Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: David Woodhouse <dwmw2@infradead.org> 2006-10-21 11:46:04 -0400
committer: David Woodhouse <dwmw2@infradead.org> 2006-10-21 11:46:04 -0400
commit: 513b046c96cc2fbce730a3474f6f7ff0c4fdd05c (patch)
tree: e8006368b6f643067486f92405a404757807d6da /fs
parent: 82810b7b6cc7a74c68881a13b0eb66c7a6370fcc (diff)
parent: c7a3bd177f248d01ee18a01d22048c80e071c331 (diff)
296 files changed, 74480 insertions, 2597 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 68f4561423ff..fee318e6f4bb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -140,6 +140,73 @@ config EXT3_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
+config EXT4DEV_FS
+        tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        select JBD2
+        help
+          Ext4dev is a predecessor filesystem of the next generation
+          extended fs ext4, based on ext3 filesystem code. It will be
+          renamed ext4 fs later, once ext4dev is mature and stabilized.
+          Unlike the change from ext2 filesystem to ext3 filesystem,
+          the on-disk format of ext4dev is not the same as ext3 any more:
+          it is based on extent maps and it supports 48-bit physical block
+          numbers. These combined on-disk format changes will allow
+          ext4dev/ext4 to handle more than 16 TB filesystem volumes --
+          a hard limit that ext3 cannot overcome without changing the
+          on-disk format.
+          Other than extent maps and 48-bit block numbers, ext4dev also is
+          likely to have other new features such as persistent preallocation,
+          high resolution time stamps, and larger file support etc.  These
+          features will be added to ext4dev gradually.
+          To compile this file system support as a module, choose M here. The
+          module will be called ext4dev.  Be aware, however, that the filesystem
+          of your root partition (the one containing the directory /) cannot
+          be compiled as a module, and so this could be dangerous.
+          If unsure, say N.
+config EXT4DEV_FS_XATTR
+        bool "Ext4dev extended attributes"
+        depends on EXT4DEV_FS
+        default y
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          If unsure, say N.
+          You need this for POSIX ACL support on ext4dev/ext4.
+config EXT4DEV_FS_POSIX_ACL
+        bool "Ext4dev POSIX Access Control Lists"
+        depends on EXT4DEV_FS_XATTR
+        select FS_POSIX_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
+config EXT4DEV_FS_SECURITY
+        bool "Ext4dev Security Labels"
+        depends on EXT4DEV_FS_XATTR
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the ext4dev/ext4 filesystem.
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
 config JBD
        tristate
        help
@@ -172,12 +239,44 @@ config JBD_DEBUG
          generated.  To turn debugging off again, do
          "echo 0 > /proc/sys/fs/jbd-debug".
+config JBD2
+        tristate
+        help
+          This is a generic journaling layer for block devices that support
+          both 32-bit and 64-bit block numbers.  It is currently used by
+          the ext4dev/ext4 filesystem, but it could also be used to add
+          journal support to other file systems or block devices such
+          as RAID or LVM.
+          If you are using ext4dev/ext4, you need to say Y here. If you are not
+          using ext4dev/ext4 then you will probably want to say N.
+          To compile this device as a module, choose M here. The module will be
+          called jbd2.  If you are compiling ext4dev/ext4 into the kernel,
+          you cannot compile this code as a module.
+config JBD2_DEBUG
+        bool "JBD2 (ext4dev/ext4) debugging support"
+        depends on JBD2
+        help
+          If you are using the ext4dev/ext4 journaled file system (or
+          potentially any other filesystem/device using JBD2), this option
+          allows you to enable debugging output while the system is running,
+          in order to help track down any problems you are having.
+          By default, the debugging output will be turned off.
+          If you select Y here, then you will be able to turn on debugging
+          with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between
+          1 and 5. The higher the number, the more debugging output is
+          generated.  To turn debugging off again, do
+          "echo 0 > /proc/sys/fs/jbd2-debug".
 config FS_MBCACHE
-# Meta block cache for Extended Attributes (ext2/ext3)
+# Meta block cache for Extended Attributes (ext2/ext3/ext4)
        tristate
-        depends on EXT2_FS_XATTR || EXT3_FS_XATTR
+        depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
-        default y if EXT2_FS=y || EXT3_FS=y
+        default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
-        default m if EXT2_FS=m || EXT3_FS=m
+        default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
 config REISERFS_FS
        tristate "Reiserfs support"
@@ -325,6 +424,7 @@ config FS_POSIX_ACL
        default n
 source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
 config OCFS2_FS
        tristate "OCFS2 file system support"
@@ -534,6 +634,10 @@ config FUSE_FS
          If you want to develop a userspace FS, or if you want to use
          a filesystem based on FUSE, answer Y or M.
+config GENERIC_ACL
+        bool
+        select FS_POSIX_ACL
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
@@ -995,6 +1099,18 @@ config AFFS_FS
          To compile this file system support as a module, choose M here: the
          module will be called affs.  If unsure, say N.
+config ECRYPT_FS
+        tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL && KEYS && CRYPTO
+        help
+          Encrypted filesystem that operates on the VFS layer.  See
+          <file:Documentation/ecryptfs.txt> to learn more about
+          eCryptfs.  Userspace components are required and can be
+          obtained from <http://ecryptfs.sf.net>.
+          To compile this file system support as a module, choose M here: the
+          module will be called ecryptfs.
 config HFS_FS
        tristate "Apple Macintosh file system support (EXPERIMENTAL)"
        depends on BLOCK && EXPERIMENTAL
@@ -1874,7 +1990,7 @@ config CIFS_EXPERIMENTAL
 config CIFS_UPCALL
          bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
          depends on CIFS_EXPERIMENTAL
-          select CONNECTOR
+          depends on CONNECTOR
          help
            Enables an upcall mechanism for CIFS which will be used to contact
            userspace helper utilities to provide SPNEGO packaged Kerberos
@@ -1968,10 +2084,6 @@ config 9P_FS
          If unsure, say N.
-config GENERIC_ACL
-        bool
-        select FS_POSIX_ACL
 endmenu
 if BLOCK
@@ -1983,6 +2095,7 @@ endmenu
 endif
 source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
 endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 819b2a93bebe..9a5ce9323bfd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -57,11 +57,14 @@ obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y                           += devpts/
 obj-$(CONFIG_PROFILING)         += dcookies.o
+obj-$(CONFIG_DLM)               += dlm/
 
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)       += reiserfs/
 obj-$(CONFIG_EXT3_FS)           += ext3/ # Before ext2 so root fs can be ext3
+obj-$(CONFIG_EXT4DEV_FS)        += ext4/ # Before ext2 so root fs can be ext4dev
 obj-$(CONFIG_JBD)               += jbd/
+obj-$(CONFIG_JBD2)              += jbd2/
 obj-$(CONFIG_EXT2_FS)           += ext2/
 obj-$(CONFIG_CRAMFS)            += cramfs/
 obj-$(CONFIG_RAMFS)             += ramfs/
@@ -75,6 +78,7 @@ obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)        += isofs/
 obj-$(CONFIG_HFSPLUS_FS)        += hfsplus/ # Before hfs to find wrapped HFS+
 obj-$(CONFIG_HFS_FS)            += hfs/
+obj-$(CONFIG_ECRYPT_FS)         += ecryptfs/
 obj-$(CONFIG_VXFS_FS)           += freevxfs/
 obj-$(CONFIG_NFS_FS)            += nfs/
 obj-$(CONFIG_EXPORTFS)          += exportfs/
@@ -109,3 +113,4 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
+obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cf8a2cb28505..a6ec75c56fcf 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -211,8 +211,8 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 {
        _enter("{%lu}", inode->i_ino);
-        BUG_ON(sizeof(union afs_dir_block) != 2048);
+        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
-        BUG_ON(sizeof(union afs_dirent) != 32);
+        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
        if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
                return -ENOENT;
@@ -446,8 +446,8 @@ static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
        _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
        /* insanity checks first */
-        BUG_ON(sizeof(union afs_dir_block) != 2048);
+        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
-        BUG_ON(sizeof(union afs_dirent) != 32);
+        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
        if (dentry->d_name.len > 255) {
                _leave(" = -ENAMETOOLONG");
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index c7700d9b3f96..906ba5ce2261 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -149,6 +149,7 @@ extern const struct file_operations autofs_root_operations;
 /* Initializing function */
 int autofs_fill_super(struct super_block *, void *, int);
+void autofs_kill_sb(struct super_block *sb);
 /* Queue management functions */
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 3fded389d06b..bf8c8af98004 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -246,5 +246,4 @@ void autofs_hash_nuke(struct autofs_sb_info *sbi)
                        kfree(ent);
                }
        }
-        shrink_dcache_sb(sbi->sb);
 }
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index aca123752406..cea5219b4f37 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "autofs",
        .get_sb         = autofs_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = autofs_kill_sb,
 };
 static int __init init_autofs_fs(void)
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2c9759baad61..54c518c89e4c 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,7 +20,7 @@
 #include "autofs_i.h"
 #include <linux/module.h>
-static void autofs_put_super(struct super_block *sb)
+void autofs_kill_sb(struct super_block *sb)
 {
        struct autofs_sb_info *sbi = autofs_sbi(sb);
        unsigned int n;
@@ -37,13 +37,13 @@ static void autofs_put_super(struct super_block *sb)
        kfree(sb->s_fs_info);
        DPRINTK(("autofs: shutting down\n"));
+        kill_anon_super(sb);
 }
 static void autofs_read_inode(struct inode *inode);
 static struct super_operations autofs_sops = {
        .read_inode     = autofs_read_inode,
-        .put_super      = autofs_put_super,
        .statfs         = simple_statfs,
 };
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 480ab178cba5..b13f32c8aeee 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -94,7 +94,6 @@ struct autofs_wait_queue {
 struct autofs_sb_info {
        u32 magic;
-        struct dentry *root;
        int pipefd;
        struct file *pipe;
        pid_t oz_pgrp;
@@ -229,4 +228,4 @@ out:
 }
 void autofs4_dentry_release(struct dentry *);
+extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 5d9193332bef..723a1c5e361b 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "autofs",
        .get_sb         = autofs_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = autofs4_kill_sb,
 };
 static int __init init_autofs4_fs(void)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 800ce876caec..51fd8595bf85 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -96,7 +96,7 @@ void autofs4_free_ino(struct autofs_info *ino)
 */
 static void autofs4_force_release(struct autofs_sb_info *sbi)
 {
-        struct dentry *this_parent = sbi->root;
+        struct dentry *this_parent = sbi->sb->s_root;
        struct list_head *next;
        spin_lock(&dcache_lock);
@@ -127,7 +127,7 @@ resume:
                spin_lock(&dcache_lock);
        }
-        if (this_parent != sbi->root) {
+        if (this_parent != sbi->sb->s_root) {
                struct dentry *dentry = this_parent;
                next = this_parent->d_u.d_child.next;
@@ -140,15 +140,9 @@ resume:
                goto resume;
        }
        spin_unlock(&dcache_lock);
-        dput(sbi->root);
-        sbi->root = NULL;
-        shrink_dcache_sb(sbi->sb);
-        return;
 }
-static void autofs4_put_super(struct super_block *sb)
+void autofs4_kill_sb(struct super_block *sb)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -163,6 +157,7 @@ static void autofs4_put_super(struct super_block *sb)
        kfree(sbi);
        DPRINTK("shutting down");
+        kill_anon_super(sb);
 }
 static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
@@ -189,7 +184,6 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 static struct super_operations autofs4_sops = {
-        .put_super      = autofs4_put_super,
        .statfs         = simple_statfs,
        .show_options   = autofs4_show_options,
 };
@@ -315,7 +309,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        s->s_fs_info = sbi;
        sbi->magic = AUTOFS_SBI_MAGIC;
-        sbi->root = NULL;
        sbi->pipefd = -1;
        sbi->catatonic = 0;
        sbi->exp_timeout = 0;
@@ -397,13 +390,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi->pipefd = pipefd;
        /*
-         * Take a reference to the root dentry so we get a chance to
-         * clean up the dentry tree on umount.
-         * See autofs4_force_release.
-         */
-        sbi->root = dget(root);
-        /*
         * Success! Install the root dentry now to indicate completion.
         */
        s->s_root = root;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index ce103e7b0bc3..c0a6c8d445c7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -45,7 +45,6 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
                fput(sbi->pipe);        /* Close the pipe */
                sbi->pipe = NULL;
        }
-        shrink_dcache_sb(sbi->sb);
 }
 static int autofs4_write(struct file *file, const void *addr, int bytes)
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 057a2c3d73b7..d9a40abda6b7 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -94,7 +94,7 @@ void befs_debug(const struct super_block *sb, const char *fmt, ...);
 void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
 void befs_dump_inode(const struct super_block *sb, befs_inode *);
-void befs_dump_index_entry(const struct super_block *sb, befs_btree_super *);
+void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *);
 void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
 /****************************/
@@ -136,7 +136,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
 static inline unsigned int
 befs_iaddrs_per_block(struct super_block *sb)
 {
-        return BEFS_SB(sb)->block_size / sizeof (befs_inode_addr);
+        return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
 }
 static inline int
@@ -151,4 +151,6 @@ befs_brun_size(struct super_block *sb, befs_block_run run)
        return BEFS_SB(sb)->block_size * run.len;
 }
+#include "endian.h"
 #endif                          /* _LINUX_BEFS_H */
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 63ef1e18fb84..e2595c2c403a 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -79,17 +79,27 @@ enum inode_flags {
 * On-Disk datastructures of BeFS
 */
+typedef u64 __bitwise fs64;
+typedef u32 __bitwise fs32;
+typedef u16 __bitwise fs16;
 typedef u64 befs_off_t;
-typedef u64 befs_time_t;
+typedef fs64 befs_time_t;
-typedef void befs_binode_etc;
 /* Block runs */
 typedef struct {
+        fs32 allocation_group;
+        fs16 start;
+        fs16 len;
+} PACKED befs_disk_block_run;
+typedef struct {
        u32 allocation_group;
        u16 start;
        u16 len;
 } PACKED befs_block_run;
+typedef befs_disk_block_run befs_disk_inode_addr;
 typedef befs_block_run befs_inode_addr;
 /*
@@ -97,31 +107,31 @@ typedef befs_block_run befs_inode_addr;
 */
 typedef struct {
        char name[B_OS_NAME_LENGTH];
-        u32 magic1;
+        fs32 magic1;
-        u32 fs_byte_order;
+        fs32 fs_byte_order;
-        u32 block_size;
+        fs32 block_size;
-        u32 block_shift;
+        fs32 block_shift;
-        befs_off_t num_blocks;
+        fs64 num_blocks;
-        befs_off_t used_blocks;
+        fs64 used_blocks;
-        u32 inode_size;
+        fs32 inode_size;
-        u32 magic2;
+        fs32 magic2;
-        u32 blocks_per_ag;
+        fs32 blocks_per_ag;
-        u32 ag_shift;
+        fs32 ag_shift;
-        u32 num_ags;
+        fs32 num_ags;
-        u32 flags;
+        fs32 flags;
-        befs_block_run log_blocks;
+        befs_disk_block_run log_blocks;
-        befs_off_t log_start;
+        fs64 log_start;
-        befs_off_t log_end;
+        fs64 log_end;
-        u32 magic3;
+        fs32 magic3;
-        befs_inode_addr root_dir;
+        befs_disk_inode_addr root_dir;
-        befs_inode_addr indices;
+        befs_disk_inode_addr indices;
 } PACKED befs_super_block;
@@ -130,6 +140,16 @@ typedef struct {
 * be longer than one block!
 */
 typedef struct {
+        befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+        fs64 max_direct_range;
+        befs_disk_block_run indirect;
+        fs64 max_indirect_range;
+        befs_disk_block_run double_indirect;
+        fs64 max_double_indirect_range;
+        fs64 size;
+} PACKED befs_disk_data_stream;
+typedef struct {
        befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
        befs_off_t max_direct_range;
        befs_block_run indirect;
@@ -141,35 +161,35 @@ typedef struct {
 /* Attribute */
 typedef struct {
-        u32 type;
+        fs32 type;
-        u16 name_size;
+        fs16 name_size;
-        u16 data_size;
+        fs16 data_size;
        char name[1];
 } PACKED befs_small_data;
 /* Inode structure */
 typedef struct {
-        u32 magic1;
+        fs32 magic1;
-        befs_inode_addr inode_num;
+        befs_disk_inode_addr inode_num;
-        u32 uid;
+        fs32 uid;
-        u32 gid;
+        fs32 gid;
-        u32 mode;
+        fs32 mode;
-        u32 flags;
+        fs32 flags;
        befs_time_t create_time;
        befs_time_t last_modified_time;
-        befs_inode_addr parent;
+        befs_disk_inode_addr parent;
-        befs_inode_addr attributes;
+        befs_disk_inode_addr attributes;
-        u32 type;
+        fs32 type;
-        u32 inode_size;
+        fs32 inode_size;
-        u32 etc;                /* not use */
+        fs32 etc;               /* not use */
        union {
-                befs_data_stream datastream;
+                befs_disk_data_stream datastream;
                char symlink[BEFS_SYMLINK_LEN];
        } data;
-        u32 pad[4];             /* not use */
+        fs32 pad[4];            /* not use */
        befs_small_data small_data[1];
 } PACKED befs_inode;
@@ -190,6 +210,16 @@ enum btree_types {
 };
 typedef struct {
+        fs32 magic;
+        fs32 node_size;
+        fs32 max_depth;
+        fs32 data_type;
+        fs64 root_node_ptr;
+        fs64 free_node_ptr;
+        fs64 max_size;
+} PACKED befs_disk_btree_super;
+typedef struct {
        u32 magic;
        u32 node_size;
        u32 max_depth;
@@ -203,11 +233,19 @@ typedef struct {
 * Header stucture of each btree node
 */
 typedef struct {
+        fs64 left;
+        fs64 right;
+        fs64 overflow;
+        fs16 all_key_count;
+        fs16 all_key_length;
+} PACKED befs_btree_nodehead;
+typedef struct {
        befs_off_t left;
        befs_off_t right;
        befs_off_t overflow;
        u16 all_key_count;
        u16 all_key_length;
-} PACKED befs_btree_nodehead;
+} PACKED befs_host_btree_nodehead;
 #endif                          /* _LINUX_BEFS_FS_TYPES */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 76e219799409..81b042ee24e6 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -30,7 +30,6 @@
 #include "befs.h"
 #include "btree.h"
 #include "datastream.h"
-#include "endian.h"
 /*
 * The btree functions in this file are built on top of the
@@ -80,7 +79,7 @@
 * In memory structure of each btree node
 */
 typedef struct {
-        befs_btree_nodehead head;       /* head of node converted to cpu byteorder */
+        befs_host_btree_nodehead head;  /* head of node converted to cpu byteorder */
        struct buffer_head *bh;
        befs_btree_nodehead *od_node;   /* on disk node */
 } befs_btree_node;
@@ -102,9 +101,9 @@ static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
 static int befs_leafnode(befs_btree_node * node);
-static u16 *befs_bt_keylen_index(befs_btree_node * node);
+static fs16 *befs_bt_keylen_index(befs_btree_node * node);
-static befs_off_t *befs_bt_valarray(befs_btree_node * node);
+static fs64 *befs_bt_valarray(befs_btree_node * node);
 static char *befs_bt_keydata(befs_btree_node * node);
@@ -136,7 +135,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
                   befs_btree_super * sup)
 {
        struct buffer_head *bh = NULL;
-        befs_btree_super *od_sup = NULL;
+        befs_disk_btree_super *od_sup = NULL;
        befs_debug(sb, "---> befs_btree_read_super()");
@@ -146,7 +145,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
                befs_error(sb, "Couldn't read index header.");
                goto error;
        }
-        od_sup = (befs_btree_super *) bh->b_data;
+        od_sup = (befs_disk_btree_super *) bh->b_data;
        befs_dump_index_entry(sb, od_sup);
        sup->magic = fs32_to_cpu(sb, od_sup->magic);
@@ -342,7 +341,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
        u16 keylen;
        int findkey_len;
        char *thiskey;
-        befs_off_t *valarray;
+        fs64 *valarray;
        befs_debug(sb, "---> befs_find_key() %s", findkey);
@@ -422,7 +421,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
        befs_btree_super bt_super;
        befs_off_t node_off = 0;
        int cur_key;
-        befs_off_t *valarray;
+        fs64 *valarray;
        char *keystart;
        u16 keylen;
        int res;
@@ -572,7 +571,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
                                   this_node->head.overflow);
                        *node_off = this_node->head.overflow;
                } else {
-                        befs_off_t *valarray = befs_bt_valarray(this_node);
+                        fs64 *valarray = befs_bt_valarray(this_node);
                        *node_off = fs64_to_cpu(sb, valarray[0]);
                }
                if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
@@ -622,7 +621,7 @@ befs_leafnode(befs_btree_node * node)
 *
 * Except that rounding up to 8 works, and rounding up to 4 doesn't.
 */
-static u16 *
+static fs16 *
 befs_bt_keylen_index(befs_btree_node * node)
 {
        const int keylen_align = 8;
@@ -633,7 +632,7 @@ befs_bt_keylen_index(befs_btree_node * node)
        if (tmp)
                off += keylen_align - tmp;
-        return (u16 *) ((void *) node->od_node + off);
+        return (fs16 *) ((void *) node->od_node + off);
 }
 /**
@@ -643,13 +642,13 @@ befs_bt_keylen_index(befs_btree_node * node)
 * Returns a pointer to the start of the value array
 * of the node pointed to by the node header
 */
-static befs_off_t *
+static fs64 *
 befs_bt_valarray(befs_btree_node * node)
 {
        void *keylen_index_start = (void *) befs_bt_keylen_index(node);
-        size_t keylen_index_size = node->head.all_key_count * sizeof (u16);
+        size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
-        return (befs_off_t *) (keylen_index_start + keylen_index_size);
+        return (fs64 *) (keylen_index_start + keylen_index_size);
 }
 /**
@@ -681,7 +680,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
 {
        int prev_key_end;
        char *keystart;
-        u16 *keylen_index;
+        fs16 *keylen_index;
        if (index < 0 || index > node->head.all_key_count) {
                *keylen = 0;
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index b7d6b920f65f..aacb4da6298a 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -18,7 +18,6 @@
 #include "befs.h"
 #include "datastream.h"
 #include "io.h"
-#include "endian.h"
 const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
@@ -312,7 +311,7 @@ befs_find_brun_indirect(struct super_block *sb,
        befs_blocknr_t indir_start_blk;
        befs_blocknr_t search_blk;
        struct buffer_head *indirblock;
-        befs_block_run *array;
+        befs_disk_block_run *array;
        befs_block_run indirect = data->indirect;
        befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
@@ -334,7 +333,7 @@ befs_find_brun_indirect(struct super_block *sb,
                        return BEFS_ERR;
                }
-                array = (befs_block_run *) indirblock->b_data;
+                array = (befs_disk_block_run *) indirblock->b_data;
                for (j = 0; j < arraylen; ++j) {
                        int len = fs16_to_cpu(sb, array[j].len);
@@ -427,7 +426,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
        struct buffer_head *dbl_indir_block;
        struct buffer_head *indir_block;
        befs_block_run indir_run;
-        befs_inode_addr *iaddr_array = NULL;
+        befs_disk_inode_addr *iaddr_array = NULL;
        befs_sb_info *befs_sb = BEFS_SB(sb);
        befs_blocknr_t indir_start_blk =
@@ -482,7 +481,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
        dbl_block_indx =
            dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb));
-        iaddr_array = (befs_inode_addr *) dbl_indir_block->b_data;
+        iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
        indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
        brelse(dbl_indir_block);
        iaddr_array = NULL;
@@ -507,7 +506,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
        }
        block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb));
-        iaddr_array = (befs_inode_addr *) indir_block->b_data;
+        iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
        *run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
        brelse(indir_block);
        iaddr_array = NULL;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 875cc0aa318c..e831a8f30849 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -21,7 +21,6 @@
 #endif                          /* __KERNEL__ */
 #include "befs.h"
-#include "endian.h"
 #define ERRBUFSIZE 1024
@@ -125,7 +124,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
        befs_debug(sb, "  type %08x", fs32_to_cpu(sb, inode->type));
        befs_debug(sb, "  inode_size %u", fs32_to_cpu(sb, inode->inode_size));
-        if (S_ISLNK(inode->mode)) {
+        if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) {
                befs_debug(sb, "  Symbolic link [%s]", inode->data.symlink);
        } else {
                int i;
@@ -231,21 +230,20 @@ befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
 /* unused */
 void
-befs_dump_run(const struct super_block *sb, befs_block_run run)
+befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
 {
 #ifdef CONFIG_BEFS_DEBUG
-        run = fsrun_to_cpu(sb, run);
+        befs_block_run n = fsrun_to_cpu(sb, run);
-        befs_debug(sb, "[%u, %hu, %hu]",
+        befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len);
-                   run.allocation_group, run.start, run.len);
 #endif                          //CONFIG_BEFS_DEBUG
 }
 #endif  /*  0  */
 void
-befs_dump_index_entry(const struct super_block *sb, befs_btree_super * super)
+befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
 {
 #ifdef CONFIG_BEFS_DEBUG
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 9ecaea4e3325..e254a20869f4 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -10,85 +10,84 @@
 #define LINUX_BEFS_ENDIAN
 #include <linux/byteorder/generic.h>
-#include "befs.h"
 static inline u64
-fs64_to_cpu(const struct super_block *sb, u64 n)
+fs64_to_cpu(const struct super_block *sb, fs64 n)
 {
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-                return le64_to_cpu(n);
+                return le64_to_cpu((__force __le64)n);
        else
-                return be64_to_cpu(n);
+                return be64_to_cpu((__force __be64)n);
 }
-static inline u64
+static inline fs64
 cpu_to_fs64(const struct super_block *sb, u64 n)
 {
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-                return cpu_to_le64(n);
+                return (__force fs64)cpu_to_le64(n);
        else
-                return cpu_to_be64(n);
+                return (__force fs64)cpu_to_be64(n);
 }
 static inline u32
-fs32_to_cpu(const struct super_block *sb, u32 n)
+fs32_to_cpu(const struct super_block *sb, fs32 n)
 {
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-                return le32_to_cpu(n);
+                return le32_to_cpu((__force __le32)n);
        else
-                return be32_to_cpu(n);
+                return be32_to_cpu((__force __be32)n);
 }
-static inline u32
+static inline fs32
 cpu_to_fs32(const struct super_block *sb, u32 n)
 {
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-                return cpu_to_le32(n);
+                return (__force fs32)cpu_to_le32(n);
        else
-                return cpu_to_be32(n);
+                return (__force fs32)cpu_to_be32(n);
 }
 static inline u16
-fs16_to_cpu(const struct super_block *sb, u16 n)
+fs16_to_cpu(const struct super_block *sb, fs16 n)
 {
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-                return le16_to_cpu(n);
+                return le16_to_cpu((__force __le16)n);
        else
-                return be16_to_cpu(n);
+                return be16_to_cpu((__force __be16)n);
 }
-static inline u16
+static inline fs16
 cpu_to_fs16(const struct super_block *sb, u16 n)
 {
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-                return cpu_to_le16(n);
+                return (__force fs16)cpu_to_le16(n);
        else
-                return cpu_to_be16(n);
+                return (__force fs16)cpu_to_be16(n);
 }
 /* Composite types below here */
 static inline befs_block_run
-fsrun_to_cpu(const struct super_block *sb, befs_block_run n)
+fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n)
 {
        befs_block_run run;
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
-                run.allocation_group = le32_to_cpu(n.allocation_group);
+                run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group);
-                run.start = le16_to_cpu(n.start);
+                run.start = le16_to_cpu((__force __le16)n.start);
-                run.len = le16_to_cpu(n.len);
+                run.len = le16_to_cpu((__force __le16)n.len);
        } else {
-                run.allocation_group = be32_to_cpu(n.allocation_group);
+                run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group);
-                run.start = be16_to_cpu(n.start);
+                run.start = be16_to_cpu((__force __be16)n.start);
-                run.len = be16_to_cpu(n.len);
+                run.len = be16_to_cpu((__force __be16)n.len);
        }
        return run;
 }
-static inline befs_block_run
+static inline befs_disk_block_run
 cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 {
-        befs_block_run run;
+        befs_disk_block_run run;
        if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
                run.allocation_group = cpu_to_le32(n.allocation_group);
@@ -103,7 +102,7 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_data_stream n)
+fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
 {
        befs_data_stream data;
        int i;
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index d41c9247ae8a..94c17f9a9576 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -8,7 +8,6 @@
 #include "befs.h"
 #include "inode.h"
-#include "endian.h"
 /*
        Validates the correctness of the befs inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 57020c7a7e65..07f7144f0e2e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -22,7 +22,6 @@
 #include "datastream.h"
 #include "super.h"
 #include "io.h"
-#include "endian.h"
 MODULE_DESCRIPTION("BeOS File System (BeFS) driver");
 MODULE_AUTHOR("Will Dyson");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 4557acbac528..8c3401ff6d6a 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -11,7 +11,6 @@
 #include "befs.h"
 #include "super.h"
-#include "endian.h"
 /**
 * load_befs_sb -- Read from disk and properly byteswap all the fields
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06435f3665f4..79b05a1a4365 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1152,7 +1152,7 @@ static int dump_write(struct file *file, const void *addr, int nr)
 static int dump_seek(struct file *file, loff_t off)
 {
        if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-                if (file->f_op->llseek(file, off, 1) != off)
+                if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
                        return 0;
        } else {
                char *buf = (char *)get_zeroed_page(GFP_KERNEL);
@@ -1220,7 +1220,7 @@ static int notesize(struct memelfnote *en)
 static int alignfile(struct file *file, loff_t *foffset)
 {
-        char buf[4] = { 0, };
+        static const char buf[4] = { 0, };
        DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
        return 1;
 }
@@ -1569,7 +1569,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
        DUMP_WRITE(elf, sizeof(*elf));
        offset += sizeof(*elf);                         /* Elf header */
-        offset += (segs+1) * sizeof(struct elf_phdr);   /* Program headers */
+        offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
+        foffset = offset;
        /* Write notes phdr entry */
        {
@@ -1586,8 +1587,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
                DUMP_WRITE(&phdr, sizeof(phdr));
        }
-        foffset = offset;
        dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
        /* Write program headers for segments dump */
@@ -1612,7 +1611,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
                phdr.p_align = ELF_EXEC_PAGESIZE;
                DUMP_WRITE(&phdr, sizeof(phdr));
-                foffset += sizeof(phdr);
        }
 #ifdef ELF_CORE_WRITE_EXTRA_PHDRS
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 32b5d625ce9c..5bcdaaf4eae0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -29,6 +29,7 @@
 #include <linux/personality.h>
 #include <linux/init.h>
+#include <asm/a.out.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -194,6 +195,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        unsigned long som_entry;
        struct som_hdr *som_ex;
        struct som_exec_auxhdr *hpuxhdr;
+        struct files_struct *files;
        /* Get the exec-header */
        som_ex = (struct som_hdr *) bprm->buf;
@@ -208,15 +210,27 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        size = som_ex->aux_header_size;
        if (size > SOM_PAGESIZE)
                goto out;
-        hpuxhdr = (struct som_exec_auxhdr *) kmalloc(size, GFP_KERNEL);
+        hpuxhdr = kmalloc(size, GFP_KERNEL);
        if (!hpuxhdr)
                goto out;
        retval = kernel_read(bprm->file, som_ex->aux_header_location,
                        (char *) hpuxhdr, size);
+        if (retval != size) {
+                if (retval >= 0)
+                        retval = -EIO;
+                goto out_free;
+        }
+        files = current->files; /* Refcounted so ok */
+        retval = unshare_files();
        if (retval < 0)
                goto out_free;
-#error "Fix security hole before enabling me"
+        if (files == current->files) {
+                put_files_struct(files);
+                files = NULL;
+        }
        retval = get_unused_fd();
        if (retval < 0)
                goto out_free;
diff --git a/fs/bio.c b/fs/bio.c
index 8f93e939f213..f95c8749499f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -79,7 +79,6 @@ static struct bio_set *fs_bio_set;
 static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
        struct bio_vec *bvl;
-        struct biovec_slab *bp;
        /*
         * see comment near bvec_array define!
@@ -98,10 +97,12 @@ static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned lon
         * idx now points to the pool we want to allocate from
         */
-        bp = bvec_slabs + *idx;
        bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-        if (bvl)
+        if (bvl) {
+                struct biovec_slab *bp = bvec_slabs + *idx;
                memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
+        }
        return bvl;
 }
@@ -166,7 +167,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                bio_init(bio);
                if (likely(nr_iovecs)) {
-                        unsigned long idx;
+                        unsigned long idx = 0; /* shut up gcc */
                        bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
                        if (unlikely(!bvl)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 16cfbcd254f1..35527dca1dbc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -452,6 +452,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
                               bdevname(bh->b_bdev, b));
                }
                set_bit(AS_EIO, &page->mapping->flags);
+                set_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
                SetPageError(page);
        }
@@ -571,6 +572,10 @@ EXPORT_SYMBOL(mark_buffer_async_write);
 static inline void __remove_assoc_queue(struct buffer_head *bh)
 {
        list_del_init(&bh->b_assoc_buffers);
+        WARN_ON(!bh->b_assoc_map);
+        if (buffer_write_io_error(bh))
+                set_bit(AS_EIO, &bh->b_assoc_map->flags);
+        bh->b_assoc_map = NULL;
 }
 int inode_has_buffers(struct inode *inode)
@@ -669,6 +674,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
                spin_lock(&buffer_mapping->private_lock);
                list_move_tail(&bh->b_assoc_buffers,
                                &mapping->private_list);
+                bh->b_assoc_map = mapping;
                spin_unlock(&buffer_mapping->private_lock);
        }
 }
@@ -701,7 +707,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 */
 int __set_page_dirty_buffers(struct page *page)
 {
-        struct address_space * const mapping = page->mapping;
+        struct address_space * const mapping = page_mapping(page);
+        if (unlikely(!mapping))
+                return !TestSetPageDirty(page);
        spin_lock(&mapping->private_lock);
        if (page_has_buffers(page)) {
@@ -762,7 +771,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
        spin_lock(lock);
        while (!list_empty(list)) {
                bh = BH_ENTRY(list->next);
-                list_del_init(&bh->b_assoc_buffers);
+                __remove_assoc_queue(bh);
                if (buffer_dirty(bh) || buffer_locked(bh)) {
                        list_add(&bh->b_assoc_buffers, &tmp);
                        if (buffer_dirty(bh)) {
@@ -783,7 +792,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
-                __remove_assoc_queue(bh);
+                list_del_init(&bh->b_assoc_buffers);
                get_bh(bh);
                spin_unlock(lock);
                wait_on_buffer(bh);
@@ -1039,8 +1048,21 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
        } while ((size << sizebits) < PAGE_SIZE);
        index = block >> sizebits;
-        block = index << sizebits;
+        /*
+         * Check for a block which wants to lie outside our maximum possible
+         * pagecache index.  (this comparison is done using sector_t types).
+         */
+        if (unlikely(index != block >> sizebits)) {
+                char b[BDEVNAME_SIZE];
+                printk(KERN_ERR "%s: requested out-of-range block %llu for "
+                        "device %s\n",
+                        __FUNCTION__, (unsigned long long)block,
+                        bdevname(bdev, b));
+                return -EIO;
+        }
+        block = index << sizebits;
        /* Create a page with the proper size buffers.. */
        page = grow_dev_page(bdev, block, index, size);
        if (!page)
@@ -1067,12 +1089,16 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
        for (;;) {
                struct buffer_head * bh;
+                int ret;
                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;
-                if (!grow_buffers(bdev, block, size))
+                ret = grow_buffers(bdev, block, size);
+                if (ret < 0)
+                        return NULL;
+                if (ret == 0)
                        free_more_memory();
        }
 }
@@ -1147,6 +1173,7 @@ void __bforget(struct buffer_head *bh)
                spin_lock(&buffer_mapping->private_lock);
                list_del_init(&bh->b_assoc_buffers);
+                bh->b_assoc_map = NULL;
                spin_unlock(&buffer_mapping->private_lock);
        }
        __brelse(bh);
@@ -1834,6 +1861,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
                        clear_buffer_new(bh);
                        kaddr = kmap_atomic(page, KM_USER0);
                        memset(kaddr+block_start, 0, bh->b_size);
+                        flush_dcache_page(page);
                        kunmap_atomic(kaddr, KM_USER0);
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
@@ -2340,6 +2368,7 @@ failed:
         */
        kaddr = kmap_atomic(page, KM_USER0);
        memset(kaddr, 0, PAGE_CACHE_SIZE);
+        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
        SetPageUptodate(page);
        set_page_dirty(page);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index d0776ac2b804..5eff35d6e564 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -31,8 +31,8 @@ struct cifs_sid {
 } __attribute__((packed));
 /* everyone */
-extern const struct cifs_sid sid_everyone;
+/* extern const struct cifs_sid sid_everyone;*/
 /* group users */
-extern const struct cifs_sid sid_user;
+/* extern const struct cifs_sid sid_user;*/
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
index 03e359b32861..152fa2dcfc6c 100644
--- a/fs/cifs/cifsencrypt.h
+++ b/fs/cifs/cifsencrypt.h
@@ -27,8 +27,6 @@ extern void mdfour(unsigned char *out, unsigned char *in, int n);
 /* smbdes.c */
 extern void E_P16(unsigned char *p14, unsigned char *p16);
 extern void E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24);
-extern void D_P16(unsigned char *p14, unsigned char *in, unsigned char *out);
-extern void E_old_pw_hash(unsigned char *, unsigned char *, unsigned char *);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c00c654f2e11..84976cdbe713 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -63,6 +63,7 @@ extern struct task_struct * oplockThread; /* remove sparse warning */
 struct task_struct * oplockThread = NULL;
 extern struct task_struct * dnotifyThread; /* remove sparse warning */
 struct task_struct * dnotifyThread = NULL;
+static struct super_operations cifs_super_ops; 
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
 MODULE_PARM_DESC(CIFSMaxBufSize,"Network buffer size (not including header). Default: 16384 Range: 8192 to 130048");
@@ -198,10 +199,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
    /* Only need to call the old QFSInfo if failed
    on newer one */
    if(rc)
-        rc = CIFSSMBQFSInfo(xid, pTcon, buf);
+        if(pTcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
-        /* Old Windows servers do not support level 103, retry with level 
+        /* Some old Windows servers also do not support level 103, retry with
-           one if old server failed the previous call */ 
+           older level one if old server failed the previous call or we
+           bypassed it because we detected that this was an older LANMAN sess */
        if(rc)
                rc = SMBOldQFSInfo(xid, pTcon, buf);
        /*     
@@ -435,13 +438,21 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
        return;
 }
+#ifdef CONFIG_CIFS_STATS2
+static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
+{
+        /* BB FIXME */
+        return 0;
+}
+#endif
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
        *flags |= MS_NODIRATIME;
        return 0;
 }
-struct super_operations cifs_super_ops = {
+static struct super_operations cifs_super_ops = {
        .read_inode = cifs_read_inode,
        .put_super = cifs_put_super,
        .statfs = cifs_statfs,
@@ -454,6 +465,9 @@ struct super_operations cifs_super_ops = {
        .show_options = cifs_show_options,
        .umount_begin   = cifs_umount_begin,
        .remount_fs = cifs_remount,
+#ifdef CONFIG_CIFS_STATS2
+        .show_stats = cifs_show_stats,
+#endif
 };
 static int
@@ -495,7 +509,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 {
        /* origin == SEEK_END => we must revalidate the cached file length */
-        if (origin == 2) {
+        if (origin == SEEK_END) {
                int retval = cifs_revalidate(file->f_dentry);
                if (retval < 0)
                        return (loff_t)retval;
@@ -903,7 +917,7 @@ init_cifs(void)
 #ifdef CONFIG_PROC_FS
        cifs_proc_init();
 #endif
-        INIT_LIST_HEAD(&GlobalServerList);      /* BB not implemented yet */
+/*      INIT_LIST_HEAD(&GlobalServerList);*/    /* BB not implemented yet */
        INIT_LIST_HEAD(&GlobalSMBSessionList);
        INIT_LIST_HEAD(&GlobalTreeConnectionList);
        INIT_LIST_HEAD(&GlobalOplock_Q);
@@ -931,6 +945,7 @@ init_cifs(void)
        GlobalCurrentXid = 0;
        GlobalTotalActiveXid = 0;
        GlobalMaxActiveXid = 0;
+        memset(Local_System_Name, 0, 15);
        rwlock_init(&GlobalSMBSeslock);
        spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bea875d9a46a..a243f779b363 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 /* Functions related to super block operations */
-extern struct super_operations cifs_super_ops;
+/* extern struct super_operations cifs_super_ops;*/
 extern void cifs_read_inode(struct inode *);
 extern void cifs_delete_inode(struct inode *);
 /* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b24006c47df1..74d3ccbb103b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -153,7 +153,7 @@ struct TCP_Server_Info {
        char sessid[4];         /* unique token id for this session */
        /* (returned on Negotiate */
        int capabilities; /* allow selective disabling of caps by smb sess */
-        __u16 timeZone;
+        int timeAdj;  /* Adjust for difference in server time zone in sec */
        __u16 CurrentMid;         /* multiplex id - rotating counter */
        char cryptKey[CIFS_CRYPTO_KEY_SIZE];
        /* 16th byte of RFC1001 workstation name is always null */
@@ -203,9 +203,14 @@ struct cifsSesInfo {
        char * domainName;
        char * password;
 };
-/* session flags */
+/* no more than one of the following three session flags may be set */
 #define CIFS_SES_NT4 1
+#define CIFS_SES_OS2 2
+#define CIFS_SES_W9X 4
+/* following flag is set for old servers such as OS2 (and Win95?)
+   which do not negotiate NTLM or POSIX dialects, but instead
+   negotiate one of the older LANMAN dialects */
+#define CIFS_SES_LANMAN 8
 /*
 * there is one of these for each connection to a resource on a particular
 * session 
@@ -512,7 +517,8 @@ require use of the stronger protocol */
 * This list helps improve performance and eliminate the messages indicating
 * that we had a communications error talking to the server in this list. 
 */
-GLOBAL_EXTERN struct servers_not_supported *NotSuppList;        /*@z4a */
+/* Feature not supported */
+/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
 /*
 * The following is a hash table of all the users we know about.
@@ -568,7 +574,6 @@ GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int extended_security;   /* if on, session setup sent 
                                with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int secFlags;
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
 GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
 GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 81df2bf8e75a..6df9dadba647 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -26,7 +26,8 @@
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 #define LANMAN_PROT 0
-#define CIFS_PROT   1
+#define LANMAN2_PROT 1
+#define CIFS_PROT   2
 #else
 #define CIFS_PROT   0
 #endif
@@ -408,6 +409,8 @@ typedef struct negotiate_req {
 /* Dialect index is 13 for LANMAN */
+#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
 typedef struct lanman_neg_rsp {
        struct smb_hdr hdr;     /* wct = 13 */
        __le16 DialectIndex;
@@ -417,7 +420,10 @@ typedef struct lanman_neg_rsp {
        __le16 MaxNumberVcs;
        __le16 RawMode;
        __le32 SessionKey;
-        __le32 ServerTime;
+        struct {
+                __le16 Time;
+                __le16 Date;
+        } __attribute__((packed)) SrvTime;
        __le16 ServerTimeZone;
        __le16 EncryptionKeyLength;
        __le16 Reserved;
@@ -674,7 +680,7 @@ typedef union smb_com_tree_disconnect {	/* as an altetnative can use flag on
 typedef struct smb_com_close_req {
        struct smb_hdr hdr;     /* wct = 3 */
        __u16 FileID;
-        __u32 LastWriteTime;    /* should be zero */
+        __u32 LastWriteTime;    /* should be zero or -1 */
        __u16 ByteCount;        /* 0 */
 } __attribute__((packed)) CLOSE_REQ;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b35c55c3c8bb..f1f8225102f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -50,12 +50,12 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct kvec *, int /* nvec to send */, 
                        int * /* type of buf returned */ , const int long_op);
-extern int SendReceiveBlockingLock(const unsigned int /* xid */ , struct cifsTconInfo *,
+extern int SendReceiveBlockingLock(const unsigned int /* xid */ , 
+                                        struct cifsTconInfo *,
                                struct smb_hdr * /* input */ ,
                                struct smb_hdr * /* out */ ,
                                int * /* bytes returned */);
-extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
+extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
-extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
 extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
 extern int is_size_safe_to_change(struct cifsInodeInfo *);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
@@ -80,6 +80,9 @@ extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16,
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ );
 extern u64 cifs_UnixTimeToNT(struct timespec);
+extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path, 
                        FILE_ALL_INFO * pfile_info,
@@ -116,6 +119,7 @@ extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
                        FILE_ALL_INFO * findData,
+                        int legacy /* whether to use old info level */,
                        const struct nls_table *nls_codepage, int remap);
 extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
@@ -279,8 +283,6 @@ extern void sesInfoFree(struct cifsSesInfo *);
 extern struct cifsTconInfo *tconInfoAlloc(void);
 extern void tconInfoFree(struct cifsTconInfo *);
-extern int cifs_reconnect(struct TCP_Server_Info *server);
 extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
                          __u32 *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 075d8fb3d376..098790eb2aa1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -46,6 +46,7 @@ static struct {
 } protocols[] = {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
        {LANMAN_PROT, "\2LM1.2X002"},
+        {LANMAN2_PROT, "\2LANMAN2.1"},
 #endif /* weak password hashing for legacy clients */
        {CIFS_PROT, "\2NT LM 0.12"}, 
        {POSIX_PROT, "\2POSIX 2"},
@@ -58,6 +59,7 @@ static struct {
 } protocols[] = {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
        {LANMAN_PROT, "\2LM1.2X002"},
+        {LANMAN2_PROT, "\2LANMAN2.1"},
 #endif /* weak password hashing for legacy clients */
        {CIFS_PROT, "\2NT LM 0.12"}, 
        {BAD_PROT, "\2"}
@@ -67,13 +69,13 @@ static struct {
 /* define the number of elements in the cifs dialect array */
 #ifdef CONFIG_CIFS_POSIX
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 3
+#define CIFS_NUM_PROT 4
 #else
 #define CIFS_NUM_PROT 2
 #endif /* CIFS_WEAK_PW_HASH */
 #else /* not posix */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 2
+#define CIFS_NUM_PROT 3
 #else
 #define CIFS_NUM_PROT 1
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
@@ -397,6 +399,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        struct TCP_Server_Info * server;
        u16 count;
        unsigned int secFlags;
+        u16 dialect;
        if(ses->server)
                server = ses->server;
@@ -436,9 +439,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (rc != 0) 
                goto neg_err_exit;
-        cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
+        dialect = le16_to_cpu(pSMBr->DialectIndex);
+        cFYI(1,("Dialect: %d", dialect));
        /* Check wct = 1 error case */
-        if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) {
+        if((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
                /* core returns wct = 1, but we do not ask for core - otherwise
                small wct just comes when dialect index is -1 indicating we 
                could not negotiate a common dialect */
@@ -446,7 +450,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH 
        } else if((pSMBr->hdr.WordCount == 13)
-                        && (pSMBr->DialectIndex == LANMAN_PROT)) {
+                        && ((dialect == LANMAN_PROT)
+                                || (dialect == LANMAN2_PROT))) {
+                __s16 tmp;
                struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
                if((secFlags & CIFSSEC_MAY_LANMAN) || 
@@ -472,12 +478,44 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->maxRw = 0;/* we do not need to use raw anyway */
                        server->capabilities = CAP_MPX_MODE;
                }
-                server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
+                tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+                if (tmp == -1) {
+                        /* OS/2 often does not set timezone therefore
+                         * we must use server time to calc time zone.
+                         * Could deviate slightly from the right zone.
+                         * Smallest defined timezone difference is 15 minutes
+                         * (i.e. Nepal).  Rounding up/down is done to match
+                         * this requirement.
+                         */
+                        int val, seconds, remain, result;
+                        struct timespec ts, utc;
+                        utc = CURRENT_TIME;
+                        ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
+                                                le16_to_cpu(rsp->SrvTime.Time));
+                        cFYI(1,("SrvTime: %d sec since 1970 (utc: %d) diff: %d",
+                                (int)ts.tv_sec, (int)utc.tv_sec, 
+                                (int)(utc.tv_sec - ts.tv_sec)));
+                        val = (int)(utc.tv_sec - ts.tv_sec);
+                        seconds = val < 0 ? -val : val;
+                        result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+                        remain = seconds % MIN_TZ_ADJ;
+                        if(remain >= (MIN_TZ_ADJ / 2))
+                                result += MIN_TZ_ADJ;
+                        if(val < 0)
+                                result = - result;
+                        server->timeAdj = result;
+                } else {
+                        server->timeAdj = (int)tmp;
+                        server->timeAdj *= 60; /* also in seconds */
+                }
+                cFYI(1,("server->timeAdj: %d seconds", server->timeAdj));
                /* BB get server time for time conversions and add
                code to use it and timezone since this is not UTC */    
-                if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+                if (rsp->EncryptionKeyLength == 
+                                cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
                        memcpy(server->cryptKey, rsp->EncryptionKey,
                                CIFS_CRYPTO_KEY_SIZE);
                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
@@ -531,7 +569,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        cFYI(0, ("Max buf = %d", ses->server->maxBuf));
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
-        server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);  
+        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
+        server->timeAdj *= 60;
        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
                memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
                       CIFS_CRYPTO_KEY_SIZE);
@@ -1617,7 +1656,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */
        pSMB->FileID = (__u16) smb_file_id;
-        pSMB->LastWriteTime = 0;
+        pSMB->LastWriteTime = 0xFFFFFFFF;
        pSMB->ByteCount = 0;
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2773,9 +2812,11 @@ GetExtAttrOut:
 /* security id for everyone */
-const struct cifs_sid sid_everyone = {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
+const static struct cifs_sid sid_everyone = 
+                {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
 /* group users */
-const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
+const static struct cifs_sid sid_user = 
+                {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
 /* Convert CIFS ACL to POSIX form */
 static int parse_sec_desc(struct cifs_sid * psec_desc, int acl_len)
@@ -2856,7 +2897,6 @@ qsec_out:
        return rc;
 }
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
 int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -2898,7 +2938,16 @@ QInfRetry:
        if (rc) {
                cFYI(1, ("Send error in QueryInfo = %d", rc));
        } else if (pFinfo) {            /* decode response */
+                struct timespec ts;
+                __u32 time = le32_to_cpu(pSMBr->last_write_time);
+                /* BB FIXME - add time zone adjustment BB */
                memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
+                ts.tv_nsec = 0;
+                ts.tv_sec = time;
+                /* decode time fields */
+                pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
+                pFinfo->LastWriteTime = pFinfo->ChangeTime;
+                pFinfo->LastAccessTime = 0;
                pFinfo->AllocationSize =
                        cpu_to_le64(le32_to_cpu(pSMBr->size));
                pFinfo->EndOfFile = pFinfo->AllocationSize;
@@ -2922,6 +2971,7 @@ int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
                 const unsigned char *searchName,
                 FILE_ALL_INFO * pFindData,
+                 int legacy /* old style infolevel */,
                 const struct nls_table *nls_codepage, int remap)
 {
 /* level 263 SMB_QUERY_FILE_ALL_INFO */
@@ -2970,7 +3020,10 @@ QPathInfoRetry:
        byte_count = params + 1 /* pad */ ;
        pSMB->TotalParameterCount = cpu_to_le16(params);
        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+        if(legacy)
+                pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD);
+        else
+                pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
        pSMB->Reserved4 = 0;
        pSMB->hdr.smb_buf_length += byte_count;
        pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -2982,13 +3035,24 @@ QPathInfoRetry:
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc || (pSMBr->ByteCount < 40)) 
+                if (rc) /* BB add auto retry on EOPNOTSUPP? */
+                        rc = -EIO;
+                else if (!legacy && (pSMBr->ByteCount < 40)) 
                        rc = -EIO;      /* bad smb */
+                else if(legacy && (pSMBr->ByteCount < 24))
+                        rc = -EIO;  /* 24 or 26 expected but we do not read last field */
                else if (pFindData){
+                        int size;
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+                        if(legacy) /* we do not read the last field, EAsize, fortunately
+                                           since it varies by subdialect and on Set vs. Get, is  
+                                           two bytes or 4 bytes depending but we don't care here */
+                                size = sizeof(FILE_INFO_STANDARD);
+                        else
+                                size = sizeof(FILE_ALL_INFO);
                        memcpy((char *) pFindData,
                               (char *) &pSMBr->hdr.Protocol +
-                               data_offset, sizeof (FILE_ALL_INFO));
+                               data_offset, size);
                } else
                    rc = -ENOMEM;
        }
@@ -3613,6 +3677,14 @@ getDFSRetry:
                strncpy(pSMB->RequestFileName, searchName, name_len);
        }
+        if(ses->server) {
+                if(ses->server->secMode &
+                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+        }
+        pSMB->hdr.Uid = ses->Suid;
        params = 2 /* level */  + name_len /*includes null */ ;
        pSMB->TotalDataCount = 0;
        pSMB->DataCount = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c78762051da4..4093d5332930 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -109,7 +109,7 @@ static int ipv6_connect(struct sockaddr_in6 *psin_server,
         * wake up waiters on reconnection? - (not needed currently)
         */
-int
+static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
        int rc = 0;
@@ -771,13 +771,18 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
        separator[0] = ',';
        separator[1] = 0; 
-        memset(vol->source_rfc1001_name,0x20,15);
+        if (Local_System_Name[0] != 0)
-        for(i=0;i < strnlen(utsname()->nodename,15);i++) {
+                memcpy(vol->source_rfc1001_name, Local_System_Name,15);
-                /* does not have to be a perfect mapping since the field is
+        else {
-                informational, only used for servers that do not support
+                char *nodename = utsname()->nodename;
-                port 445 and it can be overridden at mount time */
+                int n = strnlen(nodename,15);
-                vol->source_rfc1001_name[i] = 
+                memset(vol->source_rfc1001_name,0x20,15);
-                        toupper(utsname()->nodename[i]);
+                for(i=0 ; i < n ; i++) {
+                        /* does not have to be perfect mapping since field is
+                        informational, only used for servers that do not support
+                        port 445 and it can be overridden at mount time */
+                        vol->source_rfc1001_name[i] = toupper(nodename[i]);
+                }
        }
        vol->source_rfc1001_name[15] = 0;
        /* null target name indicates to use *SMBSERVR default called name
@@ -3215,7 +3220,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                        }
                        /* else do not bother copying these informational fields */
                }
-                if(smb_buffer_response->WordCount == 3)
+                if((smb_buffer_response->WordCount == 3) ||
+                         (smb_buffer_response->WordCount == 7))
+                        /* field is in same location */
                        tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
                else
                        tcon->Flags = 0;
@@ -3312,19 +3319,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                first_time = 1;
        }
        if (!rc) {
+                pSesInfo->flags = 0;
                pSesInfo->capabilities = pSesInfo->server->capabilities;
                if(linuxExtEnabled == 0)
                        pSesInfo->capabilities &= (~CAP_UNIX);
        /*      pSesInfo->sequence_number = 0;*/
-                cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x Time Zone: %d",
+                cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
                        pSesInfo->server->secMode,
                        pSesInfo->server->capabilities,
-                        pSesInfo->server->timeZone));
+                        pSesInfo->server->timeAdj));
                if(experimEnabled < 2)
                        rc = CIFS_SessSetup(xid, pSesInfo,
                                            first_time, nls_info);
                else if (extended_security
-                                && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+                                && (pSesInfo->capabilities 
+                                        & CAP_EXTENDED_SECURITY)
                                && (pSesInfo->server->secType == NTLMSSP)) {
                        rc = -EOPNOTSUPP;
                } else if (extended_security
@@ -3338,7 +3347,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                        if (!rc) {
                                if(ntlmv2_flag) {
                                        char * v2_response;
-                                        cFYI(1,("Can use more secure NTLM version 2 password hash"));
+                                        cFYI(1,("more secure NTLM ver2 hash"));
                                        if(CalcNTLMv2_partial_mac_key(pSesInfo, 
                                                nls_info)) {
                                                rc = -ENOMEM;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6b90ef98e4cf..35d54bb0869a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -337,6 +337,7 @@ int cifs_get_inode_info(struct inode **pinode,
                pfindData = (FILE_ALL_INFO *)buf;
                /* could do find first instead but this returns more info */
                rc = CIFSSMBQPathInfo(xid, pTcon, search_path, pfindData,
+                              0 /* not legacy */,
                              cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                /* BB optimize code so we do not make the above call
@@ -384,8 +385,10 @@ int cifs_get_inode_info(struct inode **pinode,
                /* get new inode */
                if (*pinode == NULL) {
                        *pinode = new_inode(sb);
-                        if (*pinode == NULL)
+                        if (*pinode == NULL) {
+                                kfree(buf);
                                return -ENOMEM;
+                        }
                        /* Is an i_ino of zero legal? Can we use that to check
                           if the server supports returning inode numbers?  Are
                           there other sanity checks we can use to ensure that
@@ -431,8 +434,11 @@ int cifs_get_inode_info(struct inode **pinode,
                (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
                /* Linux can not store file creation time so ignore it */
-                inode->i_atime =
+                if(pfindData->LastAccessTime)
-                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+                        inode->i_atime = cifs_NTtimeToUnix
+                                (le64_to_cpu(pfindData->LastAccessTime));
+                else /* do not need to use current_fs_time - time not stored */
+                        inode->i_atime = CURRENT_TIME;
                inode->i_mtime =
                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
                inode->i_ctime =
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index a57f5d6e6213..0bee8b7e521a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -254,7 +254,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
                                tmpbuffer,
                                len - 1,
                                cifs_sb->local_nls);
-        else {
+        else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+                cERROR(1,("SFU style symlinks not implemented yet"));
+                /* add open and read as in fs/cifs/inode.c */
+        
+        } else {
                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
                                OPEN_REPARSE_POINT,&fid, &oplock, NULL, 
                                cifs_sb->local_nls, 
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 7aa23490541f..ccebf9b7eb86 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -252,10 +252,11 @@ MD5Transform(__u32 buf[4], __u32 const in[16])
        buf[3] += d;
 }
+#if 0   /* currently unused */
 /***********************************************************************
 the rfc 2104 version of hmac_md5 initialisation.
 ***********************************************************************/
-void
+static void
 hmac_md5_init_rfc2104(unsigned char *key, int key_len,
                      struct HMACMD5Context *ctx)
 {
@@ -289,6 +290,7 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
        MD5Init(&ctx->ctx);
        MD5Update(&ctx->ctx, ctx->k_ipad, 64);
 }
+#endif
 /***********************************************************************
 the microsoft version of hmac_md5 initialisation.
@@ -350,7 +352,8 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
 single function to calculate an HMAC MD5 digest from data.
 use the microsoft hmacmd5 init method because the key is 16 bytes.
 ************************************************************/
-void
+#if 0 /* currently unused */
+static void
 hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
         unsigned char *digest)
 {
@@ -361,3 +364,4 @@ hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
        }
        hmac_md5_final(digest, &ctx);
 }
+#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index 00e1c5394fe1..f7d4f4197bac 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -27,12 +27,12 @@ void MD5Final(unsigned char digest[16], struct MD5Context *context);
 /* The following definitions come from lib/hmacmd5.c  */
-void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
+/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);
+                        struct HMACMD5Context *ctx);*/
 void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
                        struct HMACMD5Context *ctx);
 void hmac_md5_update(const unsigned char *text, int text_len,
                        struct HMACMD5Context *ctx);
 void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
+/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-                        unsigned char *digest);
+                        unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 22c937e5884f..bbc9cd34b6ea 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -389,7 +389,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
        return;
 }
-int
+static int
 checkSMBhdr(struct smb_hdr *smb, __u16 mid)
 {
        /* Make sure that this really is an SMB, that it is a response, 
@@ -418,26 +418,42 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
 }
 int
-checkSMB(struct smb_hdr *smb, __u16 mid, int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
        __u32 len = smb->smb_buf_length;
        __u32 clc_len;  /* calculated length */
        cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
-        if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) ||
-            (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) {
+        if (length < 2 + sizeof (struct smb_hdr)) {
-                if ((unsigned int)length < 2 + sizeof (struct smb_hdr)) {
+                if ((length >= sizeof (struct smb_hdr) - 1)
-                        if (((unsigned int)length >= 
-                                sizeof (struct smb_hdr) - 1)
                            && (smb->Status.CifsError != 0)) {
-                                smb->WordCount = 0;
+                        smb->WordCount = 0;
-                                /* some error cases do not return wct and bcc */
+                        /* some error cases do not return wct and bcc */
+                        return 0;
+                } else if ((length == sizeof(struct smb_hdr) + 1) && 
+                                (smb->WordCount == 0)) {
+                        char * tmp = (char *)smb;
+                        /* Need to work around a bug in two servers here */
+                        /* First, check if the part of bcc they sent was zero */
+                        if (tmp[sizeof(struct smb_hdr)] == 0) {
+                                /* some servers return only half of bcc
+                                 * on simple responses (wct, bcc both zero)
+                                 * in particular have seen this on
+                                 * ulogoffX and FindClose. This leaves
+                                 * one byte of bcc potentially unitialized
+                                 */
+                                /* zero rest of bcc */
+                                tmp[sizeof(struct smb_hdr)+1] = 0;
                                return 0;
-                        } else {
-                                cERROR(1, ("Length less than smb header size"));
                        }
+                        cERROR(1,("rcvd invalid byte count (bcc)"));
+                } else {
+                        cERROR(1, ("Length less than smb header size"));
                }
-                if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+                return 1;
-                        cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
+        }
+        if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+                cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
                                   smb->Mid));
                return 1;
        }
@@ -446,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
                return 1;
        clc_len = smbCalcSize_LE(smb);
-        if(4 + len != (unsigned int)length) {
+        if(4 + len != length) {
                cERROR(1, ("Length read does not match RFC1001 length %d",len));
                return 1;
        }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ce87550e918f..992e80edc720 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -909,3 +909,61 @@ cifs_UnixTimeToNT(struct timespec t)
        /* Convert to 100ns intervals and then add the NTFS time offset. */
        return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
 }
+static int total_days_of_prev_months[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
+{
+        return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
+}
+struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+{
+        struct timespec ts;
+        int sec, min, days, month, year;
+        SMB_TIME * st = (SMB_TIME *)&time;
+        SMB_DATE * sd = (SMB_DATE *)&date;
+        cFYI(1,("date %d time %d",date, time));
+        sec = 2 * st->TwoSeconds;
+        min = st->Minutes;
+        if((sec > 59) || (min > 59))
+                cERROR(1,("illegal time min %d sec %d", min, sec));
+        sec += (min * 60);
+        sec += 60 * 60 * st->Hours;
+        if(st->Hours > 24)
+                cERROR(1,("illegal hours %d",st->Hours));
+        days = sd->Day;
+        month = sd->Month;
+        if((days > 31) || (month > 12))
+                cERROR(1,("illegal date, month %d day: %d", month, days));
+        month -= 1;
+        days += total_days_of_prev_months[month];
+        days += 3652; /* account for difference in days between 1980 and 1970 */
+        year = sd->Year;
+        days += year * 365;
+        days += (year/4); /* leap year */
+        /* generalized leap year calculation is more complex, ie no leap year
+        for years/100 except for years/400, but since the maximum number for DOS
+         year is 2**7, the last year is 1980+127, which means we need only
+         consider 2 special case years, ie the years 2000 and 2100, and only
+         adjust for the lack of leap year for the year 2100, as 2000 was a 
+         leap year (divisable by 400) */
+        if(year >= 120)  /* the year 2100 */
+                days = days - 1;  /* do not count leap year for the year 2100 */
+        /* adjust for leap year where we are still before leap day */
+        if(year != 120)
+                days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
+        sec += 24 * 60 * 60 * days; 
+        ts.tv_sec = sec;
+        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+        ts.tv_nsec = 0;
+        return ts;
+} 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b27b34537bf2..b5b0a2a41bef 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -106,6 +106,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
        return rc;
 }
+static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
+{
+        if((tcon) && (tcon->ses) && (tcon->ses->server)) {
+                inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
+                inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
+                inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
+        }
+        return;
+}
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                char * buf, int *pobject_type, int isNewInode)
 {
@@ -135,16 +146,23 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                tmp_inode->i_ctime =
                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
        } else { /* legacy, OS2 and DOS style */
+/*              struct timespec ts;*/
                FIND_FILE_STANDARD_INFO * pfindData = 
                        (FIND_FILE_STANDARD_INFO *)buf;
+                tmp_inode->i_mtime = cnvrtDosUnixTm(
+                                le16_to_cpu(pfindData->LastWriteDate),
+                                le16_to_cpu(pfindData->LastWriteTime));
+                tmp_inode->i_atime = cnvrtDosUnixTm(
+                                le16_to_cpu(pfindData->LastAccessDate),
+                                le16_to_cpu(pfindData->LastAccessTime));
+                tmp_inode->i_ctime = cnvrtDosUnixTm(
+                                le16_to_cpu(pfindData->LastWriteDate),
+                                le16_to_cpu(pfindData->LastWriteTime));
+                AdjustForTZ(cifs_sb->tcon, tmp_inode);
                attr = le16_to_cpu(pfindData->Attributes);
                allocation_size = le32_to_cpu(pfindData->AllocationSize);
                end_of_file = le32_to_cpu(pfindData->DataSize);
-                tmp_inode->i_atime = CURRENT_TIME;
-                /* tmp_inode->i_mtime =  BB FIXME - add dos time handling
-                tmp_inode->i_ctime = 0;   BB FIXME */
        }
        /* Linux can not store file creation time unfortunately so ignore it */
@@ -938,6 +956,7 @@ static int cifs_save_resume_key(const char *current_entry,
                filename = &pFindData->FileName[0];
                /* one byte length, no name conversion */
                len = (unsigned int)pFindData->FileNameLength;
+                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
        } else {
                cFYI(1,("Unknown findfirst level %d",level));
                return -EINVAL;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 22b4c35dcfe3..a8a083543ba0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -268,6 +268,10 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
        ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
        if(ses->serverOS)
                strncpy(ses->serverOS, bcc_ptr, len);
+        if(strncmp(ses->serverOS, "OS/2",4) == 0) {
+                        cFYI(1,("OS/2 server"));
+                        ses->flags |= CIFS_SES_OS2;
+        }
        bcc_ptr += len + 1;
        bleft -= len + 1;
@@ -290,16 +294,11 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
        if(len > bleft)
                return rc;
-        if(ses->serverDomain)
+        /* No domain field in LANMAN case. Domain is
-                kfree(ses->serverDomain);
+           returned by old servers in the SMB negprot response */
+        /* BB For newer servers which do not support Unicode,
-        ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
+           but thus do return domain here we could add parsing
-        if(ses->serverOS)
+           for it later, but it is not very important */
-                strncpy(ses->serverOS, bcc_ptr, len);
-        bcc_ptr += len + 1;
-        bleft -= len + 1;
        cFYI(1,("ascii: bytes left %d",bleft));
        return rc;
@@ -366,6 +365,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        str_area = kmalloc(2000, GFP_KERNEL);
        bcc_ptr = str_area;
+        ses->flags &= ~CIFS_SES_LANMAN;
        if(type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                char lnm_session_key[CIFS_SESS_KEY_SIZE];
@@ -377,7 +378,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                /* and copy into bcc */
                calc_lanman_hash(ses, lnm_session_key);
+                ses->flags |= CIFS_SES_LANMAN; 
 /* #ifdef CONFIG_CIFS_DEBUG2
                cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
                        CIFS_SESS_KEY_SIZE);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index efaa044523a7..7a1b2b961ec8 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -364,20 +364,20 @@ E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24)
        smbhash(p24 + 16, c8, p21 + 14, 1);
 }
-void
+#if 0 /* currently unsued */
+static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
        smbhash(out, in, p14, 0);
        smbhash(out + 8, in + 8, p14 + 7, 0);
 }
-void
+static void
 E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
        smbhash(out, in, p14, 1);
        smbhash(out + 8, in + 8, p14 + 7, 1);
 }
-#if 0
 /* these routines are currently unneeded, but may be
        needed later */
 void
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index f518c5e45035..4b25ba92180d 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -51,11 +51,8 @@
 void SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
 void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-void nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]);
 static void SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
                   unsigned char p24[24]);
-void NTLMSSPOWFencrypt(unsigned char passwd[8],
-                       unsigned char *ntlmchalresp, unsigned char p24[24]);
 void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
 /*
@@ -144,8 +141,9 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        memset(wpwd,0,129 * 2);
 }
+#if 0 /* currently unused */
 /* Does both the NT and LM owfs of a user's password */
-void
+static void
 nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
 {
        char passwd[514];
@@ -171,6 +169,7 @@ nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
        /* clear out local copy of user's password (just being paranoid). */
        memset(passwd, '\0', sizeof (passwd));
 }
+#endif
 /* Does the NTLMv2 owfs of a user's password */
 #if 0  /* function not needed yet - but will be soon */
@@ -223,7 +222,8 @@ SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
 }
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-void
+#if 0 /* currently unused */
+static void
 NTLMSSPOWFencrypt(unsigned char passwd[8],
                  unsigned char *ntlmchalresp, unsigned char p24[24])
 {
@@ -235,6 +235,7 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
        E_P24(p21, ntlmchalresp, p24);
 }
+#endif
 /* Does the NT MD4 hash then des encryption. */
diff --git a/fs/compat.c b/fs/compat.c
index 4d3fbcb2ddb1..50624d4a70c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1316,7 +1316,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
                    unsigned int nr_segs, unsigned int flags)
 {
        unsigned i;
-        struct iovec *iov;
+        struct iovec __user *iov;
        if (nr_segs > UIO_MAXIOV)
                return -EINVAL;
        iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 27ca1aa30562..a91f2628c981 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2438,13 +2438,17 @@ HANDLE_IOCTL(0x1260, broken_blkgetsize)
 HANDLE_IOCTL(BLKFRAGET, w_long)
 HANDLE_IOCTL(BLKSECTGET, w_long)
 HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
 HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index e6d5754a715e..cf33fac68c84 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -275,13 +275,14 @@ static int check_perm(struct inode * inode, struct file * file)
         * it in file->private_data for easy access.
         */
        buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
-        if (buffer) {
+        if (!buffer) {
-                init_MUTEX(&buffer->sem);
-                buffer->needs_read_fill = 1;
-                buffer->ops = ops;
-                file->private_data = buffer;
-        } else
                error = -ENOMEM;
+                goto Enomem;
+        }
+        init_MUTEX(&buffer->sem);
+        buffer->needs_read_fill = 1;
+        buffer->ops = ops;
+        file->private_data = buffer;
        goto Done;
 Einval:
@@ -289,6 +290,7 @@ static int check_perm(struct inode * inode, struct file * file)
        goto Done;
 Eaccess:
        error = -EACCES;
+ Enomem:
        module_put(attr->ca_owner);
 Done:
        if (error && item)
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
+EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dcache.c b/fs/dcache.c
index fc2faa44f8d1..2bac4ba1d1d3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -291,9 +291,9 @@ struct dentry * dget_locked(struct dentry *dentry)
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem.
 *
- * If the inode has a DCACHE_DISCONNECTED alias, then prefer
+ * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one unless @want_discon is set,
- * in which case only return a DCACHE_DISCONNECTED alias.
+ * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
 */
 static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
@@ -309,7 +309,8 @@ static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
                prefetch(next);
                alias = list_entry(tmp, struct dentry, d_alias);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-                        if (alias->d_flags & DCACHE_DISCONNECTED)
+                        if (IS_ROOT(alias) &&
+                            (alias->d_flags & DCACHE_DISCONNECTED))
                                discon_alias = alias;
                        else if (!want_discon) {
                                __dget_locked(alias);
@@ -548,6 +549,136 @@ repeat:
 }
 /*
+ * destroy a single subtree of dentries for unmount
+ * - see the comments on shrink_dcache_for_umount() for a description of the
+ *   locking
+ */
+static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
+{
+        struct dentry *parent;
+        BUG_ON(!IS_ROOT(dentry));
+        /* detach this root from the system */
+        spin_lock(&dcache_lock);
+        if (!list_empty(&dentry->d_lru)) {
+                dentry_stat.nr_unused--;
+                list_del_init(&dentry->d_lru);
+        }
+        __d_drop(dentry);
+        spin_unlock(&dcache_lock);
+        for (;;) {
+                /* descend to the first leaf in the current subtree */
+                while (!list_empty(&dentry->d_subdirs)) {
+                        struct dentry *loop;
+                        /* this is a branch with children - detach all of them
+                         * from the system in one go */
+                        spin_lock(&dcache_lock);
+                        list_for_each_entry(loop, &dentry->d_subdirs,
+                                            d_u.d_child) {
+                                if (!list_empty(&loop->d_lru)) {
+                                        dentry_stat.nr_unused--;
+                                        list_del_init(&loop->d_lru);
+                                }
+                                __d_drop(loop);
+                                cond_resched_lock(&dcache_lock);
+                        }
+                        spin_unlock(&dcache_lock);
+                        /* move to the first child */
+                        dentry = list_entry(dentry->d_subdirs.next,
+                                            struct dentry, d_u.d_child);
+                }
+                /* consume the dentries from this leaf up through its parents
+                 * until we find one with children or run out altogether */
+                do {
+                        struct inode *inode;
+                        if (atomic_read(&dentry->d_count) != 0) {
+                                printk(KERN_ERR
+                                       "BUG: Dentry %p{i=%lx,n=%s}"
+                                       " still in use (%d)"
+                                       " [unmount of %s %s]\n",
+                                       dentry,
+                                       dentry->d_inode ?
+                                       dentry->d_inode->i_ino : 0UL,
+                                       dentry->d_name.name,
+                                       atomic_read(&dentry->d_count),
+                                       dentry->d_sb->s_type->name,
+                                       dentry->d_sb->s_id);
+                                BUG();
+                        }
+                        parent = dentry->d_parent;
+                        if (parent == dentry)
+                                parent = NULL;
+                        else
+                                atomic_dec(&parent->d_count);
+                        list_del(&dentry->d_u.d_child);
+                        dentry_stat.nr_dentry--;        /* For d_free, below */
+                        inode = dentry->d_inode;
+                        if (inode) {
+                                dentry->d_inode = NULL;
+                                list_del_init(&dentry->d_alias);
+                                if (dentry->d_op && dentry->d_op->d_iput)
+                                        dentry->d_op->d_iput(dentry, inode);
+                                else
+                                        iput(inode);
+                        }
+                        d_free(dentry);
+                        /* finished when we fall off the top of the tree,
+                         * otherwise we ascend to the parent and move to the
+                         * next sibling if there is one */
+                        if (!parent)
+                                return;
+                        dentry = parent;
+                } while (list_empty(&dentry->d_subdirs));
+                dentry = list_entry(dentry->d_subdirs.next,
+                                    struct dentry, d_u.d_child);
+        }
+}
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ *   removing the dentry from the system lists and hashes because:
+ *   - the superblock is detached from all mountings and open files, so the
+ *     dentry trees will not be rearranged by the VFS
+ *   - s_umount is write-locked, so the memory pressure shrinker will ignore
+ *     any dentries belonging to this superblock that it comes across
+ *   - the filesystem itself is no longer permitted to rearrange the dentries
+ *     in this superblock
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+        struct dentry *dentry;
+        if (down_read_trylock(&sb->s_umount))
+                BUG();
+        dentry = sb->s_root;
+        sb->s_root = NULL;
+        atomic_dec(&dentry->d_count);
+        shrink_dcache_for_umount_subtree(dentry);
+        while (!hlist_empty(&sb->s_anon)) {
+                dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+                shrink_dcache_for_umount_subtree(dentry);
+        }
+}
+/*
 * Search for at least 1 mount point in the dentry's subdirs.
 * We descend to the next level whenever the d_subdirs
 * list is non-empty and continue searching.
@@ -1004,7 +1135,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
        struct dentry *new = NULL;
-        if (inode) {
+        if (inode && S_ISDIR(inode->i_mode)) {
                spin_lock(&dcache_lock);
                new = __d_find_alias(inode, 1);
                if (new) {
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..81b2c6465eeb
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,20 @@
+menu "Distributed Lock Manager"
+        depends on INET && IP_SCTP && EXPERIMENTAL
+config DLM
+        tristate "Distributed Lock Manager (DLM)"
+        depends on IPV6 || IPV6=n
+        select CONFIGFS_FS
+        help
+        A general purpose distributed lock manager for kernel or userspace
+        applications.
+config DLM_DEBUG
+        bool "DLM debugging"
+        depends on DLM
+        help
+        Under the debugfs mount point, the name of each lockspace will
+        appear as a file in the "dlm" directory.  The output is the
+        list of resource and locks the local node knows about.
+endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_DLM) +=            dlm.o
+dlm-y :=                        ast.o \
+                                config.o \
+                                dir.o \
+                                lock.o \
+                                lockspace.o \
+                                lowcomms.o \
+                                main.o \
+                                member.o \
+                                memory.o \
+                                midcomms.o \
+                                rcom.o \
+                                recover.o \
+                                recoverd.o \
+                                requestqueue.o \
+                                user.o \
+                                util.o
+dlm-$(CONFIG_DLM_DEBUG) +=      debug_fs.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+#define WAKE_ASTS  0
+static struct list_head         ast_queue;
+static spinlock_t               ast_queue_lock;
+static struct task_struct *     astd_task;
+static unsigned long            astd_wakeflags;
+static struct mutex             astd_running;
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+        spin_lock(&ast_queue_lock);
+        if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+                list_del(&lkb->lkb_astqueue);
+        spin_unlock(&ast_queue_lock);
+}
+void dlm_add_ast(struct dlm_lkb *lkb, int type)
+{
+        if (lkb->lkb_flags & DLM_IFL_USER) {
+                dlm_user_add_ast(lkb, type);
+                return;
+        }
+        DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
+        spin_lock(&ast_queue_lock);
+        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+                kref_get(&lkb->lkb_ref);
+                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+        }
+        lkb->lkb_ast_type |= type;
+        spin_unlock(&ast_queue_lock);
+        set_bit(WAKE_ASTS, &astd_wakeflags);
+        wake_up_process(astd_task);
+}
+static void process_asts(void)
+{
+        struct dlm_ls *ls = NULL;
+        struct dlm_rsb *r = NULL;
+        struct dlm_lkb *lkb;
+        void (*cast) (long param);
+        void (*bast) (long param, int mode);
+        int type = 0, found, bmode;
+        for (;;) {
+                found = 0;
+                spin_lock(&ast_queue_lock);
+                list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+                        r = lkb->lkb_resource;
+                        ls = r->res_ls;
+                        if (dlm_locking_stopped(ls))
+                                continue;
+                        list_del(&lkb->lkb_astqueue);
+                        type = lkb->lkb_ast_type;
+                        lkb->lkb_ast_type = 0;
+                        found = 1;
+                        break;
+                }
+                spin_unlock(&ast_queue_lock);
+                if (!found)
+                        break;
+                cast = lkb->lkb_astaddr;
+                bast = lkb->lkb_bastaddr;
+                bmode = lkb->lkb_bastmode;
+                if ((type & AST_COMP) && cast)
+                        cast(lkb->lkb_astparam);
+                /* FIXME: Is it safe to look at lkb_grmode here
+                   without doing a lock_rsb() ?
+                   Look at other checks in v1 to avoid basts. */
+                if ((type & AST_BAST) && bast)
+                        if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+                                bast(lkb->lkb_astparam, bmode);
+                /* this removes the reference added by dlm_add_ast
+                   and may result in the lkb being freed */
+                dlm_put_lkb(lkb);
+                schedule();
+        }
+}
+static inline int no_asts(void)
+{
+        int ret;
+        spin_lock(&ast_queue_lock);
+        ret = list_empty(&ast_queue);
+        spin_unlock(&ast_queue_lock);
+        return ret;
+}
+static int dlm_astd(void *data)
+{
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+                        schedule();
+                set_current_state(TASK_RUNNING);
+                mutex_lock(&astd_running);
+                if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+                        process_asts();
+                mutex_unlock(&astd_running);
+        }
+        return 0;
+}
+void dlm_astd_wake(void)
+{
+        if (!no_asts()) {
+                set_bit(WAKE_ASTS, &astd_wakeflags);
+                wake_up_process(astd_task);
+        }
+}
+int dlm_astd_start(void)
+{
+        struct task_struct *p;
+        int error = 0;
+        INIT_LIST_HEAD(&ast_queue);
+        spin_lock_init(&ast_queue_lock);
+        mutex_init(&astd_running);
+        p = kthread_run(dlm_astd, NULL, "dlm_astd");
+        if (IS_ERR(p))
+                error = PTR_ERR(p);
+        else
+                astd_task = p;
+        return error;
+}
+void dlm_astd_stop(void)
+{
+        kthread_stop(astd_task);
+}
+void dlm_astd_suspend(void)
+{
+        mutex_lock(&astd_running);
+}
+void dlm_astd_resume(void)
+{
+        mutex_unlock(&astd_running);
+}
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_del_ast(struct dlm_lkb *lkb);
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+#endif
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <net/sock.h>
+#include "config.h"
+#include "lowcomms.h"
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct comm *local_comm;
+struct clusters;
+struct cluster;
+struct spaces;
+struct space;
+struct comms;
+struct comm;
+struct nodes;
+struct node;
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+                         char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+                         char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len);
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct comm *cm, char *buf);
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct node *nd, char *buf);
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_weight_read(struct node *nd, char *buf);
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+enum {
+        COMM_ATTR_NODEID = 0,
+        COMM_ATTR_LOCAL,
+        COMM_ATTR_ADDR,
+};
+struct comm_attribute {
+        struct configfs_attribute attr;
+        ssize_t (*show)(struct comm *, char *);
+        ssize_t (*store)(struct comm *, const char *, size_t);
+};
+static struct comm_attribute comm_attr_nodeid = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = comm_nodeid_read,
+        .store  = comm_nodeid_write,
+};
+static struct comm_attribute comm_attr_local = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "local",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = comm_local_read,
+        .store  = comm_local_write,
+};
+static struct comm_attribute comm_attr_addr = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .store  = comm_addr_write,
+};
+static struct configfs_attribute *comm_attrs[] = {
+        [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+        [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+        [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+        NULL,
+};
+enum {
+        NODE_ATTR_NODEID = 0,
+        NODE_ATTR_WEIGHT,
+};
+struct node_attribute {
+        struct configfs_attribute attr;
+        ssize_t (*show)(struct node *, char *);
+        ssize_t (*store)(struct node *, const char *, size_t);
+};
+static struct node_attribute node_attr_nodeid = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = node_nodeid_read,
+        .store  = node_nodeid_write,
+};
+static struct node_attribute node_attr_weight = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "weight",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = node_weight_read,
+        .store  = node_weight_write,
+};
+static struct configfs_attribute *node_attrs[] = {
+        [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+        [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+        NULL,
+};
+struct clusters {
+        struct configfs_subsystem subsys;
+};
+struct cluster {
+        struct config_group group;
+};
+struct spaces {
+        struct config_group ss_group;
+};
+struct space {
+        struct config_group group;
+        struct list_head members;
+        struct mutex members_lock;
+        int members_count;
+};
+struct comms {
+        struct config_group cs_group;
+};
+struct comm {
+        struct config_item item;
+        int nodeid;
+        int local;
+        int addr_count;
+        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+struct nodes {
+        struct config_group ns_group;
+};
+struct node {
+        struct config_item item;
+        struct list_head list; /* space->members */
+        int nodeid;
+        int weight;
+};
+static struct configfs_group_operations clusters_ops = {
+        .make_group = make_cluster,
+        .drop_item = drop_cluster,
+};
+static struct configfs_item_operations cluster_ops = {
+        .release = release_cluster,
+};
+static struct configfs_group_operations spaces_ops = {
+        .make_group = make_space,
+        .drop_item = drop_space,
+};
+static struct configfs_item_operations space_ops = {
+        .release = release_space,
+};
+static struct configfs_group_operations comms_ops = {
+        .make_item = make_comm,
+        .drop_item = drop_comm,
+};
+static struct configfs_item_operations comm_ops = {
+        .release = release_comm,
+        .show_attribute = show_comm,
+        .store_attribute = store_comm,
+};
+static struct configfs_group_operations nodes_ops = {
+        .make_item = make_node,
+        .drop_item = drop_node,
+};
+static struct configfs_item_operations node_ops = {
+        .release = release_node,
+        .show_attribute = show_node,
+        .store_attribute = store_node,
+};
+static struct config_item_type clusters_type = {
+        .ct_group_ops = &clusters_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type cluster_type = {
+        .ct_item_ops = &cluster_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type spaces_type = {
+        .ct_group_ops = &spaces_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type space_type = {
+        .ct_item_ops = &space_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type comms_type = {
+        .ct_group_ops = &comms_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type comm_type = {
+        .ct_item_ops = &comm_ops,
+        .ct_attrs = comm_attrs,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type nodes_type = {
+        .ct_group_ops = &nodes_ops,
+        .ct_owner = THIS_MODULE,
+};
+static struct config_item_type node_type = {
+        .ct_item_ops = &node_ops,
+        .ct_attrs = node_attrs,
+        .ct_owner = THIS_MODULE,
+};
+static struct cluster *to_cluster(struct config_item *i)
+{
+        return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+}
+static struct space *to_space(struct config_item *i)
+{
+        return i ? container_of(to_config_group(i), struct space, group) : NULL;
+}
+static struct comm *to_comm(struct config_item *i)
+{
+        return i ? container_of(i, struct comm, item) : NULL;
+}
+static struct node *to_node(struct config_item *i)
+{
+        return i ? container_of(i, struct node, item) : NULL;
+}
+static struct config_group *make_cluster(struct config_group *g,
+                                         const char *name)
+{
+        struct cluster *cl = NULL;
+        struct spaces *sps = NULL;
+        struct comms *cms = NULL;
+        void *gps = NULL;
+        cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+        if (!cl || !gps || !sps || !cms)
+                goto fail;
+        config_group_init_type_name(&cl->group, name, &cluster_type);
+        config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+        config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+        cl->group.default_groups = gps;
+        cl->group.default_groups[0] = &sps->ss_group;
+        cl->group.default_groups[1] = &cms->cs_group;
+        cl->group.default_groups[2] = NULL;
+        space_list = &sps->ss_group;
+        comm_list = &cms->cs_group;
+        return &cl->group;
+ fail:
+        kfree(cl);
+        kfree(gps);
+        kfree(sps);
+        kfree(cms);
+        return NULL;
+}
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+        struct cluster *cl = to_cluster(i);
+        struct config_item *tmp;
+        int j;
+        for (j = 0; cl->group.default_groups[j]; j++) {
+                tmp = &cl->group.default_groups[j]->cg_item;
+                cl->group.default_groups[j] = NULL;
+                config_item_put(tmp);
+        }
+        space_list = NULL;
+        comm_list = NULL;
+        config_item_put(i);
+}
+static void release_cluster(struct config_item *i)
+{
+        struct cluster *cl = to_cluster(i);
+        kfree(cl->group.default_groups);
+        kfree(cl);
+}
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+        struct space *sp = NULL;
+        struct nodes *nds = NULL;
+        void *gps = NULL;
+        sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+        if (!sp || !gps || !nds)
+                goto fail;
+        config_group_init_type_name(&sp->group, name, &space_type);
+        config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+        sp->group.default_groups = gps;
+        sp->group.default_groups[0] = &nds->ns_group;
+        sp->group.default_groups[1] = NULL;
+        INIT_LIST_HEAD(&sp->members);
+        mutex_init(&sp->members_lock);
+        sp->members_count = 0;
+        return &sp->group;
+ fail:
+        kfree(sp);
+        kfree(gps);
+        kfree(nds);
+        return NULL;
+}
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+        struct space *sp = to_space(i);
+        struct config_item *tmp;
+        int j;
+        /* assert list_empty(&sp->members) */
+        for (j = 0; sp->group.default_groups[j]; j++) {
+                tmp = &sp->group.default_groups[j]->cg_item;
+                sp->group.default_groups[j] = NULL;
+                config_item_put(tmp);
+        }
+        config_item_put(i);
+}
+static void release_space(struct config_item *i)
+{
+        struct space *sp = to_space(i);
+        kfree(sp->group.default_groups);
+        kfree(sp);
+}
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+        struct comm *cm;
+        cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+        if (!cm)
+                return NULL;
+        config_item_init_type_name(&cm->item, name, &comm_type);
+        cm->nodeid = -1;
+        cm->local = 0;
+        cm->addr_count = 0;
+        return &cm->item;
+}
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+        struct comm *cm = to_comm(i);
+        if (local_comm == cm)
+                local_comm = NULL;
+        dlm_lowcomms_close(cm->nodeid);
+        while (cm->addr_count--)
+                kfree(cm->addr[cm->addr_count]);
+        config_item_put(i);
+}
+static void release_comm(struct config_item *i)
+{
+        struct comm *cm = to_comm(i);
+        kfree(cm);
+}
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct node *nd;
+        nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+        if (!nd)
+                return NULL;
+        config_item_init_type_name(&nd->item, name, &node_type);
+        nd->nodeid = -1;
+        nd->weight = 1;  /* default weight of 1 if none is set */
+        mutex_lock(&sp->members_lock);
+        list_add(&nd->list, &sp->members);
+        sp->members_count++;
+        mutex_unlock(&sp->members_lock);
+        return &nd->item;
+}
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct node *nd = to_node(i);
+        mutex_lock(&sp->members_lock);
+        list_del(&nd->list);
+        sp->members_count--;
+        mutex_unlock(&sp->members_lock);
+        config_item_put(i);
+}
+static void release_node(struct config_item *i)
+{
+        struct node *nd = to_node(i);
+        kfree(nd);
+}
+static struct clusters clusters_root = {
+        .subsys = {
+                .su_group = {
+                        .cg_item = {
+                                .ci_namebuf = "dlm",
+                                .ci_type = &clusters_type,
+                        },
+                },
+        },
+};
+int dlm_config_init(void)
+{
+        config_group_init(&clusters_root.subsys.su_group);
+        init_MUTEX(&clusters_root.subsys.su_sem);
+        return configfs_register_subsystem(&clusters_root.subsys);
+}
+void dlm_config_exit(void)
+{
+        configfs_unregister_subsystem(&clusters_root.subsys);
+}
+/*
+ * Functions for user space to read/write attributes
+ */
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+                         char *buf)
+{
+        struct comm *cm = to_comm(i);
+        struct comm_attribute *cma =
+                        container_of(a, struct comm_attribute, attr);
+        return cma->show ? cma->show(cm, buf) : 0;
+}
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len)
+{
+        struct comm *cm = to_comm(i);
+        struct comm_attribute *cma =
+                container_of(a, struct comm_attribute, attr);
+        return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+{
+        return sprintf(buf, "%d\n", cm->nodeid);
+}
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+{
+        cm->nodeid = simple_strtol(buf, NULL, 0);
+        return len;
+}
+static ssize_t comm_local_read(struct comm *cm, char *buf)
+{
+        return sprintf(buf, "%d\n", cm->local);
+}
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+{
+        cm->local= simple_strtol(buf, NULL, 0);
+        if (cm->local && !local_comm)
+                local_comm = cm;
+        return len;
+}
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+{
+        struct sockaddr_storage *addr;
+        if (len != sizeof(struct sockaddr_storage))
+                return -EINVAL;
+        if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+                return -ENOSPC;
+        addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+        if (!addr)
+                return -ENOMEM;
+        memcpy(addr, buf, len);
+        cm->addr[cm->addr_count++] = addr;
+        return len;
+}
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+                         char *buf)
+{
+        struct node *nd = to_node(i);
+        struct node_attribute *nda =
+                        container_of(a, struct node_attribute, attr);
+        return nda->show ? nda->show(nd, buf) : 0;
+}
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+                          const char *buf, size_t len)
+{
+        struct node *nd = to_node(i);
+        struct node_attribute *nda =
+                container_of(a, struct node_attribute, attr);
+        return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+static ssize_t node_nodeid_read(struct node *nd, char *buf)
+{
+        return sprintf(buf, "%d\n", nd->nodeid);
+}
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+{
+        nd->nodeid = simple_strtol(buf, NULL, 0);
+        return len;
+}
+static ssize_t node_weight_read(struct node *nd, char *buf)
+{
+        return sprintf(buf, "%d\n", nd->weight);
+}
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+{
+        nd->weight = simple_strtol(buf, NULL, 0);
+        return len;
+}
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+static struct space *get_space(char *name)
+{
+        if (!space_list)
+                return NULL;
+        return to_space(config_group_find_obj(space_list, name));
+}
+static void put_space(struct space *sp)
+{
+        config_item_put(&sp->group.cg_item);
+}
+static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+        struct config_item *i;
+        struct comm *cm = NULL;
+        int found = 0;
+        if (!comm_list)
+                return NULL;
+        down(&clusters_root.subsys.su_sem);
+        list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+                cm = to_comm(i);
+                if (nodeid) {
+                        if (cm->nodeid != nodeid)
+                                continue;
+                        found = 1;
+                        break;
+                } else {
+                        if (!cm->addr_count ||
+                            memcmp(cm->addr[0], addr, sizeof(*addr)))
+                                continue;
+                        found = 1;
+                        break;
+                }
+        }
+        up(&clusters_root.subsys.su_sem);
+        if (found)
+                config_item_get(i);
+        else
+                cm = NULL;
+        return cm;
+}
+static void put_comm(struct comm *cm)
+{
+        config_item_put(&cm->item);
+}
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out)
+{
+        struct space *sp;
+        struct node *nd;
+        int i = 0, rv = 0;
+        int *ids;
+        sp = get_space(lsname);
+        if (!sp)
+                return -EEXIST;
+        mutex_lock(&sp->members_lock);
+        if (!sp->members_count) {
+                rv = 0;
+                goto out;
+        }
+        ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+        if (!ids) {
+                rv = -ENOMEM;
+                goto out;
+        }
+        rv = sp->members_count;
+        list_for_each_entry(nd, &sp->members, list)
+                ids[i++] = nd->nodeid;
+        if (rv != i)
+                printk("bad nodeid count %d %d\n", rv, i);
+        *ids_out = ids;
+ out:
+        mutex_unlock(&sp->members_lock);
+        put_space(sp);
+        return rv;
+}
+int dlm_node_weight(char *lsname, int nodeid)
+{
+        struct space *sp;
+        struct node *nd;
+        int w = -EEXIST;
+        sp = get_space(lsname);
+        if (!sp)
+                goto out;
+        mutex_lock(&sp->members_lock);
+        list_for_each_entry(nd, &sp->members, list) {
+                if (nd->nodeid != nodeid)
+                        continue;
+                w = nd->weight;
+                break;
+        }
+        mutex_unlock(&sp->members_lock);
+        put_space(sp);
+ out:
+        return w;
+}
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+        struct comm *cm = get_comm(nodeid, NULL);
+        if (!cm)
+                return -EEXIST;
+        if (!cm->addr_count)
+                return -ENOENT;
+        memcpy(addr, cm->addr[0], sizeof(*addr));
+        put_comm(cm);
+        return 0;
+}
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+        struct comm *cm = get_comm(0, addr);
+        if (!cm)
+                return -EEXIST;
+        *nodeid = cm->nodeid;
+        put_comm(cm);
+        return 0;
+}
+int dlm_our_nodeid(void)
+{
+        return local_comm ? local_comm->nodeid : 0;
+}
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+        if (!local_comm)
+                return -1;
+        if (num + 1 > local_comm->addr_count)
+                return -1;
+        memcpy(addr, local_comm->addr[num], sizeof(*addr));
+        return 0;
+}
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_LKBTBL_SIZE     1024
+#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_RECOVER_TIMER      5
+#define DEFAULT_TOSS_SECS         10
+#define DEFAULT_SCAN_SECS          5
+struct dlm_config_info dlm_config = {
+        .tcp_port = DEFAULT_TCP_PORT,
+        .buffer_size = DEFAULT_BUFFER_SIZE,
+        .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+        .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+        .dirtbl_size = DEFAULT_DIRTBL_SIZE,
+        .recover_timer = DEFAULT_RECOVER_TIMER,
+        .toss_secs = DEFAULT_TOSS_SECS,
+        .scan_secs = DEFAULT_SCAN_SECS
+};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+#define DLM_MAX_ADDR_COUNT 3
+struct dlm_config_info {
+        int tcp_port;
+        int buffer_size;
+        int rsbtbl_size;
+        int lkbtbl_size;
+        int dirtbl_size;
+        int recover_timer;
+        int toss_secs;
+        int scan_secs;
+};
+extern struct dlm_config_info dlm_config;
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+#endif                          /* __CONFIG_DOT_H__ */
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include "dlm_internal.h"
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+static struct dentry *dlm_root;
+struct rsb_iter {
+        int entry;
+        struct dlm_ls *ls;
+        struct list_head *next;
+        struct dlm_rsb *rsb;
+};
+/*
+ * dump all rsb's in the lockspace hash table
+ */
+static char *print_lockmode(int mode)
+{
+        switch (mode) {
+        case DLM_LOCK_IV:
+                return "--";
+        case DLM_LOCK_NL:
+                return "NL";
+        case DLM_LOCK_CR:
+                return "CR";
+        case DLM_LOCK_CW:
+                return "CW";
+        case DLM_LOCK_PR:
+                return "PR";
+        case DLM_LOCK_PW:
+                return "PW";
+        case DLM_LOCK_EX:
+                return "EX";
+        default:
+                return "??";
+        }
+}
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                       struct dlm_rsb *res)
+{
+        seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+        if (lkb->lkb_status == DLM_LKSTS_CONVERT
+            || lkb->lkb_status == DLM_LKSTS_WAITING)
+                seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+        if (lkb->lkb_nodeid) {
+                if (lkb->lkb_nodeid != res->res_nodeid)
+                        seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+                                   lkb->lkb_remid);
+                else
+                        seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+        }
+        if (lkb->lkb_wait_type)
+                seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+        seq_printf(s, "\n");
+}
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+        struct dlm_lkb *lkb;
+        int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+        seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+        for (i = 0; i < res->res_length; i++) {
+                if (isprint(res->res_name[i]))
+                        seq_printf(s, "%c", res->res_name[i]);
+                else
+                        seq_printf(s, "%c", '.');
+        }
+        if (res->res_nodeid > 0)
+                seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+                           res->res_nodeid);
+        else if (res->res_nodeid == 0)
+                seq_printf(s, "\"  \nMaster Copy\n");
+        else if (res->res_nodeid == -1)
+                seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+                           res->res_first_lkid);
+        else
+                seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+        /* Print the LVB: */
+        if (res->res_lvbptr) {
+                seq_printf(s, "LVB: ");
+                for (i = 0; i < lvblen; i++) {
+                        if (i == lvblen / 2)
+                                seq_printf(s, "\n     ");
+                        seq_printf(s, "%02x ",
+                                   (unsigned char) res->res_lvbptr[i]);
+                }
+                if (rsb_flag(res, RSB_VALNOTVALID))
+                        seq_printf(s, " (INVALID)");
+                seq_printf(s, "\n");
+        }
+        root_list = !list_empty(&res->res_root_list);
+        recover_list = !list_empty(&res->res_recover_list);
+        if (root_list || recover_list) {
+                seq_printf(s, "Recovery: root %d recover %d flags %lx "
+                           "count %d\n", root_list, recover_list,
+                           res->res_flags, res->res_recover_locks_count);
+        }
+        /* Print the locks attached to this resource */
+        seq_printf(s, "Granted Queue\n");
+        list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+                print_lock(s, lkb, res);
+        seq_printf(s, "Conversion Queue\n");
+        list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+                print_lock(s, lkb, res);
+        seq_printf(s, "Waiting Queue\n");
+        list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+                print_lock(s, lkb, res);
+        if (list_empty(&res->res_lookup))
+                goto out;
+        seq_printf(s, "Lookup Queue\n");
+        list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+                seq_printf(s, "%08x %s", lkb->lkb_id,
+                           print_lockmode(lkb->lkb_rqmode));
+                if (lkb->lkb_wait_type)
+                        seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+                seq_printf(s, "\n");
+        }
+ out:
+        return 0;
+}
+static int rsb_iter_next(struct rsb_iter *ri)
+{
+        struct dlm_ls *ls = ri->ls;
+        int i;
+        if (!ri->next) {
+ top:
+                /* Find the next non-empty hash bucket */
+                for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
+                        read_lock(&ls->ls_rsbtbl[i].lock);
+                        if (!list_empty(&ls->ls_rsbtbl[i].list)) {
+                                ri->next = ls->ls_rsbtbl[i].list.next;
+                                read_unlock(&ls->ls_rsbtbl[i].lock);
+                                break;
+                        }
+                        read_unlock(&ls->ls_rsbtbl[i].lock);
+                }
+                ri->entry = i;
+                if (ri->entry >= ls->ls_rsbtbl_size)
+                        return 1;
+        } else {
+                i = ri->entry;
+                read_lock(&ls->ls_rsbtbl[i].lock);
+                ri->next = ri->next->next;
+                if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
+                        /* End of list - move to next bucket */
+                        ri->next = NULL;
+                        ri->entry++;
+                        read_unlock(&ls->ls_rsbtbl[i].lock);
+                        goto top;
+                }
+                read_unlock(&ls->ls_rsbtbl[i].lock);
+        }
+        ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
+        return 0;
+}
+static void rsb_iter_free(struct rsb_iter *ri)
+{
+        kfree(ri);
+}
+static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+{
+        struct rsb_iter *ri;
+        ri = kmalloc(sizeof *ri, GFP_KERNEL);
+        if (!ri)
+                return NULL;
+        ri->ls = ls;
+        ri->entry = 0;
+        ri->next = NULL;
+        if (rsb_iter_next(ri)) {
+                rsb_iter_free(ri);
+                return NULL;
+        }
+        return ri;
+}
+static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
+{
+        struct rsb_iter *ri;
+        loff_t n = *pos;
+        ri = rsb_iter_init(file->private);
+        if (!ri)
+                return NULL;
+        while (n--) {
+                if (rsb_iter_next(ri)) {
+                        rsb_iter_free(ri);
+                        return NULL;
+                }
+        }
+        return ri;
+}
+static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+{
+        struct rsb_iter *ri = iter_ptr;
+        (*pos)++;
+        if (rsb_iter_next(ri)) {
+                rsb_iter_free(ri);
+                return NULL;
+        }
+        return ri;
+}
+static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+        /* nothing for now */
+}
+static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+{
+        struct rsb_iter *ri = iter_ptr;
+        print_resource(ri->rsb, file);
+        return 0;
+}
+static struct seq_operations rsb_seq_ops = {
+        .start = rsb_seq_start,
+        .next  = rsb_seq_next,
+        .stop  = rsb_seq_stop,
+        .show  = rsb_seq_show,
+};
+static int rsb_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &rsb_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private;
+        return 0;
+}
+static struct file_operations rsb_fops = {
+        .owner   = THIS_MODULE,
+        .open    = rsb_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+/*
+ * dump lkb's on the ls_waiters list
+ */
+static int waiters_open(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+                            size_t count, loff_t *ppos)
+{
+        struct dlm_ls *ls = file->private_data;
+        struct dlm_lkb *lkb;
+        size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+        mutex_lock(&debug_buf_lock);
+        mutex_lock(&ls->ls_waiters_mutex);
+        memset(debug_buf, 0, sizeof(debug_buf));
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+                               lkb->lkb_id, lkb->lkb_wait_type,
+                               lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+                if (ret >= len - pos)
+                        break;
+                pos += ret;
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+        mutex_unlock(&debug_buf_lock);
+        return rv;
+}
+static struct file_operations waiters_fops = {
+        .owner   = THIS_MODULE,
+        .open    = waiters_open,
+        .read    = waiters_read
+};
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+        char name[DLM_LOCKSPACE_LEN+8];
+        ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+                                                      S_IFREG | S_IRUGO,
+                                                      dlm_root,
+                                                      ls,
+                                                      &rsb_fops);
+        if (!ls->ls_debug_rsb_dentry)
+                return -ENOMEM;
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+                                                          S_IFREG | S_IRUGO,
+                                                          dlm_root,
+                                                          ls,
+                                                          &waiters_fops);
+        if (!ls->ls_debug_waiters_dentry) {
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+        if (ls->ls_debug_rsb_dentry)
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+        if (ls->ls_debug_waiters_dentry)
+                debugfs_remove(ls->ls_debug_waiters_dentry);
+}
+int dlm_register_debugfs(void)
+{
+        mutex_init(&debug_buf_lock);
+        dlm_root = debugfs_create_dir("dlm", NULL);
+        return dlm_root ? 0 : -ENOMEM;
+}
+void dlm_unregister_debugfs(void)
+{
+        debugfs_remove(dlm_root);
+}
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+        spin_lock(&ls->ls_recover_list_lock);
+        list_add(&de->list, &ls->ls_recover_list);
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+        int found = 0;
+        struct dlm_direntry *de;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_for_each_entry(de, &ls->ls_recover_list, list) {
+                if (de->length == len) {
+                        list_del(&de->list);
+                        de->master_nodeid = 0;
+                        memset(de->name, 0, len);
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+        if (!found)
+                de = allocate_direntry(ls, len);
+        return de;
+}
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+        struct dlm_direntry *de;
+        spin_lock(&ls->ls_recover_list_lock);
+        while (!list_empty(&ls->ls_recover_list)) {
+                de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+                                list);
+                list_del(&de->list);
+                free_direntry(de);
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+        struct list_head *tmp;
+        struct dlm_member *memb = NULL;
+        uint32_t node, n = 0;
+        int nodeid;
+        if (ls->ls_num_nodes == 1) {
+                nodeid = dlm_our_nodeid();
+                goto out;
+        }
+        if (ls->ls_node_array) {
+                node = (hash >> 16) % ls->ls_total_weight;
+                nodeid = ls->ls_node_array[node];
+                goto out;
+        }
+        /* make_member_array() failed to kmalloc ls_node_array... */
+        node = (hash >> 16) % ls->ls_num_nodes;
+        list_for_each(tmp, &ls->ls_nodes) {
+                if (n++ != node)
+                        continue;
+                memb = list_entry(tmp, struct dlm_member, list);
+                break;
+        }
+        DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+                                 ls->ls_num_nodes, n, node););
+        nodeid = memb->nodeid;
+ out:
+        return nodeid;
+}
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+        return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+        uint32_t val;
+        val = jhash(name, len, 0);
+        val &= (ls->ls_dirtbl_size - 1);
+        return val;
+}
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+        uint32_t bucket;
+        bucket = dir_hash(ls, de->name, de->length);
+        list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+                                          int namelen, uint32_t bucket)
+{
+        struct dlm_direntry *de;
+        list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+                if (de->length == namelen && !memcmp(name, de->name, namelen))
+                        goto out;
+        }
+        de = NULL;
+ out:
+        return de;
+}
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+        struct dlm_direntry *de;
+        uint32_t bucket;
+        bucket = dir_hash(ls, name, namelen);
+        write_lock(&ls->ls_dirtbl[bucket].lock);
+        de = search_bucket(ls, name, namelen, bucket);
+        if (!de) {
+                log_error(ls, "remove fr %u none", nodeid);
+                goto out;
+        }
+        if (de->master_nodeid != nodeid) {
+                log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+                goto out;
+        }
+        list_del(&de->list);
+        free_direntry(de);
+ out:
+        write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+        struct list_head *head;
+        struct dlm_direntry *de;
+        int i;
+        DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+        for (i = 0; i < ls->ls_dirtbl_size; i++) {
+                write_lock(&ls->ls_dirtbl[i].lock);
+                head = &ls->ls_dirtbl[i].list;
+                while (!list_empty(head)) {
+                        de = list_entry(head->next, struct dlm_direntry, list);
+                        list_del(&de->list);
+                        put_free_de(ls, de);
+                }
+                write_unlock(&ls->ls_dirtbl[i].lock);
+        }
+}
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        struct dlm_direntry *de;
+        char *b, *last_name = NULL;
+        int error = -ENOMEM, last_len, count = 0;
+        uint16_t namelen;
+        log_debug(ls, "dlm_recover_directory");
+        if (dlm_no_directory(ls))
+                goto out_status;
+        dlm_dir_clear(ls);
+        last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+        if (!last_name)
+                goto out;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                memset(last_name, 0, DLM_RESNAME_MAXLEN);
+                last_len = 0;
+                for (;;) {
+                        error = dlm_recovery_stopped(ls);
+                        if (error)
+                                goto out_free;
+                        error = dlm_rcom_names(ls, memb->nodeid,
+                                               last_name, last_len);
+                        if (error)
+                                goto out_free;
+                        schedule();
+                        /*
+                         * pick namelen/name pairs out of received buffer
+                         */
+                        b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
+                        for (;;) {
+                                memcpy(&namelen, b, sizeof(uint16_t));
+                                namelen = be16_to_cpu(namelen);
+                                b += sizeof(uint16_t);
+                                /* namelen of 0xFFFFF marks end of names for
+                                   this node; namelen of 0 marks end of the
+                                   buffer */
+                                if (namelen == 0xFFFF)
+                                        goto done;
+                                if (!namelen)
+                                        break;
+                                error = -ENOMEM;
+                                de = get_free_de(ls, namelen);
+                                if (!de)
+                                        goto out_free;
+                                de->master_nodeid = memb->nodeid;
+                                de->length = namelen;
+                                last_len = namelen;
+                                memcpy(de->name, b, namelen);
+                                memcpy(last_name, b, namelen);
+                                b += namelen;
+                                add_entry_to_hash(ls, de);
+                                count++;
+                        }
+                }
+         done:
+                ;
+        }
+ out_status:
+        error = 0;
+        dlm_set_recover_status(ls, DLM_RS_DIR);
+        log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+        kfree(last_name);
+ out:
+        dlm_clear_free_entries(ls);
+        return error;
+}
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+                     int namelen, int *r_nodeid)
+{
+        struct dlm_direntry *de, *tmp;
+        uint32_t bucket;
+        bucket = dir_hash(ls, name, namelen);
+        write_lock(&ls->ls_dirtbl[bucket].lock);
+        de = search_bucket(ls, name, namelen, bucket);
+        if (de) {
+                *r_nodeid = de->master_nodeid;
+                write_unlock(&ls->ls_dirtbl[bucket].lock);
+                if (*r_nodeid == nodeid)
+                        return -EEXIST;
+                return 0;
+        }
+        write_unlock(&ls->ls_dirtbl[bucket].lock);
+        de = allocate_direntry(ls, namelen);
+        if (!de)
+                return -ENOMEM;
+        de->master_nodeid = nodeid;
+        de->length = namelen;
+        memcpy(de->name, name, namelen);
+        write_lock(&ls->ls_dirtbl[bucket].lock);
+        tmp = search_bucket(ls, name, namelen, bucket);
+        if (tmp) {
+                free_direntry(de);
+                de = tmp;
+        } else {
+                list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+        }
+        *r_nodeid = de->master_nodeid;
+        write_unlock(&ls->ls_dirtbl[bucket].lock);
+        return 0;
+}
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+                   int *r_nodeid)
+{
+        return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+/* Copy the names of master rsb's into the buffer provided.
+   Only select names whose dir node is the given nodeid. */
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+                           char *outbuf, int outlen, int nodeid)
+{
+        struct list_head *list;
+        struct dlm_rsb *start_r = NULL, *r = NULL;
+        int offset = 0, start_namelen, error, dir_nodeid;
+        char *start_name;
+        uint16_t be_namelen;
+        /*
+         * Find the rsb where we left off (or start again)
+         */
+        start_namelen = inlen;
+        start_name = inbuf;
+        if (start_namelen > 1) {
+                /*
+                 * We could also use a find_rsb_root() function here that
+                 * searched the ls_root_list.
+                 */
+                error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+                                     &start_r);
+                DLM_ASSERT(!error && start_r,
+                           printk("error %d\n", error););
+                DLM_ASSERT(!list_empty(&start_r->res_root_list),
+                           dlm_print_rsb(start_r););
+                dlm_put_rsb(start_r);
+        }
+        /*
+         * Send rsb names for rsb's we're master of and whose directory node
+         * matches the requesting node.
+         */
+        down_read(&ls->ls_root_sem);
+        if (start_r)
+                list = start_r->res_root_list.next;
+        else
+                list = ls->ls_root_list.next;
+        for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+                r = list_entry(list, struct dlm_rsb, res_root_list);
+                if (r->res_nodeid)
+                        continue;
+                dir_nodeid = dlm_dir_nodeid(r);
+                if (dir_nodeid != nodeid)
+                        continue;
+                /*
+                 * The block ends when we can't fit the following in the
+                 * remaining buffer space:
+                 * namelen (uint16_t) +
+                 * name (r->res_length) +
+                 * end-of-block record 0x0000 (uint16_t)
+                 */
+                if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+                        /* Write end-of-block record */
+                        be_namelen = 0;
+                        memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                        offset += sizeof(uint16_t);
+                        goto out;
+                }
+                be_namelen = cpu_to_be16(r->res_length);
+                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                offset += sizeof(uint16_t);
+                memcpy(outbuf + offset, r->res_name, r->res_length);
+                offset += r->res_length;
+        }
+        /*
+         * If we've reached the end of the list (and there's room) write a
+         * terminating record.
+         */
+        if ((list == &ls->ls_root_list) &&
+            (offset + sizeof(uint16_t) <= outlen)) {
+                be_namelen = 0xFFFF;
+                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                offset += sizeof(uint16_t);
+        }
+ out:
+        up_read(&ls->ls_root_sem);
+}
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+        int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+        char *outbuf, int outlen, int nodeid);
+#endif                          /* __DIR_DOT_H__ */
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#define DLM_LOCKSPACE_LEN       64
+/* Size of the temp buffer midcomms allocates on the stack.
+   We try to make this large enough so most messages fit.
+   FIXME: should sctp make this unnecessary? */
+#define DLM_INBUF_LEN           148
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+#define log_print(fmt, args...) \
+        printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+        printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+#define DLM_LOG_DEBUG
+#ifdef DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk(KERN_ERR "\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
+struct dlm_direntry {
+        struct list_head        list;
+        uint32_t                master_nodeid;
+        uint16_t                length;
+        char                    name[1];
+};
+struct dlm_dirtable {
+        struct list_head        list;
+        rwlock_t                lock;
+};
+struct dlm_rsbtable {
+        struct list_head        list;
+        struct list_head        toss;
+        rwlock_t                lock;
+};
+struct dlm_lkbtable {
+        struct list_head        list;
+        rwlock_t                lock;
+        uint16_t                counter;
+};
+/*
+ * Lockspace member (per node in a ls)
+ */
+struct dlm_member {
+        struct list_head        list;
+        int                     nodeid;
+        int                     weight;
+};
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+struct dlm_recover {
+        struct list_head        list;
+        int                     *nodeids;
+        int                     node_count;
+        uint64_t                seq;
+};
+/*
+ * Pass input args to second stage locking function.
+ */
+struct dlm_args {
+        uint32_t                flags;
+        void                    *astaddr;
+        long                    astparam;
+        void                    *bastaddr;
+        int                     mode;
+        struct dlm_lksb         *lksb;
+};
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy      lock is mastered locally
+ *                 (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy    lock is mastered on a remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy     master node's copy of a lock owned by remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock.  The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h.  These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes.  One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v.  Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY).  These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags.  These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast.  All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait.  DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed.  Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+/* lkb_ast_type */
+#define AST_COMP                1
+#define AST_BAST                2
+/* lkb_status */
+#define DLM_LKSTS_WAITING       1
+#define DLM_LKSTS_GRANTED       2
+#define DLM_LKSTS_CONVERT       3
+/* lkb_flags */
+#define DLM_IFL_MSTCPY          0x00010000
+#define DLM_IFL_RESEND          0x00020000
+#define DLM_IFL_DEAD            0x00040000
+#define DLM_IFL_USER            0x00000001
+#define DLM_IFL_ORPHAN          0x00000002
+struct dlm_lkb {
+        struct dlm_rsb          *lkb_resource;  /* the rsb */
+        struct kref             lkb_ref;
+        int                     lkb_nodeid;     /* copied from rsb */
+        int                     lkb_ownpid;     /* pid of lock owner */
+        uint32_t                lkb_id;         /* our lock ID */
+        uint32_t                lkb_remid;      /* lock ID on remote partner */
+        uint32_t                lkb_exflags;    /* external flags from caller */
+        uint32_t                lkb_sbflags;    /* lksb flags */
+        uint32_t                lkb_flags;      /* internal flags */
+        uint32_t                lkb_lvbseq;     /* lvb sequence number */
+        int8_t                  lkb_status;     /* granted, waiting, convert */
+        int8_t                  lkb_rqmode;     /* requested lock mode */
+        int8_t                  lkb_grmode;     /* granted lock mode */
+        int8_t                  lkb_bastmode;   /* requested mode */
+        int8_t                  lkb_highbast;   /* highest mode bast sent for */
+        int8_t                  lkb_wait_type;  /* type of reply waiting for */
+        int8_t                  lkb_ast_type;   /* type of ast queued for */
+        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
+        struct list_head        lkb_statequeue; /* rsb g/c/w list */
+        struct list_head        lkb_rsb_lookup; /* waiting for rsb lookup */
+        struct list_head        lkb_wait_reply; /* waiting for remote reply */
+        struct list_head        lkb_astqueue;   /* need ast to be sent */
+        struct list_head        lkb_ownqueue;   /* list of locks for a process */
+        char                    *lkb_lvbptr;
+        struct dlm_lksb         *lkb_lksb;      /* caller's status block */
+        void                    *lkb_astaddr;   /* caller's ast function */
+        void                    *lkb_bastaddr;  /* caller's bast function */
+        long                    lkb_astparam;   /* caller's ast arg */
+};
+struct dlm_rsb {
+        struct dlm_ls           *res_ls;        /* the lockspace */
+        struct kref             res_ref;
+        struct mutex            res_mutex;
+        unsigned long           res_flags;
+        int                     res_length;     /* length of rsb name */
+        int                     res_nodeid;
+        uint32_t                res_lvbseq;
+        uint32_t                res_hash;
+        uint32_t                res_bucket;     /* rsbtbl */
+        unsigned long           res_toss_time;
+        uint32_t                res_first_lkid;
+        struct list_head        res_lookup;     /* lkbs waiting on first */
+        struct list_head        res_hashchain;  /* rsbtbl */
+        struct list_head        res_grantqueue;
+        struct list_head        res_convertqueue;
+        struct list_head        res_waitqueue;
+        struct list_head        res_root_list;      /* used for recovery */
+        struct list_head        res_recover_list;   /* used for recovery */
+        int                     res_recover_locks_count;
+        char                    *res_lvbptr;
+        char                    res_name[1];
+};
+/* find_rsb() flags */
+#define R_MASTER                1       /* only return rsb if it's a master */
+#define R_CREATE                2       /* create/add rsb if not found */
+/* rsb_flags */
+enum rsb_flags {
+        RSB_MASTER_UNCERTAIN,
+        RSB_VALNOTVALID,
+        RSB_VALNOTVALID_PREV,
+        RSB_NEW_MASTER,
+        RSB_NEW_MASTER2,
+        RSB_RECOVER_CONVERT,
+        RSB_LOCKS_PURGED,
+};
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+        __set_bit(flag, &r->res_flags);
+}
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+        __clear_bit(flag, &r->res_flags);
+}
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+        return test_bit(flag, &r->res_flags);
+}
+/* dlm_header is first element of all structs sent between nodes */
+#define DLM_HEADER_MAJOR        0x00020000
+#define DLM_HEADER_MINOR        0x00000001
+#define DLM_MSG                 1
+#define DLM_RCOM                2
+struct dlm_header {
+        uint32_t                h_version;
+        uint32_t                h_lockspace;
+        uint32_t                h_nodeid;       /* nodeid of sender */
+        uint16_t                h_length;
+        uint8_t                 h_cmd;          /* DLM_MSG, DLM_RCOM */
+        uint8_t                 h_pad;
+};
+#define DLM_MSG_REQUEST         1
+#define DLM_MSG_CONVERT         2
+#define DLM_MSG_UNLOCK          3
+#define DLM_MSG_CANCEL          4
+#define DLM_MSG_REQUEST_REPLY   5
+#define DLM_MSG_CONVERT_REPLY   6
+#define DLM_MSG_UNLOCK_REPLY    7
+#define DLM_MSG_CANCEL_REPLY    8
+#define DLM_MSG_GRANT           9
+#define DLM_MSG_BAST            10
+#define DLM_MSG_LOOKUP          11
+#define DLM_MSG_REMOVE          12
+#define DLM_MSG_LOOKUP_REPLY    13
+struct dlm_message {
+        struct dlm_header       m_header;
+        uint32_t                m_type;         /* DLM_MSG_ */
+        uint32_t                m_nodeid;
+        uint32_t                m_pid;
+        uint32_t                m_lkid;         /* lkid on sender */
+        uint32_t                m_remid;        /* lkid on receiver */
+        uint32_t                m_parent_lkid;
+        uint32_t                m_parent_remid;
+        uint32_t                m_exflags;
+        uint32_t                m_sbflags;
+        uint32_t                m_flags;
+        uint32_t                m_lvbseq;
+        uint32_t                m_hash;
+        int                     m_status;
+        int                     m_grmode;
+        int                     m_rqmode;
+        int                     m_bastmode;
+        int                     m_asts;
+        int                     m_result;       /* 0 or -EXXX */
+        char                    m_extra[0];     /* name or lvb */
+};
+#define DLM_RS_NODES            0x00000001
+#define DLM_RS_NODES_ALL        0x00000002
+#define DLM_RS_DIR              0x00000004
+#define DLM_RS_DIR_ALL          0x00000008
+#define DLM_RS_LOCKS            0x00000010
+#define DLM_RS_LOCKS_ALL        0x00000020
+#define DLM_RS_DONE             0x00000040
+#define DLM_RS_DONE_ALL         0x00000080
+#define DLM_RCOM_STATUS         1
+#define DLM_RCOM_NAMES          2
+#define DLM_RCOM_LOOKUP         3
+#define DLM_RCOM_LOCK           4
+#define DLM_RCOM_STATUS_REPLY   5
+#define DLM_RCOM_NAMES_REPLY    6
+#define DLM_RCOM_LOOKUP_REPLY   7
+#define DLM_RCOM_LOCK_REPLY     8
+struct dlm_rcom {
+        struct dlm_header       rc_header;
+        uint32_t                rc_type;        /* DLM_RCOM_ */
+        int                     rc_result;      /* multi-purpose */
+        uint64_t                rc_id;          /* match reply with request */
+        char                    rc_buf[0];
+};
+struct rcom_config {
+        uint32_t                rf_lvblen;
+        uint32_t                rf_lsflags;
+        uint64_t                rf_unused;
+};
+struct rcom_lock {
+        uint32_t                rl_ownpid;
+        uint32_t                rl_lkid;
+        uint32_t                rl_remid;
+        uint32_t                rl_parent_lkid;
+        uint32_t                rl_parent_remid;
+        uint32_t                rl_exflags;
+        uint32_t                rl_flags;
+        uint32_t                rl_lvbseq;
+        int                     rl_result;
+        int8_t                  rl_rqmode;
+        int8_t                  rl_grmode;
+        int8_t                  rl_status;
+        int8_t                  rl_asts;
+        uint16_t                rl_wait_type;
+        uint16_t                rl_namelen;
+        char                    rl_name[DLM_RESNAME_MAXLEN];
+        char                    rl_lvb[0];
+};
+struct dlm_ls {
+        struct list_head        ls_list;        /* list of lockspaces */
+        dlm_lockspace_t         *ls_local_handle;
+        uint32_t                ls_global_id;   /* global unique lockspace ID */
+        uint32_t                ls_exflags;
+        int                     ls_lvblen;
+        int                     ls_count;       /* reference count */
+        unsigned long           ls_flags;       /* LSFL_ */
+        struct kobject          ls_kobj;
+        struct dlm_rsbtable     *ls_rsbtbl;
+        uint32_t                ls_rsbtbl_size;
+        struct dlm_lkbtable     *ls_lkbtbl;
+        uint32_t                ls_lkbtbl_size;
+        struct dlm_dirtable     *ls_dirtbl;
+        uint32_t                ls_dirtbl_size;
+        struct mutex            ls_waiters_mutex;
+        struct list_head        ls_waiters;     /* lkbs needing a reply */
+        struct list_head        ls_nodes;       /* current nodes in ls */
+        struct list_head        ls_nodes_gone;  /* dead node list, recovery */
+        int                     ls_num_nodes;   /* number of nodes in ls */
+        int                     ls_low_nodeid;
+        int                     ls_total_weight;
+        int                     *ls_node_array;
+        struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
+        struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
+        struct dlm_message      ls_stub_ms;     /* for faking a reply */
+        struct dentry           *ls_debug_rsb_dentry; /* debugfs */
+        struct dentry           *ls_debug_waiters_dentry; /* debugfs */
+        wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
+        int                     ls_uevent_result;
+        struct miscdevice       ls_device;
+        /* recovery related */
+        struct timer_list       ls_timer;
+        struct task_struct      *ls_recoverd_task;
+        struct mutex            ls_recoverd_active;
+        spinlock_t              ls_recover_lock;
+        uint32_t                ls_recover_status; /* DLM_RS_ */
+        uint64_t                ls_recover_seq;
+        struct dlm_recover      *ls_recover_args;
+        struct rw_semaphore     ls_in_recovery; /* block local requests */
+        struct list_head        ls_requestqueue;/* queue remote requests */
+        struct mutex            ls_requestqueue_mutex;
+        char                    *ls_recover_buf;
+        int                     ls_recover_nodeid; /* for debugging */
+        uint64_t                ls_rcom_seq;
+        struct list_head        ls_recover_list;
+        spinlock_t              ls_recover_list_lock;
+        int                     ls_recover_list_count;
+        wait_queue_head_t       ls_wait_general;
+        struct mutex            ls_clear_proc_locks;
+        struct list_head        ls_root_list;   /* root resources */
+        struct rw_semaphore     ls_root_sem;    /* protect root_list */
+        int                     ls_namelen;
+        char                    ls_name[1];
+};
+#define LSFL_WORK               0
+#define LSFL_RUNNING            1
+#define LSFL_RECOVERY_STOP      2
+#define LSFL_RCOM_READY         3
+#define LSFL_UEVENT_WAIT        4
+/* much of this is just saving user space pointers associated with the
+   lock that we pass back to the user lib with an ast */
+struct dlm_user_args {
+        struct dlm_user_proc    *proc; /* each process that opens the lockspace
+                                          device has private data
+                                          (dlm_user_proc) on the struct file,
+                                          the process's locks point back to it*/
+        struct dlm_lksb         lksb;
+        int                     old_mode;
+        int                     update_user_lvb;
+        struct dlm_lksb __user  *user_lksb;
+        void __user             *castparam;
+        void __user             *castaddr;
+        void __user             *bastparam;
+        void __user             *bastaddr;
+};
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT  2
+/* locks list is kept so we can remove all a process's locks when it
+   exits (or orphan those that are persistent) */
+struct dlm_user_proc {
+        dlm_lockspace_t         *lockspace;
+        unsigned long           flags; /* DLM_PROC_FLAGS */
+        struct list_head        asts;
+        spinlock_t              asts_spin;
+        struct list_head        locks;
+        spinlock_t              locks_spin;
+        wait_queue_head_t       wait;
+};
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+        return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+        return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+        return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+#endif                          /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+/* Central locking logic has four stages:
+   dlm_lock()
+   dlm_unlock()
+   request_lock(ls, lkb)
+   convert_lock(ls, lkb)
+   unlock_lock(ls, lkb)
+   cancel_lock(ls, lkb)
+   _request_lock(r, lkb)
+   _convert_lock(r, lkb)
+   _unlock_lock(r, lkb)
+   _cancel_lock(r, lkb)
+   do_request(r, lkb)
+   do_convert(r, lkb)
+   do_unlock(r, lkb)
+   do_cancel(r, lkb)
+   Stage 1 (lock, unlock) is mainly about checking input args and
+   splitting into one of the four main operations:
+       dlm_lock          = request_lock
+       dlm_lock+CONVERT  = convert_lock
+       dlm_unlock        = unlock_lock
+       dlm_unlock+CANCEL = cancel_lock
+   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+   provided to the next stage.
+   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+   When remote, it calls send_xxxx(), when local it calls do_xxxx().
+   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
+   given rsb and lkb and queues callbacks.
+   For remote operations, send_xxxx() results in the corresponding do_xxxx()
+   function being executed on the remote node.  The connecting send/receive
+   calls on local (L) and remote (R) nodes:
+   L: send_xxxx()              ->  R: receive_xxxx()
+                                   R: do_xxxx()
+   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                                    struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+const int dlm_lvb_operations[8][8] = {
+        /* UN   NL  CR  CW  PR  PW  EX  PD*/
+        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+#define modes_compat(gr, rq) \
+        __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+int dlm_modes_compat(int mode1, int mode2)
+{
+        return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+static const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
+        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
+        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
+        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
+        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
+        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+        printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+               "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+               lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+               lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+               lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+        printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+               r->res_nodeid, r->res_flags, r->res_first_lkid,
+               r->res_recover_locks_count, r->res_name);
+}
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb;
+        dlm_print_rsb(r);
+        printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+               list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+        printk(KERN_ERR "rsb lookup list\n");
+        list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+                dlm_print_lkb(lkb);
+        printk(KERN_ERR "rsb grant queue:\n");
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+                dlm_print_lkb(lkb);
+        printk(KERN_ERR "rsb convert queue:\n");
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+                dlm_print_lkb(lkb);
+        printk(KERN_ERR "rsb wait queue:\n");
+        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+                dlm_print_lkb(lkb);
+}
+/* Threads cannot use the lockspace while it's being recovered */
+static inline void lock_recovery(struct dlm_ls *ls)
+{
+        down_read(&ls->ls_in_recovery);
+}
+static inline void unlock_recovery(struct dlm_ls *ls)
+{
+        up_read(&ls->ls_in_recovery);
+}
+static inline int lock_recovery_try(struct dlm_ls *ls)
+{
+        return down_read_trylock(&ls->ls_in_recovery);
+}
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+        return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+static inline int is_remote(struct dlm_rsb *r)
+{
+        DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+        return !!r->res_nodeid;
+}
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+        return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+        if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+            (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+                return 1;
+        return 0;
+}
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+        return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        if (is_master_copy(lkb))
+                return;
+        DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+        lkb->lkb_lksb->sb_status = rv;
+        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+        dlm_add_ast(lkb, AST_COMP);
+}
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+        if (is_master_copy(lkb))
+                send_bast(r, lkb, rqmode);
+        else {
+                lkb->lkb_bastmode = rqmode;
+                dlm_add_ast(lkb, AST_BAST);
+        }
+}
+/*
+ * Basic operations on rsb's and lkb's
+ */
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+        struct dlm_rsb *r;
+        r = allocate_rsb(ls, len);
+        if (!r)
+                return NULL;
+        r->res_ls = ls;
+        r->res_length = len;
+        memcpy(r->res_name, name, len);
+        mutex_init(&r->res_mutex);
+        INIT_LIST_HEAD(&r->res_lookup);
+        INIT_LIST_HEAD(&r->res_grantqueue);
+        INIT_LIST_HEAD(&r->res_convertqueue);
+        INIT_LIST_HEAD(&r->res_waitqueue);
+        INIT_LIST_HEAD(&r->res_root_list);
+        INIT_LIST_HEAD(&r->res_recover_list);
+        return r;
+}
+static int search_rsb_list(struct list_head *head, char *name, int len,
+                           unsigned int flags, struct dlm_rsb **r_ret)
+{
+        struct dlm_rsb *r;
+        int error = 0;
+        list_for_each_entry(r, head, res_hashchain) {
+                if (len == r->res_length && !memcmp(name, r->res_name, len))
+                        goto found;
+        }
+        return -EBADR;
+ found:
+        if (r->res_nodeid && (flags & R_MASTER))
+                error = -ENOTBLK;
+        *r_ret = r;
+        return error;
+}
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+                       unsigned int flags, struct dlm_rsb **r_ret)
+{
+        struct dlm_rsb *r;
+        int error;
+        error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+        if (!error) {
+                kref_get(&r->res_ref);
+                goto out;
+        }
+        error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+        if (error)
+                goto out;
+        list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+        if (dlm_no_directory(ls))
+                goto out;
+        if (r->res_nodeid == -1) {
+                rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+                r->res_first_lkid = 0;
+        } else if (r->res_nodeid > 0) {
+                rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+                r->res_first_lkid = 0;
+        } else {
+                DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+                DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+        }
+ out:
+        *r_ret = r;
+        return error;
+}
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+                      unsigned int flags, struct dlm_rsb **r_ret)
+{
+        int error;
+        write_lock(&ls->ls_rsbtbl[b].lock);
+        error = _search_rsb(ls, name, len, b, flags, r_ret);
+        write_unlock(&ls->ls_rsbtbl[b].lock);
+        return error;
+}
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused.  Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list.  When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+                    unsigned int flags, struct dlm_rsb **r_ret)
+{
+        struct dlm_rsb *r, *tmp;
+        uint32_t hash, bucket;
+        int error = 0;
+        if (dlm_no_directory(ls))
+                flags |= R_CREATE;
+        hash = jhash(name, namelen, 0);
+        bucket = hash & (ls->ls_rsbtbl_size - 1);
+        error = search_rsb(ls, name, namelen, bucket, flags, &r);
+        if (!error)
+                goto out;
+        if (error == -EBADR && !(flags & R_CREATE))
+                goto out;
+        /* the rsb was found but wasn't a master copy */
+        if (error == -ENOTBLK)
+                goto out;
+        error = -ENOMEM;
+        r = create_rsb(ls, name, namelen);
+        if (!r)
+                goto out;
+        r->res_hash = hash;
+        r->res_bucket = bucket;
+        r->res_nodeid = -1;
+        kref_init(&r->res_ref);
+        /* With no directory, the master can be set immediately */
+        if (dlm_no_directory(ls)) {
+                int nodeid = dlm_dir_nodeid(r);
+                if (nodeid == dlm_our_nodeid())
+                        nodeid = 0;
+                r->res_nodeid = nodeid;
+        }
+        write_lock(&ls->ls_rsbtbl[bucket].lock);
+        error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+        if (!error) {
+                write_unlock(&ls->ls_rsbtbl[bucket].lock);
+                free_rsb(r);
+                r = tmp;
+                goto out;
+        }
+        list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+        write_unlock(&ls->ls_rsbtbl[bucket].lock);
+        error = 0;
+ out:
+        *r_ret = r;
+        return error;
+}
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+                 unsigned int flags, struct dlm_rsb **r_ret)
+{
+        return find_rsb(ls, name, namelen, flags, r_ret);
+}
+/* This is only called to add a reference when the code already holds
+   a valid reference to the rsb, so there's no need for locking. */
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+        kref_get(&r->res_ref);
+}
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+        hold_rsb(r);
+}
+static void toss_rsb(struct kref *kref)
+{
+        struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+        struct dlm_ls *ls = r->res_ls;
+        DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+        kref_init(&r->res_ref);
+        list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+        r->res_toss_time = jiffies;
+        if (r->res_lvbptr) {
+                free_lvb(r->res_lvbptr);
+                r->res_lvbptr = NULL;
+        }
+}
+/* When all references to the rsb are gone it's transfered to
+   the tossed list for later disposal. */
+static void put_rsb(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        uint32_t bucket = r->res_bucket;
+        write_lock(&ls->ls_rsbtbl[bucket].lock);
+        kref_put(&r->res_ref, toss_rsb);
+        write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+        put_rsb(r);
+}
+/* See comment for unhold_lkb */
+static void unhold_rsb(struct dlm_rsb *r)
+{
+        int rv;
+        rv = kref_put(&r->res_ref, toss_rsb);
+        DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+static void kill_rsb(struct kref *kref)
+{
+        struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+        /* All work is done after the return from kref_put() so we
+           can release the write_lock before the remove and free. */
+        DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+        DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+   The rsb must exist as long as any lkb's for it do. */
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        hold_rsb(r);
+        lkb->lkb_resource = r;
+}
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_resource) {
+                put_rsb(lkb->lkb_resource);
+                lkb->lkb_resource = NULL;
+        }
+}
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+        struct dlm_lkb *lkb, *tmp;
+        uint32_t lkid = 0;
+        uint16_t bucket;
+        lkb = allocate_lkb(ls);
+        if (!lkb)
+                return -ENOMEM;
+        lkb->lkb_nodeid = -1;
+        lkb->lkb_grmode = DLM_LOCK_IV;
+        kref_init(&lkb->lkb_ref);
+        INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+        get_random_bytes(&bucket, sizeof(bucket));
+        bucket &= (ls->ls_lkbtbl_size - 1);
+        write_lock(&ls->ls_lkbtbl[bucket].lock);
+        /* counter can roll over so we must verify lkid is not in use */
+        while (lkid == 0) {
+                lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+                list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+                                    lkb_idtbl_list) {
+                        if (tmp->lkb_id != lkid)
+                                continue;
+                        lkid = 0;
+                        break;
+                }
+        }
+        lkb->lkb_id = lkid;
+        list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+        write_unlock(&ls->ls_lkbtbl[bucket].lock);
+        *lkb_ret = lkb;
+        return 0;
+}
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+        uint16_t bucket = lkid & 0xFFFF;
+        struct dlm_lkb *lkb;
+        list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+                if (lkb->lkb_id == lkid)
+                        return lkb;
+        }
+        return NULL;
+}
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+        struct dlm_lkb *lkb;
+        uint16_t bucket = lkid & 0xFFFF;
+        if (bucket >= ls->ls_lkbtbl_size)
+                return -EBADSLT;
+        read_lock(&ls->ls_lkbtbl[bucket].lock);
+        lkb = __find_lkb(ls, lkid);
+        if (lkb)
+                kref_get(&lkb->lkb_ref);
+        read_unlock(&ls->ls_lkbtbl[bucket].lock);
+        *lkb_ret = lkb;
+        return lkb ? 0 : -ENOENT;
+}
+static void kill_lkb(struct kref *kref)
+{
+        struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+        /* All work is done after the return from kref_put() so we
+           can release the write_lock before the detach_lkb */
+        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+   it so we need to provide the lockspace explicitly */
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        uint16_t bucket = lkb->lkb_id & 0xFFFF;
+        write_lock(&ls->ls_lkbtbl[bucket].lock);
+        if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+                list_del(&lkb->lkb_idtbl_list);
+                write_unlock(&ls->ls_lkbtbl[bucket].lock);
+                detach_lkb(lkb);
+                /* for local/process lkbs, lvbptr points to caller's lksb */
+                if (lkb->lkb_lvbptr && is_master_copy(lkb))
+                        free_lvb(lkb->lkb_lvbptr);
+                free_lkb(lkb);
+                return 1;
+        } else {
+                write_unlock(&ls->ls_lkbtbl[bucket].lock);
+                return 0;
+        }
+}
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls;
+        DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+        DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+        ls = lkb->lkb_resource->res_ls;
+        return __put_lkb(ls, lkb);
+}
+/* This is only called to add a reference when the code already holds
+   a valid reference to the lkb, so there's no need for locking. */
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+        kref_get(&lkb->lkb_ref);
+}
+/* This is called when we need to remove a reference and are certain
+   it's not the last ref.  e.g. del_lkb is always called between a
+   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+   put_lkb would work fine, but would involve unnecessary locking */
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+        int rv;
+        rv = kref_put(&lkb->lkb_ref, kill_lkb);
+        DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+                            int mode)
+{
+        struct dlm_lkb *lkb = NULL;
+        list_for_each_entry(lkb, head, lkb_statequeue)
+                if (lkb->lkb_rqmode < mode)
+                        break;
+        if (!lkb)
+                list_add_tail(new, head);
+        else
+                __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+/* add/remove lkb to rsb's grant/convert/wait queue */
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+        kref_get(&lkb->lkb_ref);
+        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+        lkb->lkb_status = status;
+        switch (status) {
+        case DLM_LKSTS_WAITING:
+                if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+                        list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+                else
+                        list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+                break;
+        case DLM_LKSTS_GRANTED:
+                /* convention says granted locks kept in order of grmode */
+                lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+                                lkb->lkb_grmode);
+                break;
+        case DLM_LKSTS_CONVERT:
+                if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+                        list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+                else
+                        list_add_tail(&lkb->lkb_statequeue,
+                                      &r->res_convertqueue);
+                break;
+        default:
+                DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+        }
+}
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        lkb->lkb_status = 0;
+        list_del(&lkb->lkb_statequeue);
+        unhold_lkb(lkb);
+}
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+        hold_lkb(lkb);
+        del_lkb(r, lkb);
+        add_lkb(r, lkb, sts);
+        unhold_lkb(lkb);
+}
+/* add/remove lkb from global waiters list of lkb's waiting for
+   a reply from a remote node */
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        mutex_lock(&ls->ls_waiters_mutex);
+        if (lkb->lkb_wait_type) {
+                log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+                goto out;
+        }
+        lkb->lkb_wait_type = mstype;
+        kref_get(&lkb->lkb_ref);
+        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+        mutex_unlock(&ls->ls_waiters_mutex);
+}
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+        int error = 0;
+        if (!lkb->lkb_wait_type) {
+                log_print("remove_from_waiters error");
+                error = -EINVAL;
+                goto out;
+        }
+        lkb->lkb_wait_type = 0;
+        list_del(&lkb->lkb_wait_reply);
+        unhold_lkb(lkb);
+ out:
+        return error;
+}
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        int error;
+        mutex_lock(&ls->ls_waiters_mutex);
+        error = _remove_from_waiters(lkb);
+        mutex_unlock(&ls->ls_waiters_mutex);
+        return error;
+}
+static void dir_remove(struct dlm_rsb *r)
+{
+        int to_nodeid;
+        if (dlm_no_directory(r->res_ls))
+                return;
+        to_nodeid = dlm_dir_nodeid(r);
+        if (to_nodeid != dlm_our_nodeid())
+                send_remove(r);
+        else
+                dlm_dir_remove_entry(r->res_ls, to_nodeid,
+                                     r->res_name, r->res_length);
+}
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+   found since they are in order of newest to oldest? */
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+        struct dlm_rsb *r;
+        int count = 0, found;
+        for (;;) {
+                found = 0;
+                write_lock(&ls->ls_rsbtbl[b].lock);
+                list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+                                            res_hashchain) {
+                        if (!time_after_eq(jiffies, r->res_toss_time +
+                                           dlm_config.toss_secs * HZ))
+                                continue;
+                        found = 1;
+                        break;
+                }
+                if (!found) {
+                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        break;
+                }
+                if (kref_put(&r->res_ref, kill_rsb)) {
+                        list_del(&r->res_hashchain);
+                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        if (is_master(r))
+                                dir_remove(r);
+                        free_rsb(r);
+                        count++;
+                } else {
+                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        log_error(ls, "tossed rsb in use %s", r->res_name);
+                }
+        }
+        return count;
+}
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+        int i;
+        if (dlm_locking_stopped(ls))
+                return;
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                shrink_bucket(ls, i);
+                cond_resched();
+        }
+}
+/* lkb is master or local copy */
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int b, len = r->res_ls->ls_lvblen;
+        /* b=1 lvb returned to caller
+           b=0 lvb written to rsb or invalidated
+           b=-1 do nothing */
+        b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+        if (b == 1) {
+                if (!lkb->lkb_lvbptr)
+                        return;
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        return;
+                if (!r->res_lvbptr)
+                        return;
+                memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+                lkb->lkb_lvbseq = r->res_lvbseq;
+        } else if (b == 0) {
+                if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+                        rsb_set_flag(r, RSB_VALNOTVALID);
+                        return;
+                }
+                if (!lkb->lkb_lvbptr)
+                        return;
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        return;
+                if (!r->res_lvbptr)
+                        r->res_lvbptr = allocate_lvb(r->res_ls);
+                if (!r->res_lvbptr)
+                        return;
+                memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+                r->res_lvbseq++;
+                lkb->lkb_lvbseq = r->res_lvbseq;
+                rsb_clear_flag(r, RSB_VALNOTVALID);
+        }
+        if (rsb_flag(r, RSB_VALNOTVALID))
+                lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_grmode < DLM_LOCK_PW)
+                return;
+        if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+                rsb_set_flag(r, RSB_VALNOTVALID);
+                return;
+        }
+        if (!lkb->lkb_lvbptr)
+                return;
+        if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                return;
+        if (!r->res_lvbptr)
+                r->res_lvbptr = allocate_lvb(r->res_ls);
+        if (!r->res_lvbptr)
+                return;
+        memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+        r->res_lvbseq++;
+        rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+/* lkb is process copy (pc) */
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                            struct dlm_message *ms)
+{
+        int b;
+        if (!lkb->lkb_lvbptr)
+                return;
+        if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                return;
+        b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+        if (b == 1) {
+                int len = receive_extralen(ms);
+                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+                lkb->lkb_lvbseq = ms->m_lvbseq;
+        }
+}
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+   remove_lock -- used for unlock, removes lkb from granted
+   revert_lock -- used for cancel, moves lkb from convert to granted
+   grant_lock  -- used for request and convert, adds lkb to granted or
+                  moves lkb from convert or waiting to granted
+   Each of these is used for master or local copy lkb's.  There is
+   also a _pc() variation used to make the corresponding change on
+   a process copy (pc) lkb. */
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        del_lkb(r, lkb);
+        lkb->lkb_grmode = DLM_LOCK_IV;
+        /* this unhold undoes the original ref from create_lkb()
+           so this leads to the lkb being freed */
+        unhold_lkb(lkb);
+}
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        set_lvb_unlock(r, lkb);
+        _remove_lock(r, lkb);
+}
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        _remove_lock(r, lkb);
+}
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        lkb->lkb_rqmode = DLM_LOCK_IV;
+        switch (lkb->lkb_status) {
+        case DLM_LKSTS_GRANTED:
+                break;
+        case DLM_LKSTS_CONVERT:
+                move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+                break;
+        case DLM_LKSTS_WAITING:
+                del_lkb(r, lkb);
+                lkb->lkb_grmode = DLM_LOCK_IV;
+                /* this unhold undoes the original ref from create_lkb()
+                   so this leads to the lkb being freed */
+                unhold_lkb(lkb);
+                break;
+        default:
+                log_print("invalid status for revert %d", lkb->lkb_status);
+        }
+}
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        revert_lock(r, lkb);
+}
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+                lkb->lkb_grmode = lkb->lkb_rqmode;
+                if (lkb->lkb_status)
+                        move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+                else
+                        add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+        }
+        lkb->lkb_rqmode = DLM_LOCK_IV;
+}
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        set_lvb_lock(r, lkb);
+        _grant_lock(r, lkb);
+        lkb->lkb_highbast = 0;
+}
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                          struct dlm_message *ms)
+{
+        set_lvb_lock_pc(r, lkb, ms);
+        _grant_lock(r, lkb);
+}
+/* called by grant_pending_locks() which means an async grant message must
+   be sent to the requesting node in addition to granting the lock if the
+   lkb belongs to a remote node. */
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        grant_lock(r, lkb);
+        if (is_master_copy(lkb))
+                send_grant(r, lkb);
+        else
+                queue_cast(r, lkb, 0);
+}
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+        struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+                                           lkb_statequeue);
+        if (lkb->lkb_id == first->lkb_id)
+                return 1;
+        return 0;
+}
+/* Check if the given lkb conflicts with another lkb on the queue. */
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+        struct dlm_lkb *this;
+        list_for_each_entry(this, head, lkb_statequeue) {
+                if (this == lkb)
+                        continue;
+                if (!modes_compat(this, lkb))
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ *                PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list.  We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+        struct dlm_lkb *this, *first = NULL, *self = NULL;
+        list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+                if (!first)
+                        first = this;
+                if (this == lkb) {
+                        self = lkb;
+                        continue;
+                }
+                if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+                        return 1;
+        }
+        /* if lkb is on the convert queue and is preventing the first
+           from being granted, then there's deadlock and we demote lkb.
+           multiple converting locks may need to do this before the first
+           converting lock can be granted. */
+        if (self && self != first) {
+                if (!modes_compat(lkb, first) &&
+                    !queue_conflict(&rsb->res_grantqueue, first))
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+        int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+        /*
+         * 6-10: Version 5.4 introduced an option to address the phenomenon of
+         * a new request for a NL mode lock being blocked.
+         *
+         * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+         * request, then it would be granted.  In essence, the use of this flag
+         * tells the Lock Manager to expedite theis request by not considering
+         * what may be in the CONVERTING or WAITING queues...  As of this
+         * writing, the EXPEDITE flag can be used only with new requests for NL
+         * mode locks.  This flag is not valid for conversion requests.
+         *
+         * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+         * conversion or used with a non-NL requested mode.  We also know an
+         * EXPEDITE request is always granted immediately, so now must always
+         * be 1.  The full condition to grant an expedite request: (now &&
+         * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+         * therefore be shortened to just checking the flag.
+         */
+        if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+                return 1;
+        /*
+         * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+         * added to the remaining conditions.
+         */
+        if (queue_conflict(&r->res_grantqueue, lkb))
+                goto out;
+        /*
+         * 6-3: By default, a conversion request is immediately granted if the
+         * requested mode is compatible with the modes of all other granted
+         * locks
+         */
+        if (queue_conflict(&r->res_convertqueue, lkb))
+                goto out;
+        /*
+         * 6-5: But the default algorithm for deciding whether to grant or
+         * queue conversion requests does not by itself guarantee that such
+         * requests are serviced on a "first come first serve" basis.  This, in
+         * turn, can lead to a phenomenon known as "indefinate postponement".
+         *
+         * 6-7: This issue is dealt with by using the optional QUECVT flag with
+         * the system service employed to request a lock conversion.  This flag
+         * forces certain conversion requests to be queued, even if they are
+         * compatible with the granted modes of other locks on the same
+         * resource.  Thus, the use of this flag results in conversion requests
+         * being ordered on a "first come first servce" basis.
+         *
+         * DCT: This condition is all about new conversions being able to occur
+         * "in place" while the lock remains on the granted queue (assuming
+         * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
+         * doesn't _have_ to go onto the convert queue where it's processed in
+         * order.  The "now" variable is necessary to distinguish converts
+         * being received and processed for the first time now, because once a
+         * convert is moved to the conversion queue the condition below applies
+         * requiring fifo granting.
+         */
+        if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+                return 1;
+        /*
+         * The NOORDER flag is set to avoid the standard vms rules on grant
+         * order.
+         */
+        if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+                return 1;
+        /*
+         * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+         * granted until all other conversion requests ahead of it are granted
+         * and/or canceled.
+         */
+        if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+                return 1;
+        /*
+         * 6-4: By default, a new request is immediately granted only if all
+         * three of the following conditions are satisfied when the request is
+         * issued:
+         * - The queue of ungranted conversion requests for the resource is
+         *   empty.
+         * - The queue of ungranted new requests for the resource is empty.
+         * - The mode of the new request is compatible with the most
+         *   restrictive mode of all granted locks on the resource.
+         */
+        if (now && !conv && list_empty(&r->res_convertqueue) &&
+            list_empty(&r->res_waitqueue))
+                return 1;
+        /*
+         * 6-4: Once a lock request is in the queue of ungranted new requests,
+         * it cannot be granted until the queue of ungranted conversion
+         * requests is empty, all ungranted new requests ahead of it are
+         * granted and/or canceled, and it is compatible with the granted mode
+         * of the most restrictive lock granted on the resource.
+         */
+        if (!now && !conv && list_empty(&r->res_convertqueue) &&
+            first_in_list(lkb, &r->res_waitqueue))
+                return 1;
+ out:
+        /*
+         * The following, enabled by CONVDEADLK, departs from VMS.
+         */
+        if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+            conversion_deadlock_detect(r, lkb)) {
+                lkb->lkb_grmode = DLM_LOCK_NL;
+                lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+        }
+        return 0;
+}
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+        uint32_t flags = lkb->lkb_exflags;
+        int rv;
+        int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+        rv = _can_be_granted(r, lkb, now);
+        if (rv)
+                goto out;
+        if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+                goto out;
+        if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+                alt = DLM_LOCK_PR;
+        else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+                alt = DLM_LOCK_CW;
+        if (alt) {
+                lkb->lkb_rqmode = alt;
+                rv = _can_be_granted(r, lkb, now);
+                if (rv)
+                        lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+                else
+                        lkb->lkb_rqmode = rqmode;
+        }
+ out:
+        return rv;
+}
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+        struct dlm_lkb *lkb, *s;
+        int hi, demoted, quit, grant_restart, demote_restart;
+        quit = 0;
+ restart:
+        grant_restart = 0;
+        demote_restart = 0;
+        hi = DLM_LOCK_IV;
+        list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+                demoted = is_demoted(lkb);
+                if (can_be_granted(r, lkb, 0)) {
+                        grant_lock_pending(r, lkb);
+                        grant_restart = 1;
+                } else {
+                        hi = max_t(int, lkb->lkb_rqmode, hi);
+                        if (!demoted && is_demoted(lkb))
+                                demote_restart = 1;
+                }
+        }
+        if (grant_restart)
+                goto restart;
+        if (demote_restart && !quit) {
+                quit = 1;
+                goto restart;
+        }
+        return max_t(int, high, hi);
+}
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+        struct dlm_lkb *lkb, *s;
+        list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+                if (can_be_granted(r, lkb, 0))
+                        grant_lock_pending(r, lkb);
+                else
+                        high = max_t(int, lkb->lkb_rqmode, high);
+        }
+        return high;
+}
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb, *s;
+        int high = DLM_LOCK_IV;
+        DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+        high = grant_pending_convert(r, high);
+        high = grant_pending_wait(r, high);
+        if (high == DLM_LOCK_IV)
+                return;
+        /*
+         * If there are locks left on the wait/convert queue then send blocking
+         * ASTs to granted locks based on the largest requested mode (high)
+         * found above. FIXME: highbast < high comparison not valid for PR/CW.
+         */
+        list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+                if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+                    !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+                        queue_bast(r, lkb, high);
+                        lkb->lkb_highbast = high;
+                }
+        }
+}
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+                            struct dlm_lkb *lkb)
+{
+        struct dlm_lkb *gr;
+        list_for_each_entry(gr, head, lkb_statequeue) {
+                if (gr->lkb_bastaddr &&
+                    gr->lkb_highbast < lkb->lkb_rqmode &&
+                    !modes_compat(gr, lkb)) {
+                        queue_bast(r, gr, lkb->lkb_rqmode);
+                        gr->lkb_highbast = lkb->lkb_rqmode;
+                }
+        }
+}
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        send_bast_queue(r, &r->res_grantqueue, lkb);
+        send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+/* set_master(r, lkb) -- set the master nodeid of a resource
+   The purpose of this function is to set the nodeid field in the given
+   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
+   known, it can just be copied to the lkb and the function will return
+   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
+   before it can be copied to the lkb.
+   When the rsb nodeid is being looked up remotely, the initial lkb
+   causing the lookup is kept on the ls_waiters list waiting for the
+   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
+   on the rsb's res_lookup list until the master is verified.
+   Return values:
+   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+   1: the rsb master is not available and the lkb has been placed on
+      a wait queue
+*/
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls = r->res_ls;
+        int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+        if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+                rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+                r->res_first_lkid = lkb->lkb_id;
+                lkb->lkb_nodeid = r->res_nodeid;
+                return 0;
+        }
+        if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+                list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+                return 1;
+        }
+        if (r->res_nodeid == 0) {
+                lkb->lkb_nodeid = 0;
+                return 0;
+        }
+        if (r->res_nodeid > 0) {
+                lkb->lkb_nodeid = r->res_nodeid;
+                return 0;
+        }
+        DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
+        dir_nodeid = dlm_dir_nodeid(r);
+        if (dir_nodeid != our_nodeid) {
+                r->res_first_lkid = lkb->lkb_id;
+                send_lookup(r, lkb);
+                return 1;
+        }
+        for (;;) {
+                /* It's possible for dlm_scand to remove an old rsb for
+                   this same resource from the toss list, us to create
+                   a new one, look up the master locally, and find it
+                   already exists just before dlm_scand does the
+                   dir_remove() on the previous rsb. */
+                error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+                                       r->res_length, &ret_nodeid);
+                if (!error)
+                        break;
+                log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+                schedule();
+        }
+        if (ret_nodeid == our_nodeid) {
+                r->res_first_lkid = 0;
+                r->res_nodeid = 0;
+                lkb->lkb_nodeid = 0;
+        } else {
+                r->res_first_lkid = lkb->lkb_id;
+                r->res_nodeid = ret_nodeid;
+                lkb->lkb_nodeid = ret_nodeid;
+        }
+        return 0;
+}
+static void process_lookup_list(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb, *safe;
+        list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+                list_del(&lkb->lkb_rsb_lookup);
+                _request_lock(r, lkb);
+                schedule();
+        }
+}
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+        struct dlm_lkb *lkb;
+        if (!r->res_first_lkid)
+                return;
+        switch (error) {
+        case 0:
+        case -EINPROGRESS:
+                r->res_first_lkid = 0;
+                process_lookup_list(r);
+                break;
+        case -EAGAIN:
+                /* the remote master didn't queue our NOQUEUE request;
+                   make a waiting lkb the first_lkid */
+                r->res_first_lkid = 0;
+                if (!list_empty(&r->res_lookup)) {
+                        lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+                                         lkb_rsb_lookup);
+                        list_del(&lkb->lkb_rsb_lookup);
+                        r->res_first_lkid = lkb->lkb_id;
+                        _request_lock(r, lkb);
+                } else
+                        r->res_nodeid = -1;
+                break;
+        default:
+                log_error(r->res_ls, "confirm_master unknown error %d", error);
+        }
+}
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+                         int namelen, uint32_t parent_lkid, void *ast,
+                         void *astarg, void *bast, struct dlm_args *args)
+{
+        int rv = -EINVAL;
+        /* check for invalid arg usage */
+        if (mode < 0 || mode > DLM_LOCK_EX)
+                goto out;
+        if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+                goto out;
+        if (flags & DLM_LKF_CANCEL)
+                goto out;
+        if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+                goto out;
+        if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+                goto out;
+        if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+                goto out;
+        if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+                goto out;
+        if (!ast || !lksb)
+                goto out;
+        if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+                goto out;
+        /* parent/child locks not yet supported */
+        if (parent_lkid)
+                goto out;
+        if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+                goto out;
+        /* these args will be copied to the lkb in validate_lock_args,
+           it cannot be done now because when converting locks, fields in
+           an active lkb cannot be modified before locking the rsb */
+        args->flags = flags;
+        args->astaddr = ast;
+        args->astparam = (long) astarg;
+        args->bastaddr = bast;
+        args->mode = mode;
+        args->lksb = lksb;
+        rv = 0;
+ out:
+        return rv;
+}
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+        if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+                      DLM_LKF_FORCEUNLOCK))
+                return -EINVAL;
+        args->flags = flags;
+        args->astparam = (long) astarg;
+        return 0;
+}
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                              struct dlm_args *args)
+{
+        int rv = -EINVAL;
+        if (args->flags & DLM_LKF_CONVERT) {
+                if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                        goto out;
+                if (args->flags & DLM_LKF_QUECVT &&
+                    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+                        goto out;
+                rv = -EBUSY;
+                if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+                        goto out;
+                if (lkb->lkb_wait_type)
+                        goto out;
+        }
+        lkb->lkb_exflags = args->flags;
+        lkb->lkb_sbflags = 0;
+        lkb->lkb_astaddr = args->astaddr;
+        lkb->lkb_astparam = args->astparam;
+        lkb->lkb_bastaddr = args->bastaddr;
+        lkb->lkb_rqmode = args->mode;
+        lkb->lkb_lksb = args->lksb;
+        lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+        lkb->lkb_ownpid = (int) current->pid;
+        rv = 0;
+ out:
+        return rv;
+}
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+        int rv = -EINVAL;
+        if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+                goto out;
+        if (args->flags & DLM_LKF_FORCEUNLOCK)
+                goto out_ok;
+        if (args->flags & DLM_LKF_CANCEL &&
+            lkb->lkb_status == DLM_LKSTS_GRANTED)
+                goto out;
+        if (!(args->flags & DLM_LKF_CANCEL) &&
+            lkb->lkb_status != DLM_LKSTS_GRANTED)
+                goto out;
+        rv = -EBUSY;
+        if (lkb->lkb_wait_type)
+                goto out;
+ out_ok:
+        lkb->lkb_exflags = args->flags;
+        lkb->lkb_sbflags = 0;
+        lkb->lkb_astparam = args->astparam;
+        rv = 0;
+ out:
+        return rv;
+}
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error = 0;
+        if (can_be_granted(r, lkb, 1)) {
+                grant_lock(r, lkb);
+                queue_cast(r, lkb, 0);
+                goto out;
+        }
+        if (can_be_queued(lkb)) {
+                error = -EINPROGRESS;
+                add_lkb(r, lkb, DLM_LKSTS_WAITING);
+                send_blocking_asts(r, lkb);
+                goto out;
+        }
+        error = -EAGAIN;
+        if (force_blocking_asts(lkb))
+                send_blocking_asts_all(r, lkb);
+        queue_cast(r, lkb, -EAGAIN);
+ out:
+        return error;
+}
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error = 0;
+        /* changing an existing lock may allow others to be granted */
+        if (can_be_granted(r, lkb, 1)) {
+                grant_lock(r, lkb);
+                queue_cast(r, lkb, 0);
+                grant_pending_locks(r);
+                goto out;
+        }
+        if (can_be_queued(lkb)) {
+                if (is_demoted(lkb))
+                        grant_pending_locks(r);
+                error = -EINPROGRESS;
+                del_lkb(r, lkb);
+                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+                send_blocking_asts(r, lkb);
+                goto out;
+        }
+        error = -EAGAIN;
+        if (force_blocking_asts(lkb))
+                send_blocking_asts_all(r, lkb);
+        queue_cast(r, lkb, -EAGAIN);
+ out:
+        return error;
+}
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        remove_lock(r, lkb);
+        queue_cast(r, lkb, -DLM_EUNLOCK);
+        grant_pending_locks(r);
+        return -DLM_EUNLOCK;
+}
+/* FIXME: if revert_lock() finds that the lkb is granted, we should
+   skip the queue_cast(ECANCEL).  It indicates that the request/convert
+   completed (and queued a normal ast) just before the cancel; we don't
+   want to clobber the sb_result for the normal ast with ECANCEL. */
+ 
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        revert_lock(r, lkb);
+        queue_cast(r, lkb, -DLM_ECANCEL);
+        grant_pending_locks(r);
+        return -DLM_ECANCEL;
+}
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+/* add a new lkb to a possibly new rsb, called by requesting process */
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        /* set_master: sets lkb nodeid from r */
+        error = set_master(r, lkb);
+        if (error < 0)
+                goto out;
+        if (error) {
+                error = 0;
+                goto out;
+        }
+        if (is_remote(r))
+                /* receive_request() calls do_request() on remote node */
+                error = send_request(r, lkb);
+        else
+                error = do_request(r, lkb);
+ out:
+        return error;
+}
+/* change some property of an existing lkb, e.g. mode */
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        if (is_remote(r))
+                /* receive_convert() calls do_convert() on remote node */
+                error = send_convert(r, lkb);
+        else
+                error = do_convert(r, lkb);
+        return error;
+}
+/* remove an existing lkb from the granted queue */
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        if (is_remote(r))
+                /* receive_unlock() calls do_unlock() on remote node */
+                error = send_unlock(r, lkb);
+        else
+                error = do_unlock(r, lkb);
+        return error;
+}
+/* remove an existing lkb from the convert or wait queue */
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        if (is_remote(r))
+                /* receive_cancel() calls do_cancel() on remote node */
+                error = send_cancel(r, lkb);
+        else
+                error = do_cancel(r, lkb);
+        return error;
+}
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+                        int len, struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        error = validate_lock_args(ls, lkb, args);
+        if (error)
+                goto out;
+        error = find_rsb(ls, name, len, R_CREATE, &r);
+        if (error)
+                goto out;
+        lock_rsb(r);
+        attach_lkb(r, lkb);
+        lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+        error = _request_lock(r, lkb);
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        return error;
+}
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                        struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = validate_lock_args(ls, lkb, args);
+        if (error)
+                goto out;
+        error = _convert_lock(r, lkb);
+ out:
+        unlock_rsb(r);
+        put_rsb(r);
+        return error;
+}
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = validate_unlock_args(lkb, args);
+        if (error)
+                goto out;
+        error = _unlock_lock(r, lkb);
+ out:
+        unlock_rsb(r);
+        put_rsb(r);
+        return error;
+}
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_args *args)
+{
+        struct dlm_rsb *r;
+        int error;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = validate_unlock_args(lkb, args);
+        if (error)
+                goto out;
+        error = _cancel_lock(r, lkb);
+ out:
+        unlock_rsb(r);
+        put_rsb(r);
+        return error;
+}
+/*
+ * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
+ */
+int dlm_lock(dlm_lockspace_t *lockspace,
+             int mode,
+             struct dlm_lksb *lksb,
+             uint32_t flags,
+             void *name,
+             unsigned int namelen,
+             uint32_t parent_lkid,
+             void (*ast) (void *astarg),
+             void *astarg,
+             void (*bast) (void *astarg, int mode))
+{
+        struct dlm_ls *ls;
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        int error, convert = flags & DLM_LKF_CONVERT;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -EINVAL;
+        lock_recovery(ls);
+        if (convert)
+                error = find_lkb(ls, lksb->sb_lkid, &lkb);
+        else
+                error = create_lkb(ls, &lkb);
+        if (error)
+                goto out;
+        error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+                              astarg, bast, &args);
+        if (error)
+                goto out_put;
+        if (convert)
+                error = convert_lock(ls, lkb, &args);
+        else
+                error = request_lock(ls, lkb, name, namelen, &args);
+        if (error == -EINPROGRESS)
+                error = 0;
+ out_put:
+        if (convert || error)
+                __put_lkb(ls, lkb);
+        if (error == -EAGAIN)
+                error = 0;
+ out:
+        unlock_recovery(ls);
+        dlm_put_lockspace(ls);
+        return error;
+}
+int dlm_unlock(dlm_lockspace_t *lockspace,
+               uint32_t lkid,
+               uint32_t flags,
+               struct dlm_lksb *lksb,
+               void *astarg)
+{
+        struct dlm_ls *ls;
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        int error;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -EINVAL;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        error = set_unlock_args(flags, astarg, &args);
+        if (error)
+                goto out_put;
+        if (flags & DLM_LKF_CANCEL)
+                error = cancel_lock(ls, lkb, &args);
+        else
+                error = unlock_lock(ls, lkb, &args);
+        if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+                error = 0;
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        dlm_put_lockspace(ls);
+        return error;
+}
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request                 receive_request
+ * send_convert                 receive_convert
+ * send_unlock                  receive_unlock
+ * send_cancel                  receive_cancel
+ * send_grant                   receive_grant
+ * send_bast                    receive_bast
+ * send_lookup                  receive_lookup
+ * send_remove                  receive_remove
+ *
+ *                              send_common_reply
+ * receive_request_reply        send_request_reply
+ * receive_convert_reply        send_convert_reply
+ * receive_unlock_reply         send_unlock_reply
+ * receive_cancel_reply         send_cancel_reply
+ * receive_lookup_reply         send_lookup_reply
+ */
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                          int to_nodeid, int mstype,
+                          struct dlm_message **ms_ret,
+                          struct dlm_mhandle **mh_ret)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        char *mb;
+        int mb_len = sizeof(struct dlm_message);
+        switch (mstype) {
+        case DLM_MSG_REQUEST:
+        case DLM_MSG_LOOKUP:
+        case DLM_MSG_REMOVE:
+                mb_len += r->res_length;
+                break;
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_REQUEST_REPLY:
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_GRANT:
+                if (lkb && lkb->lkb_lvbptr)
+                        mb_len += r->res_ls->ls_lvblen;
+                break;
+        }
+        /* get_buffer gives us a message handle (mh) that we need to
+           pass into lowcomms_commit and a message buffer (mb) that we
+           write our data into */
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+        if (!mh)
+                return -ENOBUFS;
+        memset(mb, 0, mb_len);
+        ms = (struct dlm_message *) mb;
+        ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+        ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+        ms->m_header.h_nodeid = dlm_our_nodeid();
+        ms->m_header.h_length = mb_len;
+        ms->m_header.h_cmd = DLM_MSG;
+        ms->m_type = mstype;
+        *mh_ret = mh;
+        *ms_ret = ms;
+        return 0;
+}
+/* further lowcomms enhancements or alternate implementations may make
+   the return value from this function useful at some point */
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+        dlm_message_out(ms);
+        dlm_lowcomms_commit_buffer(mh);
+        return 0;
+}
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                      struct dlm_message *ms)
+{
+        ms->m_nodeid   = lkb->lkb_nodeid;
+        ms->m_pid      = lkb->lkb_ownpid;
+        ms->m_lkid     = lkb->lkb_id;
+        ms->m_remid    = lkb->lkb_remid;
+        ms->m_exflags  = lkb->lkb_exflags;
+        ms->m_sbflags  = lkb->lkb_sbflags;
+        ms->m_flags    = lkb->lkb_flags;
+        ms->m_lvbseq   = lkb->lkb_lvbseq;
+        ms->m_status   = lkb->lkb_status;
+        ms->m_grmode   = lkb->lkb_grmode;
+        ms->m_rqmode   = lkb->lkb_rqmode;
+        ms->m_hash     = r->res_hash;
+        /* m_result and m_bastmode are set from function args,
+           not from lkb fields */
+        if (lkb->lkb_bastaddr)
+                ms->m_asts |= AST_BAST;
+        if (lkb->lkb_astaddr)
+                ms->m_asts |= AST_COMP;
+        if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+                memcpy(ms->m_extra, r->res_name, r->res_length);
+        else if (lkb->lkb_lvbptr)
+                memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        add_to_waiters(lkb, mstype);
+        to_nodeid = r->res_nodeid;
+        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+        if (error)
+                goto fail;
+        send_args(r, lkb, ms);
+        error = send_message(mh, ms);
+        if (error)
+                goto fail;
+        return 0;
+ fail:
+        remove_from_waiters(lkb);
+        return error;
+}
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        int error;
+        error = send_common(r, lkb, DLM_MSG_CONVERT);
+        /* down conversions go without a reply from the master */
+        if (!error && down_conversion(lkb)) {
+                remove_from_waiters(lkb);
+                r->res_ls->ls_stub_ms.m_result = 0;
+                r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+        }
+        return error;
+}
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+   MASTER_UNCERTAIN to force the next request on the rsb to confirm
+   that the master is still correct. */
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = lkb->lkb_nodeid;
+        error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+        if (error)
+                goto out;
+        send_args(r, lkb, ms);
+        ms->m_result = 0;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = lkb->lkb_nodeid;
+        error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+        if (error)
+                goto out;
+        send_args(r, lkb, ms);
+        ms->m_bastmode = mode;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        to_nodeid = dlm_dir_nodeid(r);
+        error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+        if (error)
+                goto fail;
+        send_args(r, lkb, ms);
+        error = send_message(mh, ms);
+        if (error)
+                goto fail;
+        return 0;
+ fail:
+        remove_from_waiters(lkb);
+        return error;
+}
+static int send_remove(struct dlm_rsb *r)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = dlm_dir_nodeid(r);
+        error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+        if (error)
+                goto out;
+        memcpy(ms->m_extra, r->res_name, r->res_length);
+        ms->m_hash = r->res_hash;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                             int mstype, int rv)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int to_nodeid, error;
+        to_nodeid = lkb->lkb_nodeid;
+        error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+        if (error)
+                goto out;
+        send_args(r, lkb, ms);
+        ms->m_result = rv;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+        return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+                             int ret_nodeid, int rv)
+{
+        struct dlm_rsb *r = &ls->ls_stub_rsb;
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int error, nodeid = ms_in->m_header.h_nodeid;
+        error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+        if (error)
+                goto out;
+        ms->m_lkid = ms_in->m_lkid;
+        ms->m_result = rv;
+        ms->m_nodeid = ret_nodeid;
+        error = send_message(mh, ms);
+ out:
+        return error;
+}
+/* which args we save from a received message depends heavily on the type
+   of message, unlike the send side where we can safely send everything about
+   the lkb for any type of message */
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        lkb->lkb_exflags = ms->m_exflags;
+        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+                         (ms->m_flags & 0x0000FFFF);
+}
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        lkb->lkb_sbflags = ms->m_sbflags;
+        lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+                         (ms->m_flags & 0x0000FFFF);
+}
+static int receive_extralen(struct dlm_message *ms)
+{
+        return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                       struct dlm_message *ms)
+{
+        int len;
+        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+                if (!lkb->lkb_lvbptr)
+                        lkb->lkb_lvbptr = allocate_lvb(ls);
+                if (!lkb->lkb_lvbptr)
+                        return -ENOMEM;
+                len = receive_extralen(ms);
+                memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+        }
+        return 0;
+}
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                struct dlm_message *ms)
+{
+        lkb->lkb_nodeid = ms->m_header.h_nodeid;
+        lkb->lkb_ownpid = ms->m_pid;
+        lkb->lkb_remid = ms->m_lkid;
+        lkb->lkb_grmode = DLM_LOCK_IV;
+        lkb->lkb_rqmode = ms->m_rqmode;
+        lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+        lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+        DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+        if (receive_lvb(ls, lkb, ms))
+                return -ENOMEM;
+        return 0;
+}
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                struct dlm_message *ms)
+{
+        if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+                log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+                          lkb->lkb_nodeid, ms->m_header.h_nodeid,
+                          lkb->lkb_id, lkb->lkb_remid);
+                return -EINVAL;
+        }
+        if (!is_master_copy(lkb))
+                return -EINVAL;
+        if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+                return -EBUSY;
+        if (receive_lvb(ls, lkb, ms))
+                return -ENOMEM;
+        lkb->lkb_rqmode = ms->m_rqmode;
+        lkb->lkb_lvbseq = ms->m_lvbseq;
+        return 0;
+}
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                               struct dlm_message *ms)
+{
+        if (!is_master_copy(lkb))
+                return -EINVAL;
+        if (receive_lvb(ls, lkb, ms))
+                return -ENOMEM;
+        return 0;
+}
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+   uses to send a reply and that the remote end uses to process the reply. */
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+        lkb->lkb_nodeid = ms->m_header.h_nodeid;
+        lkb->lkb_remid = ms->m_lkid;
+}
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, namelen;
+        error = create_lkb(ls, &lkb);
+        if (error)
+                goto fail;
+        receive_flags(lkb, ms);
+        lkb->lkb_flags |= DLM_IFL_MSTCPY;
+        error = receive_request_args(ls, lkb, ms);
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto fail;
+        }
+        namelen = receive_extralen(ms);
+        error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto fail;
+        }
+        lock_rsb(r);
+        attach_lkb(r, lkb);
+        error = do_request(r, lkb);
+        send_request_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        if (error == -EINPROGRESS)
+                error = 0;
+        if (error)
+                dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, reply = 1;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error)
+                goto fail;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        receive_flags(lkb, ms);
+        error = receive_convert_args(ls, lkb, ms);
+        if (error)
+                goto out;
+        reply = !down_conversion(lkb);
+        error = do_convert(r, lkb);
+ out:
+        if (reply)
+                send_convert_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error)
+                goto fail;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        receive_flags(lkb, ms);
+        error = receive_unlock_args(ls, lkb, ms);
+        if (error)
+                goto out;
+        error = do_unlock(r, lkb);
+ out:
+        send_unlock_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error)
+                goto fail;
+        receive_flags(lkb, ms);
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        error = do_cancel(r, lkb);
+        send_cancel_reply(r, lkb, error);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return;
+ fail:
+        setup_stub_lkb(ls, ms);
+        send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_grant no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        receive_flags_reply(lkb, ms);
+        grant_lock_pc(r, lkb, ms);
+        queue_cast(r, lkb, 0);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+}
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_bast no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        queue_bast(r, lkb, ms->m_bastmode);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+}
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+        from_nodeid = ms->m_header.h_nodeid;
+        our_nodeid = dlm_our_nodeid();
+        len = receive_extralen(ms);
+        dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+        if (dir_nodeid != our_nodeid) {
+                log_error(ls, "lookup dir_nodeid %d from %d",
+                          dir_nodeid, from_nodeid);
+                error = -EINVAL;
+                ret_nodeid = -1;
+                goto out;
+        }
+        error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+        /* Optimization: we're master so treat lookup as a request */
+        if (!error && ret_nodeid == our_nodeid) {
+                receive_request(ls, ms);
+                return;
+        }
+ out:
+        send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        int len, dir_nodeid, from_nodeid;
+        from_nodeid = ms->m_header.h_nodeid;
+        len = receive_extralen(ms);
+        dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+        if (dir_nodeid != dlm_our_nodeid()) {
+                log_error(ls, "remove dir entry dir_nodeid %d from %d",
+                          dir_nodeid, from_nodeid);
+                return;
+        }
+        dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, mstype;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_request_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        mstype = lkb->lkb_wait_type;
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_request_reply not on waiters");
+                goto out;
+        }
+        /* this is the value returned from do_request() on the master */
+        error = ms->m_result;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        /* Optimization: the dir node was also the master, so it took our
+           lookup as a request and sent request reply instead of lookup reply */
+        if (mstype == DLM_MSG_LOOKUP) {
+                r->res_nodeid = ms->m_header.h_nodeid;
+                lkb->lkb_nodeid = r->res_nodeid;
+        }
+        switch (error) {
+        case -EAGAIN:
+                /* request would block (be queued) on remote master;
+                   the unhold undoes the original ref from create_lkb()
+                   so it leads to the lkb being freed */
+                queue_cast(r, lkb, -EAGAIN);
+                confirm_master(r, -EAGAIN);
+                unhold_lkb(lkb);
+                break;
+        case -EINPROGRESS:
+        case 0:
+                /* request was queued or granted on remote master */
+                receive_flags_reply(lkb, ms);
+                lkb->lkb_remid = ms->m_lkid;
+                if (error)
+                        add_lkb(r, lkb, DLM_LKSTS_WAITING);
+                else {
+                        grant_lock_pc(r, lkb, ms);
+                        queue_cast(r, lkb, 0);
+                }
+                confirm_master(r, error);
+                break;
+        case -EBADR:
+        case -ENOTBLK:
+                /* find_rsb failed to find rsb or rsb wasn't master */
+                r->res_nodeid = -1;
+                lkb->lkb_nodeid = -1;
+                _request_lock(r, lkb);
+                break;
+        default:
+                log_error(ls, "receive_request_reply error %d", error);
+        }
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                                    struct dlm_message *ms)
+{
+        int error = ms->m_result;
+        /* this is the value returned from do_convert() on the master */
+        switch (error) {
+        case -EAGAIN:
+                /* convert would block (be queued) on remote master */
+                queue_cast(r, lkb, -EAGAIN);
+                break;
+        case -EINPROGRESS:
+                /* convert was queued on remote master */
+                del_lkb(r, lkb);
+                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+                break;
+        case 0:
+                /* convert was granted on remote master */
+                receive_flags_reply(lkb, ms);
+                grant_lock_pc(r, lkb, ms);
+                queue_cast(r, lkb, 0);
+                break;
+        default:
+                log_error(r->res_ls, "receive_convert_reply error %d", error);
+        }
+}
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_rsb *r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        __receive_convert_reply(r, lkb, ms);
+        unlock_rsb(r);
+        put_rsb(r);
+}
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_convert_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_convert_reply not on waiters");
+                goto out;
+        }
+        _receive_convert_reply(lkb, ms);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_rsb *r = lkb->lkb_resource;
+        int error = ms->m_result;
+        hold_rsb(r);
+        lock_rsb(r);
+        /* this is the value returned from do_unlock() on the master */
+        switch (error) {
+        case -DLM_EUNLOCK:
+                receive_flags_reply(lkb, ms);
+                remove_lock_pc(r, lkb);
+                queue_cast(r, lkb, -DLM_EUNLOCK);
+                break;
+        default:
+                log_error(r->res_ls, "receive_unlock_reply error %d", error);
+        }
+        unlock_rsb(r);
+        put_rsb(r);
+}
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_unlock_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_unlock_reply not on waiters");
+                goto out;
+        }
+        _receive_unlock_reply(lkb, ms);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_rsb *r = lkb->lkb_resource;
+        int error = ms->m_result;
+        hold_rsb(r);
+        lock_rsb(r);
+        /* this is the value returned from do_cancel() on the master */
+        switch (error) {
+        case -DLM_ECANCEL:
+                receive_flags_reply(lkb, ms);
+                revert_lock_pc(r, lkb);
+                queue_cast(r, lkb, -DLM_ECANCEL);
+                break;
+        default:
+                log_error(r->res_ls, "receive_cancel_reply error %d", error);
+        }
+        unlock_rsb(r);
+        put_rsb(r);
+}
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, ms->m_remid, &lkb);
+        if (error) {
+                log_error(ls, "receive_cancel_reply no lkb");
+                return;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_cancel_reply not on waiters");
+                goto out;
+        }
+        _receive_cancel_reply(lkb, ms);
+ out:
+        dlm_put_lkb(lkb);
+}
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error, ret_nodeid;
+        error = find_lkb(ls, ms->m_lkid, &lkb);
+        if (error) {
+                log_error(ls, "receive_lookup_reply no lkb");
+                return;
+        }
+        error = remove_from_waiters(lkb);
+        if (error) {
+                log_error(ls, "receive_lookup_reply not on waiters");
+                goto out;
+        }
+        /* this is the value returned by dlm_dir_lookup on dir node
+           FIXME: will a non-zero error ever be returned? */
+        error = ms->m_result;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        ret_nodeid = ms->m_nodeid;
+        if (ret_nodeid == dlm_our_nodeid()) {
+                r->res_nodeid = 0;
+                ret_nodeid = 0;
+                r->res_first_lkid = 0;
+        } else {
+                /* set_master() will copy res_nodeid to lkb_nodeid */
+                r->res_nodeid = ret_nodeid;
+        }
+        _request_lock(r, lkb);
+        if (!ret_nodeid)
+                process_lookup_list(r);
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        dlm_put_lkb(lkb);
+}
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+        struct dlm_message *ms = (struct dlm_message *) hd;
+        struct dlm_ls *ls;
+        int error;
+        if (!recovery)
+                dlm_message_in(ms);
+        ls = dlm_find_lockspace_global(hd->h_lockspace);
+        if (!ls) {
+                log_print("drop message %d from %d for unknown lockspace %d",
+                          ms->m_type, nodeid, hd->h_lockspace);
+                return -EINVAL;
+        }
+        /* recovery may have just ended leaving a bunch of backed-up requests
+           in the requestqueue; wait while dlm_recoverd clears them */
+        if (!recovery)
+                dlm_wait_requestqueue(ls);
+        /* recovery may have just started while there were a bunch of
+           in-flight requests -- save them in requestqueue to be processed
+           after recovery.  we can't let dlm_recvd block on the recovery
+           lock.  if dlm_recoverd is calling this function to clear the
+           requestqueue, it needs to be interrupted (-EINTR) if another
+           recovery operation is starting. */
+        while (1) {
+                if (dlm_locking_stopped(ls)) {
+                        if (!recovery)
+                                dlm_add_requestqueue(ls, nodeid, hd);
+                        error = -EINTR;
+                        goto out;
+                }
+                if (lock_recovery_try(ls))
+                        break;
+                schedule();
+        }
+        switch (ms->m_type) {
+        /* messages sent to a master node */
+        case DLM_MSG_REQUEST:
+                receive_request(ls, ms);
+                break;
+        case DLM_MSG_CONVERT:
+                receive_convert(ls, ms);
+                break;
+        case DLM_MSG_UNLOCK:
+                receive_unlock(ls, ms);
+                break;
+        case DLM_MSG_CANCEL:
+                receive_cancel(ls, ms);
+                break;
+        /* messages sent from a master node (replies to above) */
+        case DLM_MSG_REQUEST_REPLY:
+                receive_request_reply(ls, ms);
+                break;
+        case DLM_MSG_CONVERT_REPLY:
+                receive_convert_reply(ls, ms);
+                break;
+        case DLM_MSG_UNLOCK_REPLY:
+                receive_unlock_reply(ls, ms);
+                break;
+        case DLM_MSG_CANCEL_REPLY:
+                receive_cancel_reply(ls, ms);
+                break;
+        /* messages sent from a master node (only two types of async msg) */
+        case DLM_MSG_GRANT:
+                receive_grant(ls, ms);
+                break;
+        case DLM_MSG_BAST:
+                receive_bast(ls, ms);
+                break;
+        /* messages sent to a dir node */
+        case DLM_MSG_LOOKUP:
+                receive_lookup(ls, ms);
+                break;
+        case DLM_MSG_REMOVE:
+                receive_remove(ls, ms);
+                break;
+        /* messages sent from a dir node (remove has no reply) */
+        case DLM_MSG_LOOKUP_REPLY:
+                receive_lookup_reply(ls, ms);
+                break;
+        default:
+                log_error(ls, "unknown message type %d", ms->m_type);
+        }
+        unlock_recovery(ls);
+ out:
+        dlm_put_lockspace(ls);
+        dlm_astd_wake();
+        return 0;
+}
+/*
+ * Recovery related
+ */
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        if (middle_conversion(lkb)) {
+                hold_lkb(lkb);
+                ls->ls_stub_ms.m_result = -EINPROGRESS;
+                _remove_from_waiters(lkb);
+                _receive_convert_reply(lkb, &ls->ls_stub_ms);
+                /* Same special case as in receive_rcom_lock_args() */
+                lkb->lkb_grmode = DLM_LOCK_IV;
+                rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+                unhold_lkb(lkb);
+        } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+                lkb->lkb_flags |= DLM_IFL_RESEND;
+        }
+        /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+           conversions are async; there's no reply from the remote master */
+}
+/* A waiting lkb needs recovery if the master node has failed, or
+   the master node is changing (only when no directory is used) */
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        if (dlm_is_removed(ls, lkb->lkb_nodeid))
+                return 1;
+        if (!dlm_no_directory(ls))
+                return 0;
+        if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+                return 1;
+        return 0;
+}
+/* Recovery for locks that are waiting for replies from nodes that are now
+   gone.  We can just complete unlocks and cancels by faking a reply from the
+   dead node.  Requests and up-conversions we flag to be resent after
+   recovery.  Down-conversions can just be completed with a fake reply like
+   unlocks.  Conversions between PR and CW need special attention. */
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb, *safe;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+                log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+                          lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+                /* all outstanding lookups, regardless of destination  will be
+                   resent after recovery is done */
+                if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+                        lkb->lkb_flags |= DLM_IFL_RESEND;
+                        continue;
+                }
+                if (!waiter_needs_recovery(ls, lkb))
+                        continue;
+                switch (lkb->lkb_wait_type) {
+                case DLM_MSG_REQUEST:
+                        lkb->lkb_flags |= DLM_IFL_RESEND;
+                        break;
+                case DLM_MSG_CONVERT:
+                        recover_convert_waiter(ls, lkb);
+                        break;
+                case DLM_MSG_UNLOCK:
+                        hold_lkb(lkb);
+                        ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+                        _remove_from_waiters(lkb);
+                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+                        dlm_put_lkb(lkb);
+                        break;
+                case DLM_MSG_CANCEL:
+                        hold_lkb(lkb);
+                        ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+                        _remove_from_waiters(lkb);
+                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+                        dlm_put_lkb(lkb);
+                        break;
+                default:
+                        log_error(ls, "invalid lkb wait_type %d",
+                                  lkb->lkb_wait_type);
+                }
+                schedule();
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+}
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+        struct dlm_lkb *lkb;
+        int rv = 0;
+        mutex_lock(&ls->ls_waiters_mutex);
+        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+                if (lkb->lkb_flags & DLM_IFL_RESEND) {
+                        rv = lkb->lkb_wait_type;
+                        _remove_from_waiters(lkb);
+                        lkb->lkb_flags &= ~DLM_IFL_RESEND;
+                        break;
+                }
+        }
+        mutex_unlock(&ls->ls_waiters_mutex);
+        if (!rv)
+                lkb = NULL;
+        *lkb_ret = lkb;
+        return rv;
+}
+/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
+   master or dir-node for r.  Processing the lkb may result in it being placed
+   back on waiters. */
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *r;
+        int error = 0, mstype;
+        while (1) {
+                if (dlm_locking_stopped(ls)) {
+                        log_debug(ls, "recover_waiters_post aborted");
+                        error = -EINTR;
+                        break;
+                }
+                mstype = remove_resend_waiter(ls, &lkb);
+                if (!mstype)
+                        break;
+                r = lkb->lkb_resource;
+                log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+                          lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+                switch (mstype) {
+                case DLM_MSG_LOOKUP:
+                        hold_rsb(r);
+                        lock_rsb(r);
+                        _request_lock(r, lkb);
+                        if (is_master(r))
+                                confirm_master(r, 0);
+                        unlock_rsb(r);
+                        put_rsb(r);
+                        break;
+                case DLM_MSG_REQUEST:
+                        hold_rsb(r);
+                        lock_rsb(r);
+                        _request_lock(r, lkb);
+                        if (is_master(r))
+                                confirm_master(r, 0);
+                        unlock_rsb(r);
+                        put_rsb(r);
+                        break;
+                case DLM_MSG_CONVERT:
+                        hold_rsb(r);
+                        lock_rsb(r);
+                        _convert_lock(r, lkb);
+                        unlock_rsb(r);
+                        put_rsb(r);
+                        break;
+                default:
+                        log_error(ls, "recover_waiters_post type %d", mstype);
+                }
+        }
+        return error;
+}
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+                        int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+        struct dlm_ls *ls = r->res_ls;
+        struct dlm_lkb *lkb, *safe;
+        list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+                if (test(ls, lkb)) {
+                        rsb_set_flag(r, RSB_LOCKS_PURGED);
+                        del_lkb(r, lkb);
+                        /* this put should free the lkb */
+                        if (!dlm_put_lkb(lkb))
+                                log_error(ls, "purged lkb not released");
+                }
+        }
+}
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        return is_master_copy(lkb);
+}
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+        purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+        purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+        purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+        purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+        purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+        purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+/* Get rid of locks held by nodes that are gone. */
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        log_debug(ls, "dlm_purge_locks");
+        down_write(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                hold_rsb(r);
+                lock_rsb(r);
+                if (is_master(r))
+                        purge_dead_locks(r);
+                unlock_rsb(r);
+                unhold_rsb(r);
+                schedule();
+        }
+        up_write(&ls->ls_root_sem);
+        return 0;
+}
+static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+{
+        struct dlm_rsb *r, *r_ret = NULL;
+        read_lock(&ls->ls_rsbtbl[bucket].lock);
+        list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+                if (!rsb_flag(r, RSB_LOCKS_PURGED))
+                        continue;
+                hold_rsb(r);
+                rsb_clear_flag(r, RSB_LOCKS_PURGED);
+                r_ret = r;
+                break;
+        }
+        read_unlock(&ls->ls_rsbtbl[bucket].lock);
+        return r_ret;
+}
+void dlm_grant_after_purge(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int bucket = 0;
+        while (1) {
+                r = find_purged_rsb(ls, bucket);
+                if (!r) {
+                        if (bucket == ls->ls_rsbtbl_size - 1)
+                                break;
+                        bucket++;
+                        continue;
+                }
+                lock_rsb(r);
+                if (is_master(r)) {
+                        grant_pending_locks(r);
+                        confirm_master(r, 0);
+                }
+                unlock_rsb(r);
+                put_rsb(r);
+                schedule();
+        }
+}
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+                                         uint32_t remid)
+{
+        struct dlm_lkb *lkb;
+        list_for_each_entry(lkb, head, lkb_statequeue) {
+                if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+                        return lkb;
+        }
+        return NULL;
+}
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+                                    uint32_t remid)
+{
+        struct dlm_lkb *lkb;
+        lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+        if (lkb)
+                return lkb;
+        lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+        if (lkb)
+                return lkb;
+        lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+        if (lkb)
+                return lkb;
+        return NULL;
+}
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                                  struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+        struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+        int lvblen;
+        lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+        lkb->lkb_ownpid = rl->rl_ownpid;
+        lkb->lkb_remid = rl->rl_lkid;
+        lkb->lkb_exflags = rl->rl_exflags;
+        lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+        lkb->lkb_flags |= DLM_IFL_MSTCPY;
+        lkb->lkb_lvbseq = rl->rl_lvbseq;
+        lkb->lkb_rqmode = rl->rl_rqmode;
+        lkb->lkb_grmode = rl->rl_grmode;
+        /* don't set lkb_status because add_lkb wants to itself */
+        lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+        lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+                lkb->lkb_lvbptr = allocate_lvb(ls);
+                if (!lkb->lkb_lvbptr)
+                        return -ENOMEM;
+                lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+                         sizeof(struct rcom_lock);
+                memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+        }
+        /* Conversions between PR and CW (middle modes) need special handling.
+           The real granted mode of these converting locks cannot be determined
+           until all locks have been rebuilt on the rsb (recover_conversion) */
+        if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+                rl->rl_status = DLM_LKSTS_CONVERT;
+                lkb->lkb_grmode = DLM_LOCK_IV;
+                rsb_set_flag(r, RSB_RECOVER_CONVERT);
+        }
+        return 0;
+}
+/* This lkb may have been recovered in a previous aborted recovery so we need
+   to check if the rsb already has an lkb with the given remote nodeid/lkid.
+   If so we just send back a standard reply.  If not, we create a new lkb with
+   the given values and send back our lkid.  We send back our lkid by sending
+   back the rcom_lock struct we got but with the remid field filled in. */
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+        struct dlm_rsb *r;
+        struct dlm_lkb *lkb;
+        int error;
+        if (rl->rl_parent_lkid) {
+                error = -EOPNOTSUPP;
+                goto out;
+        }
+        error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+        if (error)
+                goto out;
+        lock_rsb(r);
+        lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+        if (lkb) {
+                error = -EEXIST;
+                goto out_remid;
+        }
+        error = create_lkb(ls, &lkb);
+        if (error)
+                goto out_unlock;
+        error = receive_rcom_lock_args(ls, lkb, r, rc);
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto out_unlock;
+        }
+        attach_lkb(r, lkb);
+        add_lkb(r, lkb, rl->rl_status);
+        error = 0;
+ out_remid:
+        /* this is the new value returned to the lock holder for
+           saving in its process-copy lkb */
+        rl->rl_remid = lkb->lkb_id;
+ out_unlock:
+        unlock_rsb(r);
+        put_rsb(r);
+ out:
+        if (error)
+                log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+        rl->rl_result = error;
+        return error;
+}
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+        struct dlm_rsb *r;
+        struct dlm_lkb *lkb;
+        int error;
+        error = find_lkb(ls, rl->rl_lkid, &lkb);
+        if (error) {
+                log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+                return error;
+        }
+        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+        error = rl->rl_result;
+        r = lkb->lkb_resource;
+        hold_rsb(r);
+        lock_rsb(r);
+        switch (error) {
+        case -EEXIST:
+                log_debug(ls, "master copy exists %x", lkb->lkb_id);
+                /* fall through */
+        case 0:
+                lkb->lkb_remid = rl->rl_remid;
+                break;
+        default:
+                log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+                          error, lkb->lkb_id);
+        }
+        /* an ack for dlm_recover_locks() which waits for replies from
+           all the locks it sends to new masters */
+        dlm_recovered_lock(r);
+        unlock_rsb(r);
+        put_rsb(r);
+        dlm_put_lkb(lkb);
+        return 0;
+}
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+                     int mode, uint32_t flags, void *name, unsigned int namelen,
+                     uint32_t parent_lkid)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        int error;
+        lock_recovery(ls);
+        error = create_lkb(ls, &lkb);
+        if (error) {
+                kfree(ua);
+                goto out;
+        }
+        if (flags & DLM_LKF_VALBLK) {
+                ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                if (!ua->lksb.sb_lvbptr) {
+                        kfree(ua);
+                        __put_lkb(ls, lkb);
+                        error = -ENOMEM;
+                        goto out;
+                }
+        }
+        /* After ua is attached to lkb it will be freed by free_lkb().
+           When DLM_IFL_USER is set, the dlm knows that this is a userspace
+           lock and that lkb_astparam is the dlm_user_args structure. */
+        error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
+                              DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
+        lkb->lkb_flags |= DLM_IFL_USER;
+        ua->old_mode = DLM_LOCK_IV;
+        if (error) {
+                __put_lkb(ls, lkb);
+                goto out;
+        }
+        error = request_lock(ls, lkb, name, namelen, &args);
+        switch (error) {
+        case 0:
+                break;
+        case -EINPROGRESS:
+                error = 0;
+                break;
+        case -EAGAIN:
+                error = 0;
+                /* fall through */
+        default:
+                __put_lkb(ls, lkb);
+                goto out;
+        }
+        /* add this new lkb to the per-process list of locks */
+        spin_lock(&ua->proc->locks_spin);
+        kref_get(&lkb->lkb_ref);
+        list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+        spin_unlock(&ua->proc->locks_spin);
+ out:
+        unlock_recovery(ls);
+        return error;
+}
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+                     int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        struct dlm_user_args *ua;
+        int error;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        /* user can change the params on its lock when it converts it, or
+           add an lvb that didn't exist before */
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+                ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+                if (!ua->lksb.sb_lvbptr) {
+                        error = -ENOMEM;
+                        goto out_put;
+                }
+        }
+        if (lvb_in && ua->lksb.sb_lvbptr)
+                memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+        ua->castparam = ua_tmp->castparam;
+        ua->castaddr = ua_tmp->castaddr;
+        ua->bastparam = ua_tmp->bastparam;
+        ua->bastaddr = ua_tmp->bastaddr;
+        ua->user_lksb = ua_tmp->user_lksb;
+        ua->old_mode = lkb->lkb_grmode;
+        error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
+                              ua, DLM_FAKE_USER_AST, &args);
+        if (error)
+                goto out_put;
+        error = convert_lock(ls, lkb, &args);
+        if (error == -EINPROGRESS || error == -EAGAIN)
+                error = 0;
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        kfree(ua_tmp);
+        return error;
+}
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+                    uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        struct dlm_user_args *ua;
+        int error;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        if (lvb_in && ua->lksb.sb_lvbptr)
+                memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+        ua->castparam = ua_tmp->castparam;
+        ua->user_lksb = ua_tmp->user_lksb;
+        error = set_unlock_args(flags, ua, &args);
+        if (error)
+                goto out_put;
+        error = unlock_lock(ls, lkb, &args);
+        if (error == -DLM_EUNLOCK)
+                error = 0;
+        if (error)
+                goto out_put;
+        spin_lock(&ua->proc->locks_spin);
+        list_del_init(&lkb->lkb_ownqueue);
+        spin_unlock(&ua->proc->locks_spin);
+        /* this removes the reference for the proc->locks list added by
+           dlm_user_request */
+        unhold_lkb(lkb);
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        return error;
+}
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+                    uint32_t flags, uint32_t lkid)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_args args;
+        struct dlm_user_args *ua;
+        int error;
+        lock_recovery(ls);
+        error = find_lkb(ls, lkid, &lkb);
+        if (error)
+                goto out;
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        ua->castparam = ua_tmp->castparam;
+        ua->user_lksb = ua_tmp->user_lksb;
+        error = set_unlock_args(flags, ua, &args);
+        if (error)
+                goto out_put;
+        error = cancel_lock(ls, lkb, &args);
+        if (error == -DLM_ECANCEL)
+                error = 0;
+        if (error)
+                goto out_put;
+        /* this lkb was removed from the WAITING queue */
+        if (lkb->lkb_grmode == DLM_LOCK_IV) {
+                spin_lock(&ua->proc->locks_spin);
+                list_del_init(&lkb->lkb_ownqueue);
+                spin_unlock(&ua->proc->locks_spin);
+                unhold_lkb(lkb);
+        }
+ out_put:
+        dlm_put_lkb(lkb);
+ out:
+        unlock_recovery(ls);
+        return error;
+}
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        if (ua->lksb.sb_lvbptr)
+                kfree(ua->lksb.sb_lvbptr);
+        kfree(ua);
+        lkb->lkb_astparam = (long)NULL;
+        /* TODO: propogate to master if needed */
+        return 0;
+}
+/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+   Regardless of what rsb queue the lock is on, it's removed and freed. */
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+        struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        struct dlm_args args;
+        int error;
+        /* FIXME: we need to handle the case where the lkb is in limbo
+           while the rsb is being looked up, currently we assert in
+           _unlock_lock/is_remote because rsb nodeid is -1. */
+        set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
+        error = unlock_lock(ls, lkb, &args);
+        if (error == -DLM_EUNLOCK)
+                error = 0;
+        return error;
+}
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
+   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+   which we clear here. */
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+   list, and no more device_writes should add lkb's to proc->locks list; so we
+   shouldn't need to take asts_spin or locks_spin here.  this assumes that
+   device reads/writes/closes are serialized -- FIXME: we may need to serialize
+   them ourself. */
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+        struct dlm_lkb *lkb, *safe;
+        lock_recovery(ls);
+        mutex_lock(&ls->ls_clear_proc_locks);
+        list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
+                if (lkb->lkb_ast_type) {
+                        list_del(&lkb->lkb_astqueue);
+                        unhold_lkb(lkb);
+                }
+                list_del_init(&lkb->lkb_ownqueue);
+                if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
+                        lkb->lkb_flags |= DLM_IFL_ORPHAN;
+                        orphan_proc_lock(ls, lkb);
+                } else {
+                        lkb->lkb_flags |= DLM_IFL_DEAD;
+                        unlock_proc_lock(ls, lkb);
+                }
+                /* this removes the reference for the proc->locks list
+                   added by dlm_user_request, it may result in the lkb
+                   being freed */
+                dlm_put_lkb(lkb);
+        }
+        mutex_unlock(&ls->ls_clear_proc_locks);
+        unlock_recovery(ls);
+}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+void dlm_print_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+        unsigned int flags, struct dlm_rsb **r_ret);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+        uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+        int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+        uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
+        uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+static inline int is_master(struct dlm_rsb *r)
+{
+        return !r->res_nodeid;
+}
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+        mutex_lock(&r->res_mutex);
+}
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+        mutex_unlock(&r->res_mutex);
+}
+#endif
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+static int                      ls_count;
+static struct mutex             ls_lock;
+static struct list_head         lslist;
+static spinlock_t               lslist_lock;
+static struct task_struct *     scand_task;
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int n = simple_strtol(buf, NULL, 0);
+        switch (n) {
+        case 0:
+                dlm_ls_stop(ls);
+                break;
+        case 1:
+                dlm_ls_start(ls);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+        ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+        set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+        wake_up(&ls->ls_uevent_wait);
+        return len;
+}
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+        ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+        return len;
+}
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+        uint32_t status = dlm_recover_status(ls);
+        return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+struct dlm_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct dlm_ls *, char *);
+        ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+static struct dlm_attr dlm_attr_control = {
+        .attr  = {.name = "control", .mode = S_IWUSR},
+        .store = dlm_control_store
+};
+static struct dlm_attr dlm_attr_event = {
+        .attr  = {.name = "event_done", .mode = S_IWUSR},
+        .store = dlm_event_store
+};
+static struct dlm_attr dlm_attr_id = {
+        .attr  = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+        .show  = dlm_id_show,
+        .store = dlm_id_store
+};
+static struct dlm_attr dlm_attr_recover_status = {
+        .attr  = {.name = "recover_status", .mode = S_IRUGO},
+        .show  = dlm_recover_status_show
+};
+static struct dlm_attr dlm_attr_recover_nodeid = {
+        .attr  = {.name = "recover_nodeid", .mode = S_IRUGO},
+        .show  = dlm_recover_nodeid_show
+};
+static struct attribute *dlm_attrs[] = {
+        &dlm_attr_control.attr,
+        &dlm_attr_event.attr,
+        &dlm_attr_id.attr,
+        &dlm_attr_recover_status.attr,
+        &dlm_attr_recover_nodeid.attr,
+        NULL,
+};
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+                             char *buf)
+{
+        struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+        struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+        return a->show ? a->show(ls, buf) : 0;
+}
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+                              const char *buf, size_t len)
+{
+        struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+        struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+        return a->store ? a->store(ls, buf, len) : len;
+}
+static struct sysfs_ops dlm_attr_ops = {
+        .show  = dlm_attr_show,
+        .store = dlm_attr_store,
+};
+static struct kobj_type dlm_ktype = {
+        .default_attrs = dlm_attrs,
+        .sysfs_ops     = &dlm_attr_ops,
+};
+static struct kset dlm_kset = {
+        .subsys = &kernel_subsys,
+        .kobj   = {.name = "dlm",},
+        .ktype  = &dlm_ktype,
+};
+static int kobject_setup(struct dlm_ls *ls)
+{
+        char lsname[DLM_LOCKSPACE_LEN];
+        int error;
+        memset(lsname, 0, DLM_LOCKSPACE_LEN);
+        snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
+        error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
+        if (error)
+                return error;
+        ls->ls_kobj.kset = &dlm_kset;
+        ls->ls_kobj.ktype = &dlm_ktype;
+        return 0;
+}
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+        int error;
+        if (in)
+                kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+        else
+                kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+        error = wait_event_interruptible(ls->ls_uevent_wait,
+                        test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+        if (error)
+                goto out;
+        error = ls->ls_uevent_result;
+ out:
+        return error;
+}
+int dlm_lockspace_init(void)
+{
+        int error;
+        ls_count = 0;
+        mutex_init(&ls_lock);
+        INIT_LIST_HEAD(&lslist);
+        spin_lock_init(&lslist_lock);
+        error = kset_register(&dlm_kset);
+        if (error)
+                printk("dlm_lockspace_init: cannot register kset %d\n", error);
+        return error;
+}
+void dlm_lockspace_exit(void)
+{
+        kset_unregister(&dlm_kset);
+}
+static int dlm_scand(void *data)
+{
+        struct dlm_ls *ls;
+        while (!kthread_should_stop()) {
+                list_for_each_entry(ls, &lslist, ls_list)
+                        dlm_scan_rsbs(ls);
+                schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+        }
+        return 0;
+}
+static int dlm_scand_start(void)
+{
+        struct task_struct *p;
+        int error = 0;
+        p = kthread_run(dlm_scand, NULL, "dlm_scand");
+        if (IS_ERR(p))
+                error = PTR_ERR(p);
+        else
+                scand_task = p;
+        return error;
+}
+static void dlm_scand_stop(void)
+{
+        kthread_stop(scand_task);
+}
+static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_namelen == namelen &&
+                    memcmp(ls->ls_name, name, namelen) == 0)
+                        goto out;
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_global_id == id) {
+                        ls->ls_count++;
+                        goto out;
+                }
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_local_handle == lockspace) {
+                        ls->ls_count++;
+                        goto out;
+                }
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (ls->ls_device.minor == minor) {
+                        ls->ls_count++;
+                        goto out;
+                }
+        }
+        ls = NULL;
+ out:
+        spin_unlock(&lslist_lock);
+        return ls;
+}
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+        spin_lock(&lslist_lock);
+        ls->ls_count--;
+        spin_unlock(&lslist_lock);
+}
+static void remove_lockspace(struct dlm_ls *ls)
+{
+        for (;;) {
+                spin_lock(&lslist_lock);
+                if (ls->ls_count == 0) {
+                        list_del(&ls->ls_list);
+                        spin_unlock(&lslist_lock);
+                        return;
+                }
+                spin_unlock(&lslist_lock);
+                ssleep(1);
+        }
+}
+static int threads_start(void)
+{
+        int error;
+        /* Thread which process lock requests for all lockspace's */
+        error = dlm_astd_start();
+        if (error) {
+                log_print("cannot start dlm_astd thread %d", error);
+                goto fail;
+        }
+        error = dlm_scand_start();
+        if (error) {
+                log_print("cannot start dlm_scand thread %d", error);
+                goto astd_fail;
+        }
+        /* Thread for sending/receiving messages for all lockspace's */
+        error = dlm_lowcomms_start();
+        if (error) {
+                log_print("cannot start dlm lowcomms %d", error);
+                goto scand_fail;
+        }
+        return 0;
+ scand_fail:
+        dlm_scand_stop();
+ astd_fail:
+        dlm_astd_stop();
+ fail:
+        return error;
+}
+static void threads_stop(void)
+{
+        dlm_scand_stop();
+        dlm_lowcomms_stop();
+        dlm_astd_stop();
+}
+static int new_lockspace(char *name, int namelen, void **lockspace,
+                         uint32_t flags, int lvblen)
+{
+        struct dlm_ls *ls;
+        int i, size, error = -ENOMEM;
+        if (namelen > DLM_LOCKSPACE_LEN)
+                return -EINVAL;
+        if (!lvblen || (lvblen % 8))
+                return -EINVAL;
+        if (!try_module_get(THIS_MODULE))
+                return -EINVAL;
+        ls = dlm_find_lockspace_name(name, namelen);
+        if (ls) {
+                *lockspace = ls;
+                module_put(THIS_MODULE);
+                return -EEXIST;
+        }
+        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+        if (!ls)
+                goto out;
+        memcpy(ls->ls_name, name, namelen);
+        ls->ls_namelen = namelen;
+        ls->ls_exflags = flags;
+        ls->ls_lvblen = lvblen;
+        ls->ls_count = 0;
+        ls->ls_flags = 0;
+        size = dlm_config.rsbtbl_size;
+        ls->ls_rsbtbl_size = size;
+        ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+        if (!ls->ls_rsbtbl)
+                goto out_lsfree;
+        for (i = 0; i < size; i++) {
+                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+                rwlock_init(&ls->ls_rsbtbl[i].lock);
+        }
+        size = dlm_config.lkbtbl_size;
+        ls->ls_lkbtbl_size = size;
+        ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+        if (!ls->ls_lkbtbl)
+                goto out_rsbfree;
+        for (i = 0; i < size; i++) {
+                INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+                rwlock_init(&ls->ls_lkbtbl[i].lock);
+                ls->ls_lkbtbl[i].counter = 1;
+        }
+        size = dlm_config.dirtbl_size;
+        ls->ls_dirtbl_size = size;
+        ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+        if (!ls->ls_dirtbl)
+                goto out_lkbfree;
+        for (i = 0; i < size; i++) {
+                INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+                rwlock_init(&ls->ls_dirtbl[i].lock);
+        }
+        INIT_LIST_HEAD(&ls->ls_waiters);
+        mutex_init(&ls->ls_waiters_mutex);
+        INIT_LIST_HEAD(&ls->ls_nodes);
+        INIT_LIST_HEAD(&ls->ls_nodes_gone);
+        ls->ls_num_nodes = 0;
+        ls->ls_low_nodeid = 0;
+        ls->ls_total_weight = 0;
+        ls->ls_node_array = NULL;
+        memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+        ls->ls_stub_rsb.res_ls = ls;
+        ls->ls_debug_rsb_dentry = NULL;
+        ls->ls_debug_waiters_dentry = NULL;
+        init_waitqueue_head(&ls->ls_uevent_wait);
+        ls->ls_uevent_result = 0;
+        ls->ls_recoverd_task = NULL;
+        mutex_init(&ls->ls_recoverd_active);
+        spin_lock_init(&ls->ls_recover_lock);
+        ls->ls_recover_status = 0;
+        ls->ls_recover_seq = 0;
+        ls->ls_recover_args = NULL;
+        init_rwsem(&ls->ls_in_recovery);
+        INIT_LIST_HEAD(&ls->ls_requestqueue);
+        mutex_init(&ls->ls_requestqueue_mutex);
+        mutex_init(&ls->ls_clear_proc_locks);
+        ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+        if (!ls->ls_recover_buf)
+                goto out_dirfree;
+        INIT_LIST_HEAD(&ls->ls_recover_list);
+        spin_lock_init(&ls->ls_recover_list_lock);
+        ls->ls_recover_list_count = 0;
+        ls->ls_local_handle = ls;
+        init_waitqueue_head(&ls->ls_wait_general);
+        INIT_LIST_HEAD(&ls->ls_root_list);
+        init_rwsem(&ls->ls_root_sem);
+        down_write(&ls->ls_in_recovery);
+        spin_lock(&lslist_lock);
+        list_add(&ls->ls_list, &lslist);
+        spin_unlock(&lslist_lock);
+        /* needs to find ls in lslist */
+        error = dlm_recoverd_start(ls);
+        if (error) {
+                log_error(ls, "can't start dlm_recoverd %d", error);
+                goto out_rcomfree;
+        }
+        dlm_create_debug_file(ls);
+        error = kobject_setup(ls);
+        if (error)
+                goto out_del;
+        error = kobject_register(&ls->ls_kobj);
+        if (error)
+                goto out_del;
+        error = do_uevent(ls, 1);
+        if (error)
+                goto out_unreg;
+        *lockspace = ls;
+        return 0;
+ out_unreg:
+        kobject_unregister(&ls->ls_kobj);
+ out_del:
+        dlm_delete_debug_file(ls);
+        dlm_recoverd_stop(ls);
+ out_rcomfree:
+        spin_lock(&lslist_lock);
+        list_del(&ls->ls_list);
+        spin_unlock(&lslist_lock);
+        kfree(ls->ls_recover_buf);
+ out_dirfree:
+        kfree(ls->ls_dirtbl);
+ out_lkbfree:
+        kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+        kfree(ls->ls_rsbtbl);
+ out_lsfree:
+        kfree(ls);
+ out:
+        module_put(THIS_MODULE);
+        return error;
+}
+int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+                      uint32_t flags, int lvblen)
+{
+        int error = 0;
+        mutex_lock(&ls_lock);
+        if (!ls_count)
+                error = threads_start();
+        if (error)
+                goto out;
+        error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+        if (!error)
+                ls_count++;
+ out:
+        mutex_unlock(&ls_lock);
+        return error;
+}
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+        int i, lkb_found = 0;
+        struct dlm_lkb *lkb;
+        /* NOTE: We check the lockidtbl here rather than the resource table.
+           This is because there may be LKBs queued as ASTs that have been
+           unlinked from their RSBs and are pending deletion once the AST has
+           been delivered */
+        for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+                read_lock(&ls->ls_lkbtbl[i].lock);
+                if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+                        lkb_found = 1;
+                        list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+                                            lkb_idtbl_list) {
+                                if (!lkb->lkb_nodeid) {
+                                        read_unlock(&ls->ls_lkbtbl[i].lock);
+                                        return 2;
+                                }
+                        }
+                }
+                read_unlock(&ls->ls_lkbtbl[i].lock);
+        }
+        return lkb_found;
+}
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+        struct dlm_lkb *lkb;
+        struct dlm_rsb *rsb;
+        struct list_head *head;
+        int i;
+        int busy = lockspace_busy(ls);
+        if (busy > force)
+                return -EBUSY;
+        if (force < 3)
+                do_uevent(ls, 0);
+        dlm_recoverd_stop(ls);
+        remove_lockspace(ls);
+        dlm_delete_debug_file(ls);
+        dlm_astd_suspend();
+        kfree(ls->ls_recover_buf);
+        /*
+         * Free direntry structs.
+         */
+        dlm_dir_clear(ls);
+        kfree(ls->ls_dirtbl);
+        /*
+         * Free all lkb's on lkbtbl[] lists.
+         */
+        for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+                head = &ls->ls_lkbtbl[i].list;
+                while (!list_empty(head)) {
+                        lkb = list_entry(head->next, struct dlm_lkb,
+                                         lkb_idtbl_list);
+                        list_del(&lkb->lkb_idtbl_list);
+                        dlm_del_ast(lkb);
+                        if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+                                free_lvb(lkb->lkb_lvbptr);
+                        free_lkb(lkb);
+                }
+        }
+        dlm_astd_resume();
+        kfree(ls->ls_lkbtbl);
+        /*
+         * Free all rsb's on rsbtbl[] lists
+         */
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                head = &ls->ls_rsbtbl[i].list;
+                while (!list_empty(head)) {
+                        rsb = list_entry(head->next, struct dlm_rsb,
+                                         res_hashchain);
+                        list_del(&rsb->res_hashchain);
+                        free_rsb(rsb);
+                }
+                head = &ls->ls_rsbtbl[i].toss;
+                while (!list_empty(head)) {
+                        rsb = list_entry(head->next, struct dlm_rsb,
+                                         res_hashchain);
+                        list_del(&rsb->res_hashchain);
+                        free_rsb(rsb);
+                }
+        }
+        kfree(ls->ls_rsbtbl);
+        /*
+         * Free structures on any other lists
+         */
+        kfree(ls->ls_recover_args);
+        dlm_clear_free_entries(ls);
+        dlm_clear_members(ls);
+        dlm_clear_members_gone(ls);
+        kfree(ls->ls_node_array);
+        kobject_unregister(&ls->ls_kobj);
+        kfree(ls);
+        mutex_lock(&ls_lock);
+        ls_count--;
+        if (!ls_count)
+                threads_stop();
+        mutex_unlock(&ls_lock);
+        module_put(THIS_MODULE);
+        return 0;
+}
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died.  The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+int dlm_release_lockspace(void *lockspace, int force)
+{
+        struct dlm_ls *ls;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -EINVAL;
+        dlm_put_lockspace(ls);
+        return release_lockspace(ls, force);
+}
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+#endif                          /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..6da6b14d5a61
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1239 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int                      dlm_local_count;
+static int                      dlm_local_nodeid;
+/* One of these per connected node */
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+struct nodeinfo {
+        spinlock_t              lock;
+        sctp_assoc_t            assoc_id;
+        unsigned long           flags;
+        struct list_head        write_list; /* nodes with pending writes */
+        struct list_head        writequeue; /* outgoing writequeue_entries */
+        spinlock_t              writequeue_lock;
+        int                     nodeid;
+};
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore      nodeinfo_lock;
+static int                      max_nodeid;
+struct cbuf {
+        unsigned                base;
+        unsigned                len;
+        unsigned                mask;
+};
+/* Just the one of these, now. But this struct keeps
+   the connection-specific variables together */
+#define CF_READ_PENDING 1
+struct connection {
+        struct socket          *sock;
+        unsigned long           flags;
+        struct page            *rx_page;
+        atomic_t                waiting_requests;
+        struct cbuf             cb;
+        int                     eagain_flag;
+};
+/* An entry waiting to be sent */
+struct writequeue_entry {
+        struct list_head        list;
+        struct page            *page;
+        int                     offset;
+        int                     len;
+        int                     end;
+        int                     users;
+        struct nodeinfo        *ni;
+};
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+#define CBUF_INIT(cb, size) \
+do { \
+        (cb)->base = (cb)->len = 0; \
+        (cb)->mask = ((size)-1); \
+} while(0)
+#define CBUF_EAT(cb, n) \
+do { \
+        (cb)->len  -= (n); \
+        (cb)->base += (n); \
+        (cb)->base &= (cb)->mask; \
+} while(0)
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+/* The SCTP connection */
+static struct connection sctp_con;
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+        struct sockaddr_storage addr;
+        int error;
+        if (!dlm_local_count)
+                return -1;
+        error = dlm_nodeid_to_addr(nodeid, &addr);
+        if (error)
+                return error;
+        if (dlm_local_addr[0]->ss_family == AF_INET) {
+                struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+                struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+                ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+        } else {
+                struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+                struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+                memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+                       sizeof(in6->sin6_addr));
+        }
+        return 0;
+}
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
+{
+        struct nodeinfo *ni;
+        int r;
+        int n;
+        down_read(&nodeinfo_lock);
+        ni = idr_find(&nodeinfo_idr, nodeid);
+        up_read(&nodeinfo_lock);
+        if (!ni && alloc) {
+                down_write(&nodeinfo_lock);
+                ni = idr_find(&nodeinfo_idr, nodeid);
+                if (ni)
+                        goto out_up;
+                r = idr_pre_get(&nodeinfo_idr, alloc);
+                if (!r)
+                        goto out_up;
+                ni = kmalloc(sizeof(struct nodeinfo), alloc);
+                if (!ni)
+                        goto out_up;
+                r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+                if (r) {
+                        kfree(ni);
+                        ni = NULL;
+                        goto out_up;
+                }
+                if (n != nodeid) {
+                        idr_remove(&nodeinfo_idr, n);
+                        kfree(ni);
+                        ni = NULL;
+                        goto out_up;
+                }
+                memset(ni, 0, sizeof(struct nodeinfo));
+                spin_lock_init(&ni->lock);
+                INIT_LIST_HEAD(&ni->writequeue);
+                spin_lock_init(&ni->writequeue_lock);
+                ni->nodeid = nodeid;
+                if (nodeid > max_nodeid)
+                        max_nodeid = nodeid;
+        out_up:
+                up_write(&nodeinfo_lock);
+        }
+        return ni;
+}
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+        int i;
+        struct nodeinfo *ni;
+        for (i=1; i<=max_nodeid; i++) {
+                ni = nodeid2nodeinfo(i, 0);
+                if (ni && ni->assoc_id == assoc)
+                        return ni;
+        }
+        return NULL;
+}
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+        atomic_inc(&sctp_con.waiting_requests);
+        if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+                return;
+        wake_up_interruptible(&lowcomms_recv_wait);
+}
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+   Also padd out the struct with zeros to make comparisons meaningful */
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+                          int *addr_len)
+{
+        struct sockaddr_in *local4_addr;
+        struct sockaddr_in6 *local6_addr;
+        if (!dlm_local_count)
+                return;
+        if (!port) {
+                if (dlm_local_addr[0]->ss_family == AF_INET) {
+                        local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
+                        port = be16_to_cpu(local4_addr->sin_port);
+                } else {
+                        local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
+                        port = be16_to_cpu(local6_addr->sin6_port);
+                }
+        }
+        saddr->ss_family = dlm_local_addr[0]->ss_family;
+        if (dlm_local_addr[0]->ss_family == AF_INET) {
+                struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+                in4_addr->sin_port = cpu_to_be16(port);
+                memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+                memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+                                      sizeof(struct sockaddr_in));
+                *addr_len = sizeof(struct sockaddr_in);
+        } else {
+                struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+                in6_addr->sin6_port = cpu_to_be16(port);
+                memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+                                      sizeof(struct sockaddr_in6));
+                *addr_len = sizeof(struct sockaddr_in6);
+        }
+}
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+        if (sctp_con.sock) {
+                sock_release(sctp_con.sock);
+                sctp_con.sock = NULL;
+        }
+        if (sctp_con.rx_page) {
+                __free_page(sctp_con.rx_page);
+                sctp_con.rx_page = NULL;
+        }
+}
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct msghdr outmessage;
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        int ret;
+        outmessage.msg_name = NULL;
+        outmessage.msg_namelen = 0;
+        outmessage.msg_control = outcmsg;
+        outmessage.msg_controllen = sizeof(outcmsg);
+        outmessage.msg_flags = MSG_EOR;
+        cmsg = CMSG_FIRSTHDR(&outmessage);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        outmessage.msg_controllen = cmsg->cmsg_len;
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_flags |= MSG_EOF;
+        sinfo->sinfo_assoc_id = associd;
+        ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+        if (ret != 0)
+                log_print("send EOF to node failed: %d", ret);
+}
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void init_failed(void)
+{
+        int i;
+        struct nodeinfo *ni;
+        for (i=1; i<=max_nodeid; i++) {
+                ni = nodeid2nodeinfo(i, 0);
+                if (!ni)
+                        continue;
+                if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+                        ni->assoc_id = 0;
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                }
+        }
+        wake_up_process(send_task);
+}
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+        union sctp_notification *sn = (union sctp_notification *)buf;
+        if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+                switch (sn->sn_assoc_change.sac_state) {
+                case SCTP_COMM_UP:
+                case SCTP_RESTART:
+                {
+                        /* Check that the new node is in the lockspace */
+                        struct sctp_prim prim;
+                        mm_segment_t fs;
+                        int nodeid;
+                        int prim_len, ret;
+                        int addr_len;
+                        struct nodeinfo *ni;
+                        /* This seems to happen when we received a connection
+                         * too early... or something...  anyway, it happens but
+                         * we always seem to get a real message too, see
+                         * receive_from_sock */
+                        if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+                                log_print("COMM_UP for invalid assoc ID %d",
+                                         (int)sn->sn_assoc_change.sac_assoc_id);
+                                init_failed();
+                                return;
+                        }
+                        memset(&prim, 0, sizeof(struct sctp_prim));
+                        prim_len = sizeof(struct sctp_prim);
+                        prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+                        fs = get_fs();
+                        set_fs(get_ds());
+                        ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+                                                IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+                                                (char*)&prim, &prim_len);
+                        set_fs(fs);
+                        if (ret < 0) {
+                                struct nodeinfo *ni;
+                                log_print("getsockopt/sctp_primary_addr on "
+                                          "new assoc %d failed : %d",
+                                    (int)sn->sn_assoc_change.sac_assoc_id, ret);
+                                /* Retry INIT later */
+                                ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+                                if (ni)
+                                        clear_bit(NI_INIT_PENDING, &ni->flags);
+                                return;
+                        }
+                        make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+                        if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+                                log_print("reject connect from unknown addr");
+                                send_shutdown(prim.ssp_assoc_id);
+                                return;
+                        }
+                        ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+                        if (!ni)
+                                return;
+                        /* Save the assoc ID */
+                        spin_lock(&ni->lock);
+                        ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+                        spin_unlock(&ni->lock);
+                        log_print("got new/restarted association %d nodeid %d",
+                               (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+                        /* Send any pending writes */
+                        clear_bit(NI_INIT_PENDING, &ni->flags);
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                        wake_up_process(send_task);
+                }
+                break;
+                case SCTP_COMM_LOST:
+                case SCTP_SHUTDOWN_COMP:
+                {
+                        struct nodeinfo *ni;
+                        ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+                        if (ni) {
+                                spin_lock(&ni->lock);
+                                ni->assoc_id = 0;
+                                spin_unlock(&ni->lock);
+                        }
+                }
+                break;
+                /* We don't know which INIT failed, so clear the PENDING flags
+                 * on them all.  if assoc_id is zero then it will then try
+                 * again */
+                case SCTP_CANT_STR_ASSOC:
+                {
+                        log_print("Can't start SCTP association - retrying");
+                        init_failed();
+                }
+                break;
+                default:
+                        log_print("unexpected SCTP assoc change id=%d state=%d",
+                                  (int)sn->sn_assoc_change.sac_assoc_id,
+                                  sn->sn_assoc_change.sac_state);
+                }
+        }
+}
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+        int ret = 0;
+        struct msghdr msg;
+        struct kvec iov[2];
+        unsigned len;
+        int r;
+        struct sctp_sndrcvinfo *sinfo;
+        struct cmsghdr *cmsg;
+        struct nodeinfo *ni;
+        /* These two are marginally too big for stack allocation, but this
+         * function is (currently) only called by dlm_recvd so static should be
+         * OK.
+         */
+        static struct sockaddr_storage msgname;
+        static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        if (sctp_con.sock == NULL)
+                goto out;
+        if (sctp_con.rx_page == NULL) {
+                /*
+                 * This doesn't need to be atomic, but I think it should
+                 * improve performance if it is.
+                 */
+                sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+                if (sctp_con.rx_page == NULL)
+                        goto out_resched;
+                CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+        }
+        memset(&incmsg, 0, sizeof(incmsg));
+        memset(&msgname, 0, sizeof(msgname));
+        memset(incmsg, 0, sizeof(incmsg));
+        msg.msg_name = &msgname;
+        msg.msg_namelen = sizeof(msgname);
+        msg.msg_flags = 0;
+        msg.msg_control = incmsg;
+        msg.msg_controllen = sizeof(incmsg);
+        msg.msg_iovlen = 1;
+        /* I don't see why this circular buffer stuff is necessary for SCTP
+         * which is a packet-based protocol, but the whole thing breaks under
+         * load without it! The overhead is minimal (and is in the TCP lowcomms
+         * anyway, of course) so I'll leave it in until I can figure out what's
+         * really happening.
+         */
+        /*
+         * iov[0] is the bit of the circular buffer between the current end
+         * point (cb.base + cb.len) and the end of the buffer.
+         */
+        iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+        iov[0].iov_base = page_address(sctp_con.rx_page) +
+                          CBUF_DATA(&sctp_con.cb);
+        iov[1].iov_len = 0;
+        /*
+         * iov[1] is the bit of the circular buffer between the start of the
+         * buffer and the start of the currently used section (cb.base)
+         */
+        if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+                iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+                iov[1].iov_len = sctp_con.cb.base;
+                iov[1].iov_base = page_address(sctp_con.rx_page);
+                msg.msg_iovlen = 2;
+        }
+        len = iov[0].iov_len + iov[1].iov_len;
+        r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
+                                 MSG_NOSIGNAL | MSG_DONTWAIT);
+        if (ret <= 0)
+                goto out_close;
+        msg.msg_control = incmsg;
+        msg.msg_controllen = sizeof(incmsg);
+        cmsg = CMSG_FIRSTHDR(&msg);
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        if (msg.msg_flags & MSG_NOTIFICATION) {
+                process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+                return 0;
+        }
+        /* Is this a new association ? */
+        ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+        if (ni) {
+                ni->assoc_id = sinfo->sinfo_assoc_id;
+                if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                        wake_up_process(send_task);
+                }
+        }
+        /* INIT sends a message with length of 1 - ignore it */
+        if (r == 1)
+                return 0;
+        CBUF_ADD(&sctp_con.cb, ret);
+        ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+                                          page_address(sctp_con.rx_page),
+                                          sctp_con.cb.base, sctp_con.cb.len,
+                                          PAGE_CACHE_SIZE);
+        if (ret < 0)
+                goto out_close;
+        CBUF_EAT(&sctp_con.cb, ret);
+      out:
+        ret = 0;
+        goto out_ret;
+      out_resched:
+        lowcomms_data_ready(sctp_con.sock->sk, 0);
+        ret = 0;
+        schedule();
+        goto out_ret;
+      out_close:
+        if (ret != -EAGAIN)
+                log_print("error reading from sctp socket: %d", ret);
+      out_ret:
+        return ret;
+}
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+        mm_segment_t fs;
+        int result = 0;
+        fs = get_fs();
+        set_fs(get_ds());
+        if (num == 1)
+                result = sctp_con.sock->ops->bind(sctp_con.sock,
+                                        (struct sockaddr *) addr, addr_len);
+        else
+                result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+                                SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+        set_fs(fs);
+        if (result < 0)
+                log_print("Can't bind to port %d addr number %d",
+                          dlm_config.tcp_port, num);
+        return result;
+}
+static void init_local(void)
+{
+        struct sockaddr_storage sas, *addr;
+        int i;
+        dlm_local_nodeid = dlm_our_nodeid();
+        for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+                if (dlm_our_addr(&sas, i))
+                        break;
+                addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+                if (!addr)
+                        break;
+                memcpy(addr, &sas, sizeof(*addr));
+                dlm_local_addr[dlm_local_count++] = addr;
+        }
+}
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+        mm_segment_t fs;
+        struct socket *sock = NULL;
+        struct sockaddr_storage localaddr;
+        struct sctp_event_subscribe subscribe;
+        int result = -EINVAL, num = 1, i, addr_len;
+        if (!dlm_local_count) {
+                init_local();
+                if (!dlm_local_count) {
+                        log_print("no local IP address has been set");
+                        goto out;
+                }
+        }
+        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+                                  IPPROTO_SCTP, &sock);
+        if (result < 0) {
+                log_print("Can't create comms socket, check SCTP is loaded");
+                goto out;
+        }
+        /* Listen for events */
+        memset(&subscribe, 0, sizeof(subscribe));
+        subscribe.sctp_data_io_event = 1;
+        subscribe.sctp_association_event = 1;
+        subscribe.sctp_send_failure_event = 1;
+        subscribe.sctp_shutdown_event = 1;
+        subscribe.sctp_partial_delivery_event = 1;
+        fs = get_fs();
+        set_fs(get_ds());
+        result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+                                       (char *)&subscribe, sizeof(subscribe));
+        set_fs(fs);
+        if (result < 0) {
+                log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+                          result);
+                goto create_delsock;
+        }
+        /* Init con struct */
+        sock->sk->sk_user_data = &sctp_con;
+        sctp_con.sock = sock;
+        sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+        /* Bind to all interfaces. */
+        for (i = 0; i < dlm_local_count; i++) {
+                memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+                make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+                result = add_bind_addr(&localaddr, addr_len, num);
+                if (result)
+                        goto create_delsock;
+                ++num;
+        }
+        result = sock->ops->listen(sock, 5);
+        if (result < 0) {
+                log_print("Can't set socket listening");
+                goto create_delsock;
+        }
+        return 0;
+ create_delsock:
+        sock_release(sock);
+        sctp_con.sock = NULL;
+ out:
+        return result;
+}
+static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
+{
+        struct writequeue_entry *entry;
+        entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+        if (!entry)
+                return NULL;
+        entry->page = alloc_page(allocation);
+        if (!entry->page) {
+                kfree(entry);
+                return NULL;
+        }
+        entry->offset = 0;
+        entry->len = 0;
+        entry->end = 0;
+        entry->users = 0;
+        return entry;
+}
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+        struct writequeue_entry *e;
+        int offset = 0;
+        int users = 0;
+        struct nodeinfo *ni;
+        if (!atomic_read(&accepting))
+                return NULL;
+        ni = nodeid2nodeinfo(nodeid, allocation);
+        if (!ni)
+                return NULL;
+        spin_lock(&ni->writequeue_lock);
+        e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+        if (((struct list_head *) e == &ni->writequeue) ||
+            (PAGE_CACHE_SIZE - e->end < len)) {
+                e = NULL;
+        } else {
+                offset = e->end;
+                e->end += len;
+                users = e->users++;
+        }
+        spin_unlock(&ni->writequeue_lock);
+        if (e) {
+              got_one:
+                if (users == 0)
+                        kmap(e->page);
+                *ppc = page_address(e->page) + offset;
+                return e;
+        }
+        e = new_writequeue_entry(allocation);
+        if (e) {
+                spin_lock(&ni->writequeue_lock);
+                offset = e->end;
+                e->end += len;
+                e->ni = ni;
+                users = e->users++;
+                list_add_tail(&e->list, &ni->writequeue);
+                spin_unlock(&ni->writequeue_lock);
+                goto got_one;
+        }
+        return NULL;
+}
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+        struct writequeue_entry *e = (struct writequeue_entry *) arg;
+        int users;
+        struct nodeinfo *ni = e->ni;
+        if (!atomic_read(&accepting))
+                return;
+        spin_lock(&ni->writequeue_lock);
+        users = --e->users;
+        if (users)
+                goto out;
+        e->len = e->end - e->offset;
+        kunmap(e->page);
+        spin_unlock(&ni->writequeue_lock);
+        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                spin_lock_bh(&write_nodes_lock);
+                list_add_tail(&ni->write_list, &write_nodes);
+                spin_unlock_bh(&write_nodes_lock);
+                wake_up_process(send_task);
+        }
+        return;
+      out:
+        spin_unlock(&ni->writequeue_lock);
+        return;
+}
+static void free_entry(struct writequeue_entry *e)
+{
+        __free_page(e->page);
+        kfree(e);
+}
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+   the first IP address and it should work, but this allows us to set up the
+   association before sending any valuable data that we can't afford to lose.
+   It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+        struct sockaddr_storage rem_addr;
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct msghdr outmessage;
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        int ret;
+        int addrlen;
+        char buf[1];
+        struct kvec iov[1];
+        struct nodeinfo *ni;
+        log_print("Initiating association with node %d", nodeid);
+        ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+        if (!ni)
+                return;
+        if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+                log_print("no address for nodeid %d", nodeid);
+                return;
+        }
+        make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+        outmessage.msg_name = &rem_addr;
+        outmessage.msg_namelen = addrlen;
+        outmessage.msg_control = outcmsg;
+        outmessage.msg_controllen = sizeof(outcmsg);
+        outmessage.msg_flags = MSG_EOR;
+        iov[0].iov_base = buf;
+        iov[0].iov_len = 1;
+        /* Real INIT messages seem to cause trouble. Just send a 1 byte message
+           we can afford to lose */
+        cmsg = CMSG_FIRSTHDR(&outmessage);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+        outmessage.msg_controllen = cmsg->cmsg_len;
+        ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+        if (ret < 0) {
+                log_print("send INIT to node failed: %d", ret);
+                /* Try again later */
+                clear_bit(NI_INIT_PENDING, &ni->flags);
+        }
+}
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+        int ret = 0;
+        struct writequeue_entry *e;
+        int len, offset;
+        struct msghdr outmsg;
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        struct kvec iov;
+        /* See if we need to init an association before we start
+           sending precious messages */
+        spin_lock(&ni->lock);
+        if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+                spin_unlock(&ni->lock);
+                initiate_association(ni->nodeid);
+                return 0;
+        }
+        spin_unlock(&ni->lock);
+        outmsg.msg_name = NULL; /* We use assoc_id */
+        outmsg.msg_namelen = 0;
+        outmsg.msg_control = outcmsg;
+        outmsg.msg_controllen = sizeof(outcmsg);
+        outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+        cmsg = CMSG_FIRSTHDR(&outmsg);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+        sinfo->sinfo_assoc_id = ni->assoc_id;
+        outmsg.msg_controllen = cmsg->cmsg_len;
+        spin_lock(&ni->writequeue_lock);
+        for (;;) {
+                if (list_empty(&ni->writequeue))
+                        break;
+                e = list_entry(ni->writequeue.next, struct writequeue_entry,
+                               list);
+                len = e->len;
+                offset = e->offset;
+                BUG_ON(len == 0 && e->users == 0);
+                spin_unlock(&ni->writequeue_lock);
+                kmap(e->page);
+                ret = 0;
+                if (len) {
+                        iov.iov_base = page_address(e->page)+offset;
+                        iov.iov_len = len;
+                        ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+                                             len);
+                        if (ret == -EAGAIN) {
+                                sctp_con.eagain_flag = 1;
+                                goto out;
+                        } else if (ret < 0)
+                                goto send_error;
+                } else {
+                        /* Don't starve people filling buffers */
+                        schedule();
+                }
+                spin_lock(&ni->writequeue_lock);
+                e->offset += ret;
+                e->len -= ret;
+                if (e->len == 0 && e->users == 0) {
+                        list_del(&e->list);
+                        free_entry(e);
+                        continue;
+                }
+        }
+        spin_unlock(&ni->writequeue_lock);
+ out:
+        return ret;
+ send_error:
+        log_print("Error sending to node %d %d", ni->nodeid, ret);
+        spin_lock(&ni->lock);
+        if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+                ni->assoc_id = 0;
+                spin_unlock(&ni->lock);
+                initiate_association(ni->nodeid);
+        } else
+                spin_unlock(&ni->lock);
+        return ret;
+}
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+        struct list_head *list;
+        struct list_head *temp;
+        spin_lock_bh(&write_nodes_lock);
+        list_for_each_safe(list, temp, &write_nodes) {
+                struct nodeinfo *ni =
+                    list_entry(list, struct nodeinfo, write_list);
+                clear_bit(NI_WRITE_PENDING, &ni->flags);
+                list_del(&ni->write_list);
+                spin_unlock_bh(&write_nodes_lock);
+                send_to_sock(ni);
+                spin_lock_bh(&write_nodes_lock);
+        }
+        spin_unlock_bh(&write_nodes_lock);
+}
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+        int i;
+        for (i=1; i<=max_nodeid; i++) {
+                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+                if (ni) {
+                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+                                spin_lock_bh(&write_nodes_lock);
+                                list_add_tail(&ni->write_list, &write_nodes);
+                                spin_unlock_bh(&write_nodes_lock);
+                        }
+                }
+        }
+}
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+        struct list_head *list;
+        struct list_head *temp;
+        spin_lock(&ni->writequeue_lock);
+        list_for_each_safe(list, temp, &ni->writequeue) {
+                struct writequeue_entry *e =
+                        list_entry(list, struct writequeue_entry, list);
+                list_del(&e->list);
+                free_entry(e);
+        }
+        spin_unlock(&ni->writequeue_lock);
+}
+static void clean_writequeues(void)
+{
+        int i;
+        for (i=1; i<=max_nodeid; i++) {
+                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+                if (ni)
+                        clean_one_writequeue(ni);
+        }
+}
+static void dealloc_nodeinfo(void)
+{
+        int i;
+        for (i=1; i<=max_nodeid; i++) {
+                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+                if (ni) {
+                        idr_remove(&nodeinfo_idr, i);
+                        kfree(ni);
+                }
+        }
+}
+int dlm_lowcomms_close(int nodeid)
+{
+        struct nodeinfo *ni;
+        ni = nodeid2nodeinfo(nodeid, 0);
+        if (!ni)
+                return -1;
+        spin_lock(&ni->lock);
+        if (ni->assoc_id) {
+                ni->assoc_id = 0;
+                /* Don't send shutdown here, sctp will just queue it
+                   till the node comes back up! */
+        }
+        spin_unlock(&ni->lock);
+        clean_one_writequeue(ni);
+        clear_bit(NI_INIT_PENDING, &ni->flags);
+        return 0;
+}
+static int write_list_empty(void)
+{
+        int status;
+        spin_lock_bh(&write_nodes_lock);
+        status = list_empty(&write_nodes);
+        spin_unlock_bh(&write_nodes_lock);
+        return status;
+}
+static int dlm_recvd(void *data)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        while (!kthread_should_stop()) {
+                int count = 0;
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&lowcomms_recv_wait, &wait);
+                if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+                        schedule();
+                remove_wait_queue(&lowcomms_recv_wait, &wait);
+                set_current_state(TASK_RUNNING);
+                if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+                        int ret;
+                        do {
+                                ret = receive_from_sock();
+                                /* Don't starve out everyone else */
+                                if (++count >= MAX_RX_MSG_COUNT) {
+                                        schedule();
+                                        count = 0;
+                                }
+                        } while (!kthread_should_stop() && ret >=0);
+                }
+                schedule();
+        }
+        return 0;
+}
+static int dlm_sendd(void *data)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (write_list_empty())
+                        schedule();
+                set_current_state(TASK_RUNNING);
+                if (sctp_con.eagain_flag) {
+                        sctp_con.eagain_flag = 0;
+                        refill_write_queue();
+                }
+                process_output_queue();
+        }
+        remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+        return 0;
+}
+static void daemons_stop(void)
+{
+        kthread_stop(recv_task);
+        kthread_stop(send_task);
+}
+static int daemons_start(void)
+{
+        struct task_struct *p;
+        int error;
+        p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+        error = IS_ERR(p);
+        if (error) {
+                log_print("can't start dlm_recvd %d", error);
+                return error;
+        }
+        recv_task = p;
+        p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+        error = IS_ERR(p);
+        if (error) {
+                log_print("can't start dlm_sendd %d", error);
+                kthread_stop(recv_task);
+                return error;
+        }
+        send_task = p;
+        return 0;
+}
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+        int error;
+        error = init_sock();
+        if (error)
+                goto fail_sock;
+        error = daemons_start();
+        if (error)
+                goto fail_sock;
+        atomic_set(&accepting, 1);
+        return 0;
+ fail_sock:
+        close_connection();
+        return error;
+}
+/* Set all the activity flags to prevent any socket activity. */
+void dlm_lowcomms_stop(void)
+{
+        atomic_set(&accepting, 0);
+        sctp_con.flags = 0x7;
+        daemons_stop();
+        clean_writequeues();
+        close_connection();
+        dealloc_nodeinfo();
+        max_nodeid = 0;
+}
+int dlm_lowcomms_init(void)
+{
+        init_waitqueue_head(&lowcomms_recv_wait);
+        spin_lock_init(&write_nodes_lock);
+        INIT_LIST_HEAD(&write_nodes);
+        init_rwsem(&nodeinfo_lock);
+        return 0;
+}
+void dlm_lowcomms_exit(void)
+{
+        int i;
+        for (i = 0; i < dlm_local_count; i++)
+                kfree(dlm_local_addr[i]);
+        dlm_local_count = 0;
+        dlm_local_nodeid = 0;
+}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..2d045e0daae1
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+int dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+#endif                          /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+extern const int dlm_lvb_operations[8][8];
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "config.h"
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+static int __init init_dlm(void)
+{
+        int error;
+        error = dlm_memory_init();
+        if (error)
+                goto out;
+        error = dlm_lockspace_init();
+        if (error)
+                goto out_mem;
+        error = dlm_config_init();
+        if (error)
+                goto out_lockspace;
+        error = dlm_register_debugfs();
+        if (error)
+                goto out_config;
+        error = dlm_lowcomms_init();
+        if (error)
+                goto out_debug;
+        error = dlm_user_init();
+        if (error)
+                goto out_lowcomms;
+        printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+ out_lowcomms:
+        dlm_lowcomms_exit();
+ out_debug:
+        dlm_unregister_debugfs();
+ out_config:
+        dlm_config_exit();
+ out_lockspace:
+        dlm_lockspace_exit();
+ out_mem:
+        dlm_memory_exit();
+ out:
+        return error;
+}
+static void __exit exit_dlm(void)
+{
+        dlm_user_exit();
+        dlm_lowcomms_exit();
+        dlm_config_exit();
+        dlm_memory_exit();
+        dlm_lockspace_exit();
+        dlm_unregister_debugfs();
+}
+module_init(init_dlm);
+module_exit(exit_dlm);
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+/*
+ * Following called by dlm_recoverd thread
+ */
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+        struct dlm_member *memb = NULL;
+        struct list_head *tmp;
+        struct list_head *newlist = &new->list;
+        struct list_head *head = &ls->ls_nodes;
+        list_for_each(tmp, head) {
+                memb = list_entry(tmp, struct dlm_member, list);
+                if (new->nodeid < memb->nodeid)
+                        break;
+        }
+        if (!memb)
+                list_add_tail(newlist, head);
+        else {
+                /* FIXME: can use list macro here */
+                newlist->prev = tmp->prev;
+                newlist->next = tmp;
+                tmp->prev->next = newlist;
+                tmp->prev = newlist;
+        }
+}
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_member *memb;
+        int w;
+        memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+        if (!memb)
+                return -ENOMEM;
+        w = dlm_node_weight(ls->ls_name, nodeid);
+        if (w < 0)
+                return w;
+        memb->nodeid = nodeid;
+        memb->weight = w;
+        add_ordered_member(ls, memb);
+        ls->ls_num_nodes++;
+        return 0;
+}
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+        list_move(&memb->list, &ls->ls_nodes_gone);
+        ls->ls_num_nodes--;
+}
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_member *memb;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->nodeid == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_member *memb;
+        list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+                if (memb->nodeid == nodeid)
+                        return 1;
+        }
+        return 0;
+}
+static void clear_memb_list(struct list_head *head)
+{
+        struct dlm_member *memb;
+        while (!list_empty(head)) {
+                memb = list_entry(head->next, struct dlm_member, list);
+                list_del(&memb->list);
+                kfree(memb);
+        }
+}
+void dlm_clear_members(struct dlm_ls *ls)
+{
+        clear_memb_list(&ls->ls_nodes);
+        ls->ls_num_nodes = 0;
+}
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+        clear_memb_list(&ls->ls_nodes_gone);
+}
+static void make_member_array(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        int i, w, x = 0, total = 0, all_zero = 0, *array;
+        kfree(ls->ls_node_array);
+        ls->ls_node_array = NULL;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->weight)
+                        total += memb->weight;
+        }
+        /* all nodes revert to weight of 1 if all have weight 0 */
+        if (!total) {
+                total = ls->ls_num_nodes;
+                all_zero = 1;
+        }
+        ls->ls_total_weight = total;
+        array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+        if (!array)
+                return;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (!all_zero && !memb->weight)
+                        continue;
+                if (all_zero)
+                        w = 1;
+                else
+                        w = memb->weight;
+                DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+                for (i = 0; i < w; i++)
+                        array[x++] = memb->nodeid;
+        }
+        ls->ls_node_array = array;
+}
+/* send a status request to all members just to establish comms connections */
+static int ping_members(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        int error = 0;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                error = dlm_recovery_stopped(ls);
+                if (error)
+                        break;
+                error = dlm_rcom_status(ls, memb->nodeid);
+                if (error)
+                        break;
+        }
+        if (error)
+                log_debug(ls, "ping_members aborted %d last nodeid %d",
+                          error, ls->ls_recover_nodeid);
+        return error;
+}
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+        struct dlm_member *memb, *safe;
+        int i, error, found, pos = 0, neg = 0, low = -1;
+        /* move departed members from ls_nodes to ls_nodes_gone */
+        list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+                found = 0;
+                for (i = 0; i < rv->node_count; i++) {
+                        if (memb->nodeid == rv->nodeids[i]) {
+                                found = 1;
+                                break;
+                        }
+                }
+                if (!found) {
+                        neg++;
+                        dlm_remove_member(ls, memb);
+                        log_debug(ls, "remove member %d", memb->nodeid);
+                }
+        }
+        /* add new members to ls_nodes */
+        for (i = 0; i < rv->node_count; i++) {
+                if (dlm_is_member(ls, rv->nodeids[i]))
+                        continue;
+                dlm_add_member(ls, rv->nodeids[i]);
+                pos++;
+                log_debug(ls, "add member %d", rv->nodeids[i]);
+        }
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (low == -1 || memb->nodeid < low)
+                        low = memb->nodeid;
+        }
+        ls->ls_low_nodeid = low;
+        make_member_array(ls);
+        dlm_set_recover_status(ls, DLM_RS_NODES);
+        *neg_out = neg;
+        error = ping_members(ls);
+        if (error)
+                goto out;
+        error = dlm_recover_members_wait(ls);
+ out:
+        log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+        return error;
+}
+/*
+ * Following called from lockspace.c
+ */
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+        int new;
+        /*
+         * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
+         * dlm_recovery_stopped()) and prevents any new locks from being
+         * processed (see RUNNING, dlm_locking_stopped()).
+         */
+        spin_lock(&ls->ls_recover_lock);
+        set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+        new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+        ls->ls_recover_seq++;
+        spin_unlock(&ls->ls_recover_lock);
+        /*
+         * This in_recovery lock does two things:
+         *
+         * 1) Keeps this function from returning until all threads are out
+         *    of locking routines and locking is truely stopped.
+         * 2) Keeps any new requests from being processed until it's unlocked
+         *    when recovery is complete.
+         */
+        if (new)
+                down_write(&ls->ls_in_recovery);
+        /*
+         * The recoverd suspend/resume makes sure that dlm_recoverd (if
+         * running) has noticed the clearing of RUNNING above and quit
+         * processing the previous recovery.  This will be true for all nodes
+         * before any nodes start the new recovery.
+         */
+        dlm_recoverd_suspend(ls);
+        ls->ls_recover_status = 0;
+        dlm_recoverd_resume(ls);
+        return 0;
+}
+int dlm_ls_start(struct dlm_ls *ls)
+{
+        struct dlm_recover *rv = NULL, *rv_old;
+        int *ids = NULL;
+        int error, count;
+        rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+        if (!rv)
+                return -ENOMEM;
+        error = count = dlm_nodeid_list(ls->ls_name, &ids);
+        if (error <= 0)
+                goto fail;
+        spin_lock(&ls->ls_recover_lock);
+        /* the lockspace needs to be stopped before it can be started */
+        if (!dlm_locking_stopped(ls)) {
+                spin_unlock(&ls->ls_recover_lock);
+                log_error(ls, "start ignored: lockspace running");
+                error = -EINVAL;
+                goto fail;
+        }
+        rv->nodeids = ids;
+        rv->node_count = count;
+        rv->seq = ++ls->ls_recover_seq;
+        rv_old = ls->ls_recover_args;
+        ls->ls_recover_args = rv;
+        spin_unlock(&ls->ls_recover_lock);
+        if (rv_old) {
+                kfree(rv_old->nodeids);
+                kfree(rv_old);
+        }
+        dlm_recoverd_kick(ls);
+        return 0;
+ fail:
+        kfree(rv);
+        kfree(ids);
+        return error;
+}
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+#endif                          /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+static kmem_cache_t *lkb_cache;
+int dlm_memory_init(void)
+{
+        int ret = 0;
+        lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+                                __alignof__(struct dlm_lkb), 0, NULL, NULL);
+        if (!lkb_cache)
+                ret = -ENOMEM;
+        return ret;
+}
+void dlm_memory_exit(void)
+{
+        if (lkb_cache)
+                kmem_cache_destroy(lkb_cache);
+}
+char *allocate_lvb(struct dlm_ls *ls)
+{
+        char *p;
+        p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
+        if (p)
+                memset(p, 0, ls->ls_lvblen);
+        return p;
+}
+void free_lvb(char *p)
+{
+        kfree(p);
+}
+/* FIXME: have some minimal space built-in to rsb for the name and
+   kmalloc a separate name if needed, like dentries are done */
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+        struct dlm_rsb *r;
+        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+        r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
+        if (r)
+                memset(r, 0, sizeof(*r) + namelen);
+        return r;
+}
+void free_rsb(struct dlm_rsb *r)
+{
+        if (r->res_lvbptr)
+                free_lvb(r->res_lvbptr);
+        kfree(r);
+}
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+        struct dlm_lkb *lkb;
+        lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
+        if (lkb)
+                memset(lkb, 0, sizeof(*lkb));
+        return lkb;
+}
+void free_lkb(struct dlm_lkb *lkb)
+{
+        if (lkb->lkb_flags & DLM_IFL_USER) {
+                struct dlm_user_args *ua;
+                ua = (struct dlm_user_args *)lkb->lkb_astparam;
+                if (ua) {
+                        if (ua->lksb.sb_lvbptr)
+                                kfree(ua->lksb.sb_lvbptr);
+                        kfree(ua);
+                }
+        }
+        kmem_cache_free(lkb_cache, lkb);
+}
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+        struct dlm_direntry *de;
+        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
+                   printk("namelen = %d\n", namelen););
+        de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
+        if (de)
+                memset(de, 0, sizeof(*de) + namelen);
+        return de;
+}
+void free_direntry(struct dlm_direntry *de)
+{
+        kfree(de);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+#endif          /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "rcom.h"
+#include "lock.h"
+#include "midcomms.h"
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+                         unsigned len, unsigned limit)
+{
+        unsigned copy = len;
+        if ((copy + offset) > limit)
+                copy = limit - offset;
+        memcpy(dst, base + offset, copy);
+        len -= copy;
+        if (len)
+                memcpy(dst + copy, base, len);
+}
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+                                unsigned offset, unsigned len, unsigned limit)
+{
+        unsigned char __tmp[DLM_INBUF_LEN];
+        struct dlm_header *msg = (struct dlm_header *) __tmp;
+        int ret = 0;
+        int err = 0;
+        uint16_t msglen;
+        uint32_t lockspace;
+        while (len > sizeof(struct dlm_header)) {
+                /* Copy just the header to check the total length.  The
+                   message may wrap around the end of the buffer back to the
+                   start, so we need to use a temp buffer and copy_from_cb. */
+                copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+                             limit);
+                msglen = le16_to_cpu(msg->h_length);
+                lockspace = msg->h_lockspace;
+                err = -EINVAL;
+                if (msglen < sizeof(struct dlm_header))
+                        break;
+                err = -E2BIG;
+                if (msglen > dlm_config.buffer_size) {
+                        log_print("message size %d from %d too big, buf len %d",
+                                  msglen, nodeid, len);
+                        break;
+                }
+                err = 0;
+                /* If only part of the full message is contained in this
+                   buffer, then do nothing and wait for lowcomms to call
+                   us again later with more data.  We return 0 meaning
+                   we've consumed none of the input buffer. */
+                if (msglen > len)
+                        break;
+                /* Allocate a larger temp buffer if the full message won't fit
+                   in the buffer on the stack (which should work for most
+                   ordinary messages). */
+                if (msglen > sizeof(__tmp) &&
+                    msg == (struct dlm_header *) __tmp) {
+                        msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+                        if (msg == NULL)
+                                return ret;
+                }
+                copy_from_cb(msg, base, offset, msglen, limit);
+                BUG_ON(lockspace != msg->h_lockspace);
+                ret += msglen;
+                offset += msglen;
+                offset &= (limit - 1);
+                len -= msglen;
+                switch (msg->h_cmd) {
+                case DLM_MSG:
+                        dlm_receive_message(msg, nodeid, 0);
+                        break;
+                case DLM_RCOM:
+                        dlm_receive_rcom(msg, nodeid);
+                        break;
+                default:
+                        log_print("unknown msg type %x from %u: %u %u %u %u",
+                                  msg->h_cmd, nodeid, msglen, len, offset, ret);
+                }
+        }
+        if (msg != (struct dlm_header *) __tmp)
+                kfree(msg);
+        return err ? err : ret;
+}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+                                unsigned len, unsigned limit);
+#endif                          /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+static int rcom_response(struct dlm_ls *ls)
+{
+        return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+                       struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        char *mb;
+        int mb_len = sizeof(struct dlm_rcom) + len;
+        mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+        if (!mh) {
+                log_print("create_rcom to %d type %d len %d ENOBUFS",
+                          to_nodeid, type, len);
+                return -ENOBUFS;
+        }
+        memset(mb, 0, mb_len);
+        rc = (struct dlm_rcom *) mb;
+        rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+        rc->rc_header.h_lockspace = ls->ls_global_id;
+        rc->rc_header.h_nodeid = dlm_our_nodeid();
+        rc->rc_header.h_length = mb_len;
+        rc->rc_header.h_cmd = DLM_RCOM;
+        rc->rc_type = type;
+        *mh_ret = mh;
+        *rc_ret = rc;
+        return 0;
+}
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+                      struct dlm_rcom *rc)
+{
+        dlm_rcom_out(rc);
+        dlm_lowcomms_commit_buffer(mh);
+}
+/* When replying to a status request, a node also sends back its
+   configuration values.  The requesting node then checks that the remote
+   node is configured the same way as itself. */
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+        rf->rf_lvblen = ls->ls_lvblen;
+        rf->rf_lsflags = ls->ls_exflags;
+}
+static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+{
+        if (rf->rf_lvblen != ls->ls_lvblen ||
+            rf->rf_lsflags != ls->ls_exflags) {
+                log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+                          ls->ls_lvblen, ls->ls_exflags,
+                          nodeid, rf->rf_lvblen, rf->rf_lsflags);
+                return -EINVAL;
+        }
+        return 0;
+}
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error = 0;
+        memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+        ls->ls_recover_nodeid = nodeid;
+        if (nodeid == dlm_our_nodeid()) {
+                rc = (struct dlm_rcom *) ls->ls_recover_buf;
+                rc->rc_result = dlm_recover_status(ls);
+                goto out;
+        }
+        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+        if (error)
+                goto out;
+        rc->rc_id = ++ls->ls_rcom_seq;
+        send_rcom(ls, mh, rc);
+        error = dlm_wait_function(ls, &rcom_response);
+        clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+        if (error)
+                goto out;
+        rc = (struct dlm_rcom *) ls->ls_recover_buf;
+        if (rc->rc_result == -ESRCH) {
+                /* we pretend the remote lockspace exists with 0 status */
+                log_debug(ls, "remote node %d not ready", nodeid);
+                rc->rc_result = 0;
+        } else
+                error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+                                     nodeid);
+        /* the caller looks at rc_result for the remote recovery status */
+ out:
+        return error;
+}
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, nodeid = rc_in->rc_header.h_nodeid;
+        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+                            sizeof(struct rcom_config), &rc, &mh);
+        if (error)
+                return;
+        rc->rc_id = rc_in->rc_id;
+        rc->rc_result = dlm_recover_status(ls);
+        make_config(ls, (struct rcom_config *) rc->rc_buf);
+        send_rcom(ls, mh, rc);
+}
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        if (rc_in->rc_id != ls->ls_rcom_seq) {
+                log_debug(ls, "reject old reply %d got %llx wanted %llx",
+                          rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
+                return;
+        }
+        memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+        set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+        wake_up(&ls->ls_wait_general);
+}
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        receive_sync_reply(ls, rc_in);
+}
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error = 0, len = sizeof(struct dlm_rcom);
+        memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+        ls->ls_recover_nodeid = nodeid;
+        if (nodeid == dlm_our_nodeid()) {
+                dlm_copy_master_names(ls, last_name, last_len,
+                                      ls->ls_recover_buf + len,
+                                      dlm_config.buffer_size - len, nodeid);
+                goto out;
+        }
+        error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+        if (error)
+                goto out;
+        memcpy(rc->rc_buf, last_name, last_len);
+        rc->rc_id = ++ls->ls_rcom_seq;
+        send_rcom(ls, mh, rc);
+        error = dlm_wait_function(ls, &rcom_response);
+        clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ out:
+        return error;
+}
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, inlen, outlen;
+        int nodeid = rc_in->rc_header.h_nodeid;
+        uint32_t status = dlm_recover_status(ls);
+        /*
+         * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
+         * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
+         * It could only happen in rare cases where we get a late NAMES
+         * message from a previous instance of recovery.
+         */
+        if (!(status & DLM_RS_NODES)) {
+                log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
+                return;
+        }
+        nodeid = rc_in->rc_header.h_nodeid;
+        inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+        outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+        error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+        if (error)
+                return;
+        rc->rc_id = rc_in->rc_id;
+        dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+                              nodeid);
+        send_rcom(ls, mh, rc);
+}
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        receive_sync_reply(ls, rc_in);
+}
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        struct dlm_ls *ls = r->res_ls;
+        int error;
+        error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+                            &rc, &mh);
+        if (error)
+                goto out;
+        memcpy(rc->rc_buf, r->res_name, r->res_length);
+        rc->rc_id = (unsigned long) r;
+        send_rcom(ls, mh, rc);
+ out:
+        return error;
+}
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+        int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+        error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+        if (error)
+                return;
+        error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+        if (error)
+                ret_nodeid = error;
+        rc->rc_result = ret_nodeid;
+        rc->rc_id = rc_in->rc_id;
+        send_rcom(ls, mh, rc);
+}
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        dlm_recover_master_reply(ls, rc_in);
+}
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                           struct rcom_lock *rl)
+{
+        memset(rl, 0, sizeof(*rl));
+        rl->rl_ownpid = lkb->lkb_ownpid;
+        rl->rl_lkid = lkb->lkb_id;
+        rl->rl_exflags = lkb->lkb_exflags;
+        rl->rl_flags = lkb->lkb_flags;
+        rl->rl_lvbseq = lkb->lkb_lvbseq;
+        rl->rl_rqmode = lkb->lkb_rqmode;
+        rl->rl_grmode = lkb->lkb_grmode;
+        rl->rl_status = lkb->lkb_status;
+        rl->rl_wait_type = lkb->lkb_wait_type;
+        if (lkb->lkb_bastaddr)
+                rl->rl_asts |= AST_BAST;
+        if (lkb->lkb_astaddr)
+                rl->rl_asts |= AST_COMP;
+        rl->rl_namelen = r->res_length;
+        memcpy(rl->rl_name, r->res_name, r->res_length);
+        /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+           If so, receive_rcom_lock_args() won't take this copy. */
+        if (lkb->lkb_lvbptr)
+                memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        struct dlm_ls *ls = r->res_ls;
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        struct rcom_lock *rl;
+        int error, len = sizeof(struct rcom_lock);
+        if (lkb->lkb_lvbptr)
+                len += ls->ls_lvblen;
+        error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+        if (error)
+                goto out;
+        rl = (struct rcom_lock *) rc->rc_buf;
+        pack_rcom_lock(r, lkb, rl);
+        rc->rc_id = (unsigned long) r;
+        send_rcom(ls, mh, rc);
+ out:
+        return error;
+}
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        int error, nodeid = rc_in->rc_header.h_nodeid;
+        dlm_recover_master_copy(ls, rc_in);
+        error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+                            sizeof(struct rcom_lock), &rc, &mh);
+        if (error)
+                return;
+        /* We send back the same rcom_lock struct we received, but
+           dlm_recover_master_copy() has filled in rl_remid and rl_result */
+        memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+        rc->rc_id = rc_in->rc_id;
+        send_rcom(ls, mh, rc);
+}
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+        uint32_t status = dlm_recover_status(ls);
+        if (!(status & DLM_RS_DIR)) {
+                log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
+                          rc_in->rc_header.h_nodeid);
+                return;
+        }
+        dlm_recover_process_copy(ls, rc_in);
+}
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+        struct dlm_rcom *rc;
+        struct dlm_mhandle *mh;
+        char *mb;
+        int mb_len = sizeof(struct dlm_rcom);
+        mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
+        if (!mh)
+                return -ENOBUFS;
+        memset(mb, 0, mb_len);
+        rc = (struct dlm_rcom *) mb;
+        rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+        rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+        rc->rc_header.h_nodeid = dlm_our_nodeid();
+        rc->rc_header.h_length = mb_len;
+        rc->rc_header.h_cmd = DLM_RCOM;
+        rc->rc_type = DLM_RCOM_STATUS_REPLY;
+        rc->rc_id = rc_in->rc_id;
+        rc->rc_result = -ESRCH;
+        dlm_rcom_out(rc);
+        dlm_lowcomms_commit_buffer(mh);
+        return 0;
+}
+/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+   recovery-only comms are sent through here. */
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+{
+        struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+        struct dlm_ls *ls;
+        dlm_rcom_in(rc);
+        /* If the lockspace doesn't exist then still send a status message
+           back; it's possible that it just doesn't have its global_id yet. */
+        ls = dlm_find_lockspace_global(hd->h_lockspace);
+        if (!ls) {
+                log_print("lockspace %x from %d not found",
+                          hd->h_lockspace, nodeid);
+                send_ls_not_ready(nodeid, rc);
+                return;
+        }
+        if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+                log_error(ls, "ignoring recovery message %x from %d",
+                          rc->rc_type, nodeid);
+                goto out;
+        }
+        if (nodeid != rc->rc_header.h_nodeid) {
+                log_error(ls, "bad rcom nodeid %d from %d",
+                          rc->rc_header.h_nodeid, nodeid);
+                goto out;
+        }
+        switch (rc->rc_type) {
+        case DLM_RCOM_STATUS:
+                receive_rcom_status(ls, rc);
+                break;
+        case DLM_RCOM_NAMES:
+                receive_rcom_names(ls, rc);
+                break;
+        case DLM_RCOM_LOOKUP:
+                receive_rcom_lookup(ls, rc);
+                break;
+        case DLM_RCOM_LOCK:
+                receive_rcom_lock(ls, rc);
+                break;
+        case DLM_RCOM_STATUS_REPLY:
+                receive_rcom_status_reply(ls, rc);
+                break;
+        case DLM_RCOM_NAMES_REPLY:
+                receive_rcom_names_reply(ls, rc);
+                break;
+        case DLM_RCOM_LOOKUP_REPLY:
+                receive_rcom_lookup_reply(ls, rc);
+                break;
+        case DLM_RCOM_LOCK_REPLY:
+                receive_rcom_lock_reply(ls, rc);
+                break;
+        default:
+                DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+        }
+ out:
+        dlm_put_lockspace(ls);
+}
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+#endif
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status.  They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result.  A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure.  This should only be called
+ * by the dlm_recoverd thread.
+ */
+static void dlm_wait_timer_fn(unsigned long data)
+{
+        struct dlm_ls *ls = (struct dlm_ls *) data;
+        mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+        wake_up(&ls->ls_wait_general);
+}
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+        int error = 0;
+        init_timer(&ls->ls_timer);
+        ls->ls_timer.function = dlm_wait_timer_fn;
+        ls->ls_timer.data = (long) ls;
+        ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+        add_timer(&ls->ls_timer);
+        wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+        del_timer_sync(&ls->ls_timer);
+        if (dlm_recovery_stopped(ls)) {
+                log_debug(ls, "dlm_wait_function aborted");
+                error = -EINTR;
+        }
+        return error;
+}
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status.  The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low).  When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+        uint32_t status;
+        spin_lock(&ls->ls_recover_lock);
+        status = ls->ls_recover_status;
+        spin_unlock(&ls->ls_recover_lock);
+        return status;
+}
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+        spin_lock(&ls->ls_recover_lock);
+        ls->ls_recover_status |= status;
+        spin_unlock(&ls->ls_recover_lock);
+}
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+        struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+        struct dlm_member *memb;
+        int error = 0, delay;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                delay = 0;
+                for (;;) {
+                        if (dlm_recovery_stopped(ls)) {
+                                error = -EINTR;
+                                goto out;
+                        }
+                        error = dlm_rcom_status(ls, memb->nodeid);
+                        if (error)
+                                goto out;
+                        if (rc->rc_result & wait_status)
+                                break;
+                        if (delay < 1000)
+                                delay += 20;
+                        msleep(delay);
+                }
+        }
+ out:
+        return error;
+}
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+        struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+        int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+        for (;;) {
+                if (dlm_recovery_stopped(ls)) {
+                        error = -EINTR;
+                        goto out;
+                }
+                error = dlm_rcom_status(ls, nodeid);
+                if (error)
+                        break;
+                if (rc->rc_result & wait_status)
+                        break;
+                if (delay < 1000)
+                        delay += 20;
+                msleep(delay);
+        }
+ out:
+        return error;
+}
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+        uint32_t status_all = status << 1;
+        int error;
+        if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+                error = wait_status_all(ls, status);
+                if (!error)
+                        dlm_set_recover_status(ls, status_all);
+        } else
+                error = wait_status_low(ls, status_all);
+        return error;
+}
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_NODES);
+}
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_DIR);
+}
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_LOCKS);
+}
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+        return wait_status(ls, DLM_RS_DONE);
+}
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+static int recover_list_empty(struct dlm_ls *ls)
+{
+        int empty;
+        spin_lock(&ls->ls_recover_list_lock);
+        empty = list_empty(&ls->ls_recover_list);
+        spin_unlock(&ls->ls_recover_list_lock);
+        return empty;
+}
+static void recover_list_add(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        spin_lock(&ls->ls_recover_list_lock);
+        if (list_empty(&r->res_recover_list)) {
+                list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+                ls->ls_recover_list_count++;
+                dlm_hold_rsb(r);
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+static void recover_list_del(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_del_init(&r->res_recover_list);
+        ls->ls_recover_list_count--;
+        spin_unlock(&ls->ls_recover_list_lock);
+        dlm_put_rsb(r);
+}
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+        struct dlm_rsb *r = NULL;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+                if (id == (unsigned long) r)
+                        goto out;
+        }
+        r = NULL;
+ out:
+        spin_unlock(&ls->ls_recover_list_lock);
+        return r;
+}
+static void recover_list_clear(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r, *s;
+        spin_lock(&ls->ls_recover_list_lock);
+        list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+                list_del_init(&r->res_recover_list);
+                dlm_put_rsb(r);
+                ls->ls_recover_list_count--;
+        }
+        if (ls->ls_recover_list_count != 0) {
+                log_error(ls, "warning: recover_list_count %d",
+                          ls->ls_recover_list_count);
+                ls->ls_recover_list_count = 0;
+        }
+        spin_unlock(&ls->ls_recover_list_lock);
+}
+/* Master recovery: find new master node for rsb's that were
+   mastered on nodes that have been removed.
+   dlm_recover_masters
+   recover_master
+   dlm_send_rcom_lookup            ->  receive_rcom_lookup
+                                       dlm_dir_lookup
+   receive_rcom_lookup_reply       <-
+   dlm_recover_master_reply
+   set_new_master
+   set_master_lkbs
+   set_lock_master
+*/
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+        struct dlm_lkb *lkb;
+        list_for_each_entry(lkb, queue, lkb_statequeue)
+                if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+                        lkb->lkb_nodeid = nodeid;
+}
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+        set_lock_master(&r->res_grantqueue, r->res_nodeid);
+        set_lock_master(&r->res_convertqueue, r->res_nodeid);
+        set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+/*
+ * Propogate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * rsb's to consider.
+ */
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+        lock_rsb(r);
+        r->res_nodeid = nodeid;
+        set_master_lkbs(r);
+        rsb_set_flag(r, RSB_NEW_MASTER);
+        rsb_set_flag(r, RSB_NEW_MASTER2);
+        unlock_rsb(r);
+}
+/*
+ * We do async lookups on rsb's that need new masters.  The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+static int recover_master(struct dlm_rsb *r)
+{
+        struct dlm_ls *ls = r->res_ls;
+        int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+        dir_nodeid = dlm_dir_nodeid(r);
+        if (dir_nodeid == our_nodeid) {
+                error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+                                       r->res_length, &ret_nodeid);
+                if (error)
+                        log_error(ls, "recover dir lookup error %d", error);
+                if (ret_nodeid == our_nodeid)
+                        ret_nodeid = 0;
+                set_new_master(r, ret_nodeid);
+        } else {
+                recover_list_add(r);
+                error = dlm_send_rcom_lookup(r, dir_nodeid);
+        }
+        return error;
+}
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+static int recover_master_static(struct dlm_rsb *r)
+{
+        int master = dlm_dir_nodeid(r);
+        if (master == dlm_our_nodeid())
+                master = 0;
+        if (r->res_nodeid != master) {
+                if (is_master(r))
+                        dlm_purge_mstcpy_locks(r);
+                set_new_master(r, master);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory.  The dir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int error = 0, count = 0;
+        log_debug(ls, "dlm_recover_masters");
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                if (dlm_recovery_stopped(ls)) {
+                        up_read(&ls->ls_root_sem);
+                        error = -EINTR;
+                        goto out;
+                }
+                if (dlm_no_directory(ls))
+                        count += recover_master_static(r);
+                else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+                        recover_master(r);
+                        count++;
+                }
+                schedule();
+        }
+        up_read(&ls->ls_root_sem);
+        log_debug(ls, "dlm_recover_masters %d resources", count);
+        error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+        if (error)
+                recover_list_clear(ls);
+        return error;
+}
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct dlm_rsb *r;
+        int nodeid;
+        r = recover_list_find(ls, rc->rc_id);
+        if (!r) {
+                log_error(ls, "dlm_recover_master_reply no id %llx",
+                          (unsigned long long)rc->rc_id);
+                goto out;
+        }
+        nodeid = rc->rc_result;
+        if (nodeid == dlm_our_nodeid())
+                nodeid = 0;
+        set_new_master(r, nodeid);
+        recover_list_del(r);
+        if (recover_list_empty(ls))
+                wake_up(&ls->ls_wait_general);
+ out:
+        return 0;
+}
+/* Lock recovery: rebuild the process-copy locks we hold on a
+   remastered rsb on the new rsb master.
+   dlm_recover_locks
+   recover_locks
+   recover_locks_queue
+   dlm_send_rcom_lock              ->  receive_rcom_lock
+                                       dlm_recover_master_copy
+   receive_rcom_lock_reply         <-
+   dlm_recover_process_copy
+*/
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+        struct dlm_lkb *lkb;
+        int error = 0;
+        list_for_each_entry(lkb, head, lkb_statequeue) {
+                error = dlm_send_rcom_lock(r, lkb);
+                if (error)
+                        break;
+                r->res_recover_locks_count++;
+        }
+        return error;
+}
+static int recover_locks(struct dlm_rsb *r)
+{
+        int error = 0;
+        lock_rsb(r);
+        DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+        error = recover_locks_queue(r, &r->res_grantqueue);
+        if (error)
+                goto out;
+        error = recover_locks_queue(r, &r->res_convertqueue);
+        if (error)
+                goto out;
+        error = recover_locks_queue(r, &r->res_waitqueue);
+        if (error)
+                goto out;
+        if (r->res_recover_locks_count)
+                recover_list_add(r);
+        else
+                rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+        unlock_rsb(r);
+        return error;
+}
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int error, count = 0;
+        log_debug(ls, "dlm_recover_locks");
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                if (is_master(r)) {
+                        rsb_clear_flag(r, RSB_NEW_MASTER);
+                        continue;
+                }
+                if (!rsb_flag(r, RSB_NEW_MASTER))
+                        continue;
+                if (dlm_recovery_stopped(ls)) {
+                        error = -EINTR;
+                        up_read(&ls->ls_root_sem);
+                        goto out;
+                }
+                error = recover_locks(r);
+                if (error) {
+                        up_read(&ls->ls_root_sem);
+                        goto out;
+                }
+                count += r->res_recover_locks_count;
+        }
+        up_read(&ls->ls_root_sem);
+        log_debug(ls, "dlm_recover_locks %d locks", count);
+        error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+        if (error)
+                recover_list_clear(ls);
+        else
+                dlm_set_recover_status(ls, DLM_RS_LOCKS);
+        return error;
+}
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+        DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+        r->res_recover_locks_count--;
+        if (!r->res_recover_locks_count) {
+                rsb_clear_flag(r, RSB_NEW_MASTER);
+                recover_list_del(r);
+        }
+        if (recover_list_empty(r->res_ls))
+                wake_up(&r->res_ls->ls_wait_general);
+}
+/*
+ * The lvb needs to be recovered on all master rsb's.  This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
+ * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+static void recover_lvb(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb, *high_lkb = NULL;
+        uint32_t high_seq = 0;
+        int lock_lvb_exists = 0;
+        int big_lock_exists = 0;
+        int lvblen = r->res_ls->ls_lvblen;
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        continue;
+                lock_lvb_exists = 1;
+                if (lkb->lkb_grmode > DLM_LOCK_CR) {
+                        big_lock_exists = 1;
+                        goto setflag;
+                }
+                if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+                        high_lkb = lkb;
+                        high_seq = lkb->lkb_lvbseq;
+                }
+        }
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+                        continue;
+                lock_lvb_exists = 1;
+                if (lkb->lkb_grmode > DLM_LOCK_CR) {
+                        big_lock_exists = 1;
+                        goto setflag;
+                }
+                if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+                        high_lkb = lkb;
+                        high_seq = lkb->lkb_lvbseq;
+                }
+        }
+ setflag:
+        if (!lock_lvb_exists)
+                goto out;
+        if (!big_lock_exists)
+                rsb_set_flag(r, RSB_VALNOTVALID);
+        /* don't mess with the lvb unless we're the new master */
+        if (!rsb_flag(r, RSB_NEW_MASTER2))
+                goto out;
+        if (!r->res_lvbptr) {
+                r->res_lvbptr = allocate_lvb(r->res_ls);
+                if (!r->res_lvbptr)
+                        goto out;
+        }
+        if (big_lock_exists) {
+                r->res_lvbseq = lkb->lkb_lvbseq;
+                memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+        } else if (high_lkb) {
+                r->res_lvbseq = high_lkb->lkb_lvbseq;
+                memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+        } else {
+                r->res_lvbseq = 0;
+                memset(r->res_lvbptr, 0, lvblen);
+        }
+ out:
+        return;
+}
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
+   converting PR->CW or CW->PR need to have their lkb_grmode set. */
+static void recover_conversion(struct dlm_rsb *r)
+{
+        struct dlm_lkb *lkb;
+        int grmode = -1;
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+                if (lkb->lkb_grmode == DLM_LOCK_PR ||
+                    lkb->lkb_grmode == DLM_LOCK_CW) {
+                        grmode = lkb->lkb_grmode;
+                        break;
+                }
+        }
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+                if (lkb->lkb_grmode != DLM_LOCK_IV)
+                        continue;
+                if (grmode == -1)
+                        lkb->lkb_grmode = lkb->lkb_rqmode;
+                else
+                        lkb->lkb_grmode = grmode;
+        }
+}
+/* We've become the new master for this rsb and waiting/converting locks may
+   need to be granted in dlm_grant_after_purge() due to locks that may have
+   existed from a removed node. */
+static void set_locks_purged(struct dlm_rsb *r)
+{
+        if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+                rsb_set_flag(r, RSB_LOCKS_PURGED);
+}
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int count = 0;
+        log_debug(ls, "dlm_recover_rsbs");
+        down_read(&ls->ls_root_sem);
+        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+                lock_rsb(r);
+                if (is_master(r)) {
+                        if (rsb_flag(r, RSB_RECOVER_CONVERT))
+                                recover_conversion(r);
+                        if (rsb_flag(r, RSB_NEW_MASTER2))
+                                set_locks_purged(r);
+                        recover_lvb(r);
+                        count++;
+                }
+                rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+                rsb_clear_flag(r, RSB_NEW_MASTER2);
+                unlock_rsb(r);
+        }
+        up_read(&ls->ls_root_sem);
+        log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+/* Create a single list of all root rsb's to be used during recovery */
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r;
+        int i, error = 0;
+        down_write(&ls->ls_root_sem);
+        if (!list_empty(&ls->ls_root_list)) {
+                log_error(ls, "root list not empty");
+                error = -EINVAL;
+                goto out;
+        }
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                read_lock(&ls->ls_rsbtbl[i].lock);
+                list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+                        list_add(&r->res_root_list, &ls->ls_root_list);
+                        dlm_hold_rsb(r);
+                }
+                read_unlock(&ls->ls_rsbtbl[i].lock);
+        }
+ out:
+        up_write(&ls->ls_root_sem);
+        return error;
+}
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r, *safe;
+        down_write(&ls->ls_root_sem);
+        list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+                list_del_init(&r->res_root_list);
+                dlm_put_rsb(r);
+        }
+        up_write(&ls->ls_root_sem);
+}
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+        struct dlm_rsb *r, *safe;
+        int i;
+        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+                write_lock(&ls->ls_rsbtbl[i].lock);
+                list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+                                         res_hashchain) {
+                        list_del(&r->res_hashchain);
+                        free_rsb(r);
+                }
+                write_unlock(&ls->ls_rsbtbl[i].lock);
+        }
+}
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+#endif                          /* __RECOVER_DOT_H__ */
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+/* If the start for which we're re-enabling locking (seq) has been superseded
+   by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+        int error = -EINTR;
+        spin_lock(&ls->ls_recover_lock);
+        if (ls->ls_recover_seq == seq) {
+                set_bit(LSFL_RUNNING, &ls->ls_flags);
+                up_write(&ls->ls_in_recovery);
+                error = 0;
+        }
+        spin_unlock(&ls->ls_recover_lock);
+        return error;
+}
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+        unsigned long start;
+        int error, neg = 0;
+        log_debug(ls, "recover %llx", rv->seq);
+        mutex_lock(&ls->ls_recoverd_active);
+        /*
+         * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+         * will be processed by dlm_astd during recovery.
+         */
+        dlm_astd_suspend();
+        dlm_astd_resume();
+        /*
+         * This list of root rsb's will be the basis of most of the recovery
+         * routines.
+         */
+        dlm_create_root_list(ls);
+        /*
+         * Free all the tossed rsb's so we don't have to recover them.
+         */
+        dlm_clear_toss_list(ls);
+        /*
+         * Add or remove nodes from the lockspace's ls_nodes list.
+         * Also waits for all nodes to complete dlm_recover_members.
+         */
+        error = dlm_recover_members(ls, rv, &neg);
+        if (error) {
+                log_error(ls, "recover_members failed %d", error);
+                goto fail;
+        }
+        start = jiffies;
+        /*
+         * Rebuild our own share of the directory by collecting from all other
+         * nodes their master rsb names that hash to us.
+         */
+        error = dlm_recover_directory(ls);
+        if (error) {
+                log_error(ls, "recover_directory failed %d", error);
+                goto fail;
+        }
+        /*
+         * Purge directory-related requests that are saved in requestqueue.
+         * All dir requests from before recovery are invalid now due to the dir
+         * rebuild and will be resent by the requesting nodes.
+         */
+        dlm_purge_requestqueue(ls);
+        /*
+         * Wait for all nodes to complete directory rebuild.
+         */
+        error = dlm_recover_directory_wait(ls);
+        if (error) {
+                log_error(ls, "recover_directory_wait failed %d", error);
+                goto fail;
+        }
+        /*
+         * We may have outstanding operations that are waiting for a reply from
+         * a failed node.  Mark these to be resent after recovery.  Unlock and
+         * cancel ops can just be completed.
+         */
+        dlm_recover_waiters_pre(ls);
+        error = dlm_recovery_stopped(ls);
+        if (error)
+                goto fail;
+        if (neg || dlm_no_directory(ls)) {
+                /*
+                 * Clear lkb's for departed nodes.
+                 */
+                dlm_purge_locks(ls);
+                /*
+                 * Get new master nodeid's for rsb's that were mastered on
+                 * departed nodes.
+                 */
+                error = dlm_recover_masters(ls);
+                if (error) {
+                        log_error(ls, "recover_masters failed %d", error);
+                        goto fail;
+                }
+                /*
+                 * Send our locks on remastered rsb's to the new masters.
+                 */
+                error = dlm_recover_locks(ls);
+                if (error) {
+                        log_error(ls, "recover_locks failed %d", error);
+                        goto fail;
+                }
+                error = dlm_recover_locks_wait(ls);
+                if (error) {
+                        log_error(ls, "recover_locks_wait failed %d", error);
+                        goto fail;
+                }
+                /*
+                 * Finalize state in master rsb's now that all locks can be
+                 * checked.  This includes conversion resolution and lvb
+                 * settings.
+                 */
+                dlm_recover_rsbs(ls);
+        }
+        dlm_release_root_list(ls);
+        dlm_set_recover_status(ls, DLM_RS_DONE);
+        error = dlm_recover_done_wait(ls);
+        if (error) {
+                log_error(ls, "recover_done_wait failed %d", error);
+                goto fail;
+        }
+        dlm_clear_members_gone(ls);
+        error = enable_locking(ls, rv->seq);
+        if (error) {
+                log_error(ls, "enable_locking failed %d", error);
+                goto fail;
+        }
+        error = dlm_process_requestqueue(ls);
+        if (error) {
+                log_error(ls, "process_requestqueue failed %d", error);
+                goto fail;
+        }
+        error = dlm_recover_waiters_post(ls);
+        if (error) {
+                log_error(ls, "recover_waiters_post failed %d", error);
+                goto fail;
+        }
+        dlm_grant_after_purge(ls);
+        dlm_astd_wake();
+        log_debug(ls, "recover %llx done: %u ms", rv->seq,
+                  jiffies_to_msecs(jiffies - start));
+        mutex_unlock(&ls->ls_recoverd_active);
+        return 0;
+ fail:
+        dlm_release_root_list(ls);
+        log_debug(ls, "recover %llx error %d", rv->seq, error);
+        mutex_unlock(&ls->ls_recoverd_active);
+        return error;
+}
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+        struct dlm_recover *rv = NULL;
+        spin_lock(&ls->ls_recover_lock);
+        rv = ls->ls_recover_args;
+        ls->ls_recover_args = NULL;
+        clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+        spin_unlock(&ls->ls_recover_lock);
+        if (rv) {
+                ls_recover(ls, rv);
+                kfree(rv->nodeids);
+                kfree(rv);
+        }
+}
+static int dlm_recoverd(void *arg)
+{
+        struct dlm_ls *ls;
+        ls = dlm_find_lockspace_local(arg);
+        if (!ls) {
+                log_print("dlm_recoverd: no lockspace %p", arg);
+                return -1;
+        }
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (!test_bit(LSFL_WORK, &ls->ls_flags))
+                        schedule();
+                set_current_state(TASK_RUNNING);
+                if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+                        do_ls_recovery(ls);
+        }
+        dlm_put_lockspace(ls);
+        return 0;
+}
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+        set_bit(LSFL_WORK, &ls->ls_flags);
+        wake_up_process(ls->ls_recoverd_task);
+}
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+        struct task_struct *p;
+        int error = 0;
+        p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+        if (IS_ERR(p))
+                error = PTR_ERR(p);
+        else
+                ls->ls_recoverd_task = p;
+        return error;
+}
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+        kthread_stop(ls->ls_recoverd_task);
+}
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+        wake_up(&ls->ls_wait_general);
+        mutex_lock(&ls->ls_recoverd_active);
+}
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+        mutex_unlock(&ls->ls_recoverd_active);
+}
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+#endif                          /* __RECOVERD_DOT_H__ */
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+struct rq_entry {
+        struct list_head list;
+        int nodeid;
+        char request[1];
+};
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.  This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+        struct rq_entry *e;
+        int length = hd->h_length;
+        if (dlm_is_removed(ls, nodeid))
+                return;
+        e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+        if (!e) {
+                log_print("dlm_add_requestqueue: out of memory\n");
+                return;
+        }
+        e->nodeid = nodeid;
+        memcpy(e->request, hd, length);
+        mutex_lock(&ls->ls_requestqueue_mutex);
+        list_add_tail(&e->list, &ls->ls_requestqueue);
+        mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+        struct rq_entry *e;
+        struct dlm_header *hd;
+        int error = 0;
+        mutex_lock(&ls->ls_requestqueue_mutex);
+        for (;;) {
+                if (list_empty(&ls->ls_requestqueue)) {
+                        mutex_unlock(&ls->ls_requestqueue_mutex);
+                        error = 0;
+                        break;
+                }
+                e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+                mutex_unlock(&ls->ls_requestqueue_mutex);
+                hd = (struct dlm_header *) e->request;
+                error = dlm_receive_message(hd, e->nodeid, 1);
+                if (error == -EINTR) {
+                        /* entry is left on requestqueue */
+                        log_debug(ls, "process_requestqueue abort eintr");
+                        break;
+                }
+                mutex_lock(&ls->ls_requestqueue_mutex);
+                list_del(&e->list);
+                kfree(e);
+                if (dlm_locking_stopped(ls)) {
+                        log_debug(ls, "process_requestqueue abort running");
+                        mutex_unlock(&ls->ls_requestqueue_mutex);
+                        error = -EINTR;
+                        break;
+                }
+                schedule();
+        }
+        return error;
+}
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recvd.  At
+ * the same time, dlm_recvd will start receiving new requests from remote
+ * nodes.  We want to delay dlm_recvd processing new requests until
+ * dlm_recoverd has finished processing the old saved requests.
+ */
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+        for (;;) {
+                mutex_lock(&ls->ls_requestqueue_mutex);
+                if (list_empty(&ls->ls_requestqueue))
+                        break;
+                if (dlm_locking_stopped(ls))
+                        break;
+                mutex_unlock(&ls->ls_requestqueue_mutex);
+                schedule();
+        }
+        mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+        uint32_t type = ms->m_type;
+        if (dlm_is_removed(ls, nodeid))
+                return 1;
+        /* directory operations are always purged because the directory is
+           always rebuilt during recovery and the lookups resent */
+        if (type == DLM_MSG_REMOVE ||
+            type == DLM_MSG_LOOKUP ||
+            type == DLM_MSG_LOOKUP_REPLY)
+                return 1;
+        if (!dlm_no_directory(ls))
+                return 0;
+        /* with no directory, the master is likely to change as a part of
+           recovery; requests to/from the defunct master need to be purged */
+        switch (type) {
+        case DLM_MSG_REQUEST:
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_CANCEL:
+                /* we're no longer the master of this resource, the sender
+                   will resend to the new master (see waiter_needs_recovery) */
+                if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+                        return 1;
+                break;
+        case DLM_MSG_REQUEST_REPLY:
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_UNLOCK_REPLY:
+        case DLM_MSG_CANCEL_REPLY:
+        case DLM_MSG_GRANT:
+                /* this reply is from the former master of the resource,
+                   we'll resend to the new master if needed */
+                if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+                        return 1;
+                break;
+        }
+        return 0;
+}
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+        struct dlm_message *ms;
+        struct rq_entry *e, *safe;
+        mutex_lock(&ls->ls_requestqueue_mutex);
+        list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+                ms = (struct dlm_message *) e->request;
+                if (purge_request(ls, ms, e->nodeid)) {
+                        list_del(&e->list);
+                        kfree(e);
+                }
+        }
+        mutex_unlock(&ls->ls_requestqueue_mutex);
+}
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+#endif
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+static const char *name_prefix="dlm";
+static struct miscdevice ctl_device;
+static struct file_operations device_fops;
+#ifdef CONFIG_COMPAT
+struct dlm_lock_params32 {
+        __u8 mode;
+        __u8 namelen;
+        __u16 flags;
+        __u32 lkid;
+        __u32 parent;
+        __u32 castparam;
+        __u32 castaddr;
+        __u32 bastparam;
+        __u32 bastaddr;
+        __u32 lksb;
+        char lvb[DLM_USER_LVB_LEN];
+        char name[0];
+};
+struct dlm_write_request32 {
+        __u32 version[3];
+        __u8 cmd;
+        __u8 is64bit;
+        __u8 unused[2];
+        union  {
+                struct dlm_lock_params32 lock;
+                struct dlm_lspace_params lspace;
+        } i;
+};
+struct dlm_lksb32 {
+        __u32 sb_status;
+        __u32 sb_lkid;
+        __u8 sb_flags;
+        __u32 sb_lvbptr;
+};
+struct dlm_lock_result32 {
+        __u32 length;
+        __u32 user_astaddr;
+        __u32 user_astparam;
+        __u32 user_lksb;
+        struct dlm_lksb32 lksb;
+        __u8 bast_mode;
+        __u8 unused[3];
+        /* Offsets may be zero if no data is present */
+        __u32 lvb_offset;
+};
+static void compat_input(struct dlm_write_request *kb,
+                         struct dlm_write_request32 *kb32)
+{
+        kb->version[0] = kb32->version[0];
+        kb->version[1] = kb32->version[1];
+        kb->version[2] = kb32->version[2];
+        kb->cmd = kb32->cmd;
+        kb->is64bit = kb32->is64bit;
+        if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+            kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+                kb->i.lspace.flags = kb32->i.lspace.flags;
+                kb->i.lspace.minor = kb32->i.lspace.minor;
+                strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+        } else {
+                kb->i.lock.mode = kb32->i.lock.mode;
+                kb->i.lock.namelen = kb32->i.lock.namelen;
+                kb->i.lock.flags = kb32->i.lock.flags;
+                kb->i.lock.lkid = kb32->i.lock.lkid;
+                kb->i.lock.parent = kb32->i.lock.parent;
+                kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+                kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+                kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+                kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+                kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+                memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+                memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+        }
+}
+static void compat_output(struct dlm_lock_result *res,
+                          struct dlm_lock_result32 *res32)
+{
+        res32->length = res->length - (sizeof(struct dlm_lock_result) -
+                                       sizeof(struct dlm_lock_result32));
+        res32->user_astaddr = (__u32)(long)res->user_astaddr;
+        res32->user_astparam = (__u32)(long)res->user_astparam;
+        res32->user_lksb = (__u32)(long)res->user_lksb;
+        res32->bast_mode = res->bast_mode;
+        res32->lvb_offset = res->lvb_offset;
+        res32->length = res->length;
+        res32->lksb.sb_status = res->lksb.sb_status;
+        res32->lksb.sb_flags = res->lksb.sb_flags;
+        res32->lksb.sb_lkid = res->lksb.sb_lkid;
+        res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+{
+        struct dlm_ls *ls;
+        struct dlm_user_args *ua;
+        struct dlm_user_proc *proc;
+        int remove_ownqueue = 0;
+        /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
+           lkb before dealing with it.  We need to check this
+           flag before taking ls_clear_proc_locks mutex because if
+           it's set, dlm_clear_proc_locks() holds the mutex. */
+        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+                /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+                return;
+        }
+        ls = lkb->lkb_resource->res_ls;
+        mutex_lock(&ls->ls_clear_proc_locks);
+        /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+           can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
+           lkb->ua so we can't try to use it. */
+        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+                /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+                goto out;
+        }
+        DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        proc = ua->proc;
+        if (type == AST_BAST && ua->bastaddr == NULL)
+                goto out;
+        spin_lock(&proc->asts_spin);
+        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+                kref_get(&lkb->lkb_ref);
+                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+                lkb->lkb_ast_type |= type;
+                wake_up_interruptible(&proc->wait);
+        }
+        /* noqueue requests that fail may need to be removed from the
+           proc's locks list, there should be a better way of detecting
+           this situation than checking all these things... */
+        if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
+            ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
+                remove_ownqueue = 1;
+        /* We want to copy the lvb to userspace when the completion
+           ast is read if the status is 0, the lock has an lvb and
+           lvb_ops says we should.  We could probably have set_lvb_lock()
+           set update_user_lvb instead and not need old_mode */
+        if ((lkb->lkb_ast_type & AST_COMP) &&
+            (lkb->lkb_lksb->sb_status == 0) &&
+            lkb->lkb_lksb->sb_lvbptr &&
+            dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
+                ua->update_user_lvb = 1;
+        else
+                ua->update_user_lvb = 0;
+        spin_unlock(&proc->asts_spin);
+        if (remove_ownqueue) {
+                spin_lock(&ua->proc->locks_spin);
+                list_del_init(&lkb->lkb_ownqueue);
+                spin_unlock(&ua->proc->locks_spin);
+                dlm_put_lkb(lkb);
+        }
+ out:
+        mutex_unlock(&ls->ls_clear_proc_locks);
+}
+static int device_user_lock(struct dlm_user_proc *proc,
+                            struct dlm_lock_params *params)
+{
+        struct dlm_ls *ls;
+        struct dlm_user_args *ua;
+        int error = -ENOMEM;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        if (!params->castaddr || !params->lksb) {
+                error = -EINVAL;
+                goto out;
+        }
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        if (!ua)
+                goto out;
+        ua->proc = proc;
+        ua->user_lksb = params->lksb;
+        ua->castparam = params->castparam;
+        ua->castaddr = params->castaddr;
+        ua->bastparam = params->bastparam;
+        ua->bastaddr = params->bastaddr;
+        if (params->flags & DLM_LKF_CONVERT)
+                error = dlm_user_convert(ls, ua,
+                                         params->mode, params->flags,
+                                         params->lkid, params->lvb);
+        else {
+                error = dlm_user_request(ls, ua,
+                                         params->mode, params->flags,
+                                         params->name, params->namelen,
+                                         params->parent);
+                if (!error)
+                        error = ua->lksb.sb_lkid;
+        }
+ out:
+        dlm_put_lockspace(ls);
+        return error;
+}
+static int device_user_unlock(struct dlm_user_proc *proc,
+                              struct dlm_lock_params *params)
+{
+        struct dlm_ls *ls;
+        struct dlm_user_args *ua;
+        int error = -ENOMEM;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+        if (!ua)
+                goto out;
+        ua->proc = proc;
+        ua->user_lksb = params->lksb;
+        ua->castparam = params->castparam;
+        ua->castaddr = params->castaddr;
+        if (params->flags & DLM_LKF_CANCEL)
+                error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+        else
+                error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+                                        params->lvb);
+ out:
+        dlm_put_lockspace(ls);
+        return error;
+}
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+        dlm_lockspace_t *lockspace;
+        struct dlm_ls *ls;
+        int error, len;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        error = dlm_new_lockspace(params->name, strlen(params->name),
+                                  &lockspace, 0, DLM_USER_LVB_LEN);
+        if (error)
+                return error;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -ENOENT;
+        error = -ENOMEM;
+        len = strlen(params->name) + strlen(name_prefix) + 2;
+        ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+        if (!ls->ls_device.name)
+                goto fail;
+        snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+                 params->name);
+        ls->ls_device.fops = &device_fops;
+        ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+        error = misc_register(&ls->ls_device);
+        if (error) {
+                kfree(ls->ls_device.name);
+                goto fail;
+        }
+        error = ls->ls_device.minor;
+        dlm_put_lockspace(ls);
+        return error;
+ fail:
+        dlm_put_lockspace(ls);
+        dlm_release_lockspace(lockspace, 0);
+        return error;
+}
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+        dlm_lockspace_t *lockspace;
+        struct dlm_ls *ls;
+        int error, force = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        ls = dlm_find_lockspace_device(params->minor);
+        if (!ls)
+                return -ENOENT;
+        error = misc_deregister(&ls->ls_device);
+        if (error) {
+                dlm_put_lockspace(ls);
+                goto out;
+        }
+        kfree(ls->ls_device.name);
+        if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+                force = 2;
+        lockspace = ls->ls_local_handle;
+        /* dlm_release_lockspace waits for references to go to zero,
+           so all processes will need to close their device for the ls
+           before the release will procede */
+        dlm_put_lockspace(ls);
+        error = dlm_release_lockspace(lockspace, force);
+ out:
+        return error;
+}
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+        if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+            (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+             req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+                printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+                       "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+                       current->comm,
+                       current->pid,
+                       req->version[0],
+                       req->version[1],
+                       req->version[2],
+                       DLM_DEVICE_VERSION_MAJOR,
+                       DLM_DEVICE_VERSION_MINOR,
+                       DLM_DEVICE_VERSION_PATCH);
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * device_write
+ *
+ *   device_user_lock
+ *     dlm_user_request -> request_lock
+ *     dlm_user_convert -> convert_lock
+ *
+ *   device_user_unlock
+ *     dlm_user_unlock -> unlock_lock
+ *     dlm_user_cancel -> cancel_lock
+ *
+ *   device_create_lockspace
+ *     dlm_new_lockspace
+ *
+ *   device_remove_lockspace
+ *     dlm_release_lockspace
+ */
+/* a write to a lockspace device is a lock or unlock request, a write
+   to the control device is to create/remove a lockspace */
+static ssize_t device_write(struct file *file, const char __user *buf,
+                            size_t count, loff_t *ppos)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        struct dlm_write_request *kbuf;
+        sigset_t tmpsig, allsigs;
+        int error;
+#ifdef CONFIG_COMPAT
+        if (count < sizeof(struct dlm_write_request32))
+#else
+        if (count < sizeof(struct dlm_write_request))
+#endif
+                return -EINVAL;
+        kbuf = kmalloc(count, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        if (copy_from_user(kbuf, buf, count)) {
+                error = -EFAULT;
+                goto out_free;
+        }
+        if (check_version(kbuf)) {
+                error = -EBADE;
+                goto out_free;
+        }
+#ifdef CONFIG_COMPAT
+        if (!kbuf->is64bit) {
+                struct dlm_write_request32 *k32buf;
+                k32buf = (struct dlm_write_request32 *)kbuf;
+                kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
+                               sizeof(struct dlm_write_request32)), GFP_KERNEL);
+                if (!kbuf)
+                        return -ENOMEM;
+                if (proc)
+                        set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+                compat_input(kbuf, k32buf);
+                kfree(k32buf);
+        }
+#endif
+        /* do we really need this? can a write happen after a close? */
+        if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+            test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+                return -EINVAL;
+        sigfillset(&allsigs);
+        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+        error = -EINVAL;
+        switch (kbuf->cmd)
+        {
+        case DLM_USER_LOCK:
+                if (!proc) {
+                        log_print("no locking on control device");
+                        goto out_sig;
+                }
+                error = device_user_lock(proc, &kbuf->i.lock);
+                break;
+        case DLM_USER_UNLOCK:
+                if (!proc) {
+                        log_print("no locking on control device");
+                        goto out_sig;
+                }
+                error = device_user_unlock(proc, &kbuf->i.lock);
+                break;
+        case DLM_USER_CREATE_LOCKSPACE:
+                if (proc) {
+                        log_print("create/remove only on control device");
+                        goto out_sig;
+                }
+                error = device_create_lockspace(&kbuf->i.lspace);
+                break;
+        case DLM_USER_REMOVE_LOCKSPACE:
+                if (proc) {
+                        log_print("create/remove only on control device");
+                        goto out_sig;
+                }
+                error = device_remove_lockspace(&kbuf->i.lspace);
+                break;
+        default:
+                log_print("Unknown command passed to DLM device : %d\n",
+                          kbuf->cmd);
+        }
+ out_sig:
+        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+        recalc_sigpending();
+ out_free:
+        kfree(kbuf);
+        return error;
+}
+/* Every process that opens the lockspace device has its own "proc" structure
+   hanging off the open file that's used to keep track of locks owned by the
+   process and asts that need to be delivered to the process. */
+static int device_open(struct inode *inode, struct file *file)
+{
+        struct dlm_user_proc *proc;
+        struct dlm_ls *ls;
+        ls = dlm_find_lockspace_device(iminor(inode));
+        if (!ls)
+                return -ENOENT;
+        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+        if (!proc) {
+                dlm_put_lockspace(ls);
+                return -ENOMEM;
+        }
+        proc->lockspace = ls->ls_local_handle;
+        INIT_LIST_HEAD(&proc->asts);
+        INIT_LIST_HEAD(&proc->locks);
+        spin_lock_init(&proc->asts_spin);
+        spin_lock_init(&proc->locks_spin);
+        init_waitqueue_head(&proc->wait);
+        file->private_data = proc;
+        return 0;
+}
+static int device_close(struct inode *inode, struct file *file)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        struct dlm_ls *ls;
+        sigset_t tmpsig, allsigs;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        sigfillset(&allsigs);
+        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+        set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+        dlm_clear_proc_locks(ls, proc);
+        /* at this point no more lkb's should exist for this lockspace,
+           so there's no chance of dlm_user_add_ast() being called and
+           looking for lkb->ua->proc */
+        kfree(proc);
+        file->private_data = NULL;
+        dlm_put_lockspace(ls);
+        dlm_put_lockspace(ls);  /* for the find in device_open() */
+        /* FIXME: AUTOFREE: if this ls is no longer used do
+           device_remove_lockspace() */
+        sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+        recalc_sigpending();
+        return 0;
+}
+static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+                               int bmode, char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+        struct dlm_lock_result32 result32;
+#endif
+        struct dlm_lock_result result;
+        void *resultptr;
+        int error=0;
+        int len;
+        int struct_len;
+        memset(&result, 0, sizeof(struct dlm_lock_result));
+        memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+        result.user_lksb = ua->user_lksb;
+        /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+           in a conversion unless the conversion is successful.  See code
+           in dlm_user_convert() for updating ua from ua_tmp.  OpenVMS, though,
+           notes that a new blocking AST address and parameter are set even if
+           the conversion fails, so maybe we should just do that. */
+        if (type == AST_BAST) {
+                result.user_astaddr = ua->bastaddr;
+                result.user_astparam = ua->bastparam;
+                result.bast_mode = bmode;
+        } else {
+                result.user_astaddr = ua->castaddr;
+                result.user_astparam = ua->castparam;
+        }
+#ifdef CONFIG_COMPAT
+        if (compat)
+                len = sizeof(struct dlm_lock_result32);
+        else
+#endif
+                len = sizeof(struct dlm_lock_result);
+        struct_len = len;
+        /* copy lvb to userspace if there is one, it's been updated, and
+           the user buffer has space for it */
+        if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+            count >= len + DLM_USER_LVB_LEN) {
+                if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+                                 DLM_USER_LVB_LEN)) {
+                        error = -EFAULT;
+                        goto out;
+                }
+                result.lvb_offset = len;
+                len += DLM_USER_LVB_LEN;
+        }
+        result.length = len;
+        resultptr = &result;
+#ifdef CONFIG_COMPAT
+        if (compat) {
+                compat_output(&result, &result32);
+                resultptr = &result32;
+        }
+#endif
+        if (copy_to_user(buf, resultptr, struct_len))
+                error = -EFAULT;
+        else
+                error = len;
+ out:
+        return error;
+}
+/* a read returns a single ast described in a struct dlm_lock_result */
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+                           loff_t *ppos)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        struct dlm_lkb *lkb;
+        struct dlm_user_args *ua;
+        DECLARE_WAITQUEUE(wait, current);
+        int error, type=0, bmode=0, removed = 0;
+#ifdef CONFIG_COMPAT
+        if (count < sizeof(struct dlm_lock_result32))
+#else
+        if (count < sizeof(struct dlm_lock_result))
+#endif
+                return -EINVAL;
+        /* do we really need this? can a read happen after a close? */
+        if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+                return -EINVAL;
+        spin_lock(&proc->asts_spin);
+        if (list_empty(&proc->asts)) {
+                if (file->f_flags & O_NONBLOCK) {
+                        spin_unlock(&proc->asts_spin);
+                        return -EAGAIN;
+                }
+                add_wait_queue(&proc->wait, &wait);
+        repeat:
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (list_empty(&proc->asts) && !signal_pending(current)) {
+                        spin_unlock(&proc->asts_spin);
+                        schedule();
+                        spin_lock(&proc->asts_spin);
+                        goto repeat;
+                }
+                set_current_state(TASK_RUNNING);
+                remove_wait_queue(&proc->wait, &wait);
+                if (signal_pending(current)) {
+                        spin_unlock(&proc->asts_spin);
+                        return -ERESTARTSYS;
+                }
+        }
+        if (list_empty(&proc->asts)) {
+                spin_unlock(&proc->asts_spin);
+                return -EAGAIN;
+        }
+        /* there may be both completion and blocking asts to return for
+           the lkb, don't remove lkb from asts list unless no asts remain */
+        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
+        if (lkb->lkb_ast_type & AST_COMP) {
+                lkb->lkb_ast_type &= ~AST_COMP;
+                type = AST_COMP;
+        } else if (lkb->lkb_ast_type & AST_BAST) {
+                lkb->lkb_ast_type &= ~AST_BAST;
+                type = AST_BAST;
+                bmode = lkb->lkb_bastmode;
+        }
+        if (!lkb->lkb_ast_type) {
+                list_del(&lkb->lkb_astqueue);
+                removed = 1;
+        }
+        spin_unlock(&proc->asts_spin);
+        ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        error = copy_result_to_user(ua,
+                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                type, bmode, buf, count);
+        /* removes reference for the proc->asts lists added by
+           dlm_user_add_ast() and may result in the lkb being freed */
+        if (removed)
+                dlm_put_lkb(lkb);
+        return error;
+}
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+        struct dlm_user_proc *proc = file->private_data;
+        poll_wait(file, &proc->wait, wait);
+        spin_lock(&proc->asts_spin);
+        if (!list_empty(&proc->asts)) {
+                spin_unlock(&proc->asts_spin);
+                return POLLIN | POLLRDNORM;
+        }
+        spin_unlock(&proc->asts_spin);
+        return 0;
+}
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+        file->private_data = NULL;
+        return 0;
+}
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static struct file_operations device_fops = {
+        .open    = device_open,
+        .release = device_close,
+        .read    = device_read,
+        .write   = device_write,
+        .poll    = device_poll,
+        .owner   = THIS_MODULE,
+};
+static struct file_operations ctl_device_fops = {
+        .open    = ctl_device_open,
+        .release = ctl_device_close,
+        .write   = device_write,
+        .owner   = THIS_MODULE,
+};
+int dlm_user_init(void)
+{
+        int error;
+        ctl_device.name = "dlm-control";
+        ctl_device.fops = &ctl_device_fops;
+        ctl_device.minor = MISC_DYNAMIC_MINOR;
+        error = misc_register(&ctl_device);
+        if (error)
+                log_print("misc_register failed for control device");
+        return error;
+}
+void dlm_user_exit(void)
+{
+        misc_deregister(&ctl_device);
+}
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+static void header_out(struct dlm_header *hd)
+{
+        hd->h_version           = cpu_to_le32(hd->h_version);
+        hd->h_lockspace         = cpu_to_le32(hd->h_lockspace);
+        hd->h_nodeid            = cpu_to_le32(hd->h_nodeid);
+        hd->h_length            = cpu_to_le16(hd->h_length);
+}
+static void header_in(struct dlm_header *hd)
+{
+        hd->h_version           = le32_to_cpu(hd->h_version);
+        hd->h_lockspace         = le32_to_cpu(hd->h_lockspace);
+        hd->h_nodeid            = le32_to_cpu(hd->h_nodeid);
+        hd->h_length            = le16_to_cpu(hd->h_length);
+}
+void dlm_message_out(struct dlm_message *ms)
+{
+        struct dlm_header *hd = (struct dlm_header *) ms;
+        header_out(hd);
+        ms->m_type              = cpu_to_le32(ms->m_type);
+        ms->m_nodeid            = cpu_to_le32(ms->m_nodeid);
+        ms->m_pid               = cpu_to_le32(ms->m_pid);
+        ms->m_lkid              = cpu_to_le32(ms->m_lkid);
+        ms->m_remid             = cpu_to_le32(ms->m_remid);
+        ms->m_parent_lkid       = cpu_to_le32(ms->m_parent_lkid);
+        ms->m_parent_remid      = cpu_to_le32(ms->m_parent_remid);
+        ms->m_exflags           = cpu_to_le32(ms->m_exflags);
+        ms->m_sbflags           = cpu_to_le32(ms->m_sbflags);
+        ms->m_flags             = cpu_to_le32(ms->m_flags);
+        ms->m_lvbseq            = cpu_to_le32(ms->m_lvbseq);
+        ms->m_hash              = cpu_to_le32(ms->m_hash);
+        ms->m_status            = cpu_to_le32(ms->m_status);
+        ms->m_grmode            = cpu_to_le32(ms->m_grmode);
+        ms->m_rqmode            = cpu_to_le32(ms->m_rqmode);
+        ms->m_bastmode          = cpu_to_le32(ms->m_bastmode);
+        ms->m_asts              = cpu_to_le32(ms->m_asts);
+        ms->m_result            = cpu_to_le32(ms->m_result);
+}
+void dlm_message_in(struct dlm_message *ms)
+{
+        struct dlm_header *hd = (struct dlm_header *) ms;
+        header_in(hd);
+        ms->m_type              = le32_to_cpu(ms->m_type);
+        ms->m_nodeid            = le32_to_cpu(ms->m_nodeid);
+        ms->m_pid               = le32_to_cpu(ms->m_pid);
+        ms->m_lkid              = le32_to_cpu(ms->m_lkid);
+        ms->m_remid             = le32_to_cpu(ms->m_remid);
+        ms->m_parent_lkid       = le32_to_cpu(ms->m_parent_lkid);
+        ms->m_parent_remid      = le32_to_cpu(ms->m_parent_remid);
+        ms->m_exflags           = le32_to_cpu(ms->m_exflags);
+        ms->m_sbflags           = le32_to_cpu(ms->m_sbflags);
+        ms->m_flags             = le32_to_cpu(ms->m_flags);
+        ms->m_lvbseq            = le32_to_cpu(ms->m_lvbseq);
+        ms->m_hash              = le32_to_cpu(ms->m_hash);
+        ms->m_status            = le32_to_cpu(ms->m_status);
+        ms->m_grmode            = le32_to_cpu(ms->m_grmode);
+        ms->m_rqmode            = le32_to_cpu(ms->m_rqmode);
+        ms->m_bastmode          = le32_to_cpu(ms->m_bastmode);
+        ms->m_asts              = le32_to_cpu(ms->m_asts);
+        ms->m_result            = le32_to_cpu(ms->m_result);
+}
+static void rcom_lock_out(struct rcom_lock *rl)
+{
+        rl->rl_ownpid           = cpu_to_le32(rl->rl_ownpid);
+        rl->rl_lkid             = cpu_to_le32(rl->rl_lkid);
+        rl->rl_remid            = cpu_to_le32(rl->rl_remid);
+        rl->rl_parent_lkid      = cpu_to_le32(rl->rl_parent_lkid);
+        rl->rl_parent_remid     = cpu_to_le32(rl->rl_parent_remid);
+        rl->rl_exflags          = cpu_to_le32(rl->rl_exflags);
+        rl->rl_flags            = cpu_to_le32(rl->rl_flags);
+        rl->rl_lvbseq           = cpu_to_le32(rl->rl_lvbseq);
+        rl->rl_result           = cpu_to_le32(rl->rl_result);
+        rl->rl_wait_type        = cpu_to_le16(rl->rl_wait_type);
+        rl->rl_namelen          = cpu_to_le16(rl->rl_namelen);
+}
+static void rcom_lock_in(struct rcom_lock *rl)
+{
+        rl->rl_ownpid           = le32_to_cpu(rl->rl_ownpid);
+        rl->rl_lkid             = le32_to_cpu(rl->rl_lkid);
+        rl->rl_remid            = le32_to_cpu(rl->rl_remid);
+        rl->rl_parent_lkid      = le32_to_cpu(rl->rl_parent_lkid);
+        rl->rl_parent_remid     = le32_to_cpu(rl->rl_parent_remid);
+        rl->rl_exflags          = le32_to_cpu(rl->rl_exflags);
+        rl->rl_flags            = le32_to_cpu(rl->rl_flags);
+        rl->rl_lvbseq           = le32_to_cpu(rl->rl_lvbseq);
+        rl->rl_result           = le32_to_cpu(rl->rl_result);
+        rl->rl_wait_type        = le16_to_cpu(rl->rl_wait_type);
+        rl->rl_namelen          = le16_to_cpu(rl->rl_namelen);
+}
+static void rcom_config_out(struct rcom_config *rf)
+{
+        rf->rf_lvblen           = cpu_to_le32(rf->rf_lvblen);
+        rf->rf_lsflags          = cpu_to_le32(rf->rf_lsflags);
+}
+static void rcom_config_in(struct rcom_config *rf)
+{
+        rf->rf_lvblen           = le32_to_cpu(rf->rf_lvblen);
+        rf->rf_lsflags          = le32_to_cpu(rf->rf_lsflags);
+}
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+        struct dlm_header *hd = (struct dlm_header *) rc;
+        int type = rc->rc_type;
+        header_out(hd);
+        rc->rc_type             = cpu_to_le32(rc->rc_type);
+        rc->rc_result           = cpu_to_le32(rc->rc_result);
+        rc->rc_id               = cpu_to_le64(rc->rc_id);
+        if (type == DLM_RCOM_LOCK)
+                rcom_lock_out((struct rcom_lock *) rc->rc_buf);
+        else if (type == DLM_RCOM_STATUS_REPLY)
+                rcom_config_out((struct rcom_config *) rc->rc_buf);
+}
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+        struct dlm_header *hd = (struct dlm_header *) rc;
+        header_in(hd);
+        rc->rc_type             = le32_to_cpu(rc->rc_type);
+        rc->rc_result           = le32_to_cpu(rc->rc_result);
+        rc->rc_id               = le64_to_cpu(rc->rc_id);
+        if (rc->rc_type == DLM_RCOM_LOCK)
+                rcom_lock_in((struct rcom_lock *) rc->rc_buf);
+        else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+                rcom_config_in((struct rcom_config *) rc->rc_buf);
+}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+#endif
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
new file mode 100644
index 000000000000..ca6562451eeb
--- /dev/null
+++ b/fs/ecryptfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux 2.6 eCryptfs
+#
+obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
new file mode 100644
index 000000000000..ed35a9712fa1
--- /dev/null
+++ b/fs/ecryptfs/crypto.c
@@ -0,0 +1,1659 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/key.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+                             struct page *dst_page, int dst_offset,
+                             struct page *src_page, int src_offset, int size,
+                             unsigned char *iv);
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+                             struct page *dst_page, int dst_offset,
+                             struct page *src_page, int src_offset, int size,
+                             unsigned char *iv);
+/**
+ * ecryptfs_to_hex
+ * @dst: Buffer to take hex character representation of contents of
+ *       src; must be at least of size (src_size * 2)
+ * @src: Buffer to be converted to a hex string respresentation
+ * @src_size: number of bytes to convert
+ */
+void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+        int x;
+        for (x = 0; x < src_size; x++)
+                sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
+}
+/**
+ * ecryptfs_from_hex
+ * @dst: Buffer to take the bytes from src hex; must be at least of
+ *       size (src_size / 2)
+ * @src: Buffer to be converted from a hex string respresentation to raw value
+ * @dst_size: size of dst buffer, or number of hex characters pairs to convert
+ */
+void ecryptfs_from_hex(char *dst, char *src, int dst_size)
+{
+        int x;
+        char tmp[3] = { 0, };
+        for (x = 0; x < dst_size; x++) {
+                tmp[0] = src[x * 2];
+                tmp[1] = src[x * 2 + 1];
+                dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16);
+        }
+}
+/**
+ * ecryptfs_calculate_md5 - calculates the md5 of @src
+ * @dst: Pointer to 16 bytes of allocated memory
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @src: Data to be md5'd
+ * @len: Length of @src
+ *
+ * Uses the allocated crypto context that crypt_stat references to
+ * generate the MD5 sum of the contents of src.
+ */
+static int ecryptfs_calculate_md5(char *dst,
+                                  struct ecryptfs_crypt_stat *crypt_stat,
+                                  char *src, int len)
+{
+        int rc = 0;
+        struct scatterlist sg;
+        mutex_lock(&crypt_stat->cs_md5_tfm_mutex);
+        sg_init_one(&sg, (u8 *)src, len);
+        if (!crypt_stat->md5_tfm) {
+                crypt_stat->md5_tfm =
+                        crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
+                if (!crypt_stat->md5_tfm) {
+                        rc = -ENOMEM;
+                        ecryptfs_printk(KERN_ERR, "Error attempting to "
+                                        "allocate crypto context\n");
+                        goto out;
+                }
+        }
+        crypto_digest_init(crypt_stat->md5_tfm);
+        crypto_digest_update(crypt_stat->md5_tfm, &sg, 1);
+        crypto_digest_final(crypt_stat->md5_tfm, dst);
+        mutex_unlock(&crypt_stat->cs_md5_tfm_mutex);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_derive_iv
+ * @iv: destination for the derived iv vale
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @offset: Offset of the page whose's iv we are to derive
+ *
+ * Generate the initialization vector from the given root IV and page
+ * offset.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+                              pgoff_t offset)
+{
+        int rc = 0;
+        char dst[MD5_DIGEST_SIZE];
+        char src[ECRYPTFS_MAX_IV_BYTES + 16];
+        if (unlikely(ecryptfs_verbosity > 0)) {
+                ecryptfs_printk(KERN_DEBUG, "root iv:\n");
+                ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes);
+        }
+        /* TODO: It is probably secure to just cast the least
+         * significant bits of the root IV into an unsigned long and
+         * add the offset to that rather than go through all this
+         * hashing business. -Halcrow */
+        memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes);
+        memset((src + crypt_stat->iv_bytes), 0, 16);
+        snprintf((src + crypt_stat->iv_bytes), 16, "%ld", offset);
+        if (unlikely(ecryptfs_verbosity > 0)) {
+                ecryptfs_printk(KERN_DEBUG, "source:\n");
+                ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
+        }
+        rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
+                                    (crypt_stat->iv_bytes + 16));
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+                                "MD5 while generating IV for a page\n");
+                goto out;
+        }
+        memcpy(iv, dst, crypt_stat->iv_bytes);
+        if (unlikely(ecryptfs_verbosity > 0)) {
+                ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
+                ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_init_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Initialize the crypt_stat structure.
+ */
+void
+ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+        mutex_init(&crypt_stat->cs_mutex);
+        mutex_init(&crypt_stat->cs_tfm_mutex);
+        mutex_init(&crypt_stat->cs_md5_tfm_mutex);
+        ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_STRUCT_INITIALIZED);
+}
+/**
+ * ecryptfs_destruct_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Releases all memory associated with a crypt_stat struct.
+ */
+void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->tfm)
+                crypto_free_tfm(crypt_stat->tfm);
+        if (crypt_stat->md5_tfm)
+                crypto_free_tfm(crypt_stat->md5_tfm);
+        memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+}
+void ecryptfs_destruct_mount_crypt_stat(
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+        if (mount_crypt_stat->global_auth_tok_key)
+                key_put(mount_crypt_stat->global_auth_tok_key);
+        if (mount_crypt_stat->global_key_tfm)
+                crypto_free_tfm(mount_crypt_stat->global_key_tfm);
+        memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat));
+}
+/**
+ * virt_to_scatterlist
+ * @addr: Virtual address
+ * @size: Size of data; should be an even multiple of the block size
+ * @sg: Pointer to scatterlist array; set to NULL to obtain only
+ *      the number of scatterlist structs required in array
+ * @sg_size: Max array size
+ *
+ * Fills in a scatterlist array with page references for a passed
+ * virtual address.
+ *
+ * Returns the number of scatterlist structs in array used
+ */
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+                        int sg_size)
+{
+        int i = 0;
+        struct page *pg;
+        int offset;
+        int remainder_of_page;
+        while (size > 0 && i < sg_size) {
+                pg = virt_to_page(addr);
+                offset = offset_in_page(addr);
+                if (sg) {
+                        sg[i].page = pg;
+                        sg[i].offset = offset;
+                }
+                remainder_of_page = PAGE_CACHE_SIZE - offset;
+                if (size >= remainder_of_page) {
+                        if (sg)
+                                sg[i].length = remainder_of_page;
+                        addr += remainder_of_page;
+                        size -= remainder_of_page;
+                } else {
+                        if (sg)
+                                sg[i].length = size;
+                        addr += size;
+                        size = 0;
+                }
+                i++;
+        }
+        if (size > 0)
+                return -ENOMEM;
+        return i;
+}
+/**
+ * encrypt_scatterlist
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ * @dest_sg: Destination of encrypted data
+ * @src_sg: Data to be encrypted
+ * @size: Length of data to be encrypted
+ * @iv: iv to use during encryption
+ *
+ * Returns the number of bytes encrypted; negative value on error
+ */
+static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+                               struct scatterlist *dest_sg,
+                               struct scatterlist *src_sg, int size,
+                               unsigned char *iv)
+{
+        int rc = 0;
+        BUG_ON(!crypt_stat || !crypt_stat->tfm
+               || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+                                       ECRYPTFS_STRUCT_INITIALIZED));
+        if (unlikely(ecryptfs_verbosity > 0)) {
+                ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+                                crypt_stat->key_size);
+                ecryptfs_dump_hex(crypt_stat->key,
+                                  crypt_stat->key_size);
+        }
+        /* Consider doing this once, when the file is opened */
+        mutex_lock(&crypt_stat->cs_tfm_mutex);
+        rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
+                                  crypt_stat->key_size);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+                                rc);
+                mutex_unlock(&crypt_stat->cs_tfm_mutex);
+                rc = -EINVAL;
+                goto out;
+        }
+        ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
+        crypto_cipher_encrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size, iv);
+        mutex_unlock(&crypt_stat->cs_tfm_mutex);
+out:
+        return rc;
+}
+static void
+ecryptfs_extent_to_lwr_pg_idx_and_offset(unsigned long *lower_page_idx,
+                                         int *byte_offset,
+                                         struct ecryptfs_crypt_stat *crypt_stat,
+                                         unsigned long extent_num)
+{
+        unsigned long lower_extent_num;
+        int extents_occupied_by_headers_at_front;
+        int bytes_occupied_by_headers_at_front;
+        int extent_offset;
+        int extents_per_page;
+        bytes_occupied_by_headers_at_front =
+                ( crypt_stat->header_extent_size
+                  * crypt_stat->num_header_extents_at_front );
+        extents_occupied_by_headers_at_front =
+                ( bytes_occupied_by_headers_at_front
+                  / crypt_stat->extent_size );
+        lower_extent_num = extents_occupied_by_headers_at_front + extent_num;
+        extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+        (*lower_page_idx) = lower_extent_num / extents_per_page;
+        extent_offset = lower_extent_num % extents_per_page;
+        (*byte_offset) = extent_offset * crypt_stat->extent_size;
+        ecryptfs_printk(KERN_DEBUG, " * crypt_stat->header_extent_size = "
+                        "[%d]\n", crypt_stat->header_extent_size);
+        ecryptfs_printk(KERN_DEBUG, " * crypt_stat->"
+                        "num_header_extents_at_front = [%d]\n",
+                        crypt_stat->num_header_extents_at_front);
+        ecryptfs_printk(KERN_DEBUG, " * extents_occupied_by_headers_at_"
+                        "front = [%d]\n", extents_occupied_by_headers_at_front);
+        ecryptfs_printk(KERN_DEBUG, " * lower_extent_num = [0x%.16x]\n",
+                        lower_extent_num);
+        ecryptfs_printk(KERN_DEBUG, " * extents_per_page = [%d]\n",
+                        extents_per_page);
+        ecryptfs_printk(KERN_DEBUG, " * (*lower_page_idx) = [0x%.16x]\n",
+                        (*lower_page_idx));
+        ecryptfs_printk(KERN_DEBUG, " * extent_offset = [%d]\n",
+                        extent_offset);
+        ecryptfs_printk(KERN_DEBUG, " * (*byte_offset) = [%d]\n",
+                        (*byte_offset));
+}
+static int ecryptfs_write_out_page(struct ecryptfs_page_crypt_context *ctx,
+                                   struct page *lower_page,
+                                   struct inode *lower_inode,
+                                   int byte_offset_in_page, int bytes_to_write)
+{
+        int rc = 0;
+        if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
+                rc = ecryptfs_commit_lower_page(lower_page, lower_inode,
+                                                ctx->param.lower_file,
+                                                byte_offset_in_page,
+                                                bytes_to_write);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error calling lower "
+                                        "commit; rc = [%d]\n", rc);
+                        goto out;
+                }
+        } else {
+                rc = ecryptfs_writepage_and_release_lower_page(lower_page,
+                                                               lower_inode,
+                                                               ctx->param.wbc);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error calling lower "
+                                        "writepage(); rc = [%d]\n", rc);
+                        goto out;
+                }
+        }
+out:
+        return rc;
+}
+static int ecryptfs_read_in_page(struct ecryptfs_page_crypt_context *ctx,
+                                 struct page **lower_page,
+                                 struct inode *lower_inode,
+                                 unsigned long lower_page_idx,
+                                 int byte_offset_in_page)
+{
+        int rc = 0;
+        if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
+                /* TODO: Limit this to only the data extents that are
+                 * needed */
+                rc = ecryptfs_get_lower_page(lower_page, lower_inode,
+                                             ctx->param.lower_file,
+                                             lower_page_idx,
+                                             byte_offset_in_page,
+                                             (PAGE_CACHE_SIZE
+                                              - byte_offset_in_page));
+                if (rc) {
+                        ecryptfs_printk(
+                                KERN_ERR, "Error attempting to grab, map, "
+                                "and prepare_write lower page with index "
+                                "[0x%.16x]; rc = [%d]\n", lower_page_idx, rc);
+                        goto out;
+                }
+        } else {
+                rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL,
+                                                      lower_inode,
+                                                      lower_page_idx);
+                if (rc) {
+                        ecryptfs_printk(
+                                KERN_ERR, "Error attempting to grab and map "
+                                "lower page with index [0x%.16x]; rc = [%d]\n",
+                                lower_page_idx, rc);
+                        goto out;
+                }
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_encrypt_page
+ * @ctx: The context of the page
+ *
+ * Encrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * The actual operations performed on each page depends on the
+ * contents of the ecryptfs_page_crypt_context struct.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx)
+{
+        char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+        unsigned long base_extent;
+        unsigned long extent_offset = 0;
+        unsigned long lower_page_idx = 0;
+        unsigned long prior_lower_page_idx = 0;
+        struct page *lower_page;
+        struct inode *lower_inode;
+        struct ecryptfs_inode_info *inode_info;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        int rc = 0;
+        int lower_byte_offset = 0;
+        int orig_byte_offset = 0;
+        int num_extents_per_page;
+#define ECRYPTFS_PAGE_STATE_UNREAD    0
+#define ECRYPTFS_PAGE_STATE_READ      1
+#define ECRYPTFS_PAGE_STATE_MODIFIED  2
+#define ECRYPTFS_PAGE_STATE_WRITTEN   3
+        int page_state;
+        lower_inode = ecryptfs_inode_to_lower(ctx->page->mapping->host);
+        inode_info = ecryptfs_inode_to_private(ctx->page->mapping->host);
+        crypt_stat = &inode_info->crypt_stat;
+        if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
+                rc = ecryptfs_copy_page_to_lower(ctx->page, lower_inode,
+                                                 ctx->param.lower_file);
+                if (rc)
+                        ecryptfs_printk(KERN_ERR, "Error attempting to copy "
+                                        "page at index [0x%.16x]\n",
+                                        ctx->page->index);
+                goto out;
+        }
+        num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+        base_extent = (ctx->page->index * num_extents_per_page);
+        page_state = ECRYPTFS_PAGE_STATE_UNREAD;
+        while (extent_offset < num_extents_per_page) {
+                ecryptfs_extent_to_lwr_pg_idx_and_offset(
+                        &lower_page_idx, &lower_byte_offset, crypt_stat,
+                        (base_extent + extent_offset));
+                if (prior_lower_page_idx != lower_page_idx
+                    && page_state == ECRYPTFS_PAGE_STATE_MODIFIED) {
+                        rc = ecryptfs_write_out_page(ctx, lower_page,
+                                                     lower_inode,
+                                                     orig_byte_offset,
+                                                     (PAGE_CACHE_SIZE
+                                                      - orig_byte_offset));
+                        if (rc) {
+                                ecryptfs_printk(KERN_ERR, "Error attempting "
+                                                "to write out page; rc = [%d]"
+                                                "\n", rc);
+                                goto out;
+                        }
+                        page_state = ECRYPTFS_PAGE_STATE_WRITTEN;
+                }
+                if (page_state == ECRYPTFS_PAGE_STATE_UNREAD
+                    || page_state == ECRYPTFS_PAGE_STATE_WRITTEN) {
+                        rc = ecryptfs_read_in_page(ctx, &lower_page,
+                                                   lower_inode, lower_page_idx,
+                                                   lower_byte_offset);
+                        if (rc) {
+                                ecryptfs_printk(KERN_ERR, "Error attempting "
+                                                "to read in lower page with "
+                                                "index [0x%.16x]; rc = [%d]\n",
+                                                lower_page_idx, rc);
+                                goto out;
+                        }
+                        orig_byte_offset = lower_byte_offset;
+                        prior_lower_page_idx = lower_page_idx;
+                        page_state = ECRYPTFS_PAGE_STATE_READ;
+                }
+                BUG_ON(!(page_state == ECRYPTFS_PAGE_STATE_MODIFIED
+                         || page_state == ECRYPTFS_PAGE_STATE_READ));
+                rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+                                        (base_extent + extent_offset));
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error attempting to "
+                                        "derive IV for extent [0x%.16x]; "
+                                        "rc = [%d]\n",
+                                        (base_extent + extent_offset), rc);
+                        goto out;
+                }
+                if (unlikely(ecryptfs_verbosity > 0)) {
+                        ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
+                                        "with iv:\n");
+                        ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
+                        ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
+                                        "encryption:\n");
+                        ecryptfs_dump_hex((char *)
+                                          (page_address(ctx->page)
+                                           + (extent_offset
+                                              * crypt_stat->extent_size)), 8);
+                }
+                rc = ecryptfs_encrypt_page_offset(
+                        crypt_stat, lower_page, lower_byte_offset, ctx->page,
+                        (extent_offset * crypt_stat->extent_size),
+                        crypt_stat->extent_size, extent_iv);
+                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
+                                "rc = [%d]\n",
+                                (base_extent + extent_offset), rc);
+                if (unlikely(ecryptfs_verbosity > 0)) {
+                        ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
+                                        "encryption:\n");
+                        ecryptfs_dump_hex((char *)(page_address(lower_page)
+                                                   + lower_byte_offset), 8);
+                }
+                page_state = ECRYPTFS_PAGE_STATE_MODIFIED;
+                extent_offset++;
+        }
+        BUG_ON(orig_byte_offset != 0);
+        rc = ecryptfs_write_out_page(ctx, lower_page, lower_inode, 0,
+                                     (lower_byte_offset
+                                      + crypt_stat->extent_size));
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error attempting to write out "
+                                "page; rc = [%d]\n", rc);
+                                goto out;
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_decrypt_page
+ * @file: The ecryptfs file
+ * @page: The page in ecryptfs to decrypt
+ *
+ * Decrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_decrypt_page(struct file *file, struct page *page)
+{
+        char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+        unsigned long base_extent;
+        unsigned long extent_offset = 0;
+        unsigned long lower_page_idx = 0;
+        unsigned long prior_lower_page_idx = 0;
+        struct page *lower_page;
+        char *lower_page_virt = NULL;
+        struct inode *lower_inode;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        int rc = 0;
+        int byte_offset;
+        int num_extents_per_page;
+        int page_state;
+        crypt_stat = &(ecryptfs_inode_to_private(
+                               page->mapping->host)->crypt_stat);
+        lower_inode = ecryptfs_inode_to_lower(page->mapping->host);
+        if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
+                rc = ecryptfs_do_readpage(file, page, page->index);
+                if (rc)
+                        ecryptfs_printk(KERN_ERR, "Error attempting to copy "
+                                        "page at index [0x%.16x]\n",
+                                        page->index);
+                goto out;
+        }
+        num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+        base_extent = (page->index * num_extents_per_page);
+        lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache,
+                                           SLAB_KERNEL);
+        if (!lower_page_virt) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Error getting page for encrypted "
+                                "lower page(s)\n");
+                goto out;
+        }
+        lower_page = virt_to_page(lower_page_virt);
+        page_state = ECRYPTFS_PAGE_STATE_UNREAD;
+        while (extent_offset < num_extents_per_page) {
+                ecryptfs_extent_to_lwr_pg_idx_and_offset(
+                        &lower_page_idx, &byte_offset, crypt_stat,
+                        (base_extent + extent_offset));
+                if (prior_lower_page_idx != lower_page_idx
+                    || page_state == ECRYPTFS_PAGE_STATE_UNREAD) {
+                        rc = ecryptfs_do_readpage(file, lower_page,
+                                                  lower_page_idx);
+                        if (rc) {
+                                ecryptfs_printk(KERN_ERR, "Error reading "
+                                                "lower encrypted page; rc = "
+                                                "[%d]\n", rc);
+                                goto out;
+                        }
+                        prior_lower_page_idx = lower_page_idx;
+                        page_state = ECRYPTFS_PAGE_STATE_READ;
+                }
+                rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+                                        (base_extent + extent_offset));
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error attempting to "
+                                        "derive IV for extent [0x%.16x]; rc = "
+                                        "[%d]\n",
+                                        (base_extent + extent_offset), rc);
+                        goto out;
+                }
+                if (unlikely(ecryptfs_verbosity > 0)) {
+                        ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
+                                        "with iv:\n");
+                        ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
+                        ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
+                                        "decryption:\n");
+                        ecryptfs_dump_hex((lower_page_virt + byte_offset), 8);
+                }
+                rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
+                                                  (extent_offset
+                                                   * crypt_stat->extent_size),
+                                                  lower_page, byte_offset,
+                                                  crypt_stat->extent_size,
+                                                  extent_iv);
+                if (rc != crypt_stat->extent_size) {
+                        ecryptfs_printk(KERN_ERR, "Error attempting to "
+                                        "decrypt extent [0x%.16x]\n",
+                                        (base_extent + extent_offset));
+                        goto out;
+                }
+                rc = 0;
+                if (unlikely(ecryptfs_verbosity > 0)) {
+                        ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
+                                        "decryption:\n");
+                        ecryptfs_dump_hex((char *)(page_address(page)
+                                                   + byte_offset), 8);
+                }
+                extent_offset++;
+        }
+out:
+        if (lower_page_virt)
+                kmem_cache_free(ecryptfs_lower_page_cache, lower_page_virt);
+        return rc;
+}
+/**
+ * decrypt_scatterlist
+ *
+ * Returns the number of bytes decrypted; negative value on error
+ */
+static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+                               struct scatterlist *dest_sg,
+                               struct scatterlist *src_sg, int size,
+                               unsigned char *iv)
+{
+        int rc = 0;
+        /* Consider doing this once, when the file is opened */
+        mutex_lock(&crypt_stat->cs_tfm_mutex);
+        rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
+                                  crypt_stat->key_size);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+                                rc);
+                mutex_unlock(&crypt_stat->cs_tfm_mutex);
+                rc = -EINVAL;
+                goto out;
+        }
+        ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
+        rc = crypto_cipher_decrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size,
+                                      iv);
+        mutex_unlock(&crypt_stat->cs_tfm_mutex);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
+                                rc);
+                goto out;
+        }
+        rc = size;
+out:
+        return rc;
+}
+/**
+ * ecryptfs_encrypt_page_offset
+ *
+ * Returns the number of bytes encrypted
+ */
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+                             struct page *dst_page, int dst_offset,
+                             struct page *src_page, int src_offset, int size,
+                             unsigned char *iv)
+{
+        struct scatterlist src_sg, dst_sg;
+        src_sg.page = src_page;
+        src_sg.offset = src_offset;
+        src_sg.length = size;
+        dst_sg.page = dst_page;
+        dst_sg.offset = dst_offset;
+        dst_sg.length = size;
+        return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+/**
+ * ecryptfs_decrypt_page_offset
+ *
+ * Returns the number of bytes decrypted
+ */
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+                             struct page *dst_page, int dst_offset,
+                             struct page *src_page, int src_offset, int size,
+                             unsigned char *iv)
+{
+        struct scatterlist src_sg, dst_sg;
+        src_sg.page = src_page;
+        src_sg.offset = src_offset;
+        src_sg.length = size;
+        dst_sg.page = dst_page;
+        dst_sg.offset = dst_offset;
+        dst_sg.length = size;
+        return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
+/**
+ * ecryptfs_init_crypt_ctx
+ * @crypt_stat: Uninitilized crypt stats structure
+ *
+ * Initialize the crypto context.
+ *
+ * TODO: Performance: Keep a cache of initialized cipher contexts;
+ * only init if needed
+ */
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        int rc = -EINVAL;
+        if (!crypt_stat->cipher) {
+                ecryptfs_printk(KERN_ERR, "No cipher specified\n");
+                goto out;
+        }
+        ecryptfs_printk(KERN_DEBUG,
+                        "Initializing cipher [%s]; strlen = [%d]; "
+                        "key_size_bits = [%d]\n",
+                        crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
+                        crypt_stat->key_size << 3);
+        if (crypt_stat->tfm) {
+                rc = 0;
+                goto out;
+        }
+        mutex_lock(&crypt_stat->cs_tfm_mutex);
+        crypt_stat->tfm = crypto_alloc_tfm(crypt_stat->cipher,
+                                           ECRYPTFS_DEFAULT_CHAINING_MODE
+                                           | CRYPTO_TFM_REQ_WEAK_KEY);
+        mutex_unlock(&crypt_stat->cs_tfm_mutex);
+        if (!crypt_stat->tfm) {
+                ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
+                                "Error initializing cipher [%s]\n",
+                                crypt_stat->cipher);
+                goto out;
+        }
+        rc = 0;
+out:
+        return rc;
+}
+static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        int extent_size_tmp;
+        crypt_stat->extent_mask = 0xFFFFFFFF;
+        crypt_stat->extent_shift = 0;
+        if (crypt_stat->extent_size == 0)
+                return;
+        extent_size_tmp = crypt_stat->extent_size;
+        while ((extent_size_tmp & 0x01) == 0) {
+                extent_size_tmp >>= 1;
+                crypt_stat->extent_mask <<= 1;
+                crypt_stat->extent_shift++;
+        }
+}
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        /* Default values; may be overwritten as we are parsing the
+         * packets. */
+        crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
+        set_extent_mask_and_shift(crypt_stat);
+        crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
+        if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+                crypt_stat->header_extent_size =
+                        ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+        } else
+                crypt_stat->header_extent_size = PAGE_CACHE_SIZE;
+        crypt_stat->num_header_extents_at_front = 1;
+}
+/**
+ * ecryptfs_compute_root_iv
+ * @crypt_stats
+ *
+ * On error, sets the root IV to all 0's.
+ */
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        int rc = 0;
+        char dst[MD5_DIGEST_SIZE];
+        BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
+        BUG_ON(crypt_stat->iv_bytes <= 0);
+        if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID)) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_WARNING, "Session key not valid; "
+                                "cannot generate root IV\n");
+                goto out;
+        }
+        rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
+                                    crypt_stat->key_size);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+                                "MD5 while generating root IV\n");
+                goto out;
+        }
+        memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+out:
+        if (rc) {
+                memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
+                ECRYPTFS_SET_FLAG(crypt_stat->flags,
+                                  ECRYPTFS_SECURITY_WARNING);
+        }
+        return rc;
+}
+static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        get_random_bytes(crypt_stat->key, crypt_stat->key_size);
+        ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+        ecryptfs_compute_root_iv(crypt_stat);
+        if (unlikely(ecryptfs_verbosity > 0)) {
+                ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n");
+                ecryptfs_dump_hex(crypt_stat->key,
+                                  crypt_stat->key_size);
+        }
+}
+/**
+ * ecryptfs_set_default_crypt_stat_vals
+ * @crypt_stat
+ *
+ * Default values in the event that policy does not override them.
+ */
+static void ecryptfs_set_default_crypt_stat_vals(
+        struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+        ecryptfs_set_default_sizes(crypt_stat);
+        strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER);
+        crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES;
+        ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+        crypt_stat->file_version = ECRYPTFS_FILE_VERSION;
+        crypt_stat->mount_crypt_stat = mount_crypt_stat;
+}
+/**
+ * ecryptfs_new_file_context
+ * @ecryptfs_dentry
+ *
+ * If the crypto context for the file has not yet been established,
+ * this is where we do that.  Establishing a new crypto context
+ * involves the following decisions:
+ *  - What cipher to use?
+ *  - What set of authentication tokens to use?
+ * Here we just worry about getting enough information into the
+ * authentication tokens so that we know that they are available.
+ * We associate the available authentication tokens with the new file
+ * via the set of signatures in the crypt_stat struct.  Later, when
+ * the headers are actually written out, we may again defer to
+ * userspace to perform the encryption of the session key; for the
+ * foreseeable future, this will be the case with public key packets.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+/* Associate an authentication token(s) with the file */
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+{
+        int rc = 0;
+        struct ecryptfs_crypt_stat *crypt_stat =
+            &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+            &ecryptfs_superblock_to_private(
+                    ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        int cipher_name_len;
+        ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat);
+        /* See if there are mount crypt options */
+        if (mount_crypt_stat->global_auth_tok) {
+                ecryptfs_printk(KERN_DEBUG, "Initializing context for new "
+                                "file using mount_crypt_stat\n");
+                ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+                ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+                memcpy(crypt_stat->keysigs[crypt_stat->num_keysigs++],
+                       mount_crypt_stat->global_auth_tok_sig,
+                       ECRYPTFS_SIG_SIZE_HEX);
+                cipher_name_len =
+                    strlen(mount_crypt_stat->global_default_cipher_name);
+                memcpy(crypt_stat->cipher,
+                       mount_crypt_stat->global_default_cipher_name,
+                       cipher_name_len);
+                crypt_stat->cipher[cipher_name_len] = '\0';
+                crypt_stat->key_size =
+                        mount_crypt_stat->global_default_cipher_key_size;
+                ecryptfs_generate_new_key(crypt_stat);
+        } else
+                /* We should not encounter this scenario since we
+                 * should detect lack of global_auth_tok at mount time
+                 * TODO: Applies to 0.1 release only; remove in future
+                 * release */
+                BUG();
+        rc = ecryptfs_init_crypt_ctx(crypt_stat);
+        if (rc)
+                ecryptfs_printk(KERN_ERR, "Error initializing cryptographic "
+                                "context for cipher [%s]: rc = [%d]\n",
+                                crypt_stat->cipher, rc);
+        return rc;
+}
+/**
+ * contains_ecryptfs_marker - check for the ecryptfs marker
+ * @data: The data block in which to check
+ *
+ * Returns one if marker found; zero if not found
+ */
+int contains_ecryptfs_marker(char *data)
+{
+        u32 m_1, m_2;
+        memcpy(&m_1, data, 4);
+        m_1 = be32_to_cpu(m_1);
+        memcpy(&m_2, (data + 4), 4);
+        m_2 = be32_to_cpu(m_2);
+        if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
+                return 1;
+        ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
+                        "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
+                        MAGIC_ECRYPTFS_MARKER);
+        ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
+                        "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
+        return 0;
+}
+struct ecryptfs_flag_map_elem {
+        u32 file_flag;
+        u32 local_flag;
+};
+/* Add support for additional flags by adding elements here. */
+static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
+        {0x00000001, ECRYPTFS_ENABLE_HMAC},
+        {0x00000002, ECRYPTFS_ENCRYPTED}
+};
+/**
+ * ecryptfs_process_flags
+ * @crypt_stat
+ * @page_virt: Source data to be parsed
+ * @bytes_read: Updated with the number of bytes read
+ *
+ * Returns zero on success; non-zero if the flag set is invalid
+ */
+static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
+                                  char *page_virt, int *bytes_read)
+{
+        int rc = 0;
+        int i;
+        u32 flags;
+        memcpy(&flags, page_virt, 4);
+        flags = be32_to_cpu(flags);
+        for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+                          / sizeof(struct ecryptfs_flag_map_elem))); i++)
+                if (flags & ecryptfs_flag_map[i].file_flag) {
+                        ECRYPTFS_SET_FLAG(crypt_stat->flags,
+                                          ecryptfs_flag_map[i].local_flag);
+                } else
+                        ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
+                                            ecryptfs_flag_map[i].local_flag);
+        /* Version is in top 8 bits of the 32-bit flag vector */
+        crypt_stat->file_version = ((flags >> 24) & 0xFF);
+        (*bytes_read) = 4;
+        return rc;
+}
+/**
+ * write_ecryptfs_marker
+ * @page_virt: The pointer to in a page to begin writing the marker
+ * @written: Number of bytes written
+ *
+ * Marker = 0x3c81b7f5
+ */
+static void write_ecryptfs_marker(char *page_virt, size_t *written)
+{
+        u32 m_1, m_2;
+        get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+        m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
+        m_1 = cpu_to_be32(m_1);
+        memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+        m_2 = cpu_to_be32(m_2);
+        memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
+               (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+        (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+}
+static void
+write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
+                     size_t *written)
+{
+        u32 flags = 0;
+        int i;
+        for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+                          / sizeof(struct ecryptfs_flag_map_elem))); i++)
+                if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+                                        ecryptfs_flag_map[i].local_flag))
+                        flags |= ecryptfs_flag_map[i].file_flag;
+        /* Version is in top 8 bits of the 32-bit flag vector */
+        flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
+        flags = cpu_to_be32(flags);
+        memcpy(page_virt, &flags, 4);
+        (*written) = 4;
+}
+struct ecryptfs_cipher_code_str_map_elem {
+        char cipher_str[16];
+        u16 cipher_code;
+};
+/* Add support for additional ciphers by adding elements here. The
+ * cipher_code is whatever OpenPGP applicatoins use to identify the
+ * ciphers. List in order of probability. */
+static struct ecryptfs_cipher_code_str_map_elem
+ecryptfs_cipher_code_str_map[] = {
+        {"aes",RFC2440_CIPHER_AES_128 },
+        {"blowfish", RFC2440_CIPHER_BLOWFISH},
+        {"des3_ede", RFC2440_CIPHER_DES3_EDE},
+        {"cast5", RFC2440_CIPHER_CAST_5},
+        {"twofish", RFC2440_CIPHER_TWOFISH},
+        {"cast6", RFC2440_CIPHER_CAST_6},
+        {"aes", RFC2440_CIPHER_AES_192},
+        {"aes", RFC2440_CIPHER_AES_256}
+};
+/**
+ * ecryptfs_code_for_cipher_string
+ * @str: The string representing the cipher name
+ *
+ * Returns zero on no match, or the cipher code on match
+ */
+u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        int i;
+        u16 code = 0;
+        struct ecryptfs_cipher_code_str_map_elem *map =
+                ecryptfs_cipher_code_str_map;
+        if (strcmp(crypt_stat->cipher, "aes") == 0) {
+                switch (crypt_stat->key_size) {
+                case 16:
+                        code = RFC2440_CIPHER_AES_128;
+                        break;
+                case 24:
+                        code = RFC2440_CIPHER_AES_192;
+                        break;
+                case 32:
+                        code = RFC2440_CIPHER_AES_256;
+                }
+        } else {
+                for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+                        if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+                                code = map[i].cipher_code;
+                                break;
+                        }
+        }
+        return code;
+}
+/**
+ * ecryptfs_cipher_code_to_string
+ * @str: Destination to write out the cipher name
+ * @cipher_code: The code to convert to cipher name string
+ *
+ * Returns zero on success
+ */
+int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code)
+{
+        int rc = 0;
+        int i;
+        str[0] = '\0';
+        for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+                if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code)
+                        strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str);
+        if (str[0] == '\0') {
+                ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: "
+                                "[%d]\n", cipher_code);
+                rc = -EINVAL;
+        }
+        return rc;
+}
+/**
+ * ecryptfs_read_header_region
+ * @data
+ * @dentry
+ * @nd
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_read_header_region(char *data, struct dentry *dentry,
+                                struct vfsmount *mnt)
+{
+        struct file *file;
+        mm_segment_t oldfs;
+        int rc;
+        mnt = mntget(mnt);
+        file = dentry_open(dentry, mnt, O_RDONLY);
+        if (IS_ERR(file)) {
+                ecryptfs_printk(KERN_DEBUG, "Error opening file to "
+                                "read header region\n");
+                mntput(mnt);
+                rc = PTR_ERR(file);
+                goto out;
+        }
+        file->f_pos = 0;
+        oldfs = get_fs();
+        set_fs(get_ds());
+        /* For releases 0.1 and 0.2, all of the header information
+         * fits in the first data extent-sized region. */
+        rc = file->f_op->read(file, (char __user *)data,
+                              ECRYPTFS_DEFAULT_EXTENT_SIZE, &file->f_pos);
+        set_fs(oldfs);
+        fput(file);
+        rc = 0;
+out:
+        return rc;
+}
+static void
+write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat,
+                      size_t *written)
+{
+        u32 header_extent_size;
+        u16 num_header_extents_at_front;
+        header_extent_size = (u32)crypt_stat->header_extent_size;
+        num_header_extents_at_front =
+                (u16)crypt_stat->num_header_extents_at_front;
+        header_extent_size = cpu_to_be32(header_extent_size);
+        memcpy(virt, &header_extent_size, 4);
+        virt += 4;
+        num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
+        memcpy(virt, &num_header_extents_at_front, 2);
+        (*written) = 6;
+}
+struct kmem_cache *ecryptfs_header_cache_0;
+struct kmem_cache *ecryptfs_header_cache_1;
+struct kmem_cache *ecryptfs_header_cache_2;
+/**
+ * ecryptfs_write_headers_virt
+ * @page_virt
+ * @crypt_stat
+ * @ecryptfs_dentry
+ *
+ * Format version: 1
+ *
+ *   Header Extent:
+ *     Octets 0-7:        Unencrypted file size (big-endian)
+ *     Octets 8-15:       eCryptfs special marker
+ *     Octets 16-19:      Flags
+ *      Octet 16:         File format version number (between 0 and 255)
+ *      Octets 17-18:     Reserved
+ *      Octet 19:         Bit 1 (lsb): Reserved
+ *                        Bit 2: Encrypted?
+ *                        Bits 3-8: Reserved
+ *     Octets 20-23:      Header extent size (big-endian)
+ *     Octets 24-25:      Number of header extents at front of file
+ *                        (big-endian)
+ *     Octet  26:         Begin RFC 2440 authentication token packet set
+ *   Data Extent 0:
+ *     Lower data (CBC encrypted)
+ *   Data Extent 1:
+ *     Lower data (CBC encrypted)
+ *   ...
+ *
+ * Returns zero on success
+ */
+int ecryptfs_write_headers_virt(char *page_virt,
+                                struct ecryptfs_crypt_stat *crypt_stat,
+                                struct dentry *ecryptfs_dentry)
+{
+        int rc;
+        size_t written;
+        size_t offset;
+        offset = ECRYPTFS_FILE_SIZE_BYTES;
+        write_ecryptfs_marker((page_virt + offset), &written);
+        offset += written;
+        write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+        offset += written;
+        write_header_metadata((page_virt + offset), crypt_stat, &written);
+        offset += written;
+        rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
+                                              ecryptfs_dentry, &written,
+                                              PAGE_CACHE_SIZE - offset);
+        if (rc)
+                ecryptfs_printk(KERN_WARNING, "Error generating key packet "
+                                "set; rc = [%d]\n", rc);
+        return rc;
+}
+/**
+ * ecryptfs_write_headers
+ * @lower_file: The lower file struct, which was returned from dentry_open
+ *
+ * Write the file headers out.  This will likely involve a userspace
+ * callout, in which the session key is encrypted with one or more
+ * public keys and/or the passphrase necessary to do the encryption is
+ * retrieved via a prompt.  Exactly what happens at this point should
+ * be policy-dependent.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
+                           struct file *lower_file)
+{
+        mm_segment_t oldfs;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        char *page_virt;
+        int current_header_page;
+        int header_pages;
+        int rc = 0;
+        crypt_stat = &ecryptfs_inode_to_private(
+                ecryptfs_dentry->d_inode)->crypt_stat;
+        if (likely(ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+                                       ECRYPTFS_ENCRYPTED))) {
+                if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+                                         ECRYPTFS_KEY_VALID)) {
+                        ecryptfs_printk(KERN_DEBUG, "Key is "
+                                        "invalid; bailing out\n");
+                        rc = -EINVAL;
+                        goto out;
+                }
+        } else {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_WARNING,
+                                "Called with crypt_stat->encrypted == 0\n");
+                goto out;
+        }
+        /* Released in this function */
+        page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, SLAB_USER);
+        if (!page_virt) {
+                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        memset(page_virt, 0, PAGE_CACHE_SIZE);
+        rc = ecryptfs_write_headers_virt(page_virt, crypt_stat,
+                                         ecryptfs_dentry);
+        if (unlikely(rc)) {
+                ecryptfs_printk(KERN_ERR, "Error whilst writing headers\n");
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
+                goto out_free;
+        }
+        ecryptfs_printk(KERN_DEBUG,
+                        "Writing key packet set to underlying file\n");
+        lower_file->f_pos = 0;
+        oldfs = get_fs();
+        set_fs(get_ds());
+        ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
+                        "write() w/ header page; lower_file->f_pos = "
+                        "[0x%.16x]\n", lower_file->f_pos);
+        lower_file->f_op->write(lower_file, (char __user *)page_virt,
+                                PAGE_CACHE_SIZE, &lower_file->f_pos);
+        header_pages = ((crypt_stat->header_extent_size
+                         * crypt_stat->num_header_extents_at_front)
+                        / PAGE_CACHE_SIZE);
+        memset(page_virt, 0, PAGE_CACHE_SIZE);
+        current_header_page = 1;
+        while (current_header_page < header_pages) {
+                ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
+                                "write() w/ zero'd page; lower_file->f_pos = "
+                                "[0x%.16x]\n", lower_file->f_pos);
+                lower_file->f_op->write(lower_file, (char __user *)page_virt,
+                                        PAGE_CACHE_SIZE, &lower_file->f_pos);
+                current_header_page++;
+        }
+        set_fs(oldfs);
+        ecryptfs_printk(KERN_DEBUG,
+                        "Done writing key packet set to underlying file.\n");
+out_free:
+        kmem_cache_free(ecryptfs_header_cache_0, page_virt);
+out:
+        return rc;
+}
+static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
+                                 char *virt, int *bytes_read)
+{
+        int rc = 0;
+        u32 header_extent_size;
+        u16 num_header_extents_at_front;
+        memcpy(&header_extent_size, virt, 4);
+        header_extent_size = be32_to_cpu(header_extent_size);
+        virt += 4;
+        memcpy(&num_header_extents_at_front, virt, 2);
+        num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
+        crypt_stat->header_extent_size = (int)header_extent_size;
+        crypt_stat->num_header_extents_at_front =
+                (int)num_header_extents_at_front;
+        (*bytes_read) = 6;
+        if ((crypt_stat->header_extent_size
+             * crypt_stat->num_header_extents_at_front)
+            < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_WARNING, "Invalid header extent size: "
+                                "[%d]\n", crypt_stat->header_extent_size);
+        }
+        return rc;
+}
+/**
+ * set_default_header_data
+ *
+ * For version 0 file format; this function is only for backwards
+ * compatibility for files created with the prior versions of
+ * eCryptfs.
+ */
+static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        crypt_stat->header_extent_size = 4096;
+        crypt_stat->num_header_extents_at_front = 1;
+}
+/**
+ * ecryptfs_read_headers_virt
+ *
+ * Read/parse the header data. The header format is detailed in the
+ * comment block for the ecryptfs_write_headers_virt() function.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_read_headers_virt(char *page_virt,
+                                      struct ecryptfs_crypt_stat *crypt_stat,
+                                      struct dentry *ecryptfs_dentry)
+{
+        int rc = 0;
+        int offset;
+        int bytes_read;
+        ecryptfs_set_default_sizes(crypt_stat);
+        crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
+                ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        offset = ECRYPTFS_FILE_SIZE_BYTES;
+        rc = contains_ecryptfs_marker(page_virt + offset);
+        if (rc == 0) {
+                rc = -EINVAL;
+                goto out;
+        }
+        offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+        rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
+                                    &bytes_read);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
+                goto out;
+        }
+        if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
+                ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
+                                "file version [%d] is supported by this "
+                                "version of eCryptfs\n",
+                                crypt_stat->file_version,
+                                ECRYPTFS_SUPPORTED_FILE_VERSION);
+                rc = -EINVAL;
+                goto out;
+        }
+        offset += bytes_read;
+        if (crypt_stat->file_version >= 1) {
+                rc = parse_header_metadata(crypt_stat, (page_virt + offset),
+                                           &bytes_read);
+                if (rc) {
+                        ecryptfs_printk(KERN_WARNING, "Error reading header "
+                                        "metadata; rc = [%d]\n", rc);
+                }
+                offset += bytes_read;
+        } else
+                set_default_header_data(crypt_stat);
+        rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset),
+                                       ecryptfs_dentry);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_read_headers
+ *
+ * Returns zero if valid headers found and parsed; non-zero otherwise
+ */
+int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
+                          struct file *lower_file)
+{
+        int rc = 0;
+        char *page_virt = NULL;
+        mm_segment_t oldfs;
+        ssize_t bytes_read;
+        struct ecryptfs_crypt_stat *crypt_stat =
+            &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+        /* Read the first page from the underlying file */
+        page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, SLAB_USER);
+        if (!page_virt) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Unable to allocate page_virt\n");
+                goto out;
+        }
+        lower_file->f_pos = 0;
+        oldfs = get_fs();
+        set_fs(get_ds());
+        bytes_read = lower_file->f_op->read(lower_file,
+                                            (char __user *)page_virt,
+                                            ECRYPTFS_DEFAULT_EXTENT_SIZE,
+                                            &lower_file->f_pos);
+        set_fs(oldfs);
+        if (bytes_read != ECRYPTFS_DEFAULT_EXTENT_SIZE) {
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+                                        ecryptfs_dentry);
+        if (rc) {
+                ecryptfs_printk(KERN_DEBUG, "Valid eCryptfs headers not "
+                                "found\n");
+                rc = -EINVAL;
+        }
+out:
+        if (page_virt) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
+                kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+        }
+        return rc;
+}
+/**
+ * ecryptfs_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * TODO: Implement filename decoding and decryption here, in place of
+ * memcpy. We are keeping the framework around for now to (1)
+ * facilitate testing of the components needed to implement filename
+ * encryption and (2) to provide a code base from which other
+ * developers in the community can easily implement this feature.
+ *
+ * Returns the length of encoded filename; negative if error
+ */
+int
+ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+                         const char *name, int length, char **encoded_name)
+{
+        int error = 0;
+        (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
+        if (!(*encoded_name)) {
+                error = -ENOMEM;
+                goto out;
+        }
+        /* TODO: Filename encryption is a scheduled feature for a
+         * future version of eCryptfs. This function is here only for
+         * the purpose of providing a framework for other developers
+         * to easily implement filename encryption. Hint: Replace this
+         * memcpy() with a call to encrypt and encode the
+         * filename, the set the length accordingly. */
+        memcpy((void *)(*encoded_name), (void *)name, length);
+        (*encoded_name)[length] = '\0';
+        error = length + 1;
+out:
+        return error;
+}
+/**
+ * ecryptfs_decode_filename - converts the cipher text name to plaintext
+ * @crypt_stat: The crypt_stat struct associated with the file
+ * @name: The filename in cipher text
+ * @length: The length of the cipher text name
+ * @decrypted_name: The plaintext name
+ *
+ * Decodes and decrypts the filename.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * TODO: Implement filename decoding and decryption here, in place of
+ * memcpy. We are keeping the framework around for now to (1)
+ * facilitate testing of the components needed to implement filename
+ * encryption and (2) to provide a code base from which other
+ * developers in the community can easily implement this feature.
+ *
+ * Returns the length of decoded filename; negative if error
+ */
+int
+ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+                         const char *name, int length, char **decrypted_name)
+{
+        int error = 0;
+        (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
+        if (!(*decrypted_name)) {
+                error = -ENOMEM;
+                goto out;
+        }
+        /* TODO: Filename encryption is a scheduled feature for a
+         * future version of eCryptfs. This function is here only for
+         * the purpose of providing a framework for other developers
+         * to easily implement filename encryption. Hint: Replace this
+         * memcpy() with a call to decode and decrypt the
+         * filename, the set the length accordingly. */
+        memcpy((void *)(*decrypted_name), (void *)name, length);
+        (*decrypted_name)[length + 1] = '\0';   /* Only for convenience
+                                                 * in printing out the
+                                                 * string in debug
+                                                 * messages */
+        error = length;
+out:
+        return error;
+}
+/**
+ * ecryptfs_process_cipher - Perform cipher initialization.
+ * @tfm: Crypto context set by this function
+ * @key_tfm: Crypto context for key material, set by this function
+ * @cipher_name: Name of the cipher.
+ * @key_size: Size of the key in bytes.
+ *
+ * Returns zero on success. Any crypto_tfm structs allocated here
+ * should be released by other functions, such as on a superblock put
+ * event, regardless of whether this function succeeds for fails.
+ */
+int
+ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
+                        char *cipher_name, size_t key_size)
+{
+        char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
+        int rc;
+        *tfm = *key_tfm = NULL;
+        if (key_size > ECRYPTFS_MAX_KEY_BYTES) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+                       "allowable is [%d]\n", key_size, ECRYPTFS_MAX_KEY_BYTES);
+                goto out;
+        }
+        *tfm = crypto_alloc_tfm(cipher_name, (ECRYPTFS_DEFAULT_CHAINING_MODE
+                                              | CRYPTO_TFM_REQ_WEAK_KEY));
+        if (!(*tfm)) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Unable to allocate crypto cipher with name "
+                       "[%s]\n", cipher_name);
+                goto out;
+        }
+        *key_tfm = crypto_alloc_tfm(cipher_name, CRYPTO_TFM_REQ_WEAK_KEY);
+        if (!(*key_tfm)) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Unable to allocate crypto cipher with name "
+                       "[%s]\n", cipher_name);
+                goto out;
+        }
+        if (key_size < crypto_tfm_alg_min_keysize(*tfm)) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
+                       "supported by cipher [%s] is [%d]\n", key_size,
+                       cipher_name, crypto_tfm_alg_min_keysize(*tfm));
+                goto out;
+        }
+        if (key_size < crypto_tfm_alg_min_keysize(*key_tfm)) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
+                       "supported by cipher [%s] is [%d]\n", key_size,
+                       cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
+                goto out;
+        }
+        if (key_size > crypto_tfm_alg_max_keysize(*tfm)) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
+                       "supported by cipher [%s] is [%d]\n", key_size,
+                       cipher_name, crypto_tfm_alg_min_keysize(*tfm));
+                goto out;
+        }
+        if (key_size > crypto_tfm_alg_max_keysize(*key_tfm)) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
+                       "supported by cipher [%s] is [%d]\n", key_size,
+                       cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
+                goto out;
+        }
+        get_random_bytes(dummy_key, key_size);
+        rc = crypto_cipher_setkey(*tfm, dummy_key, key_size);
+        if (rc) {
+                printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+                       "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = crypto_cipher_setkey(*key_tfm, dummy_key, key_size);
+        if (rc) {
+                printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+                       "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
+                rc = -EINVAL;
+                goto out;
+        }
+out:
+        return rc;
+}
diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
new file mode 100644
index 000000000000..61f8e894284f
--- /dev/null
+++ b/fs/ecryptfs/debug.c
@@ -0,0 +1,123 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Functions only useful for debugging.
+ *
+ * Copyright (C) 2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include "ecryptfs_kernel.h"
+/**
+ * ecryptfs_dump_auth_tok - debug function to print auth toks
+ *
+ * This function will print the contents of an ecryptfs authentication
+ * token.
+ */
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok)
+{
+        char salt[ECRYPTFS_SALT_SIZE * 2 + 1];
+        char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+        ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n",
+                        auth_tok);
+        if (ECRYPTFS_CHECK_FLAG(auth_tok->flags, ECRYPTFS_PRIVATE_KEY)) {
+                ecryptfs_printk(KERN_DEBUG, " * private key type\n");
+                ecryptfs_printk(KERN_DEBUG, " * (NO PRIVATE KEY SUPPORT "
+                                "IN ECRYPTFS VERSION 0.1)\n");
+        } else {
+                ecryptfs_printk(KERN_DEBUG, " * passphrase type\n");
+                ecryptfs_to_hex(salt, auth_tok->token.password.salt,
+                                ECRYPTFS_SALT_SIZE);
+                salt[ECRYPTFS_SALT_SIZE * 2] = '\0';
+                ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt);
+                if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
+                                        ECRYPTFS_PERSISTENT_PASSWORD)) {
+                        ecryptfs_printk(KERN_DEBUG, " * persistent\n");
+                }
+                memcpy(sig, auth_tok->token.password.signature,
+                       ECRYPTFS_SIG_SIZE_HEX);
+                sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+                ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig);
+        }
+        ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n",
+                        auth_tok->session_key.flags);
+        if (auth_tok->session_key.flags
+            & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT)
+                ecryptfs_printk(KERN_DEBUG,
+                                " * Userspace decrypt request set\n");
+        if (auth_tok->session_key.flags
+            & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT)
+                ecryptfs_printk(KERN_DEBUG,
+                                " * Userspace encrypt request set\n");
+        if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) {
+                ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n");
+                ecryptfs_printk(KERN_DEBUG,
+                                " * session_key.decrypted_key_size = [0x%x]\n",
+                                auth_tok->session_key.decrypted_key_size);
+                ecryptfs_printk(KERN_DEBUG, " * Decrypted session key "
+                                "dump:\n");
+                if (ecryptfs_verbosity > 0)
+                        ecryptfs_dump_hex(auth_tok->session_key.decrypted_key,
+                                          ECRYPTFS_DEFAULT_KEY_BYTES);
+        }
+        if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) {
+                ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n");
+                ecryptfs_printk(KERN_DEBUG,
+                                " * session_key.encrypted_key_size = [0x%x]\n",
+                                auth_tok->session_key.encrypted_key_size);
+                ecryptfs_printk(KERN_DEBUG, " * Encrypted session key "
+                                "dump:\n");
+                if (ecryptfs_verbosity > 0)
+                        ecryptfs_dump_hex(auth_tok->session_key.encrypted_key,
+                                          auth_tok->session_key.
+                                          encrypted_key_size);
+        }
+}
+/**
+ * ecryptfs_dump_hex - debug hex printer
+ * @data: string of bytes to be printed
+ * @bytes: number of bytes to print
+ *
+ * Dump hexadecimal representation of char array
+ */
+void ecryptfs_dump_hex(char *data, int bytes)
+{
+        int i = 0;
+        int add_newline = 1;
+        if (ecryptfs_verbosity < 1)
+                return;
+        if (bytes != 0) {
+                printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
+                i++;
+        }
+        while (i < bytes) {
+                printk("0x%.2x.", (unsigned char)data[i]);
+                i++;
+                if (i % 16 == 0) {
+                        printk("\n");
+                        add_newline = 0;
+                } else
+                        add_newline = 1;
+        }
+        if (add_newline)
+                printk("\n");
+}
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
new file mode 100644
index 000000000000..f0d2a433242b
--- /dev/null
+++ b/fs/ecryptfs/dentry.c
@@ -0,0 +1,87 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include "ecryptfs_kernel.h"
+/**
+ * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
+ * @dentry: The ecryptfs dentry
+ * @nd: The associated nameidata
+ *
+ * Called when the VFS needs to revalidate a dentry. This
+ * is called whenever a name lookup finds a dentry in the
+ * dcache. Most filesystems leave this as NULL, because all their
+ * dentries in the dcache are valid.
+ *
+ * Returns 1 if valid, 0 otherwise.
+ *
+ */
+static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        struct dentry *dentry_save;
+        struct vfsmount *vfsmount_save;
+        int rc = 1;
+        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
+                goto out;
+        dentry_save = nd->dentry;
+        vfsmount_save = nd->mnt;
+        nd->dentry = lower_dentry;
+        nd->mnt = lower_mnt;
+        rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
+        nd->dentry = dentry_save;
+        nd->mnt = vfsmount_save;
+out:
+        return rc;
+}
+struct kmem_cache *ecryptfs_dentry_info_cache;
+/**
+ * ecryptfs_d_release
+ * @dentry: The ecryptfs dentry
+ *
+ * Called when a dentry is really deallocated.
+ */
+static void ecryptfs_d_release(struct dentry *dentry)
+{
+        struct dentry *lower_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (ecryptfs_dentry_to_private(dentry))
+                kmem_cache_free(ecryptfs_dentry_info_cache,
+                                ecryptfs_dentry_to_private(dentry));
+        if (lower_dentry)
+                dput(lower_dentry);
+        return;
+}
+struct dentry_operations ecryptfs_dops = {
+        .d_revalidate = ecryptfs_d_revalidate,
+        .d_release = ecryptfs_d_release,
+};
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
new file mode 100644
index 000000000000..872c9958531a
--- /dev/null
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -0,0 +1,482 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Kernel declarations.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#ifndef ECRYPTFS_KERNEL_H
+#define ECRYPTFS_KERNEL_H
+#include <keys/user-type.h>
+#include <linux/fs.h>
+#include <linux/scatterlist.h>
+/* Version verification for shared data structures w/ userspace */
+#define ECRYPTFS_VERSION_MAJOR 0x00
+#define ECRYPTFS_VERSION_MINOR 0x04
+#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x01
+/* These flags indicate which features are supported by the kernel
+ * module; userspace tools such as the mount helper read
+ * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
+ * how to behave. */
+#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001
+#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002
+#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
+#define ECRYPTFS_VERSIONING_POLICY 0x00000008
+#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
+                                  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH)
+#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
+#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
+#define ECRYPTFS_SALT_SIZE 8
+#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
+/* The original signature size is only for what is stored on disk; all
+ * in-memory representations are expanded hex, so it better adapted to
+ * be passed around or referenced on the command line */
+#define ECRYPTFS_SIG_SIZE 8
+#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
+#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
+#define ECRYPTFS_MAX_KEY_BYTES 64
+#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
+#define ECRYPTFS_DEFAULT_IV_BYTES 16
+#define ECRYPTFS_FILE_VERSION 0x01
+#define ECRYPTFS_DEFAULT_HEADER_EXTENT_SIZE 8192
+#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
+#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
+#define RFC2440_CIPHER_DES3_EDE 0x02
+#define RFC2440_CIPHER_CAST_5 0x03
+#define RFC2440_CIPHER_BLOWFISH 0x04
+#define RFC2440_CIPHER_AES_128 0x07
+#define RFC2440_CIPHER_AES_192 0x08
+#define RFC2440_CIPHER_AES_256 0x09
+#define RFC2440_CIPHER_TWOFISH 0x0a
+#define RFC2440_CIPHER_CAST_6 0x0b
+#define ECRYPTFS_SET_FLAG(flag_bit_vector, flag) (flag_bit_vector |= (flag))
+#define ECRYPTFS_CLEAR_FLAG(flag_bit_vector, flag) (flag_bit_vector &= ~(flag))
+#define ECRYPTFS_CHECK_FLAG(flag_bit_vector, flag) (flag_bit_vector & (flag))
+/**
+ * For convenience, we may need to pass around the encrypted session
+ * key between kernel and userspace because the authentication token
+ * may not be extractable.  For example, the TPM may not release the
+ * private key, instead requiring the encrypted data and returning the
+ * decrypted data.
+ */
+struct ecryptfs_session_key {
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
+#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
+#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
+        u32 flags;
+        u32 encrypted_key_size;
+        u32 decrypted_key_size;
+        u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+        u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
+};
+struct ecryptfs_password {
+        u32 password_bytes;
+        s32 hash_algo;
+        u32 hash_iterations;
+        u32 session_key_encryption_key_bytes;
+#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
+#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
+        u32 flags;
+        /* Iterated-hash concatenation of salt and passphrase */
+        u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+        u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
+        /* Always in expanded hex */
+        u8 salt[ECRYPTFS_SALT_SIZE];
+};
+enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
+/* May be a password or a private key */
+struct ecryptfs_auth_tok {
+        u16 version; /* 8-bit major and 8-bit minor */
+        u16 token_type;
+        u32 flags;
+        struct ecryptfs_session_key session_key;
+        u8 reserved[32];
+        union {
+                struct ecryptfs_password password;
+                /* Private key is in future eCryptfs releases */
+        } token;
+} __attribute__ ((packed));
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
+extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
+struct ecryptfs_key_record {
+        unsigned char type;
+        size_t enc_key_size;
+        unsigned char sig[ECRYPTFS_SIG_SIZE];
+        unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+};
+struct ecryptfs_auth_tok_list {
+        struct ecryptfs_auth_tok *auth_tok;
+        struct list_head list;
+};
+struct ecryptfs_crypt_stat;
+struct ecryptfs_mount_crypt_stat;
+struct ecryptfs_page_crypt_context {
+        struct page *page;
+#define ECRYPTFS_PREPARE_COMMIT_MODE 0
+#define ECRYPTFS_WRITEPAGE_MODE      1
+        unsigned int mode;
+        union {
+                struct file *lower_file;
+                struct writeback_control *wbc;
+        } param;
+};
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_key_payload_data(struct key *key)
+{
+        return (struct ecryptfs_auth_tok *)
+                (((struct user_key_payload*)key->payload.data)->data);
+}
+#define ECRYPTFS_SUPER_MAGIC 0xf15f
+#define ECRYPTFS_MAX_KEYSET_SIZE 1024
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
+#define ECRYPTFS_MAX_NUM_KEYSIGS 2 /* TODO: Make this a linked list */
+#define ECRYPTFS_MAX_IV_BYTES 16        /* 128 bits */
+#define ECRYPTFS_SALT_BYTES 2
+#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
+#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8      /* 4*2 */
+#define ECRYPTFS_FILE_SIZE_BYTES 8
+#define ECRYPTFS_DEFAULT_CIPHER "aes"
+#define ECRYPTFS_DEFAULT_KEY_BYTES 16
+#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC
+#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
+#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
+#define MD5_DIGEST_SIZE 16
+/**
+ * This is the primary struct associated with each encrypted file.
+ *
+ * TODO: cache align/pack?
+ */
+struct ecryptfs_crypt_stat {
+#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_POLICY_APPLIED     0x00000002
+#define ECRYPTFS_NEW_FILE           0x00000004
+#define ECRYPTFS_ENCRYPTED          0x00000008
+#define ECRYPTFS_SECURITY_WARNING   0x00000010
+#define ECRYPTFS_ENABLE_HMAC        0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
+#define ECRYPTFS_KEY_VALID          0x00000080
+        u32 flags;
+        unsigned int file_version;
+        size_t iv_bytes;
+        size_t num_keysigs;
+        size_t header_extent_size;
+        size_t num_header_extents_at_front;
+        size_t extent_size; /* Data extent size; default is 4096 */
+        size_t key_size;
+        size_t extent_shift;
+        unsigned int extent_mask;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        struct crypto_tfm *tfm;
+        struct crypto_tfm *md5_tfm; /* Crypto context for generating
+                                     * the initialization vectors */
+        unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+        unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
+        unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
+        unsigned char keysigs[ECRYPTFS_MAX_NUM_KEYSIGS][ECRYPTFS_SIG_SIZE_HEX];
+        struct mutex cs_tfm_mutex;
+        struct mutex cs_md5_tfm_mutex;
+        struct mutex cs_mutex;
+};
+/* inode private data. */
+struct ecryptfs_inode_info {
+        struct inode vfs_inode;
+        struct inode *wii_inode;
+        struct ecryptfs_crypt_stat crypt_stat;
+};
+/* dentry private data. Each dentry must keep track of a lower
+ * vfsmount too. */
+struct ecryptfs_dentry_info {
+        struct dentry *wdi_dentry;
+        struct vfsmount *lower_mnt;
+        struct ecryptfs_crypt_stat *crypt_stat;
+};
+/**
+ * This struct is to enable a mount-wide passphrase/salt combo. This
+ * is more or less a stopgap to provide similar functionality to other
+ * crypto filesystems like EncFS or CFS until full policy support is
+ * implemented in eCryptfs.
+ */
+struct ecryptfs_mount_crypt_stat {
+        /* Pointers to memory we do not own, do not free these */
+#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001
+        u32 flags;
+        struct ecryptfs_auth_tok *global_auth_tok;
+        struct key *global_auth_tok_key;
+        size_t global_default_cipher_key_size;
+        struct crypto_tfm *global_key_tfm;
+        struct mutex global_key_tfm_mutex;
+        unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
+                                                 + 1];
+        unsigned char global_auth_tok_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+/* superblock private data. */
+struct ecryptfs_sb_info {
+        struct super_block *wsi_sb;
+        struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+};
+/* file private data. */
+struct ecryptfs_file_info {
+        struct file *wfi_file;
+        struct ecryptfs_crypt_stat *crypt_stat;
+};
+/* auth_tok <=> encrypted_session_key mappings */
+struct ecryptfs_auth_tok_list_item {
+        unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES];
+        struct list_head list;
+        struct ecryptfs_auth_tok auth_tok;
+};
+static inline struct ecryptfs_file_info *
+ecryptfs_file_to_private(struct file *file)
+{
+        return (struct ecryptfs_file_info *)file->private_data;
+}
+static inline void
+ecryptfs_set_file_private(struct file *file,
+                          struct ecryptfs_file_info *file_info)
+{
+        file->private_data = file_info;
+}
+static inline struct file *ecryptfs_file_to_lower(struct file *file)
+{
+        return ((struct ecryptfs_file_info *)file->private_data)->wfi_file;
+}
+static inline void
+ecryptfs_set_file_lower(struct file *file, struct file *lower_file)
+{
+        ((struct ecryptfs_file_info *)file->private_data)->wfi_file =
+                lower_file;
+}
+static inline struct ecryptfs_inode_info *
+ecryptfs_inode_to_private(struct inode *inode)
+{
+        return container_of(inode, struct ecryptfs_inode_info, vfs_inode);
+}
+static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode)
+{
+        return ecryptfs_inode_to_private(inode)->wii_inode;
+}
+static inline void
+ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode)
+{
+        ecryptfs_inode_to_private(inode)->wii_inode = lower_inode;
+}
+static inline struct ecryptfs_sb_info *
+ecryptfs_superblock_to_private(struct super_block *sb)
+{
+        return (struct ecryptfs_sb_info *)sb->s_fs_info;
+}
+static inline void
+ecryptfs_set_superblock_private(struct super_block *sb,
+                                struct ecryptfs_sb_info *sb_info)
+{
+        sb->s_fs_info = sb_info;
+}
+static inline struct super_block *
+ecryptfs_superblock_to_lower(struct super_block *sb)
+{
+        return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb;
+}
+static inline void
+ecryptfs_set_superblock_lower(struct super_block *sb,
+                              struct super_block *lower_sb)
+{
+        ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
+}
+static inline struct ecryptfs_dentry_info *
+ecryptfs_dentry_to_private(struct dentry *dentry)
+{
+        return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
+}
+static inline void
+ecryptfs_set_dentry_private(struct dentry *dentry,
+                            struct ecryptfs_dentry_info *dentry_info)
+{
+        dentry->d_fsdata = dentry_info;
+}
+static inline struct dentry *
+ecryptfs_dentry_to_lower(struct dentry *dentry)
+{
+        return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry;
+}
+static inline void
+ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry)
+{
+        ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry =
+                lower_dentry;
+}
+static inline struct vfsmount *
+ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
+{
+        return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt;
+}
+static inline void
+ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
+{
+        ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt =
+                lower_mnt;
+}
+#define ecryptfs_printk(type, fmt, arg...) \
+        __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg);
+void __ecryptfs_printk(const char *fmt, ...);
+extern const struct file_operations ecryptfs_main_fops;
+extern const struct file_operations ecryptfs_dir_fops;
+extern struct inode_operations ecryptfs_main_iops;
+extern struct inode_operations ecryptfs_dir_iops;
+extern struct inode_operations ecryptfs_symlink_iops;
+extern struct super_operations ecryptfs_sops;
+extern struct dentry_operations ecryptfs_dops;
+extern struct address_space_operations ecryptfs_aops;
+extern int ecryptfs_verbosity;
+extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+extern struct kmem_cache *ecryptfs_file_info_cache;
+extern struct kmem_cache *ecryptfs_dentry_info_cache;
+extern struct kmem_cache *ecryptfs_inode_info_cache;
+extern struct kmem_cache *ecryptfs_sb_info_cache;
+extern struct kmem_cache *ecryptfs_header_cache_0;
+extern struct kmem_cache *ecryptfs_header_cache_1;
+extern struct kmem_cache *ecryptfs_header_cache_2;
+extern struct kmem_cache *ecryptfs_lower_page_cache;
+int ecryptfs_interpose(struct dentry *hidden_dentry,
+                       struct dentry *this_dentry, struct super_block *sb,
+                       int flag);
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
+int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+                             const char *name, int length,
+                             char **decrypted_name);
+int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+                             const char *name, int length,
+                             char **encoded_name);
+struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
+void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src);
+void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src);
+void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src);
+void ecryptfs_dump_hex(char *data, int bytes);
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+                        int sg_size);
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_rotate_iv(unsigned char *iv);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destruct_mount_crypt_stat(
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_write_inode_size_to_header(struct file *lower_file,
+                                        struct inode *lower_inode,
+                                        struct inode *inode);
+int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
+                            struct file *lower_file,
+                            unsigned long lower_page_index, int byte_offset,
+                            int region_bytes);
+int
+ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
+                           struct file *lower_file, int byte_offset,
+                           int region_size);
+int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
+                                struct file *lower_file);
+int ecryptfs_do_readpage(struct file *file, struct page *page,
+                         pgoff_t lower_page_index);
+int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
+                                     char **lower_virt,
+                                     struct inode *lower_inode,
+                                     unsigned long lower_page_index);
+int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
+                                              struct inode *lower_inode,
+                                              struct writeback_control *wbc);
+int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx);
+int ecryptfs_decrypt_page(struct file *file, struct page *page);
+int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
+                           struct file *lower_file);
+int ecryptfs_write_headers_virt(char *page_virt,
+                                struct ecryptfs_crypt_stat *crypt_stat,
+                                struct dentry *ecryptfs_dentry);
+int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
+                          struct file *lower_file);
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+int contains_ecryptfs_marker(char *data);
+int ecryptfs_read_header_region(char *data, struct dentry *dentry,
+                                struct vfsmount *mnt);
+u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code);
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_generate_key_packet_set(char *dest_base,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
+                                     struct dentry *ecryptfs_dentry,
+                                     size_t *len, size_t max);
+int process_request_key_err(long err_code);
+int
+ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+                          unsigned char *src, struct dentry *ecryptfs_dentry);
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
+int
+ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
+                        char *cipher_name, size_t key_size);
+int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
+int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
+void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
+#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
new file mode 100644
index 000000000000..c8550c9f9cd2
--- /dev/null
+++ b/fs/ecryptfs/file.c
@@ -0,0 +1,440 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/smp_lock.h>
+#include <linux/compat.h>
+#include "ecryptfs_kernel.h"
+/**
+ * ecryptfs_llseek
+ * @file: File we are seeking in
+ * @offset: The offset to seek to
+ * @origin: 2 - offset from i_size; 1 - offset from f_pos
+ *
+ * Returns the position we have seeked to, or negative on error
+ */
+static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t rv;
+        loff_t new_end_pos;
+        int rc;
+        int expanding_file = 0;
+        struct inode *inode = file->f_mapping->host;
+        /* If our offset is past the end of our file, we're going to
+         * need to grow it so we have a valid length of 0's */
+        new_end_pos = offset;
+        switch (origin) {
+        case 2:
+                new_end_pos += i_size_read(inode);
+                expanding_file = 1;
+                break;
+        case 1:
+                new_end_pos += file->f_pos;
+                if (new_end_pos > i_size_read(inode)) {
+                        ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
+                                        "> i_size_read(inode)(=[0x%.16x])\n",
+                                        new_end_pos, i_size_read(inode));
+                        expanding_file = 1;
+                }
+                break;
+        default:
+                if (new_end_pos > i_size_read(inode)) {
+                        ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
+                                        "> i_size_read(inode)(=[0x%.16x])\n",
+                                        new_end_pos, i_size_read(inode));
+                        expanding_file = 1;
+                }
+        }
+        ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos);
+        if (expanding_file) {
+                rc = ecryptfs_truncate(file->f_dentry, new_end_pos);
+                if (rc) {
+                        rv = rc;
+                        ecryptfs_printk(KERN_ERR, "Error on attempt to "
+                                        "truncate to (higher) offset [0x%.16x];"
+                                        " rc = [%d]\n", new_end_pos, rc);
+                        goto out;
+                }
+        }
+        rv = generic_file_llseek(file, offset, origin);
+out:
+        return rv;
+}
+/**
+ * ecryptfs_read_update_atime
+ *
+ * generic_file_read updates the atime of upper layer inode.  But, it
+ * doesn't give us a chance to update the atime of the lower layer
+ * inode.  This function is a wrapper to generic_file_read.  It
+ * updates the atime of the lower level inode if generic_file_read
+ * returns without any errors. This is to be used only for file reads.
+ * The function to be used for directory reads is ecryptfs_read.
+ */
+static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
+                                const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+        int rc;
+        struct dentry *lower_dentry;
+        struct vfsmount *lower_vfsmount;
+        struct file *file = iocb->ki_filp;
+        rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+        /*
+         * Even though this is a async interface, we need to wait
+         * for IO to finish to update atime
+         */
+        if (-EIOCBQUEUED == rc)
+                rc = wait_on_sync_kiocb(iocb);
+        if (rc >= 0) {
+                lower_dentry = ecryptfs_dentry_to_lower(file->f_dentry);
+                lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_dentry);
+                touch_atime(lower_vfsmount, lower_dentry);
+        }
+        return rc;
+}
+struct ecryptfs_getdents_callback {
+        void *dirent;
+        struct dentry *dentry;
+        filldir_t filldir;
+        int err;
+        int filldir_called;
+        int entries_written;
+};
+/* Inspired by generic filldir in fs/readir.c */
+static int
+ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
+                 u64 ino, unsigned int d_type)
+{
+        struct ecryptfs_crypt_stat *crypt_stat;
+        struct ecryptfs_getdents_callback *buf =
+            (struct ecryptfs_getdents_callback *)dirent;
+        int rc;
+        int decoded_length;
+        char *decoded_name;
+        crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
+        buf->filldir_called++;
+        decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
+                                                  &decoded_name);
+        if (decoded_length < 0) {
+                rc = decoded_length;
+                goto out;
+        }
+        rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
+                          ino, d_type);
+        kfree(decoded_name);
+        if (rc >= 0)
+                buf->entries_written++;
+out:
+        return rc;
+}
+/**
+ * ecryptfs_readdir
+ * @file: The ecryptfs file struct
+ * @dirent: Directory entry
+ * @filldir: The filldir callback function
+ */
+static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        int rc;
+        struct file *lower_file;
+        struct inode *inode;
+        struct ecryptfs_getdents_callback buf;
+        lower_file = ecryptfs_file_to_lower(file);
+        lower_file->f_pos = file->f_pos;
+        inode = file->f_dentry->d_inode;
+        memset(&buf, 0, sizeof(buf));
+        buf.dirent = dirent;
+        buf.dentry = file->f_dentry;
+        buf.filldir = filldir;
+retry:
+        buf.filldir_called = 0;
+        buf.entries_written = 0;
+        buf.err = 0;
+        rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
+        if (buf.err)
+                rc = buf.err;
+        if (buf.filldir_called && !buf.entries_written)
+                goto retry;
+        file->f_pos = lower_file->f_pos;
+        if (rc >= 0)
+                ecryptfs_copy_attr_atime(inode, lower_file->f_dentry->d_inode);
+        return rc;
+}
+struct kmem_cache *ecryptfs_file_info_cache;
+/**
+ * ecryptfs_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_open(struct inode *inode, struct file *file)
+{
+        int rc = 0;
+        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        struct dentry *ecryptfs_dentry = file->f_dentry;
+        /* Private value of ecryptfs_dentry allocated in
+         * ecryptfs_lookup() */
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+        struct inode *lower_inode = NULL;
+        struct file *lower_file = NULL;
+        struct vfsmount *lower_mnt;
+        struct ecryptfs_file_info *file_info;
+        int lower_flags;
+        /* Released in ecryptfs_release or end of function if failure */
+        file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL);
+        ecryptfs_set_file_private(file, file_info);
+        if (!file_info) {
+                ecryptfs_printk(KERN_ERR,
+                                "Error attempting to allocate memory\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        memset(file_info, 0, sizeof(*file_info));
+        lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        mutex_lock(&crypt_stat->cs_mutex);
+        if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) {
+                ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n");
+                /* Policy code enabled in future release */
+                ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED);
+                ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+        }
+        mutex_unlock(&crypt_stat->cs_mutex);
+        /* This mntget & dget is undone via fput when the file is released */
+        dget(lower_dentry);
+        lower_flags = file->f_flags;
+        if ((lower_flags & O_ACCMODE) == O_WRONLY)
+                lower_flags = (lower_flags & O_ACCMODE) | O_RDWR;
+        if (file->f_flags & O_APPEND)
+                lower_flags &= ~O_APPEND;
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+        mntget(lower_mnt);
+        /* Corresponding fput() in ecryptfs_release() */
+        lower_file = dentry_open(lower_dentry, lower_mnt, lower_flags);
+        if (IS_ERR(lower_file)) {
+                rc = PTR_ERR(lower_file);
+                ecryptfs_printk(KERN_ERR, "Error opening lower file\n");
+                goto out_puts;
+        }
+        ecryptfs_set_file_lower(file, lower_file);
+        /* Isn't this check the same as the one in lookup? */
+        lower_inode = lower_dentry->d_inode;
+        if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+                ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+                ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+                rc = 0;
+                goto out;
+        }
+        mutex_lock(&crypt_stat->cs_mutex);
+        if (i_size_read(lower_inode) < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+                if (!(mount_crypt_stat->flags
+                      & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+                        rc = -EIO;
+                        printk(KERN_WARNING "Attempt to read file that is "
+                               "not in a valid eCryptfs format, and plaintext "
+                               "passthrough mode is not enabled; returning "
+                               "-EIO\n");
+                        mutex_unlock(&crypt_stat->cs_mutex);
+                        goto out_puts;
+                }
+                crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+                rc = 0;
+                mutex_unlock(&crypt_stat->cs_mutex);
+                goto out;
+        } else if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+                                        ECRYPTFS_POLICY_APPLIED)
+                   || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+                                           ECRYPTFS_KEY_VALID)) {
+                rc = ecryptfs_read_headers(ecryptfs_dentry, lower_file);
+                if (rc) {
+                        ecryptfs_printk(KERN_DEBUG,
+                                        "Valid headers not found\n");
+                        if (!(mount_crypt_stat->flags
+                              & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+                                rc = -EIO;
+                                printk(KERN_WARNING "Attempt to read file that "
+                                       "is not in a valid eCryptfs format, "
+                                       "and plaintext passthrough mode is not "
+                                       "enabled; returning -EIO\n");
+                                mutex_unlock(&crypt_stat->cs_mutex);
+                                goto out_puts;
+                        }
+                        ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
+                                            ECRYPTFS_ENCRYPTED);
+                        rc = 0;
+                        mutex_unlock(&crypt_stat->cs_mutex);
+                        goto out;
+                }
+        }
+        mutex_unlock(&crypt_stat->cs_mutex);
+        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
+                        "size: [0x%.16x]\n", inode, inode->i_ino,
+                        i_size_read(inode));
+        ecryptfs_set_file_lower(file, lower_file);
+        goto out;
+out_puts:
+        mntput(lower_mnt);
+        dput(lower_dentry);
+        kmem_cache_free(ecryptfs_file_info_cache,
+                        ecryptfs_file_to_private(file));
+out:
+        return rc;
+}
+static int ecryptfs_flush(struct file *file, fl_owner_t td)
+{
+        int rc = 0;
+        struct file *lower_file = NULL;
+        lower_file = ecryptfs_file_to_lower(file);
+        if (lower_file->f_op && lower_file->f_op->flush)
+                rc = lower_file->f_op->flush(lower_file, td);
+        return rc;
+}
+static int ecryptfs_release(struct inode *inode, struct file *file)
+{
+        struct file *lower_file = ecryptfs_file_to_lower(file);
+        struct ecryptfs_file_info *file_info = ecryptfs_file_to_private(file);
+        struct inode *lower_inode = ecryptfs_inode_to_lower(inode);
+        fput(lower_file);
+        inode->i_blocks = lower_inode->i_blocks;
+        kmem_cache_free(ecryptfs_file_info_cache, file_info);
+        return 0;
+}
+static int
+ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct file *lower_file = ecryptfs_file_to_lower(file);
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct inode *lower_inode = lower_dentry->d_inode;
+        int rc = -EINVAL;
+        if (lower_inode->i_fop->fsync) {
+                mutex_lock(&lower_inode->i_mutex);
+                rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
+                                               datasync);
+                mutex_unlock(&lower_inode->i_mutex);
+        }
+        return rc;
+}
+static int ecryptfs_fasync(int fd, struct file *file, int flag)
+{
+        int rc = 0;
+        struct file *lower_file = NULL;
+        lower_file = ecryptfs_file_to_lower(file);
+        if (lower_file->f_op && lower_file->f_op->fasync)
+                rc = lower_file->f_op->fasync(fd, lower_file, flag);
+        return rc;
+}
+static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos,
+                                 size_t count, read_actor_t actor, void *target)
+{
+        struct file *lower_file = NULL;
+        int rc = -EINVAL;
+        lower_file = ecryptfs_file_to_lower(file);
+        if (lower_file->f_op && lower_file->f_op->sendfile)
+                rc = lower_file->f_op->sendfile(lower_file, ppos, count,
+                                                actor, target);
+        return rc;
+}
+static int ecryptfs_ioctl(struct inode *inode, struct file *file,
+                          unsigned int cmd, unsigned long arg);
+const struct file_operations ecryptfs_dir_fops = {
+        .readdir = ecryptfs_readdir,
+        .ioctl = ecryptfs_ioctl,
+        .mmap = generic_file_mmap,
+        .open = ecryptfs_open,
+        .flush = ecryptfs_flush,
+        .release = ecryptfs_release,
+        .fsync = ecryptfs_fsync,
+        .fasync = ecryptfs_fasync,
+        .sendfile = ecryptfs_sendfile,
+};
+const struct file_operations ecryptfs_main_fops = {
+        .llseek = ecryptfs_llseek,
+        .read = do_sync_read,
+        .aio_read = ecryptfs_read_update_atime,
+        .write = do_sync_write,
+        .aio_write = generic_file_aio_write,
+        .readdir = ecryptfs_readdir,
+        .ioctl = ecryptfs_ioctl,
+        .mmap = generic_file_mmap,
+        .open = ecryptfs_open,
+        .flush = ecryptfs_flush,
+        .release = ecryptfs_release,
+        .fsync = ecryptfs_fsync,
+        .fasync = ecryptfs_fasync,
+        .sendfile = ecryptfs_sendfile,
+};
+static int
+ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+               unsigned long arg)
+{
+        int rc = 0;
+        struct file *lower_file = NULL;
+        if (ecryptfs_file_to_private(file))
+                lower_file = ecryptfs_file_to_lower(file);
+        if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
+                rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
+                                             lower_file, cmd, arg);
+        else
+                rc = -ENOTTY;
+        return rc;
+}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
new file mode 100644
index 000000000000..efdd2b7b62d7
--- /dev/null
+++ b/fs/ecryptfs/inode.c
@@ -0,0 +1,1079 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompsion <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+static struct dentry *lock_parent(struct dentry *dentry)
+{
+        struct dentry *dir;
+        dir = dget(dentry->d_parent);
+        mutex_lock(&(dir->d_inode->i_mutex));
+        return dir;
+}
+static void unlock_parent(struct dentry *dentry)
+{
+        mutex_unlock(&(dentry->d_parent->d_inode->i_mutex));
+        dput(dentry->d_parent);
+}
+static void unlock_dir(struct dentry *dir)
+{
+        mutex_unlock(&dir->d_inode->i_mutex);
+        dput(dir);
+}
+void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src)
+{
+        i_size_write(dst, i_size_read((struct inode *)src));
+        dst->i_blocks = src->i_blocks;
+}
+void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src)
+{
+        dest->i_atime = src->i_atime;
+}
+static void ecryptfs_copy_attr_times(struct inode *dest,
+                                     const struct inode *src)
+{
+        dest->i_atime = src->i_atime;
+        dest->i_mtime = src->i_mtime;
+        dest->i_ctime = src->i_ctime;
+}
+static void ecryptfs_copy_attr_timesizes(struct inode *dest,
+                                         const struct inode *src)
+{
+        dest->i_atime = src->i_atime;
+        dest->i_mtime = src->i_mtime;
+        dest->i_ctime = src->i_ctime;
+        ecryptfs_copy_inode_size(dest, src);
+}
+void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src)
+{
+        dest->i_mode = src->i_mode;
+        dest->i_nlink = src->i_nlink;
+        dest->i_uid = src->i_uid;
+        dest->i_gid = src->i_gid;
+        dest->i_rdev = src->i_rdev;
+        dest->i_atime = src->i_atime;
+        dest->i_mtime = src->i_mtime;
+        dest->i_ctime = src->i_ctime;
+        dest->i_blkbits = src->i_blkbits;
+        dest->i_flags = src->i_flags;
+}
+/**
+ * ecryptfs_create_underlying_file
+ * @lower_dir_inode: inode of the parent in the lower fs of the new file
+ * @lower_dentry: New file's dentry in the lower fs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the file in the lower file system.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
+                                struct dentry *dentry, int mode,
+                                struct nameidata *nd)
+{
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        struct dentry *dentry_save;
+        struct vfsmount *vfsmount_save;
+        int rc;
+        dentry_save = nd->dentry;
+        vfsmount_save = nd->mnt;
+        nd->dentry = lower_dentry;
+        nd->mnt = lower_mnt;
+        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
+        nd->dentry = dentry_save;
+        nd->mnt = vfsmount_save;
+        return rc;
+}
+/**
+ * ecryptfs_do_create
+ * @directory_inode: inode of the new file's dentry's parent in ecryptfs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the underlying file and the eCryptfs inode which will link to
+ * it. It will also update the eCryptfs directory inode to mimic the
+ * stat of the lower directory inode.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_do_create(struct inode *directory_inode,
+                   struct dentry *ecryptfs_dentry, int mode,
+                   struct nameidata *nd)
+{
+        int rc;
+        struct dentry *lower_dentry;
+        struct dentry *lower_dir_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+        lower_dir_dentry = lock_parent(lower_dentry);
+        if (unlikely(IS_ERR(lower_dir_dentry))) {
+                ecryptfs_printk(KERN_ERR, "Error locking directory of "
+                                "dentry\n");
+                rc = PTR_ERR(lower_dir_dentry);
+                goto out;
+        }
+        rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
+                                             ecryptfs_dentry, mode, nd);
+        if (unlikely(rc)) {
+                ecryptfs_printk(KERN_ERR,
+                                "Failure to create underlying file\n");
+                goto out_lock;
+        }
+        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+                                directory_inode->i_sb, 0);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
+                goto out_lock;
+        }
+        ecryptfs_copy_attr_timesizes(directory_inode,
+                                     lower_dir_dentry->d_inode);
+out_lock:
+        unlock_dir(lower_dir_dentry);
+out:
+        return rc;
+}
+/**
+ * grow_file
+ * @ecryptfs_dentry: the ecryptfs dentry
+ * @lower_file: The lower file
+ * @inode: The ecryptfs inode
+ * @lower_inode: The lower inode
+ *
+ * This is the code which will grow the file to its correct size.
+ */
+static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file,
+                     struct inode *inode, struct inode *lower_inode)
+{
+        int rc = 0;
+        struct file fake_file;
+        struct ecryptfs_file_info tmp_file_info;
+        memset(&fake_file, 0, sizeof(fake_file));
+        fake_file.f_dentry = ecryptfs_dentry;
+        memset(&tmp_file_info, 0, sizeof(tmp_file_info));
+        ecryptfs_set_file_private(&fake_file, &tmp_file_info);
+        ecryptfs_set_file_lower(&fake_file, lower_file);
+        rc = ecryptfs_fill_zeros(&fake_file, 1);
+        if (rc) {
+                ECRYPTFS_SET_FLAG(
+                        ecryptfs_inode_to_private(inode)->crypt_stat.flags,
+                        ECRYPTFS_SECURITY_WARNING);
+                ecryptfs_printk(KERN_WARNING, "Error attempting to fill zeros "
+                                "in file; rc = [%d]\n", rc);
+                goto out;
+        }
+        i_size_write(inode, 0);
+        ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
+        ECRYPTFS_SET_FLAG(ecryptfs_inode_to_private(inode)->crypt_stat.flags,
+                          ECRYPTFS_NEW_FILE);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_initialize_file
+ *
+ * Cause the file to be changed from a basic empty file to an ecryptfs
+ * file with a header and first data page.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+{
+        int rc = 0;
+        int lower_flags;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        struct dentry *lower_dentry;
+        struct dentry *tlower_dentry = NULL;
+        struct file *lower_file;
+        struct inode *inode, *lower_inode;
+        struct vfsmount *lower_mnt;
+        lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+        ecryptfs_printk(KERN_DEBUG, "lower_dentry->d_name.name = [%s]\n",
+                        lower_dentry->d_name.name);
+        inode = ecryptfs_dentry->d_inode;
+        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+        tlower_dentry = dget(lower_dentry);
+        if (!tlower_dentry) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry\n");
+                goto out;
+        }
+        lower_flags = ((O_CREAT | O_WRONLY | O_TRUNC) & O_ACCMODE) | O_RDWR;
+#if BITS_PER_LONG != 32
+        lower_flags |= O_LARGEFILE;
+#endif
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+        mntget(lower_mnt);
+        /* Corresponding fput() at end of this function */
+        lower_file = dentry_open(tlower_dentry, lower_mnt, lower_flags);
+        if (IS_ERR(lower_file)) {
+                rc = PTR_ERR(lower_file);
+                ecryptfs_printk(KERN_ERR,
+                                "Error opening dentry; rc = [%i]\n", rc);
+                goto out;
+        }
+        /* fput(lower_file) should handle the puts if we do this */
+        lower_file->f_dentry = tlower_dentry;
+        lower_file->f_vfsmnt = lower_mnt;
+        lower_inode = tlower_dentry->d_inode;
+        if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+                ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+                ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+                goto out_fput;
+        }
+        ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
+        ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
+        rc = ecryptfs_new_file_context(ecryptfs_dentry);
+        if (rc) {
+                ecryptfs_printk(KERN_DEBUG, "Error creating new file "
+                                "context\n");
+                goto out_fput;
+        }
+        rc = ecryptfs_write_headers(ecryptfs_dentry, lower_file);
+        if (rc) {
+                ecryptfs_printk(KERN_DEBUG, "Error writing headers\n");
+                goto out_fput;
+        }
+        rc = grow_file(ecryptfs_dentry, lower_file, inode, lower_inode);
+out_fput:
+        fput(lower_file);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_create
+ * @dir: The inode of the directory in which to create the file.
+ * @dentry: The eCryptfs dentry
+ * @mode: The mode of the new file.
+ * @nd: nameidata
+ *
+ * Creates a new file.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+                int mode, struct nameidata *nd)
+{
+        int rc;
+        rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
+        if (unlikely(rc)) {
+                ecryptfs_printk(KERN_WARNING, "Failed to create file in"
+                                "lower filesystem\n");
+                goto out;
+        }
+        /* At this point, a file exists on "disk"; we need to make sure
+         * that this on disk file is prepared to be an ecryptfs file */
+        rc = ecryptfs_initialize_file(ecryptfs_dentry);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_lookup
+ * @dir: inode
+ * @dentry: The dentry
+ * @nd: nameidata, may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
+                                      struct nameidata *nd)
+{
+        int rc = 0;
+        struct dentry *lower_dir_dentry;
+        struct dentry *lower_dentry;
+        struct vfsmount *lower_mnt;
+        struct dentry *tlower_dentry = NULL;
+        char *encoded_name;
+        unsigned int encoded_namelen;
+        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        char *page_virt = NULL;
+        struct inode *lower_inode;
+        u64 file_size;
+        lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+        dentry->d_op = &ecryptfs_dops;
+        if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
+            || (dentry->d_name.len == 2 && !strcmp(dentry->d_name.name, "..")))
+                goto out_drop;
+        encoded_namelen = ecryptfs_encode_filename(crypt_stat,
+                                                   dentry->d_name.name,
+                                                   dentry->d_name.len,
+                                                   &encoded_name);
+        if (encoded_namelen < 0) {
+                rc = encoded_namelen;
+                goto out_drop;
+        }
+        ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
+                        "= [%d]\n", encoded_name, encoded_namelen);
+        lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
+                                      encoded_namelen - 1);
+        kfree(encoded_name);
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+        if (IS_ERR(lower_dentry)) {
+                ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
+                rc = PTR_ERR(lower_dentry);
+                goto out_drop;
+        }
+        ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
+                "d_name.name = [%s]\n", lower_dentry,
+                lower_dentry->d_name.name);
+        lower_inode = lower_dentry->d_inode;
+        ecryptfs_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+        BUG_ON(!atomic_read(&lower_dentry->d_count));
+        ecryptfs_set_dentry_private(dentry,
+                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
+                                                     SLAB_KERNEL));
+        if (!ecryptfs_dentry_to_private(dentry)) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
+                                "to allocate ecryptfs_dentry_info struct\n");
+                goto out_dput;
+        }
+        ecryptfs_set_dentry_lower(dentry, lower_dentry);
+        ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+        if (!lower_dentry->d_inode) {
+                /* We want to add because we couldn't find in lower */
+                d_add(dentry, NULL);
+                goto out;
+        }
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error interposing\n");
+                goto out_dput;
+        }
+        if (S_ISDIR(lower_inode->i_mode)) {
+                ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
+                goto out;
+        }
+        if (S_ISLNK(lower_inode->i_mode)) {
+                ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
+                goto out;
+        }
+        if (!nd) {
+                ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
+                                "as we *think* we are about to unlink\n");
+                goto out;
+        }
+        tlower_dentry = dget(lower_dentry);
+        if (!tlower_dentry || IS_ERR(tlower_dentry)) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Cannot dget lower_dentry\n");
+                goto out_dput;
+        }
+        /* Released in this function */
+        page_virt =
+            (char *)kmem_cache_alloc(ecryptfs_header_cache_2,
+                                     SLAB_USER);
+        if (!page_virt) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR,
+                                "Cannot ecryptfs_kmalloc a page\n");
+                goto out_dput;
+        }
+        memset(page_virt, 0, PAGE_CACHE_SIZE);
+        rc = ecryptfs_read_header_region(page_virt, tlower_dentry, nd->mnt);
+        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED))
+                ecryptfs_set_default_sizes(crypt_stat);
+        if (rc) {
+                rc = 0;
+                ecryptfs_printk(KERN_WARNING, "Error reading header region;"
+                                " assuming unencrypted\n");
+        } else {
+                if (!contains_ecryptfs_marker(page_virt
+                                              + ECRYPTFS_FILE_SIZE_BYTES)) {
+                        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+                        goto out;
+                }
+                memcpy(&file_size, page_virt, sizeof(file_size));
+                file_size = be64_to_cpu(file_size);
+                i_size_write(dentry->d_inode, (loff_t)file_size);
+        }
+        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+        goto out;
+out_dput:
+        dput(lower_dentry);
+        if (tlower_dentry)
+                dput(tlower_dentry);
+out_drop:
+        d_drop(dentry);
+out:
+        return ERR_PTR(rc);
+}
+static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
+                         struct dentry *new_dentry)
+{
+        struct dentry *lower_old_dentry;
+        struct dentry *lower_new_dentry;
+        struct dentry *lower_dir_dentry;
+        u64 file_size_save;
+        int rc;
+        file_size_save = i_size_read(old_dentry->d_inode);
+        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+        dget(lower_old_dentry);
+        dget(lower_new_dentry);
+        lower_dir_dentry = lock_parent(lower_new_dentry);
+        rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
+                      lower_new_dentry);
+        if (rc || !lower_new_dentry->d_inode)
+                goto out_lock;
+        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+        if (rc)
+                goto out_lock;
+        ecryptfs_copy_attr_timesizes(dir, lower_new_dentry->d_inode);
+        old_dentry->d_inode->i_nlink =
+                ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+        i_size_write(new_dentry->d_inode, file_size_save);
+out_lock:
+        unlock_dir(lower_dir_dentry);
+        dput(lower_new_dentry);
+        dput(lower_old_dentry);
+        if (!new_dentry->d_inode)
+                d_drop(new_dentry);
+        return rc;
+}
+static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        int rc = 0;
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+        lock_parent(lower_dentry);
+        rc = vfs_unlink(lower_dir_inode, lower_dentry);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error in vfs_unlink\n");
+                goto out_unlock;
+        }
+        ecryptfs_copy_attr_times(dir, lower_dir_inode);
+        dentry->d_inode->i_nlink =
+                ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+        dentry->d_inode->i_ctime = dir->i_ctime;
+out_unlock:
+        unlock_parent(lower_dentry);
+        return rc;
+}
+static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+                            const char *symname)
+{
+        int rc;
+        struct dentry *lower_dentry;
+        struct dentry *lower_dir_dentry;
+        umode_t mode;
+        char *encoded_symname;
+        unsigned int encoded_symlen;
+        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        dget(lower_dentry);
+        lower_dir_dentry = lock_parent(lower_dentry);
+        mode = S_IALLUGO;
+        encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
+                                                  strlen(symname),
+                                                  &encoded_symname);
+        if (encoded_symlen < 0) {
+                rc = encoded_symlen;
+                goto out_lock;
+        }
+        rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
+                         encoded_symname, mode);
+        kfree(encoded_symname);
+        if (rc || !lower_dentry->d_inode)
+                goto out_lock;
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+        if (rc)
+                goto out_lock;
+        ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+out_lock:
+        unlock_dir(lower_dir_dentry);
+        dput(lower_dentry);
+        if (!dentry->d_inode)
+                d_drop(dentry);
+        return rc;
+}
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        int rc;
+        struct dentry *lower_dentry;
+        struct dentry *lower_dir_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        lower_dir_dentry = lock_parent(lower_dentry);
+        rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
+        if (rc || !lower_dentry->d_inode)
+                goto out;
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+        if (rc)
+                goto out;
+        ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+out:
+        unlock_dir(lower_dir_dentry);
+        if (!dentry->d_inode)
+                d_drop(dentry);
+        return rc;
+}
+static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        int rc = 0;
+        struct dentry *tdentry = NULL;
+        struct dentry *lower_dentry;
+        struct dentry *tlower_dentry = NULL;
+        struct dentry *lower_dir_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!(tdentry = dget(dentry))) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_ERR, "Error dget'ing dentry [%p]\n",
+                                dentry);
+                goto out;
+        }
+        lower_dir_dentry = lock_parent(lower_dentry);
+        if (!(tlower_dentry = dget(lower_dentry))) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry "
+                                "[%p]\n", lower_dentry);
+                goto out;
+        }
+        rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
+        if (!rc) {
+                d_delete(tlower_dentry);
+                tlower_dentry = NULL;
+        }
+        ecryptfs_copy_attr_times(dir, lower_dir_dentry->d_inode);
+        dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+        unlock_dir(lower_dir_dentry);
+        if (!rc)
+                d_drop(dentry);
+out:
+        if (tdentry)
+                dput(tdentry);
+        if (tlower_dentry)
+                dput(tlower_dentry);
+        return rc;
+}
+static int
+ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+        int rc;
+        struct dentry *lower_dentry;
+        struct dentry *lower_dir_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        lower_dir_dentry = lock_parent(lower_dentry);
+        rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
+        if (rc || !lower_dentry->d_inode)
+                goto out;
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+        if (rc)
+                goto out;
+        ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+out:
+        unlock_dir(lower_dir_dentry);
+        if (!dentry->d_inode)
+                d_drop(dentry);
+        return rc;
+}
+static int
+ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                struct inode *new_dir, struct dentry *new_dentry)
+{
+        int rc;
+        struct dentry *lower_old_dentry;
+        struct dentry *lower_new_dentry;
+        struct dentry *lower_old_dir_dentry;
+        struct dentry *lower_new_dir_dentry;
+        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+        dget(lower_old_dentry);
+        dget(lower_new_dentry);
+        lower_old_dir_dentry = dget_parent(lower_old_dentry);
+        lower_new_dir_dentry = dget_parent(lower_new_dentry);
+        lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+        rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
+                        lower_new_dir_dentry->d_inode, lower_new_dentry);
+        if (rc)
+                goto out_lock;
+        ecryptfs_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
+        if (new_dir != old_dir)
+                ecryptfs_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
+out_lock:
+        unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+        dput(lower_new_dentry);
+        dput(lower_old_dentry);
+        return rc;
+}
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+{
+        int rc;
+        struct dentry *lower_dentry;
+        char *decoded_name;
+        char *lower_buf;
+        mm_segment_t old_fs;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!lower_dentry->d_inode->i_op ||
+            !lower_dentry->d_inode->i_op->readlink) {
+                rc = -EINVAL;
+                goto out;
+        }
+        /* Released in this function */
+        lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+        if (lower_buf == NULL) {
+                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        old_fs = get_fs();
+        set_fs(get_ds());
+        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
+                        "lower_dentry->d_name.name = [%s]\n",
+                        lower_dentry->d_name.name);
+        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
+                                                   (char __user *)lower_buf,
+                                                   bufsiz);
+        set_fs(old_fs);
+        if (rc >= 0) {
+                crypt_stat = NULL;
+                rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
+                                              &decoded_name);
+                if (rc == -ENOMEM)
+                        goto out_free_lower_buf;
+                if (rc > 0) {
+                        ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
+                                        "to userspace: [%*s]\n", rc,
+                                        decoded_name);
+                        if (copy_to_user(buf, decoded_name, rc))
+                                rc = -EFAULT;
+                }
+                kfree(decoded_name);
+                ecryptfs_copy_attr_atime(dentry->d_inode,
+                                         lower_dentry->d_inode);
+        }
+out_free_lower_buf:
+        kfree(lower_buf);
+out:
+        return rc;
+}
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        char *buf;
+        int len = PAGE_SIZE, rc;
+        mm_segment_t old_fs;
+        /* Released in ecryptfs_put_link(); only release here on error */
+        buf = kmalloc(len, GFP_KERNEL);
+        if (!buf) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        old_fs = get_fs();
+        set_fs(get_ds());
+        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
+                        "dentry->d_name.name = [%s]\n", dentry->d_name.name);
+        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
+        buf[rc] = '\0';
+        set_fs(old_fs);
+        if (rc < 0)
+                goto out_free;
+        rc = 0;
+        nd_set_link(nd, buf);
+        goto out;
+out_free:
+        kfree(buf);
+out:
+        return ERR_PTR(rc);
+}
+static void
+ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+        /* Free the char* */
+        kfree(nd_get_link(nd));
+}
+/**
+ * upper_size_to_lower_size
+ * @crypt_stat: Crypt_stat associated with file
+ * @upper_size: Size of the upper file
+ *
+ * Calculate the requried size of the lower file based on the
+ * specified size of the upper file. This calculation is based on the
+ * number of headers in the underlying file and the extent size.
+ *
+ * Returns Calculated size of the lower file.
+ */
+static loff_t
+upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
+                         loff_t upper_size)
+{
+        loff_t lower_size;
+        lower_size = ( crypt_stat->header_extent_size
+                       * crypt_stat->num_header_extents_at_front );
+        if (upper_size != 0) {
+                loff_t num_extents;
+                num_extents = upper_size >> crypt_stat->extent_shift;
+                if (upper_size & ~crypt_stat->extent_mask)
+                        num_extents++;
+                lower_size += (num_extents * crypt_stat->extent_size);
+        }
+        return lower_size;
+}
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Function to handle truncations modifying the size of the file. Note
+ * that the file sizes are interpolated. When expanding, we are simply
+ * writing strings of 0's out. When truncating, we need to modify the
+ * underlying file size according to the page index interpolations.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+        int rc = 0;
+        struct inode *inode = dentry->d_inode;
+        struct dentry *lower_dentry;
+        struct vfsmount *lower_mnt;
+        struct file fake_ecryptfs_file, *lower_file = NULL;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        loff_t i_size = i_size_read(inode);
+        loff_t lower_size_before_truncate;
+        loff_t lower_size_after_truncate;
+        if (unlikely((new_length == i_size)))
+                goto out;
+        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        /* Set up a fake ecryptfs file, this is used to interface with
+         * the file in the underlying filesystem so that the
+         * truncation has an effect there as well. */
+        memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
+        fake_ecryptfs_file.f_dentry = dentry;
+        /* Released at out_free: label */
+        ecryptfs_set_file_private(&fake_ecryptfs_file,
+                                  kmem_cache_alloc(ecryptfs_file_info_cache,
+                                                   SLAB_KERNEL));
+        if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        /* This dget & mntget is released through fput at out_fput: */
+        dget(lower_dentry);
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        mntget(lower_mnt);
+        lower_file = dentry_open(lower_dentry, lower_mnt, O_RDWR);
+        if (unlikely(IS_ERR(lower_file))) {
+                rc = PTR_ERR(lower_file);
+                goto out_free;
+        }
+        ecryptfs_set_file_lower(&fake_ecryptfs_file, lower_file);
+        /* Switch on growing or shrinking file */
+        if (new_length > i_size) {
+                rc = ecryptfs_fill_zeros(&fake_ecryptfs_file, new_length);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR,
+                                        "Problem with fill_zeros\n");
+                        goto out_fput;
+                }
+                i_size_write(inode, new_length);
+                rc = ecryptfs_write_inode_size_to_header(lower_file,
+                                                         lower_dentry->d_inode,
+                                                         inode);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR,
+                                        "Problem with ecryptfs_write"
+                                        "_inode_size\n");
+                        goto out_fput;
+                }
+        } else { /* new_length < i_size_read(inode) */
+                vmtruncate(inode, new_length);
+                ecryptfs_write_inode_size_to_header(lower_file,
+                                                    lower_dentry->d_inode,
+                                                    inode);
+                /* We are reducing the size of the ecryptfs file, and need to
+                 * know if we need to reduce the size of the lower file. */
+                lower_size_before_truncate =
+                    upper_size_to_lower_size(crypt_stat, i_size);
+                lower_size_after_truncate =
+                    upper_size_to_lower_size(crypt_stat, new_length);
+                if (lower_size_after_truncate < lower_size_before_truncate)
+                        vmtruncate(lower_dentry->d_inode,
+                                   lower_size_after_truncate);
+        }
+        /* Update the access times */
+        lower_dentry->d_inode->i_mtime = lower_dentry->d_inode->i_ctime
+                = CURRENT_TIME;
+        mark_inode_dirty_sync(inode);
+out_fput:
+        fput(lower_file);
+out_free:
+        if (ecryptfs_file_to_private(&fake_ecryptfs_file))
+                kmem_cache_free(ecryptfs_file_info_cache,
+                                ecryptfs_file_to_private(&fake_ecryptfs_file));
+out:
+        return rc;
+}
+static int
+ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        int rc;
+        if (nd) {
+                struct vfsmount *vfsmnt_save = nd->mnt;
+                struct dentry *dentry_save = nd->dentry;
+                nd->mnt = ecryptfs_dentry_to_lower_mnt(nd->dentry);
+                nd->dentry = ecryptfs_dentry_to_lower(nd->dentry);
+                rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
+                nd->mnt = vfsmnt_save;
+                nd->dentry = dentry_save;
+        } else
+                rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
+        return rc;
+}
+/**
+ * ecryptfs_setattr
+ * @dentry: dentry handle to the inode to modify
+ * @ia: Structure with flags of what to change and values
+ *
+ * Updates the metadata of an inode. If the update is to the size
+ * i.e. truncation, then ecryptfs_truncate will handle the size modification
+ * of both the ecryptfs inode and the lower inode.
+ *
+ * All other metadata changes will be passed right to the lower filesystem,
+ * and we will just update our inode to look like the lower.
+ */
+static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+        int rc = 0;
+        struct dentry *lower_dentry;
+        struct inode *inode;
+        struct inode *lower_inode;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        inode = dentry->d_inode;
+        lower_inode = ecryptfs_inode_to_lower(inode);
+        if (ia->ia_valid & ATTR_SIZE) {
+                ecryptfs_printk(KERN_DEBUG,
+                                "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
+                                ia->ia_valid, ATTR_SIZE);
+                rc = ecryptfs_truncate(dentry, ia->ia_size);
+                /* ecryptfs_truncate handles resizing of the lower file */
+                ia->ia_valid &= ~ATTR_SIZE;
+                ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
+                                ia->ia_valid);
+                if (rc < 0)
+                        goto out;
+        }
+        rc = notify_change(lower_dentry, ia);
+out:
+        ecryptfs_copy_attr_all(inode, lower_inode);
+        return rc;
+}
+static int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                  size_t size, int flags)
+{
+        int rc = 0;
+        struct dentry *lower_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!lower_dentry->d_inode->i_op->setxattr) {
+                rc = -ENOSYS;
+                goto out;
+        }
+        mutex_lock(&lower_dentry->d_inode->i_mutex);
+        rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
+                                                   size, flags);
+        mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+        return rc;
+}
+static ssize_t
+ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
+                  size_t size)
+{
+        int rc = 0;
+        struct dentry *lower_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!lower_dentry->d_inode->i_op->getxattr) {
+                rc = -ENOSYS;
+                goto out;
+        }
+        mutex_lock(&lower_dentry->d_inode->i_mutex);
+        rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
+                                                   size);
+        mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+        return rc;
+}
+static ssize_t
+ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+        int rc = 0;
+        struct dentry *lower_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!lower_dentry->d_inode->i_op->listxattr) {
+                rc = -ENOSYS;
+                goto out;
+        }
+        mutex_lock(&lower_dentry->d_inode->i_mutex);
+        rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size);
+        mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+        return rc;
+}
+static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
+{
+        int rc = 0;
+        struct dentry *lower_dentry;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!lower_dentry->d_inode->i_op->removexattr) {
+                rc = -ENOSYS;
+                goto out;
+        }
+        mutex_lock(&lower_dentry->d_inode->i_mutex);
+        rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name);
+        mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+        return rc;
+}
+int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
+{
+        if ((ecryptfs_inode_to_lower(inode)
+             == (struct inode *)candidate_lower_inode))
+                return 1;
+        else
+                return 0;
+}
+int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
+{
+        ecryptfs_init_inode(inode, (struct inode *)lower_inode);
+        return 0;
+}
+struct inode_operations ecryptfs_symlink_iops = {
+        .readlink = ecryptfs_readlink,
+        .follow_link = ecryptfs_follow_link,
+        .put_link = ecryptfs_put_link,
+        .permission = ecryptfs_permission,
+        .setattr = ecryptfs_setattr,
+        .setxattr = ecryptfs_setxattr,
+        .getxattr = ecryptfs_getxattr,
+        .listxattr = ecryptfs_listxattr,
+        .removexattr = ecryptfs_removexattr
+};
+struct inode_operations ecryptfs_dir_iops = {
+        .create = ecryptfs_create,
+        .lookup = ecryptfs_lookup,
+        .link = ecryptfs_link,
+        .unlink = ecryptfs_unlink,
+        .symlink = ecryptfs_symlink,
+        .mkdir = ecryptfs_mkdir,
+        .rmdir = ecryptfs_rmdir,
+        .mknod = ecryptfs_mknod,
+        .rename = ecryptfs_rename,
+        .permission = ecryptfs_permission,
+        .setattr = ecryptfs_setattr,
+        .setxattr = ecryptfs_setxattr,
+        .getxattr = ecryptfs_getxattr,
+        .listxattr = ecryptfs_listxattr,
+        .removexattr = ecryptfs_removexattr
+};
+struct inode_operations ecryptfs_main_iops = {
+        .permission = ecryptfs_permission,
+        .setattr = ecryptfs_setattr,
+        .setxattr = ecryptfs_setxattr,
+        .getxattr = ecryptfs_getxattr,
+        .listxattr = ecryptfs_listxattr,
+        .removexattr = ecryptfs_removexattr
+};
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
new file mode 100644
index 000000000000..ba454785a0c5
--- /dev/null
+++ b/fs/ecryptfs/keystore.c
@@ -0,0 +1,1061 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * In-kernel key management code.  Includes functions to parse and
+ * write authentication token-related packets with the underlying
+ * file.
+ *
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+/**
+ * request_key returned an error instead of a valid key address;
+ * determine the type of error, make appropriate log entries, and
+ * return an error code.
+ */
+int process_request_key_err(long err_code)
+{
+        int rc = 0;
+        switch (err_code) {
+        case ENOKEY:
+                ecryptfs_printk(KERN_WARNING, "No key\n");
+                rc = -ENOENT;
+                break;
+        case EKEYEXPIRED:
+                ecryptfs_printk(KERN_WARNING, "Key expired\n");
+                rc = -ETIME;
+                break;
+        case EKEYREVOKED:
+                ecryptfs_printk(KERN_WARNING, "Key revoked\n");
+                rc = -EINVAL;
+                break;
+        default:
+                ecryptfs_printk(KERN_WARNING, "Unknown error code: "
+                                "[0x%.16x]\n", err_code);
+                rc = -EINVAL;
+        }
+        return rc;
+}
+static void wipe_auth_tok_list(struct list_head *auth_tok_list_head)
+{
+        struct list_head *walker;
+        struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+        walker = auth_tok_list_head->next;
+        while (walker != auth_tok_list_head) {
+                auth_tok_list_item =
+                    list_entry(walker, struct ecryptfs_auth_tok_list_item,
+                               list);
+                walker = auth_tok_list_item->list.next;
+                memset(auth_tok_list_item, 0,
+                       sizeof(struct ecryptfs_auth_tok_list_item));
+                kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+                                auth_tok_list_item);
+        }
+}
+struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+/**
+ * parse_packet_length
+ * @data: Pointer to memory containing length at offset
+ * @size: This function writes the decoded size to this memory
+ *        address; zero on error
+ * @length_size: The number of bytes occupied by the encoded length
+ *
+ * Returns Zero on success
+ */
+static int parse_packet_length(unsigned char *data, size_t *size,
+                               size_t *length_size)
+{
+        int rc = 0;
+        (*length_size) = 0;
+        (*size) = 0;
+        if (data[0] < 192) {
+                /* One-byte length */
+                (*size) = data[0];
+                (*length_size) = 1;
+        } else if (data[0] < 224) {
+                /* Two-byte length */
+                (*size) = ((data[0] - 192) * 256);
+                (*size) += (data[1] + 192);
+                (*length_size) = 2;
+        } else if (data[0] == 255) {
+                /* Five-byte length; we're not supposed to see this */
+                ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
+                                "supported\n");
+                rc = -EINVAL;
+                goto out;
+        } else {
+                ecryptfs_printk(KERN_ERR, "Error parsing packet length\n");
+                rc = -EINVAL;
+                goto out;
+        }
+out:
+        return rc;
+}
+/**
+ * write_packet_length
+ * @dest: The byte array target into which to write the
+ *       length. Must have at least 5 bytes allocated.
+ * @size: The length to write.
+ * @packet_size_length: The number of bytes used to encode the
+ *                      packet length is written to this address.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int write_packet_length(char *dest, size_t size,
+                               size_t *packet_size_length)
+{
+        int rc = 0;
+        if (size < 192) {
+                dest[0] = size;
+                (*packet_size_length) = 1;
+        } else if (size < 65536) {
+                dest[0] = (((size - 192) / 256) + 192);
+                dest[1] = ((size - 192) % 256);
+                (*packet_size_length) = 2;
+        } else {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_WARNING,
+                                "Unsupported packet size: [%d]\n", size);
+        }
+        return rc;
+}
+/**
+ * parse_tag_3_packet
+ * @crypt_stat: The cryptographic context to modify based on packet
+ *              contents.
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ *                 a new authentication token will be placed at the end
+ *                 of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ *                allocates; sets the memory address of the pointer to
+ *                NULL on error. This object is added to the
+ *                auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error.
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
+                   unsigned char *data, struct list_head *auth_tok_list,
+                   struct ecryptfs_auth_tok **new_auth_tok,
+                   size_t *packet_size, size_t max_packet_size)
+{
+        int rc = 0;
+        size_t body_size;
+        struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+        size_t length_size;
+        (*packet_size) = 0;
+        (*new_auth_tok) = NULL;
+        /* we check that:
+         *   one byte for the Tag 3 ID flag
+         *   two bytes for the body size
+         * do not exceed the maximum_packet_size
+         */
+        if (unlikely((*packet_size) + 3 > max_packet_size)) {
+                ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* check for Tag 3 identifyer - one byte */
+        if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) {
+                ecryptfs_printk(KERN_ERR, "Enter w/ first byte != 0x%.2x\n",
+                                ECRYPTFS_TAG_3_PACKET_TYPE);
+                rc = -EINVAL;
+                goto out;
+        }
+        /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+         * at end of function upon failure */
+        auth_tok_list_item =
+            kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL);
+        if (!auth_tok_list_item) {
+                ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        memset(auth_tok_list_item, 0,
+               sizeof(struct ecryptfs_auth_tok_list_item));
+        (*new_auth_tok) = &auth_tok_list_item->auth_tok;
+        /* check for body size - one to two bytes */
+        rc = parse_packet_length(&data[(*packet_size)], &body_size,
+                                 &length_size);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+                                "rc = [%d]\n", rc);
+                goto out_free;
+        }
+        if (unlikely(body_size < (0x05 + ECRYPTFS_SALT_SIZE))) {
+                ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
+                                body_size);
+                rc = -EINVAL;
+                goto out_free;
+        }
+        (*packet_size) += length_size;
+        /* now we know the length of the remainting Tag 3 packet size:
+         *   5 fix bytes for: version string, cipher, S2K ID, hash algo,
+         *                    number of hash iterations
+         *   ECRYPTFS_SALT_SIZE bytes for salt
+         *   body_size bytes minus the stuff above is the encrypted key size
+         */
+        if (unlikely((*packet_size) + body_size > max_packet_size)) {
+                ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+                rc = -EINVAL;
+                goto out_free;
+        }
+        /* There are 5 characters of additional information in the
+         * packet */
+        (*new_auth_tok)->session_key.encrypted_key_size =
+                body_size - (0x05 + ECRYPTFS_SALT_SIZE);
+        ecryptfs_printk(KERN_DEBUG, "Encrypted key size = [%d]\n",
+                        (*new_auth_tok)->session_key.encrypted_key_size);
+        /* Version 4 (from RFC2440) - one byte */
+        if (unlikely(data[(*packet_size)++] != 0x04)) {
+                ecryptfs_printk(KERN_DEBUG, "Unknown version number "
+                                "[%d]\n", data[(*packet_size) - 1]);
+                rc = -EINVAL;
+                goto out_free;
+        }
+        /* cipher - one byte */
+        ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+                                       (u16)data[(*packet_size)]);
+        /* A little extra work to differentiate among the AES key
+         * sizes; see RFC2440 */
+        switch(data[(*packet_size)++]) {
+        case RFC2440_CIPHER_AES_192:
+                crypt_stat->key_size = 24;
+                break;
+        default:
+                crypt_stat->key_size =
+                        (*new_auth_tok)->session_key.encrypted_key_size;
+        }
+        ecryptfs_init_crypt_ctx(crypt_stat);
+        /* S2K identifier 3 (from RFC2440) */
+        if (unlikely(data[(*packet_size)++] != 0x03)) {
+                ecryptfs_printk(KERN_ERR, "Only S2K ID 3 is currently "
+                                "supported\n");
+                rc = -ENOSYS;
+                goto out_free;
+        }
+        /* TODO: finish the hash mapping */
+        /* hash algorithm - one byte */
+        switch (data[(*packet_size)++]) {
+        case 0x01: /* See RFC2440 for these numbers and their mappings */
+                /* Choose MD5 */
+                /* salt - ECRYPTFS_SALT_SIZE bytes */
+                memcpy((*new_auth_tok)->token.password.salt,
+                       &data[(*packet_size)], ECRYPTFS_SALT_SIZE);
+                (*packet_size) += ECRYPTFS_SALT_SIZE;
+                /* This conversion was taken straight from RFC2440 */
+                /* number of hash iterations - one byte */
+                (*new_auth_tok)->token.password.hash_iterations =
+                        ((u32) 16 + (data[(*packet_size)] & 15))
+                                << ((data[(*packet_size)] >> 4) + 6);
+                (*packet_size)++;
+                /* encrypted session key -
+                 *   (body_size-5-ECRYPTFS_SALT_SIZE) bytes */
+                memcpy((*new_auth_tok)->session_key.encrypted_key,
+                       &data[(*packet_size)],
+                       (*new_auth_tok)->session_key.encrypted_key_size);
+                (*packet_size) +=
+                        (*new_auth_tok)->session_key.encrypted_key_size;
+                (*new_auth_tok)->session_key.flags &=
+                        ~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+                (*new_auth_tok)->session_key.flags |=
+                        ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+                (*new_auth_tok)->token.password.hash_algo = 0x01;
+                break;
+        default:
+                ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: "
+                                "[%d]\n", data[(*packet_size) - 1]);
+                rc = -ENOSYS;
+                goto out_free;
+        }
+        (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD;
+        /* TODO: Parametarize; we might actually want userspace to
+         * decrypt the session key. */
+        ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
+                            ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+        ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
+                            ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+        list_add(&auth_tok_list_item->list, auth_tok_list);
+        goto out;
+out_free:
+        (*new_auth_tok) = NULL;
+        memset(auth_tok_list_item, 0,
+               sizeof(struct ecryptfs_auth_tok_list_item));
+        kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+                        auth_tok_list_item);
+out:
+        if (rc)
+                (*packet_size) = 0;
+        return rc;
+}
+/**
+ * parse_tag_11_packet
+ * @data: The raw bytes of the packet
+ * @contents: This function writes the data contents of the literal
+ *            packet into this memory location
+ * @max_contents_bytes: The maximum number of bytes that this function
+ *                      is allowed to write into contents
+ * @tag_11_contents_size: This function writes the size of the parsed
+ *                        contents into this memory location; zero on
+ *                        error
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_11_packet(unsigned char *data, unsigned char *contents,
+                    size_t max_contents_bytes, size_t *tag_11_contents_size,
+                    size_t *packet_size, size_t max_packet_size)
+{
+        int rc = 0;
+        size_t body_size;
+        size_t length_size;
+        (*packet_size) = 0;
+        (*tag_11_contents_size) = 0;
+        /* check that:
+         *   one byte for the Tag 11 ID flag
+         *   two bytes for the Tag 11 length
+         * do not exceed the maximum_packet_size
+         */
+        if (unlikely((*packet_size) + 3 > max_packet_size)) {
+                ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* check for Tag 11 identifyer - one byte */
+        if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) {
+                ecryptfs_printk(KERN_WARNING,
+                                "Invalid tag 11 packet format\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* get Tag 11 content length - one or two bytes */
+        rc = parse_packet_length(&data[(*packet_size)], &body_size,
+                                 &length_size);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING,
+                                "Invalid tag 11 packet format\n");
+                goto out;
+        }
+        (*packet_size) += length_size;
+        if (body_size < 13) {
+                ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
+                                body_size);
+                rc = -EINVAL;
+                goto out;
+        }
+        /* We have 13 bytes of surrounding packet values */
+        (*tag_11_contents_size) = (body_size - 13);
+        /* now we know the length of the remainting Tag 11 packet size:
+         *   14 fix bytes for: special flag one, special flag two,
+         *                     12 skipped bytes
+         *   body_size bytes minus the stuff above is the Tag 11 content
+         */
+        /* FIXME why is the body size one byte smaller than the actual
+         * size of the body?
+         * this seems to be an error here as well as in
+         * write_tag_11_packet() */
+        if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) {
+                ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* special flag one - one byte */
+        if (data[(*packet_size)++] != 0x62) {
+                ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* special flag two - one byte */
+        if (data[(*packet_size)++] != 0x08) {
+                ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* skip the next 12 bytes */
+        (*packet_size) += 12; /* We don't care about the filename or
+                               * the timestamp */
+        /* get the Tag 11 contents - tag_11_contents_size bytes */
+        memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size));
+        (*packet_size) += (*tag_11_contents_size);
+out:
+        if (rc) {
+                (*packet_size) = 0;
+                (*tag_11_contents_size) = 0;
+        }
+        return rc;
+}
+/**
+ * decrypt_session_key - Decrypt the session key with the given auth_tok.
+ *
+ * Returns Zero on success; non-zero error otherwise.
+ */
+static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
+                               struct ecryptfs_crypt_stat *crypt_stat)
+{
+        int rc = 0;
+        struct ecryptfs_password *password_s_ptr;
+        struct crypto_tfm *tfm = NULL;
+        struct scatterlist src_sg[2], dst_sg[2];
+        struct mutex *tfm_mutex = NULL;
+        /* TODO: Use virt_to_scatterlist for these */
+        char *encrypted_session_key;
+        char *session_key;
+        password_s_ptr = &auth_tok->token.password;
+        if (ECRYPTFS_CHECK_FLAG(password_s_ptr->flags,
+                                ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET))
+                ecryptfs_printk(KERN_DEBUG, "Session key encryption key "
+                                "set; skipping key generation\n");
+        ecryptfs_printk(KERN_DEBUG, "Session key encryption key (size [%d])"
+                        ":\n",
+                        password_s_ptr->session_key_encryption_key_bytes);
+        if (ecryptfs_verbosity > 0)
+                ecryptfs_dump_hex(password_s_ptr->session_key_encryption_key,
+                                  password_s_ptr->
+                                  session_key_encryption_key_bytes);
+        if (!strcmp(crypt_stat->cipher,
+                    crypt_stat->mount_crypt_stat->global_default_cipher_name)
+            && crypt_stat->mount_crypt_stat->global_key_tfm) {
+                tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
+                tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
+        } else {
+                tfm = crypto_alloc_tfm(crypt_stat->cipher,
+                                       CRYPTO_TFM_REQ_WEAK_KEY);
+                if (!tfm) {
+                        printk(KERN_ERR "Error allocating crypto context\n");
+                        rc = -ENOMEM;
+                        goto out;
+                }
+        }
+        if (password_s_ptr->session_key_encryption_key_bytes
+            < crypto_tfm_alg_min_keysize(tfm)) {
+                printk(KERN_WARNING "Session key encryption key is [%d] bytes; "
+                       "minimum keysize for selected cipher is [%d] bytes.\n",
+                       password_s_ptr->session_key_encryption_key_bytes,
+                       crypto_tfm_alg_min_keysize(tfm));
+                rc = -EINVAL;
+                goto out;
+        }
+        if (tfm_mutex)
+                mutex_lock(tfm_mutex);
+        crypto_cipher_setkey(tfm, password_s_ptr->session_key_encryption_key,
+                             crypt_stat->key_size);
+        /* TODO: virt_to_scatterlist */
+        encrypted_session_key = (char *)__get_free_page(GFP_KERNEL);
+        if (!encrypted_session_key) {
+                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                rc = -ENOMEM;
+                goto out_free_tfm;
+        }
+        session_key = (char *)__get_free_page(GFP_KERNEL);
+        if (!session_key) {
+                kfree(encrypted_session_key);
+                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                rc = -ENOMEM;
+                goto out_free_tfm;
+        }
+        memcpy(encrypted_session_key, auth_tok->session_key.encrypted_key,
+               auth_tok->session_key.encrypted_key_size);
+        src_sg[0].page = virt_to_page(encrypted_session_key);
+        src_sg[0].offset = 0;
+        BUG_ON(auth_tok->session_key.encrypted_key_size > PAGE_CACHE_SIZE);
+        src_sg[0].length = auth_tok->session_key.encrypted_key_size;
+        dst_sg[0].page = virt_to_page(session_key);
+        dst_sg[0].offset = 0;
+        auth_tok->session_key.decrypted_key_size =
+            auth_tok->session_key.encrypted_key_size;
+        dst_sg[0].length = auth_tok->session_key.encrypted_key_size;
+        /* TODO: Handle error condition */
+        crypto_cipher_decrypt(tfm, dst_sg, src_sg,
+                              auth_tok->session_key.encrypted_key_size);
+        auth_tok->session_key.decrypted_key_size =
+            auth_tok->session_key.encrypted_key_size;
+        memcpy(auth_tok->session_key.decrypted_key, session_key,
+               auth_tok->session_key.decrypted_key_size);
+        auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+        memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+               auth_tok->session_key.decrypted_key_size);
+        ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+        ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n");
+        if (ecryptfs_verbosity > 0)
+                ecryptfs_dump_hex(crypt_stat->key,
+                                  crypt_stat->key_size);
+        memset(encrypted_session_key, 0, PAGE_CACHE_SIZE);
+        free_page((unsigned long)encrypted_session_key);
+        memset(session_key, 0, PAGE_CACHE_SIZE);
+        free_page((unsigned long)session_key);
+out_free_tfm:
+        if (tfm_mutex)
+                mutex_unlock(tfm_mutex);
+        else
+                crypto_free_tfm(tfm);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_parse_packet_set
+ * @dest: The header page in memory
+ * @version: Version of file format, to guide parsing behavior
+ *
+ * Get crypt_stat to have the file's session key if the requisite key
+ * is available to decrypt the session key.
+ *
+ * Returns Zero if a valid authentication token was retrieved and
+ * processed; negative value for file not encrypted or for error
+ * conditions.
+ */
+int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+                              unsigned char *src,
+                              struct dentry *ecryptfs_dentry)
+{
+        size_t i = 0;
+        int rc = 0;
+        size_t found_auth_tok = 0;
+        size_t next_packet_is_auth_tok_packet;
+        char sig[ECRYPTFS_SIG_SIZE_HEX];
+        struct list_head auth_tok_list;
+        struct list_head *walker;
+        struct ecryptfs_auth_tok *chosen_auth_tok = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+                &ecryptfs_superblock_to_private(
+                        ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        struct ecryptfs_auth_tok *candidate_auth_tok = NULL;
+        size_t packet_size;
+        struct ecryptfs_auth_tok *new_auth_tok;
+        unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE];
+        size_t tag_11_contents_size;
+        size_t tag_11_packet_size;
+        INIT_LIST_HEAD(&auth_tok_list);
+        /* Parse the header to find as many packets as we can, these will be
+         * added the our &auth_tok_list */
+        next_packet_is_auth_tok_packet = 1;
+        while (next_packet_is_auth_tok_packet) {
+                size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+                switch (src[i]) {
+                case ECRYPTFS_TAG_3_PACKET_TYPE:
+                        rc = parse_tag_3_packet(crypt_stat,
+                                                (unsigned char *)&src[i],
+                                                &auth_tok_list, &new_auth_tok,
+                                                &packet_size, max_packet_size);
+                        if (rc) {
+                                ecryptfs_printk(KERN_ERR, "Error parsing "
+                                                "tag 3 packet\n");
+                                rc = -EIO;
+                                goto out_wipe_list;
+                        }
+                        i += packet_size;
+                        rc = parse_tag_11_packet((unsigned char *)&src[i],
+                                                 sig_tmp_space,
+                                                 ECRYPTFS_SIG_SIZE,
+                                                 &tag_11_contents_size,
+                                                 &tag_11_packet_size,
+                                                 max_packet_size);
+                        if (rc) {
+                                ecryptfs_printk(KERN_ERR, "No valid "
+                                                "(ecryptfs-specific) literal "
+                                                "packet containing "
+                                                "authentication token "
+                                                "signature found after "
+                                                "tag 3 packet\n");
+                                rc = -EIO;
+                                goto out_wipe_list;
+                        }
+                        i += tag_11_packet_size;
+                        if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
+                                ecryptfs_printk(KERN_ERR, "Expected "
+                                                "signature of size [%d]; "
+                                                "read size [%d]\n",
+                                                ECRYPTFS_SIG_SIZE,
+                                                tag_11_contents_size);
+                                rc = -EIO;
+                                goto out_wipe_list;
+                        }
+                        ecryptfs_to_hex(new_auth_tok->token.password.signature,
+                                        sig_tmp_space, tag_11_contents_size);
+                        new_auth_tok->token.password.signature[
+                                ECRYPTFS_PASSWORD_SIG_SIZE] = '\0';
+                        ECRYPTFS_SET_FLAG(crypt_stat->flags,
+                                          ECRYPTFS_ENCRYPTED);
+                        break;
+                case ECRYPTFS_TAG_11_PACKET_TYPE:
+                        ecryptfs_printk(KERN_WARNING, "Invalid packet set "
+                                        "(Tag 11 not allowed by itself)\n");
+                        rc = -EIO;
+                        goto out_wipe_list;
+                        break;
+                default:
+                        ecryptfs_printk(KERN_DEBUG, "No packet at offset "
+                                        "[%d] of the file header; hex value of "
+                                        "character is [0x%.2x]\n", i, src[i]);
+                        next_packet_is_auth_tok_packet = 0;
+                }
+        }
+        if (list_empty(&auth_tok_list)) {
+                rc = -EINVAL; /* Do not support non-encrypted files in
+                               * the 0.1 release */
+                goto out;
+        }
+        /* If we have a global auth tok, then we should try to use
+         * it */
+        if (mount_crypt_stat->global_auth_tok) {
+                memcpy(sig, mount_crypt_stat->global_auth_tok_sig,
+                       ECRYPTFS_SIG_SIZE_HEX);
+                chosen_auth_tok = mount_crypt_stat->global_auth_tok;
+        } else
+                BUG(); /* We should always have a global auth tok in
+                        * the 0.1 release */
+        /* Scan list to see if our chosen_auth_tok works */
+        list_for_each(walker, &auth_tok_list) {
+                struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+                auth_tok_list_item =
+                    list_entry(walker, struct ecryptfs_auth_tok_list_item,
+                               list);
+                candidate_auth_tok = &auth_tok_list_item->auth_tok;
+                if (unlikely(ecryptfs_verbosity > 0)) {
+                        ecryptfs_printk(KERN_DEBUG,
+                                        "Considering cadidate auth tok:\n");
+                        ecryptfs_dump_auth_tok(candidate_auth_tok);
+                }
+                /* TODO: Replace ECRYPTFS_SIG_SIZE_HEX w/ dynamic value */
+                if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD
+                    && !strncmp(candidate_auth_tok->token.password.signature,
+                                sig, ECRYPTFS_SIG_SIZE_HEX)) {
+                        found_auth_tok = 1;
+                        goto leave_list;
+                        /* TODO: Transfer the common salt into the
+                         * crypt_stat salt */
+                }
+        }
+leave_list:
+        if (!found_auth_tok) {
+                ecryptfs_printk(KERN_ERR, "Could not find authentication "
+                                "token on temporary list for sig [%.*s]\n",
+                                ECRYPTFS_SIG_SIZE_HEX, sig);
+                rc = -EIO;
+                goto out_wipe_list;
+        } else {
+                memcpy(&(candidate_auth_tok->token.password),
+                       &(chosen_auth_tok->token.password),
+                       sizeof(struct ecryptfs_password));
+                rc = decrypt_session_key(candidate_auth_tok, crypt_stat);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error decrypting the "
+                                        "session key\n");
+                        goto out_wipe_list;
+                }
+                rc = ecryptfs_compute_root_iv(crypt_stat);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error computing "
+                                        "the root IV\n");
+                        goto out_wipe_list;
+                }
+        }
+        rc = ecryptfs_init_crypt_ctx(crypt_stat);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error initializing crypto "
+                                "context for cipher [%s]; rc = [%d]\n",
+                                crypt_stat->cipher, rc);
+        }
+out_wipe_list:
+        wipe_auth_tok_list(&auth_tok_list);
+out:
+        return rc;
+}
+/**
+ * write_tag_11_packet
+ * @dest: Target into which Tag 11 packet is to be written
+ * @max: Maximum packet length
+ * @contents: Byte array of contents to copy in
+ * @contents_length: Number of bytes in contents
+ * @packet_length: Length of the Tag 11 packet written; zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_11_packet(char *dest, int max, char *contents, size_t contents_length,
+                    size_t *packet_length)
+{
+        int rc = 0;
+        size_t packet_size_length;
+        (*packet_length) = 0;
+        if ((13 + contents_length) > max) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_ERR, "Packet length larger than "
+                                "maximum allowable\n");
+                goto out;
+        }
+        /* General packet header */
+        /* Packet tag */
+        dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
+        /* Packet length */
+        rc = write_packet_length(&dest[(*packet_length)],
+                                 (13 + contents_length), &packet_size_length);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error generating tag 11 packet "
+                                "header; cannot generate packet length\n");
+                goto out;
+        }
+        (*packet_length) += packet_size_length;
+        /* Tag 11 specific */
+        /* One-octet field that describes how the data is formatted */
+        dest[(*packet_length)++] = 0x62; /* binary data */
+        /* One-octet filename length followed by filename */
+        dest[(*packet_length)++] = 8;
+        memcpy(&dest[(*packet_length)], "_CONSOLE", 8);
+        (*packet_length) += 8;
+        /* Four-octet number indicating modification date */
+        memset(&dest[(*packet_length)], 0x00, 4);
+        (*packet_length) += 4;
+        /* Remainder is literal data */
+        memcpy(&dest[(*packet_length)], contents, contents_length);
+        (*packet_length) += contents_length;
+ out:
+        if (rc)
+                (*packet_length) = 0;
+        return rc;
+}
+/**
+ * write_tag_3_packet
+ * @dest: Buffer into which to write the packet
+ * @max: Maximum number of bytes that can be written
+ * @auth_tok: Authentication token
+ * @crypt_stat: The cryptographic context
+ * @key_rec: encrypted key
+ * @packet_size: This function will write the number of bytes that end
+ *               up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok,
+                   struct ecryptfs_crypt_stat *crypt_stat,
+                   struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+        int rc = 0;
+        size_t i;
+        size_t signature_is_valid = 0;
+        size_t encrypted_session_key_valid = 0;
+        char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+        struct scatterlist dest_sg[2];
+        struct scatterlist src_sg[2];
+        struct crypto_tfm *tfm = NULL;
+        struct mutex *tfm_mutex = NULL;
+        size_t key_rec_size;
+        size_t packet_size_length;
+        size_t cipher_code;
+        (*packet_size) = 0;
+        /* Check for a valid signature on the auth_tok */
+        for (i = 0; i < ECRYPTFS_SIG_SIZE_HEX; i++)
+                signature_is_valid |= auth_tok->token.password.signature[i];
+        if (!signature_is_valid)
+                BUG();
+        ecryptfs_from_hex((*key_rec).sig, auth_tok->token.password.signature,
+                          ECRYPTFS_SIG_SIZE);
+        encrypted_session_key_valid = 0;
+        for (i = 0; i < crypt_stat->key_size; i++)
+                encrypted_session_key_valid |=
+                        auth_tok->session_key.encrypted_key[i];
+        if (encrypted_session_key_valid) {
+                memcpy((*key_rec).enc_key,
+                       auth_tok->session_key.encrypted_key,
+                       auth_tok->session_key.encrypted_key_size);
+                goto encrypted_session_key_set;
+        }
+        if (auth_tok->session_key.encrypted_key_size == 0)
+                auth_tok->session_key.encrypted_key_size =
+                        crypt_stat->key_size;
+        if (crypt_stat->key_size == 24
+            && strcmp("aes", crypt_stat->cipher) == 0) {
+                memset((crypt_stat->key + 24), 0, 8);
+                auth_tok->session_key.encrypted_key_size = 32;
+        }
+        (*key_rec).enc_key_size =
+                auth_tok->session_key.encrypted_key_size;
+        if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
+                                ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) {
+                ecryptfs_printk(KERN_DEBUG, "Using previously generated "
+                                "session key encryption key of size [%d]\n",
+                                auth_tok->token.password.
+                                session_key_encryption_key_bytes);
+                memcpy(session_key_encryption_key,
+                       auth_tok->token.password.session_key_encryption_key,
+                       crypt_stat->key_size);
+                ecryptfs_printk(KERN_DEBUG,
+                                "Cached session key " "encryption key: \n");
+                if (ecryptfs_verbosity > 0)
+                        ecryptfs_dump_hex(session_key_encryption_key, 16);
+        }
+        if (unlikely(ecryptfs_verbosity > 0)) {
+                ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n");
+                ecryptfs_dump_hex(session_key_encryption_key, 16);
+        }
+        rc = virt_to_scatterlist(crypt_stat->key,
+                                 (*key_rec).enc_key_size, src_sg, 2);
+        if (!rc) {
+                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+                                "for crypt_stat session key\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        rc = virt_to_scatterlist((*key_rec).enc_key,
+                                 (*key_rec).enc_key_size, dest_sg, 2);
+        if (!rc) {
+                ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+                                "for crypt_stat encrypted session key\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        if (!strcmp(crypt_stat->cipher,
+                    crypt_stat->mount_crypt_stat->global_default_cipher_name)
+            && crypt_stat->mount_crypt_stat->global_key_tfm) {
+                tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
+                tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
+        } else
+                tfm = crypto_alloc_tfm(crypt_stat->cipher, 0);
+        if (!tfm) {
+                ecryptfs_printk(KERN_ERR, "Could not initialize crypto "
+                                "context for cipher [%s]\n",
+                                crypt_stat->cipher);
+                rc = -EINVAL;
+                goto out;
+        }
+        if (tfm_mutex)
+                mutex_lock(tfm_mutex);
+        rc = crypto_cipher_setkey(tfm, session_key_encryption_key,
+                                  crypt_stat->key_size);
+        if (rc < 0) {
+                if (tfm_mutex)
+                        mutex_unlock(tfm_mutex);
+                ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
+                                "context\n");
+                goto out;
+        }
+        rc = 0;
+        ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+                        crypt_stat->key_size);
+        crypto_cipher_encrypt(tfm, dest_sg, src_sg,
+                              (*key_rec).enc_key_size);
+        if (tfm_mutex)
+                mutex_unlock(tfm_mutex);
+        ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
+        if (ecryptfs_verbosity > 0)
+                ecryptfs_dump_hex((*key_rec).enc_key,
+                                  (*key_rec).enc_key_size);
+encrypted_session_key_set:
+        /* Now we have a valid key_rec.  Append it to the
+         * key_rec set. */
+        key_rec_size = (sizeof(struct ecryptfs_key_record)
+                        - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES
+                        + ((*key_rec).enc_key_size));
+        /* TODO: Include a packet size limit as a parameter to this
+         * function once we have multi-packet headers (for versions
+         * later than 0.1 */
+        if (key_rec_size >= ECRYPTFS_MAX_KEYSET_SIZE) {
+                ecryptfs_printk(KERN_ERR, "Keyset too large\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* TODO: Packet size limit */
+        /* We have 5 bytes of surrounding packet data */
+        if ((0x05 + ECRYPTFS_SALT_SIZE
+             + (*key_rec).enc_key_size) >= max) {
+                ecryptfs_printk(KERN_ERR, "Authentication token is too "
+                                "large\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        /* This format is inspired by OpenPGP; see RFC 2440
+         * packet tag 3 */
+        dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
+        /* ver+cipher+s2k+hash+salt+iter+enc_key */
+        rc = write_packet_length(&dest[(*packet_size)],
+                                 (0x05 + ECRYPTFS_SALT_SIZE
+                                  + (*key_rec).enc_key_size),
+                                 &packet_size_length);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error generating tag 3 packet "
+                                "header; cannot generate packet length\n");
+                goto out;
+        }
+        (*packet_size) += packet_size_length;
+        dest[(*packet_size)++] = 0x04; /* version 4 */
+        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+        if (cipher_code == 0) {
+                ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
+                                "cipher [%s]\n", crypt_stat->cipher);
+                rc = -EINVAL;
+                goto out;
+        }
+        dest[(*packet_size)++] = cipher_code;
+        dest[(*packet_size)++] = 0x03;  /* S2K */
+        dest[(*packet_size)++] = 0x01;  /* MD5 (TODO: parameterize) */
+        memcpy(&dest[(*packet_size)], auth_tok->token.password.salt,
+               ECRYPTFS_SALT_SIZE);
+        (*packet_size) += ECRYPTFS_SALT_SIZE;   /* salt */
+        dest[(*packet_size)++] = 0x60;  /* hash iterations (65536) */
+        memcpy(&dest[(*packet_size)], (*key_rec).enc_key,
+               (*key_rec).enc_key_size);
+        (*packet_size) += (*key_rec).enc_key_size;
+out:
+        if (tfm && !tfm_mutex)
+                crypto_free_tfm(tfm);
+        if (rc)
+                (*packet_size) = 0;
+        return rc;
+}
+/**
+ * ecryptfs_generate_key_packet_set
+ * @dest: Virtual address from which to write the key record set
+ * @crypt_stat: The cryptographic context from which the
+ *              authentication tokens will be retrieved
+ * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat
+ *                   for the global parameters
+ * @len: The amount written
+ * @max: The maximum amount of data allowed to be written
+ *
+ * Generates a key packet set and writes it to the virtual address
+ * passed in.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_generate_key_packet_set(char *dest_base,
+                                 struct ecryptfs_crypt_stat *crypt_stat,
+                                 struct dentry *ecryptfs_dentry, size_t *len,
+                                 size_t max)
+{
+        int rc = 0;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+                &ecryptfs_superblock_to_private(
+                        ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        size_t written;
+        struct ecryptfs_key_record key_rec;
+        (*len) = 0;
+        if (mount_crypt_stat->global_auth_tok) {
+                auth_tok = mount_crypt_stat->global_auth_tok;
+                if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
+                        rc = write_tag_3_packet((dest_base + (*len)),
+                                                max, auth_tok,
+                                                crypt_stat, &key_rec,
+                                                &written);
+                        if (rc) {
+                                ecryptfs_printk(KERN_WARNING, "Error "
+                                                "writing tag 3 packet\n");
+                                goto out;
+                        }
+                        (*len) += written;
+                        /* Write auth tok signature packet */
+                        rc = write_tag_11_packet(
+                                (dest_base + (*len)),
+                                (max - (*len)),
+                                key_rec.sig, ECRYPTFS_SIG_SIZE, &written);
+                        if (rc) {
+                                ecryptfs_printk(KERN_ERR, "Error writing "
+                                                "auth tok signature packet\n");
+                                goto out;
+                        }
+                        (*len) += written;
+                } else {
+                        ecryptfs_printk(KERN_WARNING, "Unsupported "
+                                        "authentication token type\n");
+                        rc = -EINVAL;
+                        goto out;
+                }
+                if (rc) {
+                        ecryptfs_printk(KERN_WARNING, "Error writing "
+                                        "authentication token packet with sig "
+                                        "= [%s]\n",
+                                        mount_crypt_stat->global_auth_tok_sig);
+                        rc = -EIO;
+                        goto out;
+                }
+        } else
+                BUG();
+        if (likely((max - (*len)) > 0)) {
+                dest_base[(*len)] = 0x00;
+        } else {
+                ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
+                rc = -EIO;
+        }
+out:
+        if (rc)
+                (*len) = 0;
+        return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
new file mode 100644
index 000000000000..5938a232d11b
--- /dev/null
+++ b/fs/ecryptfs/main.c
@@ -0,0 +1,828 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/netlink.h>
+#include <linux/mount.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/parser.h>
+#include "ecryptfs_kernel.h"
+/**
+ * Module parameter that defines the ecryptfs_verbosity level.
+ */
+int ecryptfs_verbosity = 0;
+module_param(ecryptfs_verbosity, int, 0);
+MODULE_PARM_DESC(ecryptfs_verbosity,
+                 "Initial verbosity level (0 or 1; defaults to "
+                 "0, which is Quiet)");
+void __ecryptfs_printk(const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        if (fmt[1] == '7') { /* KERN_DEBUG */
+                if (ecryptfs_verbosity >= 1)
+                        vprintk(fmt, args);
+        } else
+                vprintk(fmt, args);
+        va_end(args);
+}
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flag: If set to true, then d_add is called, else d_instantiate is called
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+                       struct super_block *sb, int flag)
+{
+        struct inode *lower_inode;
+        struct inode *inode;
+        int rc = 0;
+        lower_inode = lower_dentry->d_inode;
+        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
+                rc = -EXDEV;
+                goto out;
+        }
+        if (!igrab(lower_inode)) {
+                rc = -ESTALE;
+                goto out;
+        }
+        inode = iget5_locked(sb, (unsigned long)lower_inode,
+                             ecryptfs_inode_test, ecryptfs_inode_set,
+                             lower_inode);
+        if (!inode) {
+                rc = -EACCES;
+                iput(lower_inode);
+                goto out;
+        }
+        if (inode->i_state & I_NEW)
+                unlock_new_inode(inode);
+        else
+                iput(lower_inode);
+        if (S_ISLNK(lower_inode->i_mode))
+                inode->i_op = &ecryptfs_symlink_iops;
+        else if (S_ISDIR(lower_inode->i_mode))
+                inode->i_op = &ecryptfs_dir_iops;
+        if (S_ISDIR(lower_inode->i_mode))
+                inode->i_fop = &ecryptfs_dir_fops;
+        if (special_file(lower_inode->i_mode))
+                init_special_inode(inode, lower_inode->i_mode,
+                                   lower_inode->i_rdev);
+        dentry->d_op = &ecryptfs_dops;
+        if (flag)
+                d_add(dentry, inode);
+        else
+                d_instantiate(dentry, inode);
+        ecryptfs_copy_attr_all(inode, lower_inode);
+        /* This size will be overwritten for real files w/ headers and
+         * other metadata */
+        ecryptfs_copy_inode_size(inode, lower_inode);
+out:
+        return rc;
+}
+enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_debug,
+       ecryptfs_opt_ecryptfs_debug, ecryptfs_opt_cipher,
+       ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes,
+       ecryptfs_opt_passthrough, ecryptfs_opt_err };
+static match_table_t tokens = {
+        {ecryptfs_opt_sig, "sig=%s"},
+        {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
+        {ecryptfs_opt_debug, "debug=%u"},
+        {ecryptfs_opt_ecryptfs_debug, "ecryptfs_debug=%u"},
+        {ecryptfs_opt_cipher, "cipher=%s"},
+        {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"},
+        {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"},
+        {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
+        {ecryptfs_opt_err, NULL}
+};
+/**
+ * ecryptfs_verify_version
+ * @version: The version number to confirm
+ *
+ * Returns zero on good version; non-zero otherwise
+ */
+static int ecryptfs_verify_version(u16 version)
+{
+        int rc = 0;
+        unsigned char major;
+        unsigned char minor;
+        major = ((version >> 8) & 0xFF);
+        minor = (version & 0xFF);
+        if (major != ECRYPTFS_VERSION_MAJOR) {
+                ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
+                                "Expected [%d]; got [%d]\n",
+                                ECRYPTFS_VERSION_MAJOR, major);
+                rc = -EINVAL;
+                goto out;
+        }
+        if (minor != ECRYPTFS_VERSION_MINOR) {
+                ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
+                                "Expected [%d]; got [%d]\n",
+                                ECRYPTFS_VERSION_MINOR, minor);
+                rc = -EINVAL;
+                goto out;
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_parse_options
+ * @sb: The ecryptfs super block
+ * @options: The options pased to the kernel
+ *
+ * Parse mount options:
+ * debug=N         - ecryptfs_verbosity level for debug output
+ * sig=XXX         - description(signature) of the key to use
+ *
+ * Returns the dentry object of the lower-level (lower/interposed)
+ * directory; We want to mount our stackable file system on top of
+ * that lower directory.
+ *
+ * The signature of the key to use must be the description of a key
+ * already in the keyring. Mounting will fail if the key can not be
+ * found.
+ *
+ * Returns zero on success; non-zero on error
+ */
+static int ecryptfs_parse_options(struct super_block *sb, char *options)
+{
+        char *p;
+        int rc = 0;
+        int sig_set = 0;
+        int cipher_name_set = 0;
+        int cipher_key_bytes;
+        int cipher_key_bytes_set = 0;
+        struct key *auth_tok_key = NULL;
+        struct ecryptfs_auth_tok *auth_tok = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+        substring_t args[MAX_OPT_ARGS];
+        int token;
+        char *sig_src;
+        char *sig_dst;
+        char *debug_src;
+        char *cipher_name_dst;
+        char *cipher_name_src;
+        char *cipher_key_bytes_src;
+        struct crypto_tfm *tmp_tfm;
+        int cipher_name_len;
+        if (!options) {
+                rc = -EINVAL;
+                goto out;
+        }
+        while ((p = strsep(&options, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case ecryptfs_opt_sig:
+                case ecryptfs_opt_ecryptfs_sig:
+                        sig_src = args[0].from;
+                        sig_dst =
+                                mount_crypt_stat->global_auth_tok_sig;
+                        memcpy(sig_dst, sig_src, ECRYPTFS_SIG_SIZE_HEX);
+                        sig_dst[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+                        ecryptfs_printk(KERN_DEBUG,
+                                        "The mount_crypt_stat "
+                                        "global_auth_tok_sig set to: "
+                                        "[%s]\n", sig_dst);
+                        sig_set = 1;
+                        break;
+                case ecryptfs_opt_debug:
+                case ecryptfs_opt_ecryptfs_debug:
+                        debug_src = args[0].from;
+                        ecryptfs_verbosity =
+                                (int)simple_strtol(debug_src, &debug_src,
+                                                   0);
+                        ecryptfs_printk(KERN_DEBUG,
+                                        "Verbosity set to [%d]" "\n",
+                                        ecryptfs_verbosity);
+                        break;
+                case ecryptfs_opt_cipher:
+                case ecryptfs_opt_ecryptfs_cipher:
+                        cipher_name_src = args[0].from;
+                        cipher_name_dst =
+                                mount_crypt_stat->
+                                global_default_cipher_name;
+                        strncpy(cipher_name_dst, cipher_name_src,
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+                        ecryptfs_printk(KERN_DEBUG,
+                                        "The mount_crypt_stat "
+                                        "global_default_cipher_name set to: "
+                                        "[%s]\n", cipher_name_dst);
+                        cipher_name_set = 1;
+                        break;
+                case ecryptfs_opt_ecryptfs_key_bytes:
+                        cipher_key_bytes_src = args[0].from;
+                        cipher_key_bytes =
+                                (int)simple_strtol(cipher_key_bytes_src,
+                                                   &cipher_key_bytes_src, 0);
+                        mount_crypt_stat->global_default_cipher_key_size =
+                                cipher_key_bytes;
+                        ecryptfs_printk(KERN_DEBUG,
+                                        "The mount_crypt_stat "
+                                        "global_default_cipher_key_size "
+                                        "set to: [%d]\n", mount_crypt_stat->
+                                        global_default_cipher_key_size);
+                        cipher_key_bytes_set = 1;
+                        break;
+                case ecryptfs_opt_passthrough:
+                        mount_crypt_stat->flags |=
+                                ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED;
+                        break;
+                case ecryptfs_opt_err:
+                default:
+                        ecryptfs_printk(KERN_WARNING,
+                                        "eCryptfs: unrecognized option '%s'\n",
+                                        p);
+                }
+        }
+        /* Do not support lack of mount-wide signature in 0.1
+         * release */
+        if (!sig_set) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_ERR, "You must supply a valid "
+                                "passphrase auth tok signature as a mount "
+                                "parameter; see the eCryptfs README\n");
+                goto out;
+        }
+        if (!cipher_name_set) {
+                cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+                if (unlikely(cipher_name_len
+                             >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
+                        rc = -EINVAL;
+                        BUG();
+                        goto out;
+                }
+                memcpy(mount_crypt_stat->global_default_cipher_name,
+                       ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
+                mount_crypt_stat->global_default_cipher_name[cipher_name_len]
+                    = '\0';
+        }
+        if (!cipher_key_bytes_set) {
+                mount_crypt_stat->global_default_cipher_key_size =
+                        ECRYPTFS_DEFAULT_KEY_BYTES;
+                ecryptfs_printk(KERN_DEBUG, "Cipher key size was not "
+                                "specified.  Defaulting to [%d]\n",
+                                mount_crypt_stat->
+                                global_default_cipher_key_size);
+        }
+        rc = ecryptfs_process_cipher(
+                &tmp_tfm,
+                &mount_crypt_stat->global_key_tfm,
+                mount_crypt_stat->global_default_cipher_name,
+                mount_crypt_stat->global_default_cipher_key_size);
+        if (tmp_tfm)
+                crypto_free_tfm(tmp_tfm);
+        if (rc) {
+                printk(KERN_ERR "Error attempting to initialize cipher [%s] "
+                       "with key size [%Zd] bytes; rc = [%d]\n",
+                       mount_crypt_stat->global_default_cipher_name,
+                       mount_crypt_stat->global_default_cipher_key_size, rc);
+                rc = -EINVAL;
+                goto out;
+        }
+        mutex_init(&mount_crypt_stat->global_key_tfm_mutex);
+        ecryptfs_printk(KERN_DEBUG, "Requesting the key with description: "
+                        "[%s]\n", mount_crypt_stat->global_auth_tok_sig);
+        /* The reference to this key is held until umount is done The
+         * call to key_put is done in ecryptfs_put_super() */
+        auth_tok_key = request_key(&key_type_user,
+                                   mount_crypt_stat->global_auth_tok_sig,
+                                   NULL);
+        if (!auth_tok_key || IS_ERR(auth_tok_key)) {
+                ecryptfs_printk(KERN_ERR, "Could not find key with "
+                                "description: [%s]\n",
+                                mount_crypt_stat->global_auth_tok_sig);
+                process_request_key_err(PTR_ERR(auth_tok_key));
+                rc = -EINVAL;
+                goto out;
+        }
+        auth_tok = ecryptfs_get_key_payload_data(auth_tok_key);
+        if (ecryptfs_verify_version(auth_tok->version)) {
+                ecryptfs_printk(KERN_ERR, "Data structure version mismatch. "
+                                "Userspace tools must match eCryptfs kernel "
+                                "module with major version [%d] and minor "
+                                "version [%d]\n", ECRYPTFS_VERSION_MAJOR,
+                                ECRYPTFS_VERSION_MINOR);
+                rc = -EINVAL;
+                goto out;
+        }
+        if (auth_tok->token_type != ECRYPTFS_PASSWORD) {
+                ecryptfs_printk(KERN_ERR, "Invalid auth_tok structure "
+                                "returned from key\n");
+                rc = -EINVAL;
+                goto out;
+        }
+        mount_crypt_stat->global_auth_tok_key = auth_tok_key;
+        mount_crypt_stat->global_auth_tok = auth_tok;
+out:
+        return rc;
+}
+struct kmem_cache *ecryptfs_sb_info_cache;
+/**
+ * ecryptfs_fill_super
+ * @sb: The ecryptfs super block
+ * @raw_data: The options passed to mount
+ * @silent: Not used but required by function prototype
+ *
+ * Sets up what we can of the sb, rest is done in ecryptfs_read_super
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+        int rc = 0;
+        /* Released in ecryptfs_put_super() */
+        ecryptfs_set_superblock_private(sb,
+                                        kmem_cache_alloc(ecryptfs_sb_info_cache,
+                                                         SLAB_KERNEL));
+        if (!ecryptfs_superblock_to_private(sb)) {
+                ecryptfs_printk(KERN_WARNING, "Out of memory\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        memset(ecryptfs_superblock_to_private(sb), 0,
+               sizeof(struct ecryptfs_sb_info));
+        sb->s_op = &ecryptfs_sops;
+        /* Released through deactivate_super(sb) from get_sb_nodev */
+        sb->s_root = d_alloc(NULL, &(const struct qstr) {
+                             .hash = 0,.name = "/",.len = 1});
+        if (!sb->s_root) {
+                ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        sb->s_root->d_op = &ecryptfs_dops;
+        sb->s_root->d_sb = sb;
+        sb->s_root->d_parent = sb->s_root;
+        /* Released in d_release when dput(sb->s_root) is called */
+        /* through deactivate_super(sb) from get_sb_nodev() */
+        ecryptfs_set_dentry_private(sb->s_root,
+                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
+                                                     SLAB_KERNEL));
+        if (!ecryptfs_dentry_to_private(sb->s_root)) {
+                ecryptfs_printk(KERN_ERR,
+                                "dentry_info_cache alloc failed\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        memset(ecryptfs_dentry_to_private(sb->s_root), 0,
+               sizeof(struct ecryptfs_dentry_info));
+        rc = 0;
+out:
+        /* Should be able to rely on deactivate_super called from
+         * get_sb_nodev */
+        return rc;
+}
+/**
+ * ecryptfs_read_super
+ * @sb: The ecryptfs super block
+ * @dev_name: The path to mount over
+ *
+ * Read the super block of the lower filesystem, and use
+ * ecryptfs_interpose to create our initial inode and super block
+ * struct.
+ */
+static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
+{
+        int rc;
+        struct nameidata nd;
+        struct dentry *lower_root;
+        struct vfsmount *lower_mnt;
+        memset(&nd, 0, sizeof(struct nameidata));
+        rc = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
+                goto out_free;
+        }
+        lower_root = nd.dentry;
+        if (!lower_root->d_inode) {
+                ecryptfs_printk(KERN_WARNING,
+                                "No directory to interpose on\n");
+                rc = -ENOENT;
+                goto out_free;
+        }
+        lower_mnt = nd.mnt;
+        ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
+        sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
+        ecryptfs_set_dentry_lower(sb->s_root, lower_root);
+        ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
+        if ((rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0)))
+                goto out_free;
+        rc = 0;
+        goto out;
+out_free:
+        path_release(&nd);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_get_sb
+ * @fs_type
+ * @flags
+ * @dev_name: The path to mount over
+ * @raw_data: The options passed into the kernel
+ *
+ * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
+ * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
+ *                        with as much information as it can before needing
+ *                        the lower filesystem.
+ * ecryptfs_read_super(): this accesses the lower filesystem and uses
+ *                        ecryptfs_interpolate to perform most of the linking
+ * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
+ */
+static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
+                        const char *dev_name, void *raw_data,
+                        struct vfsmount *mnt)
+{
+        int rc;
+        struct super_block *sb;
+        rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
+        if (rc < 0) {
+                printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+                goto out;
+        }
+        sb = mnt->mnt_sb;
+        rc = ecryptfs_parse_options(sb, raw_data);
+        if (rc) {
+                printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
+                goto out_abort;
+        }
+        rc = ecryptfs_read_super(sb, dev_name);
+        if (rc) {
+                printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
+                goto out_abort;
+        }
+        goto out;
+out_abort:
+        dput(sb->s_root);
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_kill_block_super
+ * @sb: The ecryptfs super block
+ *
+ * Used to bring the superblock down and free the private data.
+ * Private data is free'd in ecryptfs_put_super()
+ */
+static void ecryptfs_kill_block_super(struct super_block *sb)
+{
+        generic_shutdown_super(sb);
+}
+static struct file_system_type ecryptfs_fs_type = {
+        .owner = THIS_MODULE,
+        .name = "ecryptfs",
+        .get_sb = ecryptfs_get_sb,
+        .kill_sb = ecryptfs_kill_block_super,
+        .fs_flags = 0
+};
+/**
+ * inode_info_init_once
+ *
+ * Initializes the ecryptfs_inode_info_cache when it is created
+ */
+static void
+inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
+{
+        struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
+        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(&ei->vfs_inode);
+}
+static struct ecryptfs_cache_info {
+        kmem_cache_t **cache;
+        const char *name;
+        size_t size;
+        void (*ctor)(void*, struct kmem_cache *, unsigned long);
+} ecryptfs_cache_infos[] = {
+        {
+                .cache = &ecryptfs_auth_tok_list_item_cache,
+                .name = "ecryptfs_auth_tok_list_item",
+                .size = sizeof(struct ecryptfs_auth_tok_list_item),
+        },
+        {
+                .cache = &ecryptfs_file_info_cache,
+                .name = "ecryptfs_file_cache",
+                .size = sizeof(struct ecryptfs_file_info),
+        },
+        {
+                .cache = &ecryptfs_dentry_info_cache,
+                .name = "ecryptfs_dentry_info_cache",
+                .size = sizeof(struct ecryptfs_dentry_info),
+        },
+        {
+                .cache = &ecryptfs_inode_info_cache,
+                .name = "ecryptfs_inode_cache",
+                .size = sizeof(struct ecryptfs_inode_info),
+                .ctor = inode_info_init_once,
+        },
+        {
+                .cache = &ecryptfs_sb_info_cache,
+                .name = "ecryptfs_sb_cache",
+                .size = sizeof(struct ecryptfs_sb_info),
+        },
+        {
+                .cache = &ecryptfs_header_cache_0,
+                .name = "ecryptfs_headers_0",
+                .size = PAGE_CACHE_SIZE,
+        },
+        {
+                .cache = &ecryptfs_header_cache_1,
+                .name = "ecryptfs_headers_1",
+                .size = PAGE_CACHE_SIZE,
+        },
+        {
+                .cache = &ecryptfs_header_cache_2,
+                .name = "ecryptfs_headers_2",
+                .size = PAGE_CACHE_SIZE,
+        },
+        {
+                .cache = &ecryptfs_lower_page_cache,
+                .name = "ecryptfs_lower_page_cache",
+                .size = PAGE_CACHE_SIZE,
+        },
+};
+static void ecryptfs_free_kmem_caches(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+                struct ecryptfs_cache_info *info;
+                info = &ecryptfs_cache_infos[i];
+                if (*(info->cache))
+                        kmem_cache_destroy(*(info->cache));
+        }
+}
+/**
+ * ecryptfs_init_kmem_caches
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_kmem_caches(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+                struct ecryptfs_cache_info *info;
+                info = &ecryptfs_cache_infos[i];
+                *(info->cache) = kmem_cache_create(info->name, info->size,
+                                0, SLAB_HWCACHE_ALIGN, info->ctor, NULL);
+                if (!*(info->cache)) {
+                        ecryptfs_free_kmem_caches();
+                        ecryptfs_printk(KERN_WARNING, "%s: "
+                                        "kmem_cache_create failed\n",
+                                        info->name);
+                        return -ENOMEM;
+                }
+        }
+        return 0;
+}
+struct ecryptfs_obj {
+        char *name;
+        struct list_head slot_list;
+        struct kobject kobj;
+};
+struct ecryptfs_attribute {
+        struct attribute attr;
+        ssize_t(*show) (struct ecryptfs_obj *, char *);
+        ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
+};
+static ssize_t
+ecryptfs_attr_store(struct kobject *kobj,
+                    struct attribute *attr, const char *buf, size_t len)
+{
+        struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+                                                kobj);
+        struct ecryptfs_attribute *attribute =
+                container_of(attr, struct ecryptfs_attribute, attr);
+        return (attribute->store ? attribute->store(obj, buf, len) : 0);
+}
+static ssize_t
+ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+        struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+                                                kobj);
+        struct ecryptfs_attribute *attribute =
+                container_of(attr, struct ecryptfs_attribute, attr);
+        return (attribute->show ? attribute->show(obj, buf) : 0);
+}
+static struct sysfs_ops ecryptfs_sysfs_ops = {
+        .show = ecryptfs_attr_show,
+        .store = ecryptfs_attr_store
+};
+static struct kobj_type ecryptfs_ktype = {
+        .sysfs_ops = &ecryptfs_sysfs_ops
+};
+static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
+static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
+{
+        return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+}
+static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
+struct ecryptfs_version_str_map_elem {
+        u32 flag;
+        char *str;
+} ecryptfs_version_str_map[] = {
+        {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
+        {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
+        {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
+        {ECRYPTFS_VERSIONING_POLICY, "policy"}
+};
+static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
+{
+        int i;
+        int remaining = PAGE_SIZE;
+        int total_written = 0;
+        buff[0] = '\0';
+        for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
+                int entry_size;
+                if (!(ECRYPTFS_VERSIONING_MASK
+                      & ecryptfs_version_str_map[i].flag))
+                        continue;
+                entry_size = strlen(ecryptfs_version_str_map[i].str);
+                if ((entry_size + 2) > remaining)
+                        goto out;
+                memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
+                buff[entry_size++] = '\n';
+                buff[entry_size] = '\0';
+                buff += entry_size;
+                total_written += entry_size;
+                remaining -= entry_size;
+        }
+out:
+        return total_written;
+}
+static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
+static int do_sysfs_registration(void)
+{
+        int rc;
+        if ((rc = subsystem_register(&ecryptfs_subsys))) {
+                printk(KERN_ERR
+                       "Unable to register ecryptfs sysfs subsystem\n");
+                goto out;
+        }
+        rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+                               &sysfs_attr_version.attr);
+        if (rc) {
+                printk(KERN_ERR
+                       "Unable to create ecryptfs version attribute\n");
+                subsystem_unregister(&ecryptfs_subsys);
+                goto out;
+        }
+        rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+                               &sysfs_attr_version_str.attr);
+        if (rc) {
+                printk(KERN_ERR
+                       "Unable to create ecryptfs version_str attribute\n");
+                sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+                                  &sysfs_attr_version.attr);
+                subsystem_unregister(&ecryptfs_subsys);
+                goto out;
+        }
+out:
+        return rc;
+}
+static int __init ecryptfs_init(void)
+{
+        int rc;
+        if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+                rc = -EINVAL;
+                ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
+                                "larger than the host's page size, and so "
+                                "eCryptfs cannot run on this system. The "
+                                "default eCryptfs extent size is [%d] bytes; "
+                                "the page size is [%d] bytes.\n",
+                                ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+                goto out;
+        }
+        rc = ecryptfs_init_kmem_caches();
+        if (rc) {
+                printk(KERN_ERR
+                       "Failed to allocate one or more kmem_cache objects\n");
+                goto out;
+        }
+        rc = register_filesystem(&ecryptfs_fs_type);
+        if (rc) {
+                printk(KERN_ERR "Failed to register filesystem\n");
+                ecryptfs_free_kmem_caches();
+                goto out;
+        }
+        kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
+        sysfs_attr_version.attr.owner = THIS_MODULE;
+        sysfs_attr_version_str.attr.owner = THIS_MODULE;
+        rc = do_sysfs_registration();
+        if (rc) {
+                printk(KERN_ERR "sysfs registration failed\n");
+                unregister_filesystem(&ecryptfs_fs_type);
+                ecryptfs_free_kmem_caches();
+                goto out;
+        }
+out:
+        return rc;
+}
+static void __exit ecryptfs_exit(void)
+{
+        sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+                          &sysfs_attr_version.attr);
+        sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+                          &sysfs_attr_version_str.attr);
+        subsystem_unregister(&ecryptfs_subsys);
+        unregister_filesystem(&ecryptfs_fs_type);
+        ecryptfs_free_kmem_caches();
+}
+MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>");
+MODULE_DESCRIPTION("eCryptfs");
+MODULE_LICENSE("GPL");
+module_init(ecryptfs_init)
+module_exit(ecryptfs_exit)
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
new file mode 100644
index 000000000000..924dd90a4cf5
--- /dev/null
+++ b/fs/ecryptfs/mmap.c
@@ -0,0 +1,788 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * This is where eCryptfs coordinates the symmetric encryption and
+ * decryption of the file data as it passes between the lower
+ * encrypted file and the upper decrypted file.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/page-flags.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+struct kmem_cache *ecryptfs_lower_page_cache;
+/**
+ * ecryptfs_get1page
+ *
+ * Get one page from cache or lower f/s, return error otherwise.
+ *
+ * Returns unlocked and up-to-date page (if ok), with increased
+ * refcnt.
+ */
+static struct page *ecryptfs_get1page(struct file *file, int index)
+{
+        struct page *page;
+        struct dentry *dentry;
+        struct inode *inode;
+        struct address_space *mapping;
+        dentry = file->f_dentry;
+        inode = dentry->d_inode;
+        mapping = inode->i_mapping;
+        page = read_cache_page(mapping, index,
+                               (filler_t *)mapping->a_ops->readpage,
+                               (void *)file);
+        if (IS_ERR(page))
+                goto out;
+        wait_on_page_locked(page);
+out:
+        return page;
+}
+static
+int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros);
+/**
+ * ecryptfs_fill_zeros
+ * @file: The ecryptfs file
+ * @new_length: The new length of the data in the underlying file;
+ *              everything between the prior end of the file and the
+ *              new end of the file will be filled with zero's.
+ *              new_length must be greater than  current length
+ *
+ * Function for handling lseek-ing past the end of the file.
+ *
+ * This function does not support shrinking, only growing a file.
+ *
+ * Returns zero on success; non-zero otherwise.
+ */
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
+{
+        int rc = 0;
+        struct dentry *dentry = file->f_dentry;
+        struct inode *inode = dentry->d_inode;
+        pgoff_t old_end_page_index = 0;
+        pgoff_t index = old_end_page_index;
+        int old_end_pos_in_page = -1;
+        pgoff_t new_end_page_index;
+        int new_end_pos_in_page;
+        loff_t cur_length = i_size_read(inode);
+        if (cur_length != 0) {
+                index = old_end_page_index =
+                    ((cur_length - 1) >> PAGE_CACHE_SHIFT);
+                old_end_pos_in_page = ((cur_length - 1) & ~PAGE_CACHE_MASK);
+        }
+        new_end_page_index = ((new_length - 1) >> PAGE_CACHE_SHIFT);
+        new_end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK);
+        ecryptfs_printk(KERN_DEBUG, "old_end_page_index = [0x%.16x]; "
+                        "old_end_pos_in_page = [%d]; "
+                        "new_end_page_index = [0x%.16x]; "
+                        "new_end_pos_in_page = [%d]\n",
+                        old_end_page_index, old_end_pos_in_page,
+                        new_end_page_index, new_end_pos_in_page);
+        if (old_end_page_index == new_end_page_index) {
+                /* Start and end are in the same page; we just need to
+                 * set a portion of the existing page to zero's */
+                rc = write_zeros(file, index, (old_end_pos_in_page + 1),
+                                 (new_end_pos_in_page - old_end_pos_in_page));
+                if (rc)
+                        ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+                                        "index=[0x%.16x], "
+                                        "old_end_pos_in_page=[d], "
+                                        "(PAGE_CACHE_SIZE - new_end_pos_in_page"
+                                        "=[%d]"
+                                        ")=[d]) returned [%d]\n", file, index,
+                                        old_end_pos_in_page,
+                                        new_end_pos_in_page,
+                                        (PAGE_CACHE_SIZE - new_end_pos_in_page),
+                                        rc);
+                goto out;
+        }
+        /* Fill the remainder of the previous last page with zeros */
+        rc = write_zeros(file, index, (old_end_pos_in_page + 1),
+                         ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page));
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+                                "index=[0x%.16x], old_end_pos_in_page=[d], "
+                                "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) "
+                                "returned [%d]\n", file, index,
+                                old_end_pos_in_page,
+                                (PAGE_CACHE_SIZE - old_end_pos_in_page), rc);
+                goto out;
+        }
+        index++;
+        while (index < new_end_page_index) {
+                /* Fill all intermediate pages with zeros */
+                rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+                                        "index=[0x%.16x], "
+                                        "old_end_pos_in_page=[d], "
+                                        "(PAGE_CACHE_SIZE - new_end_pos_in_page"
+                                        "=[%d]"
+                                        ")=[d]) returned [%d]\n", file, index,
+                                        old_end_pos_in_page,
+                                        new_end_pos_in_page,
+                                        (PAGE_CACHE_SIZE - new_end_pos_in_page),
+                                        rc);
+                        goto out;
+                }
+                index++;
+        }
+        /* Fill the portion at the beginning of the last new page with
+         * zero's */
+        rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1));
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "write_zeros(file="
+                                "[%p], index=[0x%.16x], 0, "
+                                "new_end_pos_in_page=[%d]"
+                                "returned [%d]\n", file, index,
+                                new_end_pos_in_page, rc);
+                goto out;
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_writepage
+ * @page: Page that is locked before this call is made
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct ecryptfs_page_crypt_context ctx;
+        int rc;
+        ctx.page = page;
+        ctx.mode = ECRYPTFS_WRITEPAGE_MODE;
+        ctx.param.wbc = wbc;
+        rc = ecryptfs_encrypt_page(&ctx);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error encrypting "
+                                "page (upper index [0x%.16x])\n", page->index);
+                ClearPageUptodate(page);
+                goto out;
+        }
+        SetPageUptodate(page);
+        unlock_page(page);
+out:
+        return rc;
+}
+/**
+ * Reads the data from the lower file file at index lower_page_index
+ * and copies that data into page.
+ *
+ * @param page  Page to fill
+ * @param lower_page_index Index of the page in the lower file to get
+ */
+int ecryptfs_do_readpage(struct file *file, struct page *page,
+                         pgoff_t lower_page_index)
+{
+        int rc;
+        struct dentry *dentry;
+        struct file *lower_file;
+        struct dentry *lower_dentry;
+        struct inode *inode;
+        struct inode *lower_inode;
+        char *page_data;
+        struct page *lower_page = NULL;
+        char *lower_page_data;
+        const struct address_space_operations *lower_a_ops;
+        dentry = file->f_dentry;
+        lower_file = ecryptfs_file_to_lower(file);
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        inode = dentry->d_inode;
+        lower_inode = ecryptfs_inode_to_lower(inode);
+        lower_a_ops = lower_inode->i_mapping->a_ops;
+        lower_page = read_cache_page(lower_inode->i_mapping, lower_page_index,
+                                     (filler_t *)lower_a_ops->readpage,
+                                     (void *)lower_file);
+        if (IS_ERR(lower_page)) {
+                rc = PTR_ERR(lower_page);
+                lower_page = NULL;
+                ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
+                goto out;
+        }
+        wait_on_page_locked(lower_page);
+        page_data = (char *)kmap(page);
+        if (!page_data) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Error mapping page\n");
+                goto out;
+        }
+        lower_page_data = (char *)kmap(lower_page);
+        if (!lower_page_data) {
+                rc = -ENOMEM;
+                ecryptfs_printk(KERN_ERR, "Error mapping page\n");
+                kunmap(page);
+                goto out;
+        }
+        memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
+        kunmap(lower_page);
+        kunmap(page);
+        rc = 0;
+out:
+        if (likely(lower_page))
+                page_cache_release(lower_page);
+        if (rc == 0)
+                SetPageUptodate(page);
+        else
+                ClearPageUptodate(page);
+        return rc;
+}
+/**
+ * ecryptfs_readpage
+ * @file: This is an ecryptfs file
+ * @page: ecryptfs associated page to stick the read data into
+ *
+ * Read in a page, decrypting if necessary.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_readpage(struct file *file, struct page *page)
+{
+        int rc = 0;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        BUG_ON(!(file && file->f_dentry && file->f_dentry->d_inode));
+        crypt_stat =
+                &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
+        if (!crypt_stat
+            || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)
+            || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
+                ecryptfs_printk(KERN_DEBUG,
+                                "Passing through unencrypted page\n");
+                rc = ecryptfs_do_readpage(file, page, page->index);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error reading page; rc = "
+                                        "[%d]\n", rc);
+                        goto out;
+                }
+        } else {
+                rc = ecryptfs_decrypt_page(file, page);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error decrypting page; "
+                                        "rc = [%d]\n", rc);
+                        goto out;
+                }
+        }
+        SetPageUptodate(page);
+out:
+        if (rc)
+                ClearPageUptodate(page);
+        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+                        page->index);
+        unlock_page(page);
+        return rc;
+}
+static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
+{
+        struct inode *inode = page->mapping->host;
+        int end_byte_in_page;
+        int rc = 0;
+        char *page_virt;
+        if ((i_size_read(inode) / PAGE_CACHE_SIZE) == page->index) {
+                end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+                if (to > end_byte_in_page)
+                        end_byte_in_page = to;
+                page_virt = kmap(page);
+                if (!page_virt) {
+                        rc = -ENOMEM;
+                        ecryptfs_printk(KERN_WARNING,
+                                        "Could not map page\n");
+                        goto out;
+                }
+                memset((page_virt + end_byte_in_page), 0,
+                       (PAGE_CACHE_SIZE - end_byte_in_page));
+                kunmap(page);
+        }
+out:
+        return rc;
+}
+static int ecryptfs_prepare_write(struct file *file, struct page *page,
+                                  unsigned from, unsigned to)
+{
+        int rc = 0;
+        kmap(page);
+        if (from == 0 && to == PAGE_CACHE_SIZE)
+                goto out;       /* If we are writing a full page, it will be
+                                   up to date. */
+        if (!PageUptodate(page))
+                rc = ecryptfs_do_readpage(file, page, page->index);
+out:
+        return rc;
+}
+int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
+                                     char **lower_virt,
+                                     struct inode *lower_inode,
+                                     unsigned long lower_page_index)
+{
+        int rc = 0;
+        (*lower_page) = grab_cache_page(lower_inode->i_mapping,
+                                        lower_page_index);
+        if (!(*lower_page)) {
+                ecryptfs_printk(KERN_ERR, "grab_cache_page for "
+                                "lower_page_index = [0x%.16x] failed\n",
+                                lower_page_index);
+                rc = -EINVAL;
+                goto out;
+        }
+        if (lower_virt)
+                (*lower_virt) = kmap((*lower_page));
+        else
+                kmap((*lower_page));
+out:
+        return rc;
+}
+int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
+                                              struct inode *lower_inode,
+                                              struct writeback_control *wbc)
+{
+        int rc = 0;
+        rc = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error calling lower writepage(); "
+                                "rc = [%d]\n", rc);
+                goto out;
+        }
+        lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+        page_cache_release(lower_page);
+out:
+        return rc;
+}
+static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page)
+{
+        kunmap(lower_page);
+        ecryptfs_printk(KERN_DEBUG, "Unlocking lower page with index = "
+                        "[0x%.16x]\n", lower_page->index);
+        unlock_page(lower_page);
+        page_cache_release(lower_page);
+}
+/**
+ * ecryptfs_write_inode_size_to_header
+ *
+ * Writes the lower file size to the first 8 bytes of the header.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_write_inode_size_to_header(struct file *lower_file,
+                                    struct inode *lower_inode,
+                                    struct inode *inode)
+{
+        int rc = 0;
+        struct page *header_page;
+        char *header_virt;
+        const struct address_space_operations *lower_a_ops;
+        u64 file_size;
+        rc = ecryptfs_grab_and_map_lower_page(&header_page, &header_virt,
+                                              lower_inode, 0);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "grab_cache_page for header page "
+                                "failed\n");
+                goto out;
+        }
+        lower_a_ops = lower_inode->i_mapping->a_ops;
+        rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8);
+        file_size = (u64)i_size_read(inode);
+        ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size);
+        file_size = cpu_to_be64(file_size);
+        memcpy(header_virt, &file_size, sizeof(u64));
+        rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8);
+        if (rc < 0)
+                ecryptfs_printk(KERN_ERR, "Error commiting header page "
+                                "write\n");
+        ecryptfs_unmap_and_release_lower_page(header_page);
+        lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+        mark_inode_dirty_sync(inode);
+out:
+        return rc;
+}
+int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
+                            struct file *lower_file,
+                            unsigned long lower_page_index, int byte_offset,
+                            int region_bytes)
+{
+        int rc = 0;
+        rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, lower_inode,
+                                              lower_page_index);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error attempting to grab and map "
+                                "lower page with index [0x%.16x]\n",
+                                lower_page_index);
+                goto out;
+        }
+        rc = lower_inode->i_mapping->a_ops->prepare_write(lower_file,
+                                                          (*lower_page),
+                                                          byte_offset,
+                                                          region_bytes);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "prepare_write for "
+                                "lower_page_index = [0x%.16x] failed; rc = "
+                                "[%d]\n", lower_page_index, rc);
+        }
+out:
+        if (rc && (*lower_page)) {
+                ecryptfs_unmap_and_release_lower_page(*lower_page);
+                (*lower_page) = NULL;
+        }
+        return rc;
+}
+/**
+ * ecryptfs_commit_lower_page
+ *
+ * Returns zero on success; non-zero on error
+ */
+int
+ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
+                           struct file *lower_file, int byte_offset,
+                           int region_size)
+{
+        int rc = 0;
+        rc = lower_inode->i_mapping->a_ops->commit_write(
+                lower_file, lower_page, byte_offset, region_size);
+        if (rc < 0) {
+                ecryptfs_printk(KERN_ERR,
+                                "Error committing write; rc = [%d]\n", rc);
+        } else
+                rc = 0;
+        ecryptfs_unmap_and_release_lower_page(lower_page);
+        return rc;
+}
+/**
+ * ecryptfs_copy_page_to_lower
+ *
+ * Used for plaintext pass-through; no page index interpolation
+ * required.
+ */
+int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
+                                struct file *lower_file)
+{
+        int rc = 0;
+        struct page *lower_page;
+        rc = ecryptfs_get_lower_page(&lower_page, lower_inode, lower_file,
+                                     page->index, 0, PAGE_CACHE_SIZE);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error attempting to get page "
+                                "at index [0x%.16x]\n", page->index);
+                goto out;
+        }
+        /* TODO: aops */
+        memcpy((char *)page_address(lower_page), page_address(page),
+               PAGE_CACHE_SIZE);
+        rc = ecryptfs_commit_lower_page(lower_page, lower_inode, lower_file,
+                                        0, PAGE_CACHE_SIZE);
+        if (rc)
+                ecryptfs_printk(KERN_ERR, "Error attempting to commit page "
+                                "at index [0x%.16x]\n", page->index);
+out:
+        return rc;
+}
+static int
+process_new_file(struct ecryptfs_crypt_stat *crypt_stat,
+                 struct file *file, struct inode *inode)
+{
+        struct page *header_page;
+        const struct address_space_operations *lower_a_ops;
+        struct inode *lower_inode;
+        struct file *lower_file;
+        char *header_virt;
+        int rc = 0;
+        int current_header_page = 0;
+        int header_pages;
+        int more_header_data_to_be_written = 1;
+        lower_inode = ecryptfs_inode_to_lower(inode);
+        lower_file = ecryptfs_file_to_lower(file);
+        lower_a_ops = lower_inode->i_mapping->a_ops;
+        header_pages = ((crypt_stat->header_extent_size
+                         * crypt_stat->num_header_extents_at_front)
+                        / PAGE_CACHE_SIZE);
+        BUG_ON(header_pages < 1);
+        while (current_header_page < header_pages) {
+                rc = ecryptfs_grab_and_map_lower_page(&header_page,
+                                                      &header_virt,
+                                                      lower_inode,
+                                                      current_header_page);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "grab_cache_page for "
+                                        "header page [%d] failed; rc = [%d]\n",
+                                        current_header_page, rc);
+                        goto out;
+                }
+                rc = lower_a_ops->prepare_write(lower_file, header_page, 0,
+                                                PAGE_CACHE_SIZE);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error preparing to write "
+                                        "header page out; rc = [%d]\n", rc);
+                        goto out;
+                }
+                memset(header_virt, 0, PAGE_CACHE_SIZE);
+                if (more_header_data_to_be_written) {
+                        rc = ecryptfs_write_headers_virt(header_virt,
+                                                         crypt_stat,
+                                                         file->f_dentry);
+                        if (rc) {
+                                ecryptfs_printk(KERN_WARNING, "Error "
+                                                "generating header; rc = "
+                                                "[%d]\n", rc);
+                                rc = -EIO;
+                                memset(header_virt, 0, PAGE_CACHE_SIZE);
+                                ecryptfs_unmap_and_release_lower_page(
+                                        header_page);
+                                goto out;
+                        }
+                        if (current_header_page == 0)
+                                memset(header_virt, 0, 8);
+                        more_header_data_to_be_written = 0;
+                }
+                rc = lower_a_ops->commit_write(lower_file, header_page, 0,
+                                               PAGE_CACHE_SIZE);
+                ecryptfs_unmap_and_release_lower_page(header_page);
+                if (rc < 0) {
+                        ecryptfs_printk(KERN_ERR,
+                                        "Error commiting header page write; "
+                                        "rc = [%d]\n", rc);
+                        break;
+                }
+                current_header_page++;
+        }
+        if (rc >= 0) {
+                rc = 0;
+                ecryptfs_printk(KERN_DEBUG, "lower_inode->i_blocks = "
+                                "[0x%.16x]\n", lower_inode->i_blocks);
+                i_size_write(inode, 0);
+                lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+                mark_inode_dirty_sync(inode);
+        }
+        ecryptfs_printk(KERN_DEBUG, "Clearing ECRYPTFS_NEW_FILE flag in "
+                        "crypt_stat at memory location [%p]\n", crypt_stat);
+        ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_commit_write
+ * @file: The eCryptfs file object
+ * @page: The eCryptfs page
+ * @from: Ignored (we rotate the page IV on each write)
+ * @to: Ignored
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
+ */
+static int ecryptfs_commit_write(struct file *file, struct page *page,
+                                 unsigned from, unsigned to)
+{
+        struct ecryptfs_page_crypt_context ctx;
+        loff_t pos;
+        struct inode *inode;
+        struct inode *lower_inode;
+        struct file *lower_file;
+        struct ecryptfs_crypt_stat *crypt_stat;
+        int rc;
+        inode = page->mapping->host;
+        lower_inode = ecryptfs_inode_to_lower(inode);
+        lower_file = ecryptfs_file_to_lower(file);
+        mutex_lock(&lower_inode->i_mutex);
+        crypt_stat =
+                &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
+        if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
+                ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in "
+                        "crypt_stat at memory location [%p]\n", crypt_stat);
+                rc = process_new_file(crypt_stat, file, inode);
+                if (rc) {
+                        ecryptfs_printk(KERN_ERR, "Error processing new "
+                                        "file; rc = [%d]\n", rc);
+                        goto out;
+                }
+        } else
+                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
+        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
+                        "(page w/ index = [0x%.16x], to = [%d])\n", page->index,
+                        to);
+        rc = fill_zeros_to_end_of_page(page, to);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
+                                "zeros in page with index = [0x%.16x]\n",
+                                page->index);
+                goto out;
+        }
+        ctx.page = page;
+        ctx.mode = ECRYPTFS_PREPARE_COMMIT_MODE;
+        ctx.param.lower_file = lower_file;
+        rc = ecryptfs_encrypt_page(&ctx);
+        if (rc) {
+                ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
+                                "index [0x%.16x])\n", page->index);
+                goto out;
+        }
+        rc = 0;
+        inode->i_blocks = lower_inode->i_blocks;
+        pos = (page->index << PAGE_CACHE_SHIFT) + to;
+        if (pos > i_size_read(inode)) {
+                i_size_write(inode, pos);
+                ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
+                                "[0x%.16x]\n", i_size_read(inode));
+        }
+        ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
+        lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+        mark_inode_dirty_sync(inode);
+out:
+        kunmap(page); /* mapped in prior call (prepare_write) */
+        if (rc < 0)
+                ClearPageUptodate(page);
+        else
+                SetPageUptodate(page);
+        mutex_unlock(&lower_inode->i_mutex);
+        return rc;
+}
+/**
+ * write_zeros
+ * @file: The ecryptfs file
+ * @index: The index in which we are writing
+ * @start: The position after the last block of data
+ * @num_zeros: The number of zeros to write
+ *
+ * Write a specified number of zero's to a page.
+ *
+ * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE
+ */
+static
+int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
+{
+        int rc = 0;
+        struct page *tmp_page;
+        tmp_page = ecryptfs_get1page(file, index);
+        if (IS_ERR(tmp_page)) {
+                ecryptfs_printk(KERN_ERR, "Error getting page at index "
+                                "[0x%.16x]\n", index);
+                rc = PTR_ERR(tmp_page);
+                goto out;
+        }
+        kmap(tmp_page);
+        rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros);
+        if (rc) {
+                ecryptfs_printk(KERN_ERR, "Error preparing to write zero's "
+                                "to remainder of page at index [0x%.16x]\n",
+                                index);
+                kunmap(tmp_page);
+                page_cache_release(tmp_page);
+                goto out;
+        }
+        memset(((char *)page_address(tmp_page) + start), 0, num_zeros);
+        rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros);
+        if (rc < 0) {
+                ecryptfs_printk(KERN_ERR, "Error attempting to write zero's "
+                                "to remainder of page at index [0x%.16x]\n",
+                                index);
+                kunmap(tmp_page);
+                page_cache_release(tmp_page);
+                goto out;
+        }
+        rc = 0;
+        kunmap(tmp_page);
+        page_cache_release(tmp_page);
+out:
+        return rc;
+}
+static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
+{
+        int rc = 0;
+        struct inode *inode;
+        struct inode *lower_inode;
+        inode = (struct inode *)mapping->host;
+        lower_inode = ecryptfs_inode_to_lower(inode);
+        if (lower_inode->i_mapping->a_ops->bmap)
+                rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping,
+                                                         block);
+        return rc;
+}
+static void ecryptfs_sync_page(struct page *page)
+{
+        struct inode *inode;
+        struct inode *lower_inode;
+        struct page *lower_page;
+        inode = page->mapping->host;
+        lower_inode = ecryptfs_inode_to_lower(inode);
+        /* NOTE: Recently swapped with grab_cache_page(), since
+         * sync_page() just makes sure that pending I/O gets done. */
+        lower_page = find_lock_page(lower_inode->i_mapping, page->index);
+        if (!lower_page) {
+                ecryptfs_printk(KERN_DEBUG, "find_lock_page failed\n");
+                return;
+        }
+        lower_page->mapping->a_ops->sync_page(lower_page);
+        ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+                        lower_page->index);
+        unlock_page(lower_page);
+        page_cache_release(lower_page);
+}
+struct address_space_operations ecryptfs_aops = {
+        .writepage = ecryptfs_writepage,
+        .readpage = ecryptfs_readpage,
+        .prepare_write = ecryptfs_prepare_write,
+        .commit_write = ecryptfs_commit_write,
+        .bmap = ecryptfs_bmap,
+        .sync_page = ecryptfs_sync_page,
+};
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
new file mode 100644
index 000000000000..c337c0410fb1
--- /dev/null
+++ b/fs/ecryptfs/super.c
@@ -0,0 +1,198 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/key.h>
+#include <linux/seq_file.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+struct kmem_cache *ecryptfs_inode_info_cache;
+/**
+ * ecryptfs_alloc_inode - allocate an ecryptfs inode
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Called to bring an inode into existence.
+ *
+ * Only handle allocation, setting up structures should be done in
+ * ecryptfs_read_inode. This is because the kernel, between now and
+ * then, will 0 out the private data pointer.
+ *
+ * Returns a pointer to a newly allocated inode, NULL otherwise
+ */
+static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
+{
+        struct ecryptfs_inode_info *ecryptfs_inode;
+        struct inode *inode = NULL;
+        ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache,
+                                          SLAB_KERNEL);
+        if (unlikely(!ecryptfs_inode))
+                goto out;
+        ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat);
+        inode = &ecryptfs_inode->vfs_inode;
+out:
+        return inode;
+}
+/**
+ * ecryptfs_destroy_inode
+ * @inode: The ecryptfs inode
+ *
+ * This is used during the final destruction of the inode.
+ * All allocation of memory related to the inode, including allocated
+ * memory in the crypt_stat struct, will be released here.
+ * There should be no chance that this deallocation will be missed.
+ */
+static void ecryptfs_destroy_inode(struct inode *inode)
+{
+        struct ecryptfs_inode_info *inode_info;
+        inode_info = ecryptfs_inode_to_private(inode);
+        ecryptfs_destruct_crypt_stat(&inode_info->crypt_stat);
+        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+/**
+ * ecryptfs_init_inode
+ * @inode: The ecryptfs inode
+ *
+ * Set up the ecryptfs inode.
+ */
+void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
+{
+        ecryptfs_set_inode_lower(inode, lower_inode);
+        inode->i_ino = lower_inode->i_ino;
+        inode->i_version++;
+        inode->i_op = &ecryptfs_main_iops;
+        inode->i_fop = &ecryptfs_main_fops;
+        inode->i_mapping->a_ops = &ecryptfs_aops;
+}
+/**
+ * ecryptfs_put_super
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Final actions when unmounting a file system.
+ * This will handle deallocation and release of our private data.
+ */
+static void ecryptfs_put_super(struct super_block *sb)
+{
+        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        ecryptfs_destruct_mount_crypt_stat(&sb_info->mount_crypt_stat);
+        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
+        ecryptfs_set_superblock_private(sb, NULL);
+}
+/**
+ * ecryptfs_statfs
+ * @sb: The ecryptfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. Currently, we let this pass right through
+ * to the lower filesystem and take no action ourselves.
+ */
+static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
+}
+/**
+ * ecryptfs_clear_inode
+ * @inode - The ecryptfs inode
+ *
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere.  Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list. We use this to drop out reference to the
+ * lower inode.
+ */
+static void ecryptfs_clear_inode(struct inode *inode)
+{
+        iput(ecryptfs_inode_to_lower(inode));
+}
+/**
+ * ecryptfs_umount_begin
+ *
+ * Called in do_umount().
+ */
+static void ecryptfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+        struct vfsmount *lower_mnt =
+                ecryptfs_dentry_to_lower_mnt(vfsmnt->mnt_sb->s_root);
+        struct super_block *lower_sb;
+        mntput(lower_mnt);
+        lower_sb = lower_mnt->mnt_sb;
+        if (lower_sb->s_op->umount_begin)
+                lower_sb->s_op->umount_begin(lower_mnt, flags);
+}
+/**
+ * ecryptfs_show_options
+ *
+ * Prints the directory we are currently mounted over.
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct super_block *sb = mnt->mnt_sb;
+        struct dentry *lower_root_dentry = ecryptfs_dentry_to_lower(sb->s_root);
+        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(sb->s_root);
+        char *tmp_page;
+        char *path;
+        int rc = 0;
+        tmp_page = (char *)__get_free_page(GFP_KERNEL);
+        if (!tmp_page) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        path = d_path(lower_root_dentry, lower_mnt, tmp_page, PAGE_SIZE);
+        if (IS_ERR(path)) {
+                rc = PTR_ERR(path);
+                goto out;
+        }
+        seq_printf(m, ",dir=%s", path);
+        free_page((unsigned long)tmp_page);
+out:
+        return rc;
+}
+struct super_operations ecryptfs_sops = {
+        .alloc_inode = ecryptfs_alloc_inode,
+        .destroy_inode = ecryptfs_destroy_inode,
+        .drop_inode = generic_delete_inode,
+        .put_super = ecryptfs_put_super,
+        .statfs = ecryptfs_statfs,
+        .remount_fs = NULL,
+        .clear_inode = ecryptfs_clear_inode,
+        .umount_begin = ecryptfs_umount_begin,
+        .show_options = ecryptfs_show_options
+};
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 557d5b614fae..ae228ec54e94 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -105,6 +105,8 @@
 /* Maximum msec timeout value storeable in a long int */
 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
+#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 struct epoll_filefd {
        struct file *file;
@@ -497,7 +499,7 @@ void eventpoll_release_file(struct file *file)
 */
 asmlinkage long sys_epoll_create(int size)
 {
-        int error, fd;
+        int error, fd = -1;
        struct eventpoll *ep;
        struct inode *inode;
        struct file *file;
@@ -640,7 +642,6 @@ eexit_1:
        return error;
 }
-#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 /*
 * Implement the event wait interface for the eventpoll file. It is the kernel
@@ -657,7 +658,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                     current, epfd, events, maxevents, timeout));
        /* The maximum number of event must be greater than zero */
-        if (maxevents <= 0 || maxevents > MAX_EVENTS)
+        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;
        /* Verify that the area passed by the user is writeable */
@@ -699,6 +700,55 @@ eexit_1:
 }
+#ifdef TIF_RESTORE_SIGMASK
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
+                int maxevents, int timeout, const sigset_t __user *sigmask,
+                size_t sigsetsize)
+{
+        int error;
+        sigset_t ksigmask, sigsaved;
+        /*
+         * If the caller wants a certain signal mask to be set during the wait,
+         * we apply it here.
+         */
+        if (sigmask) {
+                if (sigsetsize != sizeof(sigset_t))
+                        return -EINVAL;
+                if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+                        return -EFAULT;
+                sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+        }
+        error = sys_epoll_wait(epfd, events, maxevents, timeout);
+        /*
+         * If we changed the signal mask, we need to restore the original one.
+         * In case we've got a signal while waiting, we do not restore the
+         * signal mask yet, and we allow do_signal() to deliver the signal on
+         * the way back to userspace, before the signal mask is restored.
+         */
+        if (sigmask) {
+                if (error == -EINTR) {
+                        memcpy(&current->saved_sigmask, &sigsaved,
+                                sizeof(sigsaved));
+                        set_thread_flag(TIF_RESTORE_SIGMASK);
+                } else
+                        sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+        }
+        return error;
+}
+#endif /* #ifdef TIF_RESTORE_SIGMASK */
 /*
 * Creates the file descriptor to be used by the epoll interface.
 */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 513cd421ac0b..d8b9abd95d07 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -364,7 +364,6 @@ static int parse_options (char * options,
 {
        char * p;
        substring_t args[MAX_OPT_ARGS];
-        unsigned long kind = EXT2_MOUNT_ERRORS_CONT;
        int option;
        if (!options)
@@ -404,13 +403,19 @@ static int parse_options (char * options,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        kind = EXT2_MOUNT_ERRORS_PANIC;
+                        clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt (sbi->s_mount_opt, ERRORS_RO);
+                        set_opt (sbi->s_mount_opt, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        kind = EXT2_MOUNT_ERRORS_RO;
+                        clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt (sbi->s_mount_opt, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        kind = EXT2_MOUNT_ERRORS_CONT;
+                        clear_opt (sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt (sbi->s_mount_opt, ERRORS_CONT);
                        break;
                case Opt_nouid32:
                        set_opt (sbi->s_mount_opt, NO_UID32);
@@ -489,7 +494,6 @@ static int parse_options (char * options,
                        return 0;
                }
        }
-        sbi->s_mount_opt |= kind;
        return 1;
 }
@@ -715,6 +719,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO)
                set_opt(sbi->s_mount_opt, ERRORS_RO);
+        else
+                set_opt(sbi->s_mount_opt, ERRORS_CONT);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8bfd56ef18ca..afc2d4f42d77 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1470,6 +1470,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
                set_opt(sbi->s_mount_opt, ERRORS_RO);
+        else
+                set_opt(sbi->s_mount_opt, ERRORS_CONT);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..a6acb96ebeb9
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ext4-filesystem routines.
+#
+obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+ext4dev-y       := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+           ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)      += xattr.o xattr_user.o xattr_trusted.o
+ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)  += acl.o
+ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)   += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..9e882546d91a
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
+/*
+ * linux/fs/ext4/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext4_acl_from_disk(const void *value, size_t size)
+{
+        const char *end = (char *)value + size;
+        int n, count;
+        struct posix_acl *acl;
+        if (!value)
+                return NULL;
+        if (size < sizeof(ext4_acl_header))
+                 return ERR_PTR(-EINVAL);
+        if (((ext4_acl_header *)value)->a_version !=
+            cpu_to_le32(EXT4_ACL_VERSION))
+                return ERR_PTR(-EINVAL);
+        value = (char *)value + sizeof(ext4_acl_header);
+        count = ext4_acl_count(size);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_KERNEL);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (n=0; n < count; n++) {
+                ext4_acl_entry *entry =
+                        (ext4_acl_entry *)value;
+                if ((char *)value + sizeof(ext4_acl_entry_short) > end)
+                        goto fail;
+                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+                switch(acl->a_entries[n].e_tag) {
+                        case ACL_USER_OBJ:
+                        case ACL_GROUP_OBJ:
+                        case ACL_MASK:
+                        case ACL_OTHER:
+                                value = (char *)value +
+                                        sizeof(ext4_acl_entry_short);
+                                acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+                                break;
+                        case ACL_USER:
+                        case ACL_GROUP:
+                                value = (char *)value + sizeof(ext4_acl_entry);
+                                if ((char *)value > end)
+                                        goto fail;
+                                acl->a_entries[n].e_id =
+                                        le32_to_cpu(entry->e_id);
+                                break;
+                        default:
+                                goto fail;
+                }
+        }
+        if (value != end)
+                goto fail;
+        return acl;
+fail:
+        posix_acl_release(acl);
+        return ERR_PTR(-EINVAL);
+}
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+        ext4_acl_header *ext_acl;
+        char *e;
+        size_t n;
+        *size = ext4_acl_size(acl->a_count);
+        ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
+                        sizeof(ext4_acl_entry), GFP_KERNEL);
+        if (!ext_acl)
+                return ERR_PTR(-ENOMEM);
+        ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
+        e = (char *)ext_acl + sizeof(ext4_acl_header);
+        for (n=0; n < acl->a_count; n++) {
+                ext4_acl_entry *entry = (ext4_acl_entry *)e;
+                entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+                entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+                switch(acl->a_entries[n].e_tag) {
+                        case ACL_USER:
+                        case ACL_GROUP:
+                                entry->e_id =
+                                        cpu_to_le32(acl->a_entries[n].e_id);
+                                e += sizeof(ext4_acl_entry);
+                                break;
+                        case ACL_USER_OBJ:
+                        case ACL_GROUP_OBJ:
+                        case ACL_MASK:
+                        case ACL_OTHER:
+                                e += sizeof(ext4_acl_entry_short);
+                                break;
+                        default:
+                                goto fail;
+                }
+        }
+        return (char *)ext_acl;
+fail:
+        kfree(ext_acl);
+        return ERR_PTR(-EINVAL);
+}
+static inline struct posix_acl *
+ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+        struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
+        spin_lock(&inode->i_lock);
+        if (*i_acl != EXT4_ACL_NOT_CACHED)
+                acl = posix_acl_dup(*i_acl);
+        spin_unlock(&inode->i_lock);
+        return acl;
+}
+static inline void
+ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+                struct posix_acl *acl)
+{
+        spin_lock(&inode->i_lock);
+        if (*i_acl != EXT4_ACL_NOT_CACHED)
+                posix_acl_release(*i_acl);
+        *i_acl = posix_acl_dup(acl);
+        spin_unlock(&inode->i_lock);
+}
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+static struct posix_acl *
+ext4_get_acl(struct inode *inode, int type)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        int name_index;
+        char *value = NULL;
+        struct posix_acl *acl;
+        int retval;
+        if (!test_opt(inode->i_sb, POSIX_ACL))
+                return NULL;
+        switch(type) {
+                case ACL_TYPE_ACCESS:
+                        acl = ext4_iget_acl(inode, &ei->i_acl);
+                        if (acl != EXT4_ACL_NOT_CACHED)
+                                return acl;
+                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        acl = ext4_iget_acl(inode, &ei->i_default_acl);
+                        if (acl != EXT4_ACL_NOT_CACHED)
+                                return acl;
+                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                        break;
+                default:
+                        return ERR_PTR(-EINVAL);
+        }
+        retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+        if (retval > 0) {
+                value = kmalloc(retval, GFP_KERNEL);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                retval = ext4_xattr_get(inode, name_index, "", value, retval);
+        }
+        if (retval > 0)
+                acl = ext4_acl_from_disk(value, retval);
+        else if (retval == -ENODATA || retval == -ENOSYS)
+                acl = NULL;
+        else
+                acl = ERR_PTR(retval);
+        kfree(value);
+        if (!IS_ERR(acl)) {
+                switch(type) {
+                        case ACL_TYPE_ACCESS:
+                                ext4_iset_acl(inode, &ei->i_acl, acl);
+                                break;
+                        case ACL_TYPE_DEFAULT:
+                                ext4_iset_acl(inode, &ei->i_default_acl, acl);
+                                break;
+                }
+        }
+        return acl;
+}
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext4_new_inode
+ */
+static int
+ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+             struct posix_acl *acl)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        int name_index;
+        void *value = NULL;
+        size_t size = 0;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch(type) {
+                case ACL_TYPE_ACCESS:
+                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+                        if (acl) {
+                                mode_t mode = inode->i_mode;
+                                error = posix_acl_equiv_mode(acl, &mode);
+                                if (error < 0)
+                                        return error;
+                                else {
+                                        inode->i_mode = mode;
+                                        ext4_mark_inode_dirty(handle, inode);
+                                        if (error == 0)
+                                                acl = NULL;
+                                }
+                        }
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                        if (!S_ISDIR(inode->i_mode))
+                                return acl ? -EACCES : 0;
+                        break;
+                default:
+                        return -EINVAL;
+        }
+        if (acl) {
+                value = ext4_acl_to_disk(acl, &size);
+                if (IS_ERR(value))
+                        return (int)PTR_ERR(value);
+        }
+        error = ext4_xattr_set_handle(handle, inode, name_index, "",
+                                      value, size, 0);
+        kfree(value);
+        if (!error) {
+                switch(type) {
+                        case ACL_TYPE_ACCESS:
+                                ext4_iset_acl(inode, &ei->i_acl, acl);
+                                break;
+                        case ACL_TYPE_DEFAULT:
+                                ext4_iset_acl(inode, &ei->i_default_acl, acl);
+                                break;
+                }
+        }
+        return error;
+}
+static int
+ext4_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+int
+ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        return generic_permission(inode, mask, ext4_check_acl);
+}
+/*
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+        struct posix_acl *acl = NULL;
+        int error = 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (test_opt(dir->i_sb, POSIX_ACL)) {
+                        acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        error = ext4_set_acl(handle, inode,
+                                             ACL_TYPE_DEFAULT, acl);
+                        if (error)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                error = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                mode = inode->i_mode;
+                error = posix_acl_create_masq(clone, &mode);
+                if (error >= 0) {
+                        inode->i_mode = mode;
+                        if (error > 0) {
+                                /* This is an extended ACL */
+                                error = ext4_set_acl(handle, inode,
+                                                     ACL_TYPE_ACCESS, clone);
+                        }
+                }
+                posix_acl_release(clone);
+        }
+cleanup:
+        posix_acl_release(acl);
+        return error;
+}
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext4_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!test_opt(inode->i_sb, POSIX_ACL))
+                return 0;
+        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        error = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!error) {
+                handle_t *handle;
+                int retries = 0;
+        retry:
+                handle = ext4_journal_start(inode,
+                                EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+                if (IS_ERR(handle)) {
+                        error = PTR_ERR(handle);
+                        ext4_std_error(inode->i_sb, error);
+                        goto out;
+                }
+                error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
+                ext4_journal_stop(handle);
+                if (error == -ENOSPC &&
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
+                        goto retry;
+        }
+out:
+        posix_acl_release(clone);
+        return error;
+}
+/*
+ * Extended attribute handlers
+ */
+static size_t
+ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+                           const char *name, size_t name_len)
+{
+        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+        if (!test_opt(inode->i_sb, POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+        return size;
+}
+static size_t
+ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+                            const char *name, size_t name_len)
+{
+        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+        if (!test_opt(inode->i_sb, POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+        return size;
+}
+static int
+ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+{
+        struct posix_acl *acl;
+        int error;
+        if (!test_opt(inode->i_sb, POSIX_ACL))
+                return -EOPNOTSUPP;
+        acl = ext4_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int
+ext4_xattr_get_acl_access(struct inode *inode, const char *name,
+                          void *buffer, size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int
+ext4_xattr_get_acl_default(struct inode *inode, const char *name,
+                           void *buffer, size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+static int
+ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
+                   size_t size)
+{
+        handle_t *handle;
+        struct posix_acl *acl;
+        int error, retries = 0;
+        if (!test_opt(inode->i_sb, POSIX_ACL))
+                return -EOPNOTSUPP;
+        if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        error = posix_acl_valid(acl);
+                        if (error)
+                                goto release_and_out;
+                }
+        } else
+                acl = NULL;
+retry:
+        handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        error = ext4_set_acl(handle, inode, type, acl);
+        ext4_journal_stop(handle);
+        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+release_and_out:
+        posix_acl_release(acl);
+        return error;
+}
+static int
+ext4_xattr_set_acl_access(struct inode *inode, const char *name,
+                          const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int
+ext4_xattr_set_acl_default(struct inode *inode, const char *name,
+                           const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+struct xattr_handler ext4_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .list   = ext4_xattr_list_acl_access,
+        .get    = ext4_xattr_get_acl_access,
+        .set    = ext4_xattr_set_acl_access,
+};
+struct xattr_handler ext4_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .list   = ext4_xattr_list_acl_default,
+        .get    = ext4_xattr_get_acl_default,
+        .set    = ext4_xattr_set_acl_default,
+};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..26a5c1abf147
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
+/*
+  File: fs/ext4/acl.h
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+#include <linux/posix_acl_xattr.h>
+#define EXT4_ACL_VERSION        0x0001
+typedef struct {
+        __le16          e_tag;
+        __le16          e_perm;
+        __le32          e_id;
+} ext4_acl_entry;
+typedef struct {
+        __le16          e_tag;
+        __le16          e_perm;
+} ext4_acl_entry_short;
+typedef struct {
+        __le32          a_version;
+} ext4_acl_header;
+static inline size_t ext4_acl_size(int count)
+{
+        if (count <= 4) {
+                return sizeof(ext4_acl_header) +
+                       count * sizeof(ext4_acl_entry_short);
+        } else {
+                return sizeof(ext4_acl_header) +
+                       4 * sizeof(ext4_acl_entry_short) +
+                       (count - 4) * sizeof(ext4_acl_entry);
+        }
+}
+static inline int ext4_acl_count(size_t size)
+{
+        ssize_t s;
+        size -= sizeof(ext4_acl_header);
+        s = size - 4 * sizeof(ext4_acl_entry_short);
+        if (s < 0) {
+                if (size % sizeof(ext4_acl_entry_short))
+                        return -1;
+                return size / sizeof(ext4_acl_entry_short);
+        } else {
+                if (s % sizeof(ext4_acl_entry))
+                        return -1;
+                return s / sizeof(ext4_acl_entry) + 4;
+        }
+}
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
+   if the ACL has not been cached */
+#define EXT4_ACL_NOT_CACHED ((void *)-1)
+/* acl.c */
+extern int ext4_permission (struct inode *, int, struct nameidata *);
+extern int ext4_acl_chmod (struct inode *);
+extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext4_permission NULL
+static inline int
+ext4_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+        return 0;
+}
+#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..5d45582f9517
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1833 @@
+/*
+ *  linux/fs/ext4/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+                unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        ext4_grpblk_t offset;
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+        if (offsetp)
+                *offsetp = offset;
+        if (blockgrpp)
+                *blockgrpp = blocknr;
+}
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext4_read_super).
+ */
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/**
+ * ext4_get_group_desc() -- load group descriptor from disk
+ * @sb:                 super block
+ * @block_group:        given block group
+ * @bh:                 pointer to the buffer head to store the block
+ *                      group descriptor
+ */
+struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+                                             unsigned int block_group,
+                                             struct buffer_head ** bh)
+{
+        unsigned long group_desc;
+        unsigned long offset;
+        struct ext4_group_desc * desc;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (block_group >= sbi->s_groups_count) {
+                ext4_error (sb, "ext4_get_group_desc",
+                            "block_group >= groups_count - "
+                            "block_group = %d, groups_count = %lu",
+                            block_group, sbi->s_groups_count);
+                return NULL;
+        }
+        smp_rmb();
+        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+        if (!sbi->s_group_desc[group_desc]) {
+                ext4_error (sb, "ext4_get_group_desc",
+                            "Group descriptor not loaded - "
+                            "block_group = %d, group_desc = %lu, desc = %lu",
+                             block_group, group_desc, offset);
+                return NULL;
+        }
+        desc = (struct ext4_group_desc *)(
+                (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+                offset * EXT4_DESC_SIZE(sb));
+        if (bh)
+                *bh = sbi->s_group_desc[group_desc];
+        return desc;
+}
+/**
+ * read_block_bitmap()
+ * @sb:                 super block
+ * @block_group:        given block group
+ *
+ * Read the bitmap for a given block_group, reading into the specified
+ * slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+        struct ext4_group_desc * desc;
+        struct buffer_head * bh = NULL;
+        desc = ext4_get_group_desc (sb, block_group, NULL);
+        if (!desc)
+                goto error_out;
+        bh = sb_bread(sb, ext4_block_bitmap(sb, desc));
+        if (!bh)
+                ext4_error (sb, "read_block_bitmap",
+                            "Cannot read block bitmap - "
+                            "block_group = %d, block_bitmap = %llu",
+                            block_group,
+                            ext4_block_bitmap(sb, desc));
+error_out:
+        return bh;
+}
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:            root of per-filesystem reservation rb tree
+ * @verbose:            verbose mode
+ * @fn:                 function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end). Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+                              const char *fn)
+{
+        struct rb_node *n;
+        struct ext4_reserve_window_node *rsv, *prev;
+        int bad;
+restart:
+        n = rb_first(root);
+        bad = 0;
+        prev = NULL;
+        printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+        while (n) {
+                rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
+                if (verbose)
+                        printk("reservation window 0x%p "
+                               "start:  %llu, end:  %llu\n",
+                               rsv, rsv->rsv_start, rsv->rsv_end);
+                if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+                        printk("Bad reservation %p (start >= end)\n",
+                               rsv);
+                        bad = 1;
+                }
+                if (prev && prev->rsv_end >= rsv->rsv_start) {
+                        printk("Bad reservation %p (prev->end >= start)\n",
+                               rsv);
+                        bad = 1;
+                }
+                if (bad) {
+                        if (!verbose) {
+                                printk("Restarting reservation walk in verbose mode\n");
+                                verbose = 1;
+                                goto restart;
+                        }
+                }
+                n = rb_next(n);
+                prev = rsv;
+        }
+        printk("Window map complete.\n");
+        if (bad)
+                BUG();
+}
+#define rsv_window_dump(root, verbose) \
+        __rsv_window_dump((root), (verbose), __FUNCTION__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+/**
+ * goal_in_my_reservation()
+ * @rsv:                inode's reservation window
+ * @grp_goal:           given goal block relative to the allocation block group
+ * @group:              the current allocation block group
+ * @sb:                 filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
+                        unsigned int group, struct super_block * sb)
+{
+        ext4_fsblk_t group_first_block, group_last_block;
+        group_first_block = ext4_group_first_block_no(sb, group);
+        group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+        if ((rsv->_rsv_start > group_last_block) ||
+            (rsv->_rsv_end < group_first_block))
+                return 0;
+        if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+                || (grp_goal + group_first_block > rsv->_rsv_end)))
+                return 0;
+        return 1;
+}
+/**
+ * search_reserve_window()
+ * @rb_root:            root of reservation tree
+ * @goal:               target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext4_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
+{
+        struct rb_node *n = root->rb_node;
+        struct ext4_reserve_window_node *rsv;
+        if (!n)
+                return NULL;
+        do {
+                rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
+                if (goal < rsv->rsv_start)
+                        n = n->rb_left;
+                else if (goal > rsv->rsv_end)
+                        n = n->rb_right;
+                else
+                        return rsv;
+        } while (n);
+        /*
+         * We've fallen off the end of the tree: the goal wasn't inside
+         * any particular node.  OK, the previous node must be to one
+         * side of the interval containing the goal.  If it's the RHS,
+         * we need to back up one.
+         */
+        if (rsv->rsv_start > goal) {
+                n = rb_prev(&rsv->rsv_node);
+                rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
+        }
+        return rsv;
+}
+/**
+ * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:                 super block
+ * @rsv:                reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
+void ext4_rsv_window_add(struct super_block *sb,
+                    struct ext4_reserve_window_node *rsv)
+{
+        struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
+        struct rb_node *node = &rsv->rsv_node;
+        ext4_fsblk_t start = rsv->rsv_start;
+        struct rb_node ** p = &root->rb_node;
+        struct rb_node * parent = NULL;
+        struct ext4_reserve_window_node *this;
+        while (*p)
+        {
+                parent = *p;
+                this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
+                if (start < this->rsv_start)
+                        p = &(*p)->rb_left;
+                else if (start > this->rsv_end)
+                        p = &(*p)->rb_right;
+                else {
+                        rsv_window_dump(root, 1);
+                        BUG();
+                }
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+}
+/**
+ * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:                 super block
+ * @rsv:                reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
+static void rsv_window_remove(struct super_block *sb,
+                              struct ext4_reserve_window_node *rsv)
+{
+        rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+        rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+        rsv->rsv_alloc_hit = 0;
+        rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
+}
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:                given reservation window to check
+ *
+ * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
+{
+        /* a valid reservation end block could not be 0 */
+        return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+}
+/**
+ * ext4_init_block_alloc_info()
+ * @inode:              file inode structure
+ *
+ * Allocate and initialize the  reservation window structure, and
+ * link the window to the ext4 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext4 inode the first time the open file
+ * needs a new block. So, before every ext4_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext4_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
+void ext4_init_block_alloc_info(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
+        struct super_block *sb = inode->i_sb;
+        block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+        if (block_i) {
+                struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
+                rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+                rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+                /*
+                 * if filesystem is mounted with NORESERVATION, the goal
+                 * reservation window size is set to zero to indicate
+                 * block reservation is off
+                 */
+                if (!test_opt(sb, RESERVATION))
+                        rsv->rsv_goal_size = 0;
+                else
+                        rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
+                rsv->rsv_alloc_hit = 0;
+                block_i->last_alloc_logical_block = 0;
+                block_i->last_alloc_physical_block = 0;
+        }
+        ei->i_block_alloc_info = block_i;
+}
+/**
+ * ext4_discard_reservation()
+ * @inode:              inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ *      ext4_release_file(): last writer close the file
+ *      ext4_clear_inode(): last iput(), when nobody link to this file.
+ *      ext4_truncate(): when the block indirect map is about to change.
+ *
+ */
+void ext4_discard_reservation(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
+        struct ext4_reserve_window_node *rsv;
+        spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
+        if (!block_i)
+                return;
+        rsv = &block_i->rsv_window_node;
+        if (!rsv_is_empty(&rsv->rsv_window)) {
+                spin_lock(rsv_lock);
+                if (!rsv_is_empty(&rsv->rsv_window))
+                        rsv_window_remove(inode->i_sb, rsv);
+                spin_unlock(rsv_lock);
+        }
+}
+/**
+ * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * @handle:                     handle to this transaction
+ * @sb:                         super block
+ * @block:                      start physcial block to free
+ * @count:                      number of blocks to free
+ * @pdquot_freed_blocks:        pointer to quota
+ */
+void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+                         ext4_fsblk_t block, unsigned long count,
+                         unsigned long *pdquot_freed_blocks)
+{
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *gd_bh;
+        unsigned long block_group;
+        ext4_grpblk_t bit;
+        unsigned long i;
+        unsigned long overflow;
+        struct ext4_group_desc * desc;
+        struct ext4_super_block * es;
+        struct ext4_sb_info *sbi;
+        int err = 0, ret;
+        ext4_grpblk_t group_freed;
+        *pdquot_freed_blocks = 0;
+        sbi = EXT4_SB(sb);
+        es = sbi->s_es;
+        if (block < le32_to_cpu(es->s_first_data_block) ||
+            block + count < block ||
+            block + count > ext4_blocks_count(es)) {
+                ext4_error (sb, "ext4_free_blocks",
+                            "Freeing blocks not in datazone - "
+                            "block = %llu, count = %lu", block, count);
+                goto error_return;
+        }
+        ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+do_more:
+        overflow = 0;
+        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        /*
+         * Check to see if we are freeing blocks across a group
+         * boundary.
+         */
+        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+                count -= overflow;
+        }
+        brelse(bitmap_bh);
+        bitmap_bh = read_block_bitmap(sb, block_group);
+        if (!bitmap_bh)
+                goto error_return;
+        desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+        if (!desc)
+                goto error_return;
+        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+            in_range(block + count - 1, ext4_inode_table(sb, desc),
+                     sbi->s_itb_per_group))
+                ext4_error (sb, "ext4_free_blocks",
+                            "Freeing blocks in system zones - "
+                            "Block = %llu, count = %lu",
+                            block, count);
+        /*
+         * We are about to start releasing blocks in the bitmap,
+         * so we need undo access.
+         */
+        /* @@@ check errors */
+        BUFFER_TRACE(bitmap_bh, "getting undo access");
+        err = ext4_journal_get_undo_access(handle, bitmap_bh);
+        if (err)
+                goto error_return;
+        /*
+         * We are about to modify some metadata.  Call the journal APIs
+         * to unshare ->b_data if a currently-committing transaction is
+         * using it
+         */
+        BUFFER_TRACE(gd_bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, gd_bh);
+        if (err)
+                goto error_return;
+        jbd_lock_bh_state(bitmap_bh);
+        for (i = 0, group_freed = 0; i < count; i++) {
+                /*
+                 * An HJ special.  This is expensive...
+                 */
+#ifdef CONFIG_JBD_DEBUG
+                jbd_unlock_bh_state(bitmap_bh);
+                {
+                        struct buffer_head *debug_bh;
+                        debug_bh = sb_find_get_block(sb, block + i);
+                        if (debug_bh) {
+                                BUFFER_TRACE(debug_bh, "Deleted!");
+                                if (!bh2jh(bitmap_bh)->b_committed_data)
+                                        BUFFER_TRACE(debug_bh,
+                                                "No commited data in bitmap");
+                                BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+                                __brelse(debug_bh);
+                        }
+                }
+                jbd_lock_bh_state(bitmap_bh);
+#endif
+                if (need_resched()) {
+                        jbd_unlock_bh_state(bitmap_bh);
+                        cond_resched();
+                        jbd_lock_bh_state(bitmap_bh);
+                }
+                /* @@@ This prevents newly-allocated data from being
+                 * freed and then reallocated within the same
+                 * transaction.
+                 *
+                 * Ideally we would want to allow that to happen, but to
+                 * do so requires making jbd2_journal_forget() capable of
+                 * revoking the queued write of a data block, which
+                 * implies blocking on the journal lock.  *forget()
+                 * cannot block due to truncate races.
+                 *
+                 * Eventually we can fix this by making jbd2_journal_forget()
+                 * return a status indicating whether or not it was able
+                 * to revoke the buffer.  On successful revoke, it is
+                 * safe not to set the allocation bit in the committed
+                 * bitmap, because we know that there is no outstanding
+                 * activity on the buffer any more and so it is safe to
+                 * reallocate it.
+                 */
+                BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
+                J_ASSERT_BH(bitmap_bh,
+                                bh2jh(bitmap_bh)->b_committed_data != NULL);
+                ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
+                                bh2jh(bitmap_bh)->b_committed_data);
+                /*
+                 * We clear the bit in the bitmap after setting the committed
+                 * data bit, because this is the reverse order to that which
+                 * the allocator uses.
+                 */
+                BUFFER_TRACE(bitmap_bh, "clear bit");
+                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+                                                bit + i, bitmap_bh->b_data)) {
+                        jbd_unlock_bh_state(bitmap_bh);
+                        ext4_error(sb, __FUNCTION__,
+                                   "bit already cleared for block %llu",
+                                   (ext4_fsblk_t)(block + i));
+                        jbd_lock_bh_state(bitmap_bh);
+                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
+                } else {
+                        group_freed++;
+                }
+        }
+        jbd_unlock_bh_state(bitmap_bh);
+        spin_lock(sb_bgl_lock(sbi, block_group));
+        desc->bg_free_blocks_count =
+                cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
+                        group_freed);
+        spin_unlock(sb_bgl_lock(sbi, block_group));
+        percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        /* And the group descriptor block */
+        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+        ret = ext4_journal_dirty_metadata(handle, gd_bh);
+        if (!err) err = ret;
+        *pdquot_freed_blocks += group_freed;
+        if (overflow && !err) {
+                block += count;
+                count = overflow;
+                goto do_more;
+        }
+        sb->s_dirt = 1;
+error_return:
+        brelse(bitmap_bh);
+        ext4_std_error(sb, err);
+        return;
+}
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:             handle for this transaction
+ * @inode:              inode
+ * @block:              start physical block to free
+ * @count:              number of blocks to count
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+                        ext4_fsblk_t block, unsigned long count)
+{
+        struct super_block * sb;
+        unsigned long dquot_freed_blocks;
+        sb = inode->i_sb;
+        if (!sb) {
+                printk ("ext4_free_blocks: nonexistent device");
+                return;
+        }
+        ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+        if (dquot_freed_blocks)
+                DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+        return;
+}
+/**
+ * ext4_test_allocatable()
+ * @nr:                 given allocation block group
+ * @bh:                 bufferhead contains the bitmap of the given block group
+ *
+ * For ext4 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
+{
+        int ret;
+        struct journal_head *jh = bh2jh(bh);
+        if (ext4_test_bit(nr, bh->b_data))
+                return 0;
+        jbd_lock_bh_state(bh);
+        if (!jh->b_committed_data)
+                ret = 1;
+        else
+                ret = !ext4_test_bit(nr, jh->b_committed_data);
+        jbd_unlock_bh_state(bh);
+        return ret;
+}
+/**
+ * bitmap_search_next_usable_block()
+ * @start:              the starting block (group relative) of the search
+ * @bh:                 bufferhead contains the block group bitmap
+ * @maxblocks:          the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
+static ext4_grpblk_t
+bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
+                                        ext4_grpblk_t maxblocks)
+{
+        ext4_grpblk_t next;
+        struct journal_head *jh = bh2jh(bh);
+        while (start < maxblocks) {
+                next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
+                if (next >= maxblocks)
+                        return -1;
+                if (ext4_test_allocatable(next, bh))
+                        return next;
+                jbd_lock_bh_state(bh);
+                if (jh->b_committed_data)
+                        start = ext4_find_next_zero_bit(jh->b_committed_data,
+                                                        maxblocks, next);
+                jbd_unlock_bh_state(bh);
+        }
+        return -1;
+}
+/**
+ * find_next_usable_block()
+ * @start:              the starting block (group relative) to find next
+ *                      allocatable block in bitmap.
+ * @bh:                 bufferhead contains the block group bitmap
+ * @maxblocks:          the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We honor both the bitmap and
+ * its last-committed copy (if that exists), and perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap; then
+ * for any free bit in the bitmap.
+ */
+static ext4_grpblk_t
+find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
+                        ext4_grpblk_t maxblocks)
+{
+        ext4_grpblk_t here, next;
+        char *p, *r;
+        if (start > 0) {
+                /*
+                 * The goal was occupied; search forward for a free
+                 * block within the next XX blocks.
+                 *
+                 * end_goal is more or less random, but it has to be
+                 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
+                 * next 64-bit boundary is simple..
+                 */
+                ext4_grpblk_t end_goal = (start + 63) & ~63;
+                if (end_goal > maxblocks)
+                        end_goal = maxblocks;
+                here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
+                if (here < end_goal && ext4_test_allocatable(here, bh))
+                        return here;
+                ext4_debug("Bit not found near goal\n");
+        }
+        here = start;
+        if (here < 0)
+                here = 0;
+        p = ((char *)bh->b_data) + (here >> 3);
+        r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+        next = (r - ((char *)bh->b_data)) << 3;
+        if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
+                return next;
+        /*
+         * The bitmap search --- search forward alternately through the actual
+         * bitmap and the last-committed copy until we find a bit free in
+         * both
+         */
+        here = bitmap_search_next_usable_block(here, bh, maxblocks);
+        return here;
+}
+/**
+ * claim_block()
+ * @block:              the free block (group relative) to allocate
+ * @bh:                 the bufferhead containts the block group bitmap
+ *
+ * We think we can allocate this block in this bitmap.  Try to set the bit.
+ * If that succeeds then check that nobody has allocated and then freed the
+ * block since we saw that is was not marked in b_committed_data.  If it _was_
+ * allocated and freed then clear the bit in the bitmap again and return
+ * zero (failure).
+ */
+static inline int
+claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
+{
+        struct journal_head *jh = bh2jh(bh);
+        int ret;
+        if (ext4_set_bit_atomic(lock, block, bh->b_data))
+                return 0;
+        jbd_lock_bh_state(bh);
+        if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
+                ext4_clear_bit_atomic(lock, block, bh->b_data);
+                ret = 0;
+        } else {
+                ret = 1;
+        }
+        jbd_unlock_bh_state(bh);
+        return ret;
+}
+/**
+ * ext4_try_to_allocate()
+ * @sb:                 superblock
+ * @handle:             handle to this transaction
+ * @group:              given allocation block group
+ * @bitmap_bh:          bufferhead holds the block bitmap
+ * @grp_goal:           given target block within the group
+ * @count:              target number of blocks to allocate
+ * @my_rsv:             reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ *      if there is a reservation window, only try to allocate block(s) from the
+ *      file's own reservation window;
+ *      Otherwise, the allocation range starts from the give goal block, ends at
+ *      the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap.  In that case we must release write access to the old one via
+ * ext4_journal_release_buffer(), else we'll run out of credits.
+ */
+static ext4_grpblk_t
+ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+                        struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
+                        unsigned long *count, struct ext4_reserve_window *my_rsv)
+{
+        ext4_fsblk_t group_first_block;
+        ext4_grpblk_t start, end;
+        unsigned long num = 0;
+        /* we do allocation within the reservation window if we have a window */
+        if (my_rsv) {
+                group_first_block = ext4_group_first_block_no(sb, group);
+                if (my_rsv->_rsv_start >= group_first_block)
+                        start = my_rsv->_rsv_start - group_first_block;
+                else
+                        /* reservation window cross group boundary */
+                        start = 0;
+                end = my_rsv->_rsv_end - group_first_block + 1;
+                if (end > EXT4_BLOCKS_PER_GROUP(sb))
+                        /* reservation window crosses group boundary */
+                        end = EXT4_BLOCKS_PER_GROUP(sb);
+                if ((start <= grp_goal) && (grp_goal < end))
+                        start = grp_goal;
+                else
+                        grp_goal = -1;
+        } else {
+                if (grp_goal > 0)
+                        start = grp_goal;
+                else
+                        start = 0;
+                end = EXT4_BLOCKS_PER_GROUP(sb);
+        }
+        BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
+repeat:
+        if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
+                grp_goal = find_next_usable_block(start, bitmap_bh, end);
+                if (grp_goal < 0)
+                        goto fail_access;
+                if (!my_rsv) {
+                        int i;
+                        for (i = 0; i < 7 && grp_goal > start &&
+                                        ext4_test_allocatable(grp_goal - 1,
+                                                                bitmap_bh);
+                                        i++, grp_goal--)
+                                ;
+                }
+        }
+        start = grp_goal;
+        if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
+                grp_goal, bitmap_bh)) {
+                /*
+                 * The block was allocated by another thread, or it was
+                 * allocated and then freed by another thread
+                 */
+                start++;
+                grp_goal++;
+                if (start >= end)
+                        goto fail_access;
+                goto repeat;
+        }
+        num++;
+        grp_goal++;
+        while (num < *count && grp_goal < end
+                && ext4_test_allocatable(grp_goal, bitmap_bh)
+                && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
+                                grp_goal, bitmap_bh)) {
+                num++;
+                grp_goal++;
+        }
+        *count = num;
+        return grp_goal - num;
+fail_access:
+        *count = num;
+        return -1;
+}
+/**
+ *      find_next_reservable_window():
+ *              find a reservable space within the given range.
+ *              It does not allocate the reservation window for now:
+ *              alloc_new_reservation() will do the work later.
+ *
+ *      @search_head: the head of the searching list;
+ *              This is not necessarily the list head of the whole filesystem
+ *
+ *              We have both head and start_block to assist the search
+ *              for the reservable space. The list starts from head,
+ *              but we will shift to the place where start_block is,
+ *              then start from there, when looking for a reservable space.
+ *
+ *      @size: the target new reservation window size
+ *
+ *      @group_first_block: the first block we consider to start
+ *                      the real search from
+ *
+ *      @last_block:
+ *              the maximum block number that our goal reservable space
+ *              could start from. This is normally the last block in this
+ *              group. The search will end when we found the start of next
+ *              possible reservable space is out of this boundary.
+ *              This could handle the cross boundary reservation window
+ *              request.
+ *
+ *      basically we search from the given range, rather than the whole
+ *      reservation double linked list, (start_block, last_block)
+ *      to find a free region that is of my size and has not
+ *      been reserved.
+ *
+ */
+static int find_next_reservable_window(
+                                struct ext4_reserve_window_node *search_head,
+                                struct ext4_reserve_window_node *my_rsv,
+                                struct super_block * sb,
+                                ext4_fsblk_t start_block,
+                                ext4_fsblk_t last_block)
+{
+        struct rb_node *next;
+        struct ext4_reserve_window_node *rsv, *prev;
+        ext4_fsblk_t cur;
+        int size = my_rsv->rsv_goal_size;
+        /* TODO: make the start of the reservation window byte-aligned */
+        /* cur = *start_block & ~7;*/
+        cur = start_block;
+        rsv = search_head;
+        if (!rsv)
+                return -1;
+        while (1) {
+                if (cur <= rsv->rsv_end)
+                        cur = rsv->rsv_end + 1;
+                /* TODO?
+                 * in the case we could not find a reservable space
+                 * that is what is expected, during the re-search, we could
+                 * remember what's the largest reservable space we could have
+                 * and return that one.
+                 *
+                 * For now it will fail if we could not find the reservable
+                 * space with expected-size (or more)...
+                 */
+                if (cur > last_block)
+                        return -1;              /* fail */
+                prev = rsv;
+                next = rb_next(&rsv->rsv_node);
+                rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
+                /*
+                 * Reached the last reservation, we can just append to the
+                 * previous one.
+                 */
+                if (!next)
+                        break;
+                if (cur + size <= rsv->rsv_start) {
+                        /*
+                         * Found a reserveable space big enough.  We could
+                         * have a reservation across the group boundary here
+                         */
+                        break;
+                }
+        }
+        /*
+         * we come here either :
+         * when we reach the end of the whole list,
+         * and there is empty reservable space after last entry in the list.
+         * append it to the end of the list.
+         *
+         * or we found one reservable space in the middle of the list,
+         * return the reservation window that we could append to.
+         * succeed.
+         */
+        if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+                rsv_window_remove(sb, my_rsv);
+        /*
+         * Let's book the whole avaliable window for now.  We will check the
+         * disk bitmap later and then, if there are free blocks then we adjust
+         * the window size if it's larger than requested.
+         * Otherwise, we will remove this node from the tree next time
+         * call find_next_reservable_window.
+         */
+        my_rsv->rsv_start = cur;
+        my_rsv->rsv_end = cur + size - 1;
+        my_rsv->rsv_alloc_hit = 0;
+        if (prev != my_rsv)
+                ext4_rsv_window_add(sb, my_rsv);
+        return 0;
+}
+/**
+ *      alloc_new_reservation()--allocate a new reservation window
+ *
+ *              To make a new reservation, we search part of the filesystem
+ *              reservation list (the list that inside the group). We try to
+ *              allocate a new reservation window near the allocation goal,
+ *              or the beginning of the group, if there is no goal.
+ *
+ *              We first find a reservable space after the goal, then from
+ *              there, we check the bitmap for the first free block after
+ *              it. If there is no free block until the end of group, then the
+ *              whole group is full, we failed. Otherwise, check if the free
+ *              block is inside the expected reservable space, if so, we
+ *              succeed.
+ *              If the first free block is outside the reservable space, then
+ *              start from the first free block, we search for next available
+ *              space, and go on.
+ *
+ *      on succeed, a new reservation will be found and inserted into the list
+ *      It contains at least one free block, and it does not overlap with other
+ *      reservation windows.
+ *
+ *      failed: we failed to find a reservation window in this group
+ *
+ *      @rsv: the reservation
+ *
+ *      @grp_goal: The goal (group-relative).  It is where the search for a
+ *              free reservable space should start from.
+ *              if we have a grp_goal(grp_goal >0 ), then start from there,
+ *              no grp_goal(grp_goal = -1), we start from the first block
+ *              of the group.
+ *
+ *      @sb: the super block
+ *      @group: the group we are trying to allocate in
+ *      @bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
+                ext4_grpblk_t grp_goal, struct super_block *sb,
+                unsigned int group, struct buffer_head *bitmap_bh)
+{
+        struct ext4_reserve_window_node *search_head;
+        ext4_fsblk_t group_first_block, group_end_block, start_block;
+        ext4_grpblk_t first_free_block;
+        struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
+        unsigned long size;
+        int ret;
+        spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
+        group_first_block = ext4_group_first_block_no(sb, group);
+        group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+        if (grp_goal < 0)
+                start_block = group_first_block;
+        else
+                start_block = grp_goal + group_first_block;
+        size = my_rsv->rsv_goal_size;
+        if (!rsv_is_empty(&my_rsv->rsv_window)) {
+                /*
+                 * if the old reservation is cross group boundary
+                 * and if the goal is inside the old reservation window,
+                 * we will come here when we just failed to allocate from
+                 * the first part of the window. We still have another part
+                 * that belongs to the next group. In this case, there is no
+                 * point to discard our window and try to allocate a new one
+                 * in this group(which will fail). we should
+                 * keep the reservation window, just simply move on.
+                 *
+                 * Maybe we could shift the start block of the reservation
+                 * window to the first block of next group.
+                 */
+                if ((my_rsv->rsv_start <= group_end_block) &&
+                                (my_rsv->rsv_end > group_end_block) &&
+                                (start_block >= my_rsv->rsv_start))
+                        return -1;
+                if ((my_rsv->rsv_alloc_hit >
+                     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+                        /*
+                         * if the previously allocation hit ratio is
+                         * greater than 1/2, then we double the size of
+                         * the reservation window the next time,
+                         * otherwise we keep the same size window
+                         */
+                        size = size * 2;
+                        if (size > EXT4_MAX_RESERVE_BLOCKS)
+                                size = EXT4_MAX_RESERVE_BLOCKS;
+                        my_rsv->rsv_goal_size= size;
+                }
+        }
+        spin_lock(rsv_lock);
+        /*
+         * shift the search start to the window near the goal block
+         */
+        search_head = search_reserve_window(fs_rsv_root, start_block);
+        /*
+         * find_next_reservable_window() simply finds a reservable window
+         * inside the given range(start_block, group_end_block).
+         *
+         * To make sure the reservation window has a free bit inside it, we
+         * need to check the bitmap after we found a reservable window.
+         */
+retry:
+        ret = find_next_reservable_window(search_head, my_rsv, sb,
+                                                start_block, group_end_block);
+        if (ret == -1) {
+                if (!rsv_is_empty(&my_rsv->rsv_window))
+                        rsv_window_remove(sb, my_rsv);
+                spin_unlock(rsv_lock);
+                return -1;
+        }
+        /*
+         * On success, find_next_reservable_window() returns the
+         * reservation window where there is a reservable space after it.
+         * Before we reserve this reservable space, we need
+         * to make sure there is at least a free block inside this region.
+         *
+         * searching the first free bit on the block bitmap and copy of
+         * last committed bitmap alternatively, until we found a allocatable
+         * block. Search start from the start block of the reservable space
+         * we just found.
+         */
+        spin_unlock(rsv_lock);
+        first_free_block = bitmap_search_next_usable_block(
+                        my_rsv->rsv_start - group_first_block,
+                        bitmap_bh, group_end_block - group_first_block + 1);
+        if (first_free_block < 0) {
+                /*
+                 * no free block left on the bitmap, no point
+                 * to reserve the space. return failed.
+                 */
+                spin_lock(rsv_lock);
+                if (!rsv_is_empty(&my_rsv->rsv_window))
+                        rsv_window_remove(sb, my_rsv);
+                spin_unlock(rsv_lock);
+                return -1;              /* failed */
+        }
+        start_block = first_free_block + group_first_block;
+        /*
+         * check if the first free block is within the
+         * free space we just reserved
+         */
+        if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
+                return 0;               /* success */
+        /*
+         * if the first free bit we found is out of the reservable space
+         * continue search for next reservable space,
+         * start from where the free block is,
+         * we also shift the list head to where we stopped last time
+         */
+        search_head = my_rsv;
+        spin_lock(rsv_lock);
+        goto retry;
+}
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:             given reservation window
+ * @sb:                 super block
+ * @size:               the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext4_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext4_new_blocks() tries to allocate blocks,
+ */
+static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
+                        struct super_block *sb, int size)
+{
+        struct ext4_reserve_window_node *next_rsv;
+        struct rb_node *next;
+        spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
+        if (!spin_trylock(rsv_lock))
+                return;
+        next = rb_next(&my_rsv->rsv_node);
+        if (!next)
+                my_rsv->rsv_end += size;
+        else {
+                next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
+                if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+                        my_rsv->rsv_end += size;
+                else
+                        my_rsv->rsv_end = next_rsv->rsv_start - 1;
+        }
+        spin_unlock(rsv_lock);
+}
+/**
+ * ext4_try_to_allocate_with_rsv()
+ * @sb:                 superblock
+ * @handle:             handle to this transaction
+ * @group:              given allocation block group
+ * @bitmap_bh:          bufferhead holds the block bitmap
+ * @grp_goal:           given target block within the group
+ * @count:              target number of blocks to allocate
+ * @my_rsv:             reservation window
+ * @errp:               pointer to store the error code
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation.  If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ *
+ */
+static ext4_grpblk_t
+ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
+                        unsigned int group, struct buffer_head *bitmap_bh,
+                        ext4_grpblk_t grp_goal,
+                        struct ext4_reserve_window_node * my_rsv,
+                        unsigned long *count, int *errp)
+{
+        ext4_fsblk_t group_first_block, group_last_block;
+        ext4_grpblk_t ret = 0;
+        int fatal;
+        unsigned long num = *count;
+        *errp = 0;
+        /*
+         * Make sure we use undo access for the bitmap, because it is critical
+         * that we do the frozen_data COW on bitmap buffers in all cases even
+         * if the buffer is in BJ_Forget state in the committing transaction.
+         */
+        BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+        fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
+        if (fatal) {
+                *errp = fatal;
+                return -1;
+        }
+        /*
+         * we don't deal with reservation when
+         * filesystem is mounted without reservation
+         * or the file is not a regular file
+         * or last attempt to allocate a block with reservation turned on failed
+         */
+        if (my_rsv == NULL ) {
+                ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
+                                                grp_goal, count, NULL);
+                goto out;
+        }
+        /*
+         * grp_goal is a group relative block number (if there is a goal)
+         * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
+         * first block is a filesystem wide block number
+         * first block is the block number of the first block in this group
+         */
+        group_first_block = ext4_group_first_block_no(sb, group);
+        group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+        /*
+         * Basically we will allocate a new block from inode's reservation
+         * window.
+         *
+         * We need to allocate a new reservation window, if:
+         * a) inode does not have a reservation window; or
+         * b) last attempt to allocate a block from existing reservation
+         *    failed; or
+         * c) we come here with a goal and with a reservation window
+         *
+         * We do not need to allocate a new reservation window if we come here
+         * at the beginning with a goal and the goal is inside the window, or
+         * we don't have a goal but already have a reservation window.
+         * then we could go to allocate from the reservation window directly.
+         */
+        while (1) {
+                if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+                        !goal_in_my_reservation(&my_rsv->rsv_window,
+                                                grp_goal, group, sb)) {
+                        if (my_rsv->rsv_goal_size < *count)
+                                my_rsv->rsv_goal_size = *count;
+                        ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+                                                        group, bitmap_bh);
+                        if (ret < 0)
+                                break;                  /* failed */
+                        if (!goal_in_my_reservation(&my_rsv->rsv_window,
+                                                        grp_goal, group, sb))
+                                grp_goal = -1;
+                } else if (grp_goal > 0 &&
+                          (my_rsv->rsv_end-grp_goal+1) < *count)
+                        try_to_extend_reservation(my_rsv, sb,
+                                        *count-my_rsv->rsv_end + grp_goal - 1);
+                if ((my_rsv->rsv_start > group_last_block) ||
+                                (my_rsv->rsv_end < group_first_block)) {
+                        rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
+                        BUG();
+                }
+                ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
+                                           grp_goal, &num, &my_rsv->rsv_window);
+                if (ret >= 0) {
+                        my_rsv->rsv_alloc_hit += num;
+                        *count = num;
+                        break;                          /* succeed */
+                }
+                num = *count;
+        }
+out:
+        if (ret >= 0) {
+                BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
+                                        "bitmap block");
+                fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
+                if (fatal) {
+                        *errp = fatal;
+                        return -1;
+                }
+                return ret;
+        }
+        BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+        ext4_journal_release_buffer(handle, bitmap_bh);
+        return ret;
+}
+/**
+ * ext4_has_free_blocks()
+ * @sbi:                in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+{
+        ext4_fsblk_t free_blocks, root_blocks;
+        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        root_blocks = ext4_r_blocks_count(sbi->s_es);
+        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+                sbi->s_resuid != current->fsuid &&
+                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                return 0;
+        }
+        return 1;
+}
+/**
+ * ext4_should_retry_alloc()
+ * @sb:                 super block
+ * @retries             number of attemps has been made
+ *
+ * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or commiting transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+{
+        if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+                return 0;
+        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+        return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+}
+/**
+ * ext4_new_blocks() -- core block(s) allocation function
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:              target number of blocks to allocate
+ * @errp:               error code
+ *
+ * ext4_new_blocks uses a goal block to assist allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
+ */
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+                        ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *gdp_bh;
+        unsigned long group_no;
+        int goal_group;
+        ext4_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
+        ext4_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
+        ext4_fsblk_t ret_block;         /* filesyetem-wide allocated block */
+        int bgi;                        /* blockgroup iteration index */
+        int fatal = 0, err;
+        int performed_allocation = 0;
+        ext4_grpblk_t free_blocks;      /* number of free blocks in a group */
+        struct super_block *sb;
+        struct ext4_group_desc *gdp;
+        struct ext4_super_block *es;
+        struct ext4_sb_info *sbi;
+        struct ext4_reserve_window_node *my_rsv = NULL;
+        struct ext4_block_alloc_info *block_i;
+        unsigned short windowsz = 0;
+#ifdef EXT4FS_DEBUG
+        static int goal_hits, goal_attempts;
+#endif
+        unsigned long ngroups;
+        unsigned long num = *count;
+        *errp = -ENOSPC;
+        sb = inode->i_sb;
+        if (!sb) {
+                printk("ext4_new_block: nonexistent device");
+                return 0;
+        }
+        /*
+         * Check quota for allocation of this block.
+         */
+        if (DQUOT_ALLOC_BLOCK(inode, num)) {
+                *errp = -EDQUOT;
+                return 0;
+        }
+        sbi = EXT4_SB(sb);
+        es = EXT4_SB(sb)->s_es;
+        ext4_debug("goal=%lu.\n", goal);
+        /*
+         * Allocate a block from reservation only when
+         * filesystem is mounted with reservation(default,-o reservation), and
+         * it's a regular file, and
+         * the desired window size is greater than 0 (One could use ioctl
+         * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
+         * reservation on that particular file)
+         */
+        block_i = EXT4_I(inode)->i_block_alloc_info;
+        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
+                my_rsv = &block_i->rsv_window_node;
+        if (!ext4_has_free_blocks(sbi)) {
+                *errp = -ENOSPC;
+                goto out;
+        }
+        /*
+         * First, test whether the goal block is free.
+         */
+        if (goal < le32_to_cpu(es->s_first_data_block) ||
+            goal >= ext4_blocks_count(es))
+                goal = le32_to_cpu(es->s_first_data_block);
+        ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
+        goal_group = group_no;
+retry_alloc:
+        gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
+        if (!gdp)
+                goto io_error;
+        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+        /*
+         * if there is not enough free blocks to make a new resevation
+         * turn off reservation for this allocation
+         */
+        if (my_rsv && (free_blocks < windowsz)
+                && (rsv_is_empty(&my_rsv->rsv_window)))
+                my_rsv = NULL;
+        if (free_blocks > 0) {
+                bitmap_bh = read_block_bitmap(sb, group_no);
+                if (!bitmap_bh)
+                        goto io_error;
+                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
+                                        group_no, bitmap_bh, grp_target_blk,
+                                        my_rsv, &num, &fatal);
+                if (fatal)
+                        goto out;
+                if (grp_alloc_blk >= 0)
+                        goto allocated;
+        }
+        ngroups = EXT4_SB(sb)->s_groups_count;
+        smp_rmb();
+        /*
+         * Now search the rest of the groups.  We assume that
+         * i and gdp correctly point to the last group visited.
+         */
+        for (bgi = 0; bgi < ngroups; bgi++) {
+                group_no++;
+                if (group_no >= ngroups)
+                        group_no = 0;
+                gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
+                if (!gdp) {
+                        *errp = -EIO;
+                        goto out;
+                }
+                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+                /*
+                 * skip this group if the number of
+                 * free blocks is less than half of the reservation
+                 * window size.
+                 */
+                if (free_blocks <= (windowsz/2))
+                        continue;
+                brelse(bitmap_bh);
+                bitmap_bh = read_block_bitmap(sb, group_no);
+                if (!bitmap_bh)
+                        goto io_error;
+                /*
+                 * try to allocate block(s) from this group, without a goal(-1).
+                 */
+                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
+                                        group_no, bitmap_bh, -1, my_rsv,
+                                        &num, &fatal);
+                if (fatal)
+                        goto out;
+                if (grp_alloc_blk >= 0)
+                        goto allocated;
+        }
+        /*
+         * We may end up a bogus ealier ENOSPC error due to
+         * filesystem is "full" of reservations, but
+         * there maybe indeed free blocks avaliable on disk
+         * In this case, we just forget about the reservations
+         * just do block allocation as without reservations.
+         */
+        if (my_rsv) {
+                my_rsv = NULL;
+                group_no = goal_group;
+                goto retry_alloc;
+        }
+        /* No space left on the device */
+        *errp = -ENOSPC;
+        goto out;
+allocated:
+        ext4_debug("using block group %d(%d)\n",
+                        group_no, gdp->bg_free_blocks_count);
+        BUFFER_TRACE(gdp_bh, "get_write_access");
+        fatal = ext4_journal_get_write_access(handle, gdp_bh);
+        if (fatal)
+                goto out;
+        ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
+        if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+            in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+            in_range(ret_block, ext4_inode_table(sb, gdp),
+                     EXT4_SB(sb)->s_itb_per_group) ||
+            in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
+                     EXT4_SB(sb)->s_itb_per_group))
+                ext4_error(sb, "ext4_new_block",
+                            "Allocating block in system zone - "
+                            "blocks from %llu, length %lu",
+                             ret_block, num);
+        performed_allocation = 1;
+#ifdef CONFIG_JBD_DEBUG
+        {
+                struct buffer_head *debug_bh;
+                /* Record bitmap buffer state in the newly allocated block */
+                debug_bh = sb_find_get_block(sb, ret_block);
+                if (debug_bh) {
+                        BUFFER_TRACE(debug_bh, "state when allocated");
+                        BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
+                        brelse(debug_bh);
+                }
+        }
+        jbd_lock_bh_state(bitmap_bh);
+        spin_lock(sb_bgl_lock(sbi, group_no));
+        if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+                int i;
+                for (i = 0; i < num; i++) {
+                        if (ext4_test_bit(grp_alloc_blk+i,
+                                        bh2jh(bitmap_bh)->b_committed_data)) {
+                                printk("%s: block was unexpectedly set in "
+                                        "b_committed_data\n", __FUNCTION__);
+                        }
+                }
+        }
+        ext4_debug("found bit %d\n", grp_alloc_blk);
+        spin_unlock(sb_bgl_lock(sbi, group_no));
+        jbd_unlock_bh_state(bitmap_bh);
+#endif
+        if (ret_block + num - 1 >= ext4_blocks_count(es)) {
+                ext4_error(sb, "ext4_new_block",
+                            "block(%llu) >= blocks count(%llu) - "
+                            "block_group = %lu, es == %p ", ret_block,
+                        ext4_blocks_count(es), group_no, es);
+                goto out;
+        }
+        /*
+         * It is up to the caller to add the new buffer to a journal
+         * list of some description.  We don't know in advance whether
+         * the caller wants to use it as metadata or data.
+         */
+        ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
+                        ret_block, goal_hits, goal_attempts);
+        spin_lock(sb_bgl_lock(sbi, group_no));
+        gdp->bg_free_blocks_count =
+                        cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
+        spin_unlock(sb_bgl_lock(sbi, group_no));
+        percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
+        err = ext4_journal_dirty_metadata(handle, gdp_bh);
+        if (!fatal)
+                fatal = err;
+        sb->s_dirt = 1;
+        if (fatal)
+                goto out;
+        *errp = 0;
+        brelse(bitmap_bh);
+        DQUOT_FREE_BLOCK(inode, *count-num);
+        *count = num;
+        return ret_block;
+io_error:
+        *errp = -EIO;
+out:
+        if (fatal) {
+                *errp = fatal;
+                ext4_std_error(sb, fatal);
+        }
+        /*
+         * Undo the block allocation
+         */
+        if (!performed_allocation)
+                DQUOT_FREE_BLOCK(inode, *count);
+        brelse(bitmap_bh);
+        return 0;
+}
+ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+                        ext4_fsblk_t goal, int *errp)
+{
+        unsigned long count = 1;
+        return ext4_new_blocks(handle, inode, goal, &count, errp);
+}
+/**
+ * ext4_count_free_blocks() -- count filesystem free blocks
+ * @sb:         superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+{
+        ext4_fsblk_t desc_count;
+        struct ext4_group_desc *gdp;
+        int i;
+        unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
+#ifdef EXT4FS_DEBUG
+        struct ext4_super_block *es;
+        ext4_fsblk_t bitmap_count;
+        unsigned long x;
+        struct buffer_head *bitmap_bh = NULL;
+        es = EXT4_SB(sb)->s_es;
+        desc_count = 0;
+        bitmap_count = 0;
+        gdp = NULL;
+        smp_rmb();
+        for (i = 0; i < ngroups; i++) {
+                gdp = ext4_get_group_desc(sb, i, NULL);
+                if (!gdp)
+                        continue;
+                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+                brelse(bitmap_bh);
+                bitmap_bh = read_block_bitmap(sb, i);
+                if (bitmap_bh == NULL)
+                        continue;
+                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+                printk("group %d: stored = %d, counted = %lu\n",
+                        i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+                bitmap_count += x;
+        }
+        brelse(bitmap_bh);
+        printk("ext4_count_free_blocks: stored = %llu"
+                ", computed = %llu, %llu\n",
+               EXT4_FREE_BLOCKS_COUNT(es),
+                desc_count, bitmap_count);
+        return bitmap_count;
+#else
+        desc_count = 0;
+        smp_rmb();
+        for (i = 0; i < ngroups; i++) {
+                gdp = ext4_get_group_desc(sb, i, NULL);
+                if (!gdp)
+                        continue;
+                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+        }
+        return desc_count;
+#endif
+}
+static inline int
+block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
+{
+        ext4_grpblk_t offset;
+        ext4_get_group_no_and_offset(sb, block, NULL, &offset);
+        return ext4_test_bit (offset, map);
+}
+static inline int test_root(int a, int b)
+{
+        int num = b;
+        while (a > num)
+                num *= b;
+        return num == a;
+}
+static int ext4_group_sparse(int group)
+{
+        if (group <= 1)
+                return 1;
+        if (!(group & 1))
+                return 0;
+        return (test_root(group, 7) || test_root(group, 5) ||
+                test_root(group, 3));
+}
+/**
+ *      ext4_bg_has_super - number of blocks used by the superblock in group
+ *      @sb: superblock for filesystem
+ *      @group: group number to check
+ *
+ *      Return the number of blocks used by the superblock (primary or backup)
+ *      in this group.  Currently this will be only 0 or 1.
+ */
+int ext4_bg_has_super(struct super_block *sb, int group)
+{
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+                        !ext4_group_sparse(group))
+                return 0;
+        return 1;
+}
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
+{
+        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+        unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+        unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+        if (group == first || group == first + 1 || group == last)
+                return 1;
+        return 0;
+}
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
+{
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+                        !ext4_group_sparse(group))
+                return 0;
+        return EXT4_SB(sb)->s_gdb_count;
+}
+/**
+ *      ext4_bg_num_gdb - number of blocks used by the group table in group
+ *      @sb: superblock for filesystem
+ *      @group: group number to check
+ *
+ *      Return the number of blocks used by the group descriptor table
+ *      (primary or backup) in this group.  In the future there may be a
+ *      different number of descriptor blocks in each group.
+ */
+unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
+{
+        unsigned long first_meta_bg =
+                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
+                        metagroup < first_meta_bg)
+                return ext4_bg_num_gdb_nometa(sb,group);
+        return ext4_bg_num_gdb_meta(sb,group);
+}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..11e93c169bcf
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
+/*
+ *  linux/fs/ext4/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+#include <linux/buffer_head.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#ifdef EXT4FS_DEBUG
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+{
+        unsigned int i;
+        unsigned long sum = 0;
+        if (!map)
+                return (0);
+        for (i = 0; i < numchars; i++)
+                sum += nibblemap[map->b_data[i] & 0xf] +
+                        nibblemap[(map->b_data[i] >> 4) & 0xf];
+        return (sum);
+}
+#endif  /*  EXT4FS_DEBUG  */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..f8595787a70e
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
+/*
+ *  linux/fs/ext4/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001  Daniel Phillips
+ *
+ */
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+static unsigned char ext4_filetype_table[] = {
+        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+static int ext4_readdir(struct file *, void *, filldir_t);
+static int ext4_dx_readdir(struct file * filp,
+                           void * dirent, filldir_t filldir);
+static int ext4_release_dir (struct inode * inode,
+                                struct file * filp);
+const struct file_operations ext4_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = ext4_readdir,         /* we take BKL. needed?*/
+        .ioctl          = ext4_ioctl,           /* BKL held */
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ext4_compat_ioctl,
+#endif
+        .fsync          = ext4_sync_file,       /* BKL held */
+#ifdef CONFIG_EXT4_INDEX
+        .release        = ext4_release_dir,
+#endif
+};
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+            (filetype >= EXT4_FT_MAX))
+                return DT_UNKNOWN;
+        return (ext4_filetype_table[filetype]);
+}
+int ext4_check_dir_entry (const char * function, struct inode * dir,
+                          struct ext4_dir_entry_2 * de,
+                          struct buffer_head * bh,
+                          unsigned long offset)
+{
+        const char * error_msg = NULL;
+        const int rlen = le16_to_cpu(de->rec_len);
+        if (rlen < EXT4_DIR_REC_LEN(1))
+                error_msg = "rec_len is smaller than minimal";
+        else if (rlen % 4 != 0)
+                error_msg = "rec_len % 4 != 0";
+        else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+                error_msg = "rec_len is too small for name_len";
+        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+                error_msg = "directory entry across blocks";
+        else if (le32_to_cpu(de->inode) >
+                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+                error_msg = "inode out of bounds";
+        if (error_msg != NULL)
+                ext4_error (dir->i_sb, function,
+                        "bad entry in directory #%lu: %s - "
+                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                        dir->i_ino, error_msg, offset,
+                        (unsigned long) le32_to_cpu(de->inode),
+                        rlen, de->name_len);
+        return error_msg == NULL ? 1 : 0;
+}
+static int ext4_readdir(struct file * filp,
+                         void * dirent, filldir_t filldir)
+{
+        int error = 0;
+        unsigned long offset;
+        int i, stored;
+        struct ext4_dir_entry_2 *de;
+        struct super_block *sb;
+        int err;
+        struct inode *inode = filp->f_dentry->d_inode;
+        int ret = 0;
+        sb = inode->i_sb;
+#ifdef CONFIG_EXT4_INDEX
+        if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                                    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+            ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+             ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+                err = ext4_dx_readdir(filp, dirent, filldir);
+                if (err != ERR_BAD_DX_DIR) {
+                        ret = err;
+                        goto out;
+                }
+                /*
+                 * We don't set the inode dirty flag since it's not
+                 * critical that it get flushed back to the disk.
+                 */
+                EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+        }
+#endif
+        stored = 0;
+        offset = filp->f_pos & (sb->s_blocksize - 1);
+        while (!error && !stored && filp->f_pos < inode->i_size) {
+                unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                struct buffer_head map_bh;
+                struct buffer_head *bh = NULL;
+                map_bh.b_state = 0;
+                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+                if (err > 0) {
+                        page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
+                                &filp->f_ra,
+                                filp,
+                                map_bh.b_blocknr >>
+                                        (PAGE_CACHE_SHIFT - inode->i_blkbits),
+                                1);
+                        bh = ext4_bread(NULL, inode, blk, 0, &err);
+                }
+                /*
+                 * We ignore I/O errors on directories so users have a chance
+                 * of recovering data when there's a bad sector
+                 */
+                if (!bh) {
+                        ext4_error (sb, "ext4_readdir",
+                                "directory #%lu contains a hole at offset %lu",
+                                inode->i_ino, (unsigned long)filp->f_pos);
+                        filp->f_pos += sb->s_blocksize - offset;
+                        continue;
+                }
+revalidate:
+                /* If the dir block has changed since the last call to
+                 * readdir(2), then we might be pointing to an invalid
+                 * dirent right now.  Scan from the start of the block
+                 * to make sure. */
+                if (filp->f_version != inode->i_version) {
+                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
+                                de = (struct ext4_dir_entry_2 *)
+                                        (bh->b_data + i);
+                                /* It's too expensive to do a full
+                                 * dirent test each time round this
+                                 * loop, but we do have to test at
+                                 * least that it is non-zero.  A
+                                 * failure will be detected in the
+                                 * dirent test below. */
+                                if (le16_to_cpu(de->rec_len) <
+                                                EXT4_DIR_REC_LEN(1))
+                                        break;
+                                i += le16_to_cpu(de->rec_len);
+                        }
+                        offset = i;
+                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                                | offset;
+                        filp->f_version = inode->i_version;
+                }
+                while (!error && filp->f_pos < inode->i_size
+                       && offset < sb->s_blocksize) {
+                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
+                        if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
+                                                   bh, offset)) {
+                                /*
+                                 * On error, skip the f_pos to the next block
+                                 */
+                                filp->f_pos = (filp->f_pos |
+                                                (sb->s_blocksize - 1)) + 1;
+                                brelse (bh);
+                                ret = stored;
+                                goto out;
+                        }
+                        offset += le16_to_cpu(de->rec_len);
+                        if (le32_to_cpu(de->inode)) {
+                                /* We might block in the next section
+                                 * if the data destination is
+                                 * currently swapped out.  So, use a
+                                 * version stamp to detect whether or
+                                 * not the directory has been modified
+                                 * during the copy operation.
+                                 */
+                                unsigned long version = filp->f_version;
+                                error = filldir(dirent, de->name,
+                                                de->name_len,
+                                                filp->f_pos,
+                                                le32_to_cpu(de->inode),
+                                                get_dtype(sb, de->file_type));
+                                if (error)
+                                        break;
+                                if (version != filp->f_version)
+                                        goto revalidate;
+                                stored ++;
+                        }
+                        filp->f_pos += le16_to_cpu(de->rec_len);
+                }
+                offset = 0;
+                brelse (bh);
+        }
+out:
+        return ret;
+}
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+ *
+ * Currently we only use major hash numer.  This is unfortunate, but
+ * on 32-bit machines, the same VFS interface is used for lseek and
+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+ * lseek/telldir/seekdir will blow out spectacularly, and from within
+ * the ext2 low-level routine, we don't know if we're being called by
+ * a 64-bit version of the system call or the 32-bit version of the
+ * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
+ * cookie.  Sigh.
+ */
+#define hash2pos(major, minor)  (major >> 1)
+#define pos2maj_hash(pos)       ((pos << 1) & 0xffffffff)
+#define pos2min_hash(pos)       (0)
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+        __u32           hash;
+        __u32           minor_hash;
+        struct rb_node  rb_hash;
+        struct fname    *next;
+        __u32           inode;
+        __u8            name_len;
+        __u8            file_type;
+        char            name[0];
+};
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+        struct rb_node  *n = root->rb_node;
+        struct rb_node  *parent;
+        struct fname    *fname;
+        while (n) {
+                /* Do the node's children first */
+                if ((n)->rb_left) {
+                        n = n->rb_left;
+                        continue;
+                }
+                if (n->rb_right) {
+                        n = n->rb_right;
+                        continue;
+                }
+                /*
+                 * The node has no children; free it, and then zero
+                 * out parent's link to it.  Finally go to the
+                 * beginning of the loop and try to free the parent
+                 * node.
+                 */
+                parent = rb_parent(n);
+                fname = rb_entry(n, struct fname, rb_hash);
+                while (fname) {
+                        struct fname * old = fname;
+                        fname = fname->next;
+                        kfree (old);
+                }
+                if (!parent)
+                        root->rb_node = NULL;
+                else if (parent->rb_left == n)
+                        parent->rb_left = NULL;
+                else if (parent->rb_right == n)
+                        parent->rb_right = NULL;
+                n = parent;
+        }
+        root->rb_node = NULL;
+}
+static struct dir_private_info *create_dir_info(loff_t pos)
+{
+        struct dir_private_info *p;
+        p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+        if (!p)
+                return NULL;
+        p->root.rb_node = NULL;
+        p->curr_node = NULL;
+        p->extra_fname = NULL;
+        p->last_pos = 0;
+        p->curr_hash = pos2maj_hash(pos);
+        p->curr_minor_hash = pos2min_hash(pos);
+        p->next_hash = 0;
+        return p;
+}
+void ext4_htree_free_dir_info(struct dir_private_info *p)
+{
+        free_rb_tree_fname(&p->root);
+        kfree(p);
+}
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+                             __u32 minor_hash,
+                             struct ext4_dir_entry_2 *dirent)
+{
+        struct rb_node **p, *parent = NULL;
+        struct fname * fname, *new_fn;
+        struct dir_private_info *info;
+        int len;
+        info = (struct dir_private_info *) dir_file->private_data;
+        p = &info->root.rb_node;
+        /* Create and allocate the fname structure */
+        len = sizeof(struct fname) + dirent->name_len + 1;
+        new_fn = kzalloc(len, GFP_KERNEL);
+        if (!new_fn)
+                return -ENOMEM;
+        new_fn->hash = hash;
+        new_fn->minor_hash = minor_hash;
+        new_fn->inode = le32_to_cpu(dirent->inode);
+        new_fn->name_len = dirent->name_len;
+        new_fn->file_type = dirent->file_type;
+        memcpy(new_fn->name, dirent->name, dirent->name_len);
+        new_fn->name[dirent->name_len] = 0;
+        while (*p) {
+                parent = *p;
+                fname = rb_entry(parent, struct fname, rb_hash);
+                /*
+                 * If the hash and minor hash match up, then we put
+                 * them on a linked list.  This rarely happens...
+                 */
+                if ((new_fn->hash == fname->hash) &&
+                    (new_fn->minor_hash == fname->minor_hash)) {
+                        new_fn->next = fname->next;
+                        fname->next = new_fn;
+                        return 0;
+                }
+                if (new_fn->hash < fname->hash)
+                        p = &(*p)->rb_left;
+                else if (new_fn->hash > fname->hash)
+                        p = &(*p)->rb_right;
+                else if (new_fn->minor_hash < fname->minor_hash)
+                        p = &(*p)->rb_left;
+                else /* if (new_fn->minor_hash > fname->minor_hash) */
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&new_fn->rb_hash, parent, p);
+        rb_insert_color(&new_fn->rb_hash, &info->root);
+        return 0;
+}
+/*
+ * This is a helper function for ext4_dx_readdir.  It calls filldir
+ * for all entres on the fname linked list.  (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file * filp, void * dirent,
+                        filldir_t filldir, struct fname *fname)
+{
+        struct dir_private_info *info = filp->private_data;
+        loff_t  curr_pos;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct super_block * sb;
+        int error;
+        sb = inode->i_sb;
+        if (!fname) {
+                printk("call_filldir: called with null fname?!?\n");
+                return 0;
+        }
+        curr_pos = hash2pos(fname->hash, fname->minor_hash);
+        while (fname) {
+                error = filldir(dirent, fname->name,
+                                fname->name_len, curr_pos,
+                                fname->inode,
+                                get_dtype(sb, fname->file_type));
+                if (error) {
+                        filp->f_pos = curr_pos;
+                        info->extra_fname = fname->next;
+                        return error;
+                }
+                fname = fname->next;
+        }
+        return 0;
+}
+static int ext4_dx_readdir(struct file * filp,
+                         void * dirent, filldir_t filldir)
+{
+        struct dir_private_info *info = filp->private_data;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct fname *fname;
+        int     ret;
+        if (!info) {
+                info = create_dir_info(filp->f_pos);
+                if (!info)
+                        return -ENOMEM;
+                filp->private_data = info;
+        }
+        if (filp->f_pos == EXT4_HTREE_EOF)
+                return 0;       /* EOF */
+        /* Some one has messed with f_pos; reset the world */
+        if (info->last_pos != filp->f_pos) {
+                free_rb_tree_fname(&info->root);
+                info->curr_node = NULL;
+                info->extra_fname = NULL;
+                info->curr_hash = pos2maj_hash(filp->f_pos);
+                info->curr_minor_hash = pos2min_hash(filp->f_pos);
+        }
+        /*
+         * If there are any leftover names on the hash collision
+         * chain, return them first.
+         */
+        if (info->extra_fname &&
+            call_filldir(filp, dirent, filldir, info->extra_fname))
+                goto finished;
+        if (!info->curr_node)
+                info->curr_node = rb_first(&info->root);
+        while (1) {
+                /*
+                 * Fill the rbtree if we have no more entries,
+                 * or the inode has changed since we last read in the
+                 * cached entries.
+                 */
+                if ((!info->curr_node) ||
+                    (filp->f_version != inode->i_version)) {
+                        info->curr_node = NULL;
+                        free_rb_tree_fname(&info->root);
+                        filp->f_version = inode->i_version;
+                        ret = ext4_htree_fill_tree(filp, info->curr_hash,
+                                                   info->curr_minor_hash,
+                                                   &info->next_hash);
+                        if (ret < 0)
+                                return ret;
+                        if (ret == 0) {
+                                filp->f_pos = EXT4_HTREE_EOF;
+                                break;
+                        }
+                        info->curr_node = rb_first(&info->root);
+                }
+                fname = rb_entry(info->curr_node, struct fname, rb_hash);
+                info->curr_hash = fname->hash;
+                info->curr_minor_hash = fname->minor_hash;
+                if (call_filldir(filp, dirent, filldir, fname))
+                        break;
+                info->curr_node = rb_next(info->curr_node);
+                if (!info->curr_node) {
+                        if (info->next_hash == ~0) {
+                                filp->f_pos = EXT4_HTREE_EOF;
+                                break;
+                        }
+                        info->curr_hash = info->next_hash;
+                        info->curr_minor_hash = 0;
+                }
+        }
+finished:
+        info->last_pos = filp->f_pos;
+        return 0;
+}
+static int ext4_release_dir (struct inode * inode, struct file * filp)
+{
+        if (filp->private_data)
+                ext4_htree_free_dir_info(filp->private_data);
+        return 0;
+}
+#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 000000000000..2608dce18f3e
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,2152 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * Architecture independence:
+ *   Copyright (c) 2005, Bull S.A.
+ *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+/*
+ * Extents support for EXT4
+ *
+ * TODO:
+ *   - ext4*_error() should be used in some situations
+ *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
+ *   - smart tree reduction
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ext4_fs_extents.h>
+#include <asm/uaccess.h>
+/*
+ * ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ex->ee_start);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+        return block;
+}
+/*
+ * idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+{
+        ext4_fsblk_t block;
+        block = le32_to_cpu(ix->ei_leaf);
+        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+        return block;
+}
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+{
+        ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+}
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
+{
+        ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+}
+static int ext4_ext_check_header(const char *function, struct inode *inode,
+                                struct ext4_extent_header *eh)
+{
+        const char *error_msg = NULL;
+        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+                error_msg = "invalid magic";
+                goto corrupted;
+        }
+        if (unlikely(eh->eh_max == 0)) {
+                error_msg = "invalid eh_max";
+                goto corrupted;
+        }
+        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+                error_msg = "invalid eh_entries";
+                goto corrupted;
+        }
+        return 0;
+corrupted:
+        ext4_error(inode->i_sb, function,
+                        "bad header in inode #%lu: %s - magic %x, "
+                        "entries %u, max %u, depth %u",
+                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+                        le16_to_cpu(eh->eh_depth));
+        return -EIO;
+}
+static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+{
+        int err;
+        if (handle->h_buffer_credits > needed)
+                return handle;
+        if (!ext4_journal_extend(handle, needed))
+                return handle;
+        err = ext4_journal_restart(handle, needed);
+        return handle;
+}
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *path)
+{
+        if (path->p_bh) {
+                /* path points to block */
+                return ext4_journal_get_write_access(handle, path->p_bh);
+        }
+        /* path points to leaf/index in inode body */
+        /* we use in-core data, no need to protect them */
+        return 0;
+}
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *path)
+{
+        int err;
+        if (path->p_bh) {
+                /* path points to block */
+                err = ext4_journal_dirty_metadata(handle, path->p_bh);
+        } else {
+                /* path points to leaf/index in inode body */
+                err = ext4_mark_inode_dirty(handle, inode);
+        }
+        return err;
+}
+static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+                              struct ext4_ext_path *path,
+                              ext4_fsblk_t block)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        ext4_fsblk_t bg_start;
+        ext4_grpblk_t colour;
+        int depth;
+        if (path) {
+                struct ext4_extent *ex;
+                depth = path->p_depth;
+                /* try to predict block placement */
+                if ((ex = path[depth].p_ext))
+                        return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                /* it looks like index is empty;
+                 * try to find starting block from index itself */
+                if (path[depth].p_bh)
+                        return path[depth].p_bh->b_blocknr;
+        }
+        /* OK. use inode's group */
+        bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+        colour = (current->pid % 16) *
+                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+        return bg_start + colour + block;
+}
+static ext4_fsblk_t
+ext4_ext_new_block(handle_t *handle, struct inode *inode,
+                        struct ext4_ext_path *path,
+                        struct ext4_extent *ex, int *err)
+{
+        ext4_fsblk_t goal, newblock;
+        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+        newblock = ext4_new_block(handle, inode, goal, err);
+        return newblock;
+}
+static inline int ext4_ext_space_block(struct inode *inode)
+{
+        int size;
+        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+                        / sizeof(struct ext4_extent);
+#ifdef AGRESSIVE_TEST
+        if (size > 6)
+                size = 6;
+#endif
+        return size;
+}
+static inline int ext4_ext_space_block_idx(struct inode *inode)
+{
+        int size;
+        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+                        / sizeof(struct ext4_extent_idx);
+#ifdef AGRESSIVE_TEST
+        if (size > 5)
+                size = 5;
+#endif
+        return size;
+}
+static inline int ext4_ext_space_root(struct inode *inode)
+{
+        int size;
+        size = sizeof(EXT4_I(inode)->i_data);
+        size -= sizeof(struct ext4_extent_header);
+        size /= sizeof(struct ext4_extent);
+#ifdef AGRESSIVE_TEST
+        if (size > 3)
+                size = 3;
+#endif
+        return size;
+}
+static inline int ext4_ext_space_root_idx(struct inode *inode)
+{
+        int size;
+        size = sizeof(EXT4_I(inode)->i_data);
+        size -= sizeof(struct ext4_extent_header);
+        size /= sizeof(struct ext4_extent_idx);
+#ifdef AGRESSIVE_TEST
+        if (size > 4)
+                size = 4;
+#endif
+        return size;
+}
+#ifdef EXT_DEBUG
+static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
+{
+        int k, l = path->p_depth;
+        ext_debug("path:");
+        for (k = 0; k <= l; k++, path++) {
+                if (path->p_idx) {
+                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
+                            idx_pblock(path->p_idx));
+                } else if (path->p_ext) {
+                        ext_debug("  %d:%d:%llu ",
+                                  le32_to_cpu(path->p_ext->ee_block),
+                                  le16_to_cpu(path->p_ext->ee_len),
+                                  ext_pblock(path->p_ext));
+                } else
+                        ext_debug("  []");
+        }
+        ext_debug("\n");
+}
+static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
+{
+        int depth = ext_depth(inode);
+        struct ext4_extent_header *eh;
+        struct ext4_extent *ex;
+        int i;
+        if (!path)
+                return;
+        eh = path[depth].p_hdr;
+        ex = EXT_FIRST_EXTENT(eh);
+        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
+                ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+                          le16_to_cpu(ex->ee_len), ext_pblock(ex));
+        }
+        ext_debug("\n");
+}
+#else
+#define ext4_ext_show_path(inode,path)
+#define ext4_ext_show_leaf(inode,path)
+#endif
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+        int depth = path->p_depth;
+        int i;
+        for (i = 0; i <= depth; i++, path++)
+                if (path->p_bh) {
+                        brelse(path->p_bh);
+                        path->p_bh = NULL;
+                }
+}
+/*
+ * ext4_ext_binsearch_idx:
+ * binary search for the closest index of the given block
+ */
+static void
+ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
+{
+        struct ext4_extent_header *eh = path->p_hdr;
+        struct ext4_extent_idx *r, *l, *m;
+        BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+        BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+        BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
+        ext_debug("binsearch for %d(idx):  ", block);
+        l = EXT_FIRST_INDEX(eh) + 1;
+        r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1;
+        while (l <= r) {
+                m = l + (r - l) / 2;
+                if (block < le32_to_cpu(m->ei_block))
+                        r = m - 1;
+                else
+                        l = m + 1;
+                ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block,
+                                m, m->ei_block, r, r->ei_block);
+        }
+        path->p_idx = l - 1;
+        ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+                  idx_block(path->p_idx));
+#ifdef CHECK_BINSEARCH
+        {
+                struct ext4_extent_idx *chix, *ix;
+                int k;
+                chix = ix = EXT_FIRST_INDEX(eh);
+                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
+                  if (k != 0 &&
+                      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+                                printk("k=%d, ix=0x%p, first=0x%p\n", k,
+                                        ix, EXT_FIRST_INDEX(eh));
+                                printk("%u <= %u\n",
+                                       le32_to_cpu(ix->ei_block),
+                                       le32_to_cpu(ix[-1].ei_block));
+                        }
+                        BUG_ON(k && le32_to_cpu(ix->ei_block)
+                                           <= le32_to_cpu(ix[-1].ei_block));
+                        if (block < le32_to_cpu(ix->ei_block))
+                                break;
+                        chix = ix;
+                }
+                BUG_ON(chix != path->p_idx);
+        }
+#endif
+}
+/*
+ * ext4_ext_binsearch:
+ * binary search for closest extent of the given block
+ */
+static void
+ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
+{
+        struct ext4_extent_header *eh = path->p_hdr;
+        struct ext4_extent *r, *l, *m;
+        BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+        BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+        if (eh->eh_entries == 0) {
+                /*
+                 * this leaf is empty:
+                 * we get such a leaf in split/add case
+                 */
+                return;
+        }
+        ext_debug("binsearch for %d:  ", block);
+        l = EXT_FIRST_EXTENT(eh) + 1;
+        r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1;
+        while (l <= r) {
+                m = l + (r - l) / 2;
+                if (block < le32_to_cpu(m->ee_block))
+                        r = m - 1;
+                else
+                        l = m + 1;
+                ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block,
+                                m, m->ee_block, r, r->ee_block);
+        }
+        path->p_ext = l - 1;
+        ext_debug("  -> %d:%llu:%d ",
+                        le32_to_cpu(path->p_ext->ee_block),
+                        ext_pblock(path->p_ext),
+                        le16_to_cpu(path->p_ext->ee_len));
+#ifdef CHECK_BINSEARCH
+        {
+                struct ext4_extent *chex, *ex;
+                int k;
+                chex = ex = EXT_FIRST_EXTENT(eh);
+                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
+                        BUG_ON(k && le32_to_cpu(ex->ee_block)
+                                          <= le32_to_cpu(ex[-1].ee_block));
+                        if (block < le32_to_cpu(ex->ee_block))
+                                break;
+                        chex = ex;
+                }
+                BUG_ON(chex != path->p_ext);
+        }
+#endif
+}
+int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+{
+        struct ext4_extent_header *eh;
+        eh = ext_inode_hdr(inode);
+        eh->eh_depth = 0;
+        eh->eh_entries = 0;
+        eh->eh_magic = EXT4_EXT_MAGIC;
+        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_ext_invalidate_cache(inode);
+        return 0;
+}
+struct ext4_ext_path *
+ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
+{
+        struct ext4_extent_header *eh;
+        struct buffer_head *bh;
+        short int depth, i, ppos = 0, alloc = 0;
+        eh = ext_inode_hdr(inode);
+        BUG_ON(eh == NULL);
+        if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+                return ERR_PTR(-EIO);
+        i = depth = ext_depth(inode);
+        /* account possible depth increase */
+        if (!path) {
+                path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+                                GFP_NOFS);
+                if (!path)
+                        return ERR_PTR(-ENOMEM);
+                alloc = 1;
+        }
+        memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
+        path[0].p_hdr = eh;
+        /* walk through the tree */
+        while (i) {
+                ext_debug("depth %d: num %d, max %d\n",
+                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+                ext4_ext_binsearch_idx(inode, path + ppos, block);
+                path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+                path[ppos].p_depth = i;
+                path[ppos].p_ext = NULL;
+                bh = sb_bread(inode->i_sb, path[ppos].p_block);
+                if (!bh)
+                        goto err;
+                eh = ext_block_hdr(bh);
+                ppos++;
+                BUG_ON(ppos > depth);
+                path[ppos].p_bh = bh;
+                path[ppos].p_hdr = eh;
+                i--;
+                if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+                        goto err;
+        }
+        path[ppos].p_depth = i;
+        path[ppos].p_hdr = eh;
+        path[ppos].p_ext = NULL;
+        path[ppos].p_idx = NULL;
+        if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+                goto err;
+        /* find extent */
+        ext4_ext_binsearch(inode, path + ppos, block);
+        ext4_ext_show_path(inode, path);
+        return path;
+err:
+        ext4_ext_drop_refs(path);
+        if (alloc)
+                kfree(path);
+        return ERR_PTR(-EIO);
+}
+/*
+ * ext4_ext_insert_index:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *curp,
+                                int logical, ext4_fsblk_t ptr)
+{
+        struct ext4_extent_idx *ix;
+        int len, err;
+        if ((err = ext4_ext_get_access(handle, inode, curp)))
+                return err;
+        BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+        len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
+                /* insert after */
+                if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
+                        len = (len - 1) * sizeof(struct ext4_extent_idx);
+                        len = len < 0 ? 0 : len;
+                        ext_debug("insert new index %d after: %d. "
+                                        "move %d from 0x%p to 0x%p\n",
+                                        logical, ptr, len,
+                                        (curp->p_idx + 1), (curp->p_idx + 2));
+                        memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+                }
+                ix = curp->p_idx + 1;
+        } else {
+                /* insert before */
+                len = len * sizeof(struct ext4_extent_idx);
+                len = len < 0 ? 0 : len;
+                ext_debug("insert new index %d before: %d. "
+                                "move %d from 0x%p to 0x%p\n",
+                                logical, ptr, len,
+                                curp->p_idx, (curp->p_idx + 1));
+                memmove(curp->p_idx + 1, curp->p_idx, len);
+                ix = curp->p_idx;
+        }
+        ix->ei_block = cpu_to_le32(logical);
+        ext4_idx_store_pblock(ix, ptr);
+        curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
+        BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
+                             > le16_to_cpu(curp->p_hdr->eh_max));
+        BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+        err = ext4_ext_dirty(handle, inode, curp);
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
+/*
+ * ext4_ext_split:
+ * inserts new subtree into the path, using free index entry
+ * at depth @at:
+ * - allocates all needed blocks (new leaf and all intermediate index blocks)
+ * - makes decision where to split
+ * - moves remaining extents and index entries (right to the split point)
+ *   into the newly allocated blocks
+ * - initializes subtree
+ */
+static int ext4_ext_split(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_extent *newext, int at)
+{
+        struct buffer_head *bh = NULL;
+        int depth = ext_depth(inode);
+        struct ext4_extent_header *neh;
+        struct ext4_extent_idx *fidx;
+        struct ext4_extent *ex;
+        int i = at, k, m, a;
+        ext4_fsblk_t newblock, oldblock;
+        __le32 border;
+        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+        int err = 0;
+        /* make decision: where to split? */
+        /* FIXME: now decision is simplest: at current extent */
+        /* if current leaf will be split, then we should use
+         * border from split point */
+        BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+                border = path[depth].p_ext[1].ee_block;
+                ext_debug("leaf will be split."
+                                " next leaf starts at %d\n",
+                                  le32_to_cpu(border));
+        } else {
+                border = newext->ee_block;
+                ext_debug("leaf will be added."
+                                " next leaf starts at %d\n",
+                                le32_to_cpu(border));
+        }
+        /*
+         * If error occurs, then we break processing
+         * and mark filesystem read-only. index won't
+         * be inserted and tree will be in consistent
+         * state. Next mount will repair buffers too.
+         */
+        /*
+         * Get array to track all allocated blocks.
+         * We need this to handle errors and free blocks
+         * upon them.
+         */
+        ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+        if (!ablocks)
+                return -ENOMEM;
+        memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
+        /* allocate all needed blocks */
+        ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+        for (a = 0; a < depth - at; a++) {
+                newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+                if (newblock == 0)
+                        goto cleanup;
+                ablocks[a] = newblock;
+        }
+        /* initialize new leaf */
+        newblock = ablocks[--a];
+        BUG_ON(newblock == 0);
+        bh = sb_getblk(inode->i_sb, newblock);
+        if (!bh) {
+                err = -EIO;
+                goto cleanup;
+        }
+        lock_buffer(bh);
+        if ((err = ext4_journal_get_create_access(handle, bh)))
+                goto cleanup;
+        neh = ext_block_hdr(bh);
+        neh->eh_entries = 0;
+        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+        neh->eh_magic = EXT4_EXT_MAGIC;
+        neh->eh_depth = 0;
+        ex = EXT_FIRST_EXTENT(neh);
+        /* move remainder of path[depth] to the new leaf */
+        BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+        /* start copy from next extent */
+        /* TODO: we could do it by single memmove */
+        m = 0;
+        path[depth].p_ext++;
+        while (path[depth].p_ext <=
+                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
+                ext_debug("move %d:%llu:%d in new leaf %llu\n",
+                                le32_to_cpu(path[depth].p_ext->ee_block),
+                                ext_pblock(path[depth].p_ext),
+                                le16_to_cpu(path[depth].p_ext->ee_len),
+                                newblock);
+                /*memmove(ex++, path[depth].p_ext++,
+                                sizeof(struct ext4_extent));
+                neh->eh_entries++;*/
+                path[depth].p_ext++;
+                m++;
+        }
+        if (m) {
+                memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+                neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
+        }
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        if ((err = ext4_journal_dirty_metadata(handle, bh)))
+                goto cleanup;
+        brelse(bh);
+        bh = NULL;
+        /* correct old leaf */
+        if (m) {
+                if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+                        goto cleanup;
+                path[depth].p_hdr->eh_entries =
+                     cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
+                if ((err = ext4_ext_dirty(handle, inode, path + depth)))
+                        goto cleanup;
+        }
+        /* create intermediate indexes */
+        k = depth - at - 1;
+        BUG_ON(k < 0);
+        if (k)
+                ext_debug("create %d intermediate indices\n", k);
+        /* insert new index into current index block */
+        /* current depth stored in i var */
+        i = depth - 1;
+        while (k--) {
+                oldblock = newblock;
+                newblock = ablocks[--a];
+                bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
+                if (!bh) {
+                        err = -EIO;
+                        goto cleanup;
+                }
+                lock_buffer(bh);
+                if ((err = ext4_journal_get_create_access(handle, bh)))
+                        goto cleanup;
+                neh = ext_block_hdr(bh);
+                neh->eh_entries = cpu_to_le16(1);
+                neh->eh_magic = EXT4_EXT_MAGIC;
+                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+                neh->eh_depth = cpu_to_le16(depth - i);
+                fidx = EXT_FIRST_INDEX(neh);
+                fidx->ei_block = border;
+                ext4_idx_store_pblock(fidx, oldblock);
+                ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
+                                newblock, (unsigned long) le32_to_cpu(border),
+                                oldblock);
+                /* copy indexes */
+                m = 0;
+                path[i].p_idx++;
+                ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+                                EXT_MAX_INDEX(path[i].p_hdr));
+                BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
+                                EXT_LAST_INDEX(path[i].p_hdr));
+                while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+                        ext_debug("%d: move %d:%d in new index %llu\n", i,
+                                        le32_to_cpu(path[i].p_idx->ei_block),
+                                        idx_pblock(path[i].p_idx),
+                                        newblock);
+                        /*memmove(++fidx, path[i].p_idx++,
+                                        sizeof(struct ext4_extent_idx));
+                        neh->eh_entries++;
+                        BUG_ON(neh->eh_entries > neh->eh_max);*/
+                        path[i].p_idx++;
+                        m++;
+                }
+                if (m) {
+                        memmove(++fidx, path[i].p_idx - m,
+                                sizeof(struct ext4_extent_idx) * m);
+                        neh->eh_entries =
+                                cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
+                }
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+                if ((err = ext4_journal_dirty_metadata(handle, bh)))
+                        goto cleanup;
+                brelse(bh);
+                bh = NULL;
+                /* correct old index */
+                if (m) {
+                        err = ext4_ext_get_access(handle, inode, path + i);
+                        if (err)
+                                goto cleanup;
+                        path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
+                        err = ext4_ext_dirty(handle, inode, path + i);
+                        if (err)
+                                goto cleanup;
+                }
+                i--;
+        }
+        /* insert new index */
+        if (err)
+                goto cleanup;
+        err = ext4_ext_insert_index(handle, inode, path + at,
+                                    le32_to_cpu(border), newblock);
+cleanup:
+        if (bh) {
+                if (buffer_locked(bh))
+                        unlock_buffer(bh);
+                brelse(bh);
+        }
+        if (err) {
+                /* free all allocated blocks in error case */
+                for (i = 0; i < depth; i++) {
+                        if (!ablocks[i])
+                                continue;
+                        ext4_free_blocks(handle, inode, ablocks[i], 1);
+                }
+        }
+        kfree(ablocks);
+        return err;
+}
+/*
+ * ext4_ext_grow_indepth:
+ * implements tree growing procedure:
+ * - allocates new block
+ * - moves top-level data (index block or leaf) into the new block
+ * - initializes new top-level, creating index that points to the
+ *   just created block
+ */
+static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
+                                        struct ext4_ext_path *path,
+                                        struct ext4_extent *newext)
+{
+        struct ext4_ext_path *curp = path;
+        struct ext4_extent_header *neh;
+        struct ext4_extent_idx *fidx;
+        struct buffer_head *bh;
+        ext4_fsblk_t newblock;
+        int err = 0;
+        newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+        if (newblock == 0)
+                return err;
+        bh = sb_getblk(inode->i_sb, newblock);
+        if (!bh) {
+                err = -EIO;
+                ext4_std_error(inode->i_sb, err);
+                return err;
+        }
+        lock_buffer(bh);
+        if ((err = ext4_journal_get_create_access(handle, bh))) {
+                unlock_buffer(bh);
+                goto out;
+        }
+        /* move top-level index/leaf into new block */
+        memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+        /* set size of new block */
+        neh = ext_block_hdr(bh);
+        /* old root could have indexes or leaves
+         * so calculate e_max right way */
+        if (ext_depth(inode))
+          neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+        else
+          neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+        neh->eh_magic = EXT4_EXT_MAGIC;
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        if ((err = ext4_journal_dirty_metadata(handle, bh)))
+                goto out;
+        /* create index in new top-level index: num,max,pointer */
+        if ((err = ext4_ext_get_access(handle, inode, curp)))
+                goto out;
+        curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
+        curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+        curp->p_hdr->eh_entries = cpu_to_le16(1);
+        curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+        /* FIXME: it works, but actually path[0] can be index */
+        curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
+        ext4_idx_store_pblock(curp->p_idx, newblock);
+        neh = ext_inode_hdr(inode);
+        fidx = EXT_FIRST_INDEX(neh);
+        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
+                  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
+        err = ext4_ext_dirty(handle, inode, curp);
+out:
+        brelse(bh);
+        return err;
+}
+/*
+ * ext4_ext_create_new_leaf:
+ * finds empty index and adds new leaf.
+ * if no free index is found, then it requests in-depth growing.
+ */
+static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+                                        struct ext4_ext_path *path,
+                                        struct ext4_extent *newext)
+{
+        struct ext4_ext_path *curp;
+        int depth, i, err = 0;
+repeat:
+        i = depth = ext_depth(inode);
+        /* walk up to the tree and look for free index entry */
+        curp = path + depth;
+        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+                i--;
+                curp--;
+        }
+        /* we use already allocated block for index block,
+         * so subsequent data blocks should be contiguous */
+        if (EXT_HAS_FREE_INDEX(curp)) {
+                /* if we found index with free entry, then use that
+                 * entry: create all needed subtree and add new leaf */
+                err = ext4_ext_split(handle, inode, path, newext, i);
+                /* refill path */
+                ext4_ext_drop_refs(path);
+                path = ext4_ext_find_extent(inode,
+                                            le32_to_cpu(newext->ee_block),
+                                            path);
+                if (IS_ERR(path))
+                        err = PTR_ERR(path);
+        } else {
+                /* tree is full, time to grow in depth */
+                err = ext4_ext_grow_indepth(handle, inode, path, newext);
+                if (err)
+                        goto out;
+                /* refill path */
+                ext4_ext_drop_refs(path);
+                path = ext4_ext_find_extent(inode,
+                                            le32_to_cpu(newext->ee_block),
+                                            path);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        goto out;
+                }
+                /*
+                 * only first (depth 0 -> 1) produces free space;
+                 * in all other cases we have to split the grown tree
+                 */
+                depth = ext_depth(inode);
+                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+                        /* now we need to split */
+                        goto repeat;
+                }
+        }
+out:
+        return err;
+}
+/*
+ * ext4_ext_next_allocated_block:
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * NOTE: it considers block number from index entry as
+ * allocated block. Thus, index entries have to be consistent
+ * with leaves.
+ */
+static unsigned long
+ext4_ext_next_allocated_block(struct ext4_ext_path *path)
+{
+        int depth;
+        BUG_ON(path == NULL);
+        depth = path->p_depth;
+        if (depth == 0 && path->p_ext == NULL)
+                return EXT_MAX_BLOCK;
+        while (depth >= 0) {
+                if (depth == path->p_depth) {
+                        /* leaf */
+                        if (path[depth].p_ext !=
+                                        EXT_LAST_EXTENT(path[depth].p_hdr))
+                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
+                } else {
+                        /* index */
+                        if (path[depth].p_idx !=
+                                        EXT_LAST_INDEX(path[depth].p_hdr))
+                          return le32_to_cpu(path[depth].p_idx[1].ei_block);
+                }
+                depth--;
+        }
+        return EXT_MAX_BLOCK;
+}
+/*
+ * ext4_ext_next_leaf_block:
+ * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ */
+static unsigned ext4_ext_next_leaf_block(struct inode *inode,
+                                        struct ext4_ext_path *path)
+{
+        int depth;
+        BUG_ON(path == NULL);
+        depth = path->p_depth;
+        /* zero-tree has no leaf blocks at all */
+        if (depth == 0)
+                return EXT_MAX_BLOCK;
+        /* go to index block */
+        depth--;
+        while (depth >= 0) {
+                if (path[depth].p_idx !=
+                                EXT_LAST_INDEX(path[depth].p_hdr))
+                  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+                depth--;
+        }
+        return EXT_MAX_BLOCK;
+}
+/*
+ * ext4_ext_correct_indexes:
+ * if leaf gets modified and modified extent is first in the leaf,
+ * then we have to correct all indexes above.
+ * TODO: do we need to correct tree in all cases?
+ */
+int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *path)
+{
+        struct ext4_extent_header *eh;
+        int depth = ext_depth(inode);
+        struct ext4_extent *ex;
+        __le32 border;
+        int k, err = 0;
+        eh = path[depth].p_hdr;
+        ex = path[depth].p_ext;
+        BUG_ON(ex == NULL);
+        BUG_ON(eh == NULL);
+        if (depth == 0) {
+                /* there is no tree at all */
+                return 0;
+        }
+        if (ex != EXT_FIRST_EXTENT(eh)) {
+                /* we correct tree if first leaf got modified only */
+                return 0;
+        }
+        /*
+         * TODO: we need correction if border is smaller than current one
+         */
+        k = depth - 1;
+        border = path[depth].p_ext->ee_block;
+        if ((err = ext4_ext_get_access(handle, inode, path + k)))
+                return err;
+        path[k].p_idx->ei_block = border;
+        if ((err = ext4_ext_dirty(handle, inode, path + k)))
+                return err;
+        while (k--) {
+                /* change all left-side indexes */
+                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+                        break;
+                if ((err = ext4_ext_get_access(handle, inode, path + k)))
+                        break;
+                path[k].p_idx->ei_block = border;
+                if ((err = ext4_ext_dirty(handle, inode, path + k)))
+                        break;
+        }
+        return err;
+}
+static int inline
+ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
+                                struct ext4_extent *ex2)
+{
+        if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+                        le32_to_cpu(ex2->ee_block))
+                return 0;
+        /*
+         * To allow future support for preallocated extents to be added
+         * as an RO_COMPAT feature, refuse to merge to extents if
+         * this can result in the top bit of ee_len being set.
+         */
+        if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+                return 0;
+#ifdef AGRESSIVE_TEST
+        if (le16_to_cpu(ex1->ee_len) >= 4)
+                return 0;
+#endif
+        if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+                return 1;
+        return 0;
+}
+/*
+ * ext4_ext_insert_extent:
+ * tries to merge requsted extent into the existing extent or
+ * inserts requested extent as new one into the tree,
+ * creating new leaf in the no-space case.
+ */
+int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_extent *newext)
+{
+        struct ext4_extent_header * eh;
+        struct ext4_extent *ex, *fex;
+        struct ext4_extent *nearex; /* nearest extent */
+        struct ext4_ext_path *npath = NULL;
+        int depth, len, err, next;
+        BUG_ON(newext->ee_len == 0);
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        BUG_ON(path[depth].p_hdr == NULL);
+        /* try to insert block into found extent and return */
+        if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+                ext_debug("append %d block to %d:%d (from %llu)\n",
+                                le16_to_cpu(newext->ee_len),
+                                le32_to_cpu(ex->ee_block),
+                                le16_to_cpu(ex->ee_len), ext_pblock(ex));
+                if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+                        return err;
+                ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
+                                         + le16_to_cpu(newext->ee_len));
+                eh = path[depth].p_hdr;
+                nearex = ex;
+                goto merge;
+        }
+repeat:
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
+                goto has_space;
+        /* probably next leaf has space for us? */
+        fex = EXT_LAST_EXTENT(eh);
+        next = ext4_ext_next_leaf_block(inode, path);
+        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
+            && next != EXT_MAX_BLOCK) {
+                ext_debug("next leaf block - %d\n", next);
+                BUG_ON(npath != NULL);
+                npath = ext4_ext_find_extent(inode, next, NULL);
+                if (IS_ERR(npath))
+                        return PTR_ERR(npath);
+                BUG_ON(npath->p_depth != path->p_depth);
+                eh = npath[depth].p_hdr;
+                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
+                        ext_debug("next leaf isnt full(%d)\n",
+                                  le16_to_cpu(eh->eh_entries));
+                        path = npath;
+                        goto repeat;
+                }
+                ext_debug("next leaf has no free space(%d,%d)\n",
+                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+        }
+        /*
+         * There is no free space in the found leaf.
+         * We're gonna add a new leaf in the tree.
+         */
+        err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+        if (err)
+                goto cleanup;
+        depth = ext_depth(inode);
+        eh = path[depth].p_hdr;
+has_space:
+        nearex = path[depth].p_ext;
+        if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+                goto cleanup;
+        if (!nearex) {
+                /* there is no extent in this leaf, create first one */
+                ext_debug("first extent in the leaf: %d:%llu:%d\n",
+                                le32_to_cpu(newext->ee_block),
+                                ext_pblock(newext),
+                                le16_to_cpu(newext->ee_len));
+                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+        } else if (le32_to_cpu(newext->ee_block)
+                           > le32_to_cpu(nearex->ee_block)) {
+/*              BUG_ON(newext->ee_block == nearex->ee_block); */
+                if (nearex != EXT_LAST_EXTENT(eh)) {
+                        len = EXT_MAX_EXTENT(eh) - nearex;
+                        len = (len - 1) * sizeof(struct ext4_extent);
+                        len = len < 0 ? 0 : len;
+                        ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+                                        "move %d from 0x%p to 0x%p\n",
+                                        le32_to_cpu(newext->ee_block),
+                                        ext_pblock(newext),
+                                        le16_to_cpu(newext->ee_len),
+                                        nearex, len, nearex + 1, nearex + 2);
+                        memmove(nearex + 2, nearex + 1, len);
+                }
+                path[depth].p_ext = nearex + 1;
+        } else {
+                BUG_ON(newext->ee_block == nearex->ee_block);
+                len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
+                len = len < 0 ? 0 : len;
+                ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+                                "move %d from 0x%p to 0x%p\n",
+                                le32_to_cpu(newext->ee_block),
+                                ext_pblock(newext),
+                                le16_to_cpu(newext->ee_len),
+                                nearex, len, nearex + 1, nearex + 2);
+                memmove(nearex + 1, nearex, len);
+                path[depth].p_ext = nearex;
+        }
+        eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
+        nearex = path[depth].p_ext;
+        nearex->ee_block = newext->ee_block;
+        nearex->ee_start = newext->ee_start;
+        nearex->ee_start_hi = newext->ee_start_hi;
+        nearex->ee_len = newext->ee_len;
+merge:
+        /* try to merge extents to the right */
+        while (nearex < EXT_LAST_EXTENT(eh)) {
+                if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
+                        break;
+                /* merge with next extent! */
+                nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
+                                             + le16_to_cpu(nearex[1].ee_len));
+                if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
+                        len = (EXT_LAST_EXTENT(eh) - nearex - 1)
+                                        * sizeof(struct ext4_extent);
+                        memmove(nearex + 1, nearex + 2, len);
+                }
+                eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+                BUG_ON(eh->eh_entries == 0);
+        }
+        /* try to merge extents to the left */
+        /* time to correct all indexes above */
+        err = ext4_ext_correct_indexes(handle, inode, path);
+        if (err)
+                goto cleanup;
+        err = ext4_ext_dirty(handle, inode, path + depth);
+cleanup:
+        if (npath) {
+                ext4_ext_drop_refs(npath);
+                kfree(npath);
+        }
+        ext4_ext_tree_changed(inode);
+        ext4_ext_invalidate_cache(inode);
+        return err;
+}
+int ext4_ext_walk_space(struct inode *inode, unsigned long block,
+                        unsigned long num, ext_prepare_callback func,
+                        void *cbdata)
+{
+        struct ext4_ext_path *path = NULL;
+        struct ext4_ext_cache cbex;
+        struct ext4_extent *ex;
+        unsigned long next, start = 0, end = 0;
+        unsigned long last = block + num;
+        int depth, exists, err = 0;
+        BUG_ON(func == NULL);
+        BUG_ON(inode == NULL);
+        while (block < last && block != EXT_MAX_BLOCK) {
+                num = last - block;
+                /* find extent for this block */
+                path = ext4_ext_find_extent(inode, block, path);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        path = NULL;
+                        break;
+                }
+                depth = ext_depth(inode);
+                BUG_ON(path[depth].p_hdr == NULL);
+                ex = path[depth].p_ext;
+                next = ext4_ext_next_allocated_block(path);
+                exists = 0;
+                if (!ex) {
+                        /* there is no extent yet, so try to allocate
+                         * all requested space */
+                        start = block;
+                        end = block + num;
+                } else if (le32_to_cpu(ex->ee_block) > block) {
+                        /* need to allocate space before found extent */
+                        start = block;
+                        end = le32_to_cpu(ex->ee_block);
+                        if (block + num < end)
+                                end = block + num;
+                } else if (block >=
+                             le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+                        /* need to allocate space after found extent */
+                        start = block;
+                        end = block + num;
+                        if (end >= next)
+                                end = next;
+                } else if (block >= le32_to_cpu(ex->ee_block)) {
+                        /*
+                         * some part of requested space is covered
+                         * by found extent
+                         */
+                        start = block;
+                        end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+                        if (block + num < end)
+                                end = block + num;
+                        exists = 1;
+                } else {
+                        BUG();
+                }
+                BUG_ON(end <= start);
+                if (!exists) {
+                        cbex.ec_block = start;
+                        cbex.ec_len = end - start;
+                        cbex.ec_start = 0;
+                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
+                } else {
+                        cbex.ec_block = le32_to_cpu(ex->ee_block);
+                        cbex.ec_len = le16_to_cpu(ex->ee_len);
+                        cbex.ec_start = ext_pblock(ex);
+                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+                }
+                BUG_ON(cbex.ec_len == 0);
+                err = func(inode, path, &cbex, cbdata);
+                ext4_ext_drop_refs(path);
+                if (err < 0)
+                        break;
+                if (err == EXT_REPEAT)
+                        continue;
+                else if (err == EXT_BREAK) {
+                        err = 0;
+                        break;
+                }
+                if (ext_depth(inode) != depth) {
+                        /* depth was changed. we have to realloc path */
+                        kfree(path);
+                        path = NULL;
+                }
+                block = cbex.ec_block + cbex.ec_len;
+        }
+        if (path) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
+        return err;
+}
+static inline void
+ext4_ext_put_in_cache(struct inode *inode, __u32 block,
+                        __u32 len, __u32 start, int type)
+{
+        struct ext4_ext_cache *cex;
+        BUG_ON(len == 0);
+        cex = &EXT4_I(inode)->i_cached_extent;
+        cex->ec_type = type;
+        cex->ec_block = block;
+        cex->ec_len = len;
+        cex->ec_start = start;
+}
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static inline void
+ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+                                unsigned long block)
+{
+        int depth = ext_depth(inode);
+        unsigned long lblock, len;
+        struct ext4_extent *ex;
+        ex = path[depth].p_ext;
+        if (ex == NULL) {
+                /* there is no extent yet, so gap is [0;-] */
+                lblock = 0;
+                len = EXT_MAX_BLOCK;
+                ext_debug("cache gap(whole file):");
+        } else if (block < le32_to_cpu(ex->ee_block)) {
+                lblock = block;
+                len = le32_to_cpu(ex->ee_block) - block;
+                ext_debug("cache gap(before): %lu [%lu:%lu]",
+                                (unsigned long) block,
+                                (unsigned long) le32_to_cpu(ex->ee_block),
+                                (unsigned long) le16_to_cpu(ex->ee_len));
+        } else if (block >= le32_to_cpu(ex->ee_block)
+                            + le16_to_cpu(ex->ee_len)) {
+                lblock = le32_to_cpu(ex->ee_block)
+                         + le16_to_cpu(ex->ee_len);
+                len = ext4_ext_next_allocated_block(path);
+                ext_debug("cache gap(after): [%lu:%lu] %lu",
+                                (unsigned long) le32_to_cpu(ex->ee_block),
+                                (unsigned long) le16_to_cpu(ex->ee_len),
+                                (unsigned long) block);
+                BUG_ON(len == lblock);
+                len = len - lblock;
+        } else {
+                lblock = len = 0;
+                BUG();
+        }
+        ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
+        ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+}
+static inline int
+ext4_ext_in_cache(struct inode *inode, unsigned long block,
+                        struct ext4_extent *ex)
+{
+        struct ext4_ext_cache *cex;
+        cex = &EXT4_I(inode)->i_cached_extent;
+        /* has cache valid data? */
+        if (cex->ec_type == EXT4_EXT_CACHE_NO)
+                return EXT4_EXT_CACHE_NO;
+        BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
+                        cex->ec_type != EXT4_EXT_CACHE_EXTENT);
+        if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+                ex->ee_block = cpu_to_le32(cex->ec_block);
+                ext4_ext_store_pblock(ex, cex->ec_start);
+                ex->ee_len = cpu_to_le16(cex->ec_len);
+                ext_debug("%lu cached by %lu:%lu:%llu\n",
+                                (unsigned long) block,
+                                (unsigned long) cex->ec_block,
+                                (unsigned long) cex->ec_len,
+                                cex->ec_start);
+                return cex->ec_type;
+        }
+        /* not in cache */
+        return EXT4_EXT_CACHE_NO;
+}
+/*
+ * ext4_ext_rm_idx:
+ * removes index from the index block.
+ * It's used in truncate case only, thus all requests are for
+ * last index in the block only.
+ */
+int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+                        struct ext4_ext_path *path)
+{
+        struct buffer_head *bh;
+        int err;
+        ext4_fsblk_t leaf;
+        /* free index block */
+        path--;
+        leaf = idx_pblock(path->p_idx);
+        BUG_ON(path->p_hdr->eh_entries == 0);
+        if ((err = ext4_ext_get_access(handle, inode, path)))
+                return err;
+        path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
+        if ((err = ext4_ext_dirty(handle, inode, path)))
+                return err;
+        ext_debug("index is empty, remove it, free block %llu\n", leaf);
+        bh = sb_find_get_block(inode->i_sb, leaf);
+        ext4_forget(handle, 1, inode, bh, leaf);
+        ext4_free_blocks(handle, inode, leaf, 1);
+        return err;
+}
+/*
+ * ext4_ext_calc_credits_for_insert:
+ * This routine returns max. credits that the extent tree can consume.
+ * It should be OK for low-performance paths like ->writepage()
+ * To allow many writing processes to fit into a single transaction,
+ * the caller should calculate credits under truncate_mutex and
+ * pass the actual path.
+ */
+int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
+                                                struct ext4_ext_path *path)
+{
+        int depth, needed;
+        if (path) {
+                /* probably there is space in leaf? */
+                depth = ext_depth(inode);
+                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
+                                < le16_to_cpu(path[depth].p_hdr->eh_max))
+                        return 1;
+        }
+        /*
+         * given 32-bit logical block (4294967296 blocks), max. tree
+         * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
+         * Let's also add one more level for imbalance.
+         */
+        depth = 5;
+        /* allocation of new data block(s) */
+        needed = 2;
+        /*
+         * tree can be full, so it would need to grow in depth:
+         * allocation + old root + new root
+         */
+        needed += 2 + 1 + 1;
+        /*
+         * Index split can happen, we would need:
+         *    allocate intermediate indexes (bitmap + group)
+         *  + change two blocks at each level, but root (already included)
+         */
+        needed = (depth * 2) + (depth * 2);
+        /* any allocation modifies superblock */
+        needed += 1;
+        return needed;
+}
+static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
+                                struct ext4_extent *ex,
+                                unsigned long from, unsigned long to)
+{
+        struct buffer_head *bh;
+        int i;
+#ifdef EXTENTS_STATS
+        {
+                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+                unsigned short ee_len =  le16_to_cpu(ex->ee_len);
+                spin_lock(&sbi->s_ext_stats_lock);
+                sbi->s_ext_blocks += ee_len;
+                sbi->s_ext_extents++;
+                if (ee_len < sbi->s_ext_min)
+                        sbi->s_ext_min = ee_len;
+                if (ee_len > sbi->s_ext_max)
+                        sbi->s_ext_max = ee_len;
+                if (ext_depth(inode) > sbi->s_depth_max)
+                        sbi->s_depth_max = ext_depth(inode);
+                spin_unlock(&sbi->s_ext_stats_lock);
+        }
+#endif
+        if (from >= le32_to_cpu(ex->ee_block)
+            && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+                /* tail removal */
+                unsigned long num;
+                ext4_fsblk_t start;
+                num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
+                start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+                ext_debug("free last %lu blocks starting %llu\n", num, start);
+                for (i = 0; i < num; i++) {
+                        bh = sb_find_get_block(inode->i_sb, start + i);
+                        ext4_forget(handle, 0, inode, bh, start + i);
+                }
+                ext4_free_blocks(handle, inode, start, num);
+        } else if (from == le32_to_cpu(ex->ee_block)
+                   && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+                printk("strange request: removal %lu-%lu from %u:%u\n",
+                       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+        } else {
+                printk("strange request: removal(2) %lu-%lu from %u:%u\n",
+                       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+        }
+        return 0;
+}
+static int
+ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
+                struct ext4_ext_path *path, unsigned long start)
+{
+        int err = 0, correct_index = 0;
+        int depth = ext_depth(inode), credits;
+        struct ext4_extent_header *eh;
+        unsigned a, b, block, num;
+        unsigned long ex_ee_block;
+        unsigned short ex_ee_len;
+        struct ext4_extent *ex;
+        ext_debug("truncate since %lu in leaf\n", start);
+        if (!path[depth].p_hdr)
+                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+        eh = path[depth].p_hdr;
+        BUG_ON(eh == NULL);
+        BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+        BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+        /* find where to start removing */
+        ex = EXT_LAST_EXTENT(eh);
+        ex_ee_block = le32_to_cpu(ex->ee_block);
+        ex_ee_len = le16_to_cpu(ex->ee_len);
+        while (ex >= EXT_FIRST_EXTENT(eh) &&
+                        ex_ee_block + ex_ee_len > start) {
+                ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+                path[depth].p_ext = ex;
+                a = ex_ee_block > start ? ex_ee_block : start;
+                b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+                        ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+                ext_debug("  border %u:%u\n", a, b);
+                if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+                        block = 0;
+                        num = 0;
+                        BUG();
+                } else if (a != ex_ee_block) {
+                        /* remove tail of the extent */
+                        block = ex_ee_block;
+                        num = a - block;
+                } else if (b != ex_ee_block + ex_ee_len - 1) {
+                        /* remove head of the extent */
+                        block = a;
+                        num = b - a;
+                        /* there is no "make a hole" API yet */
+                        BUG();
+                } else {
+                        /* remove whole extent: excellent! */
+                        block = ex_ee_block;
+                        num = 0;
+                        BUG_ON(a != ex_ee_block);
+                        BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+                }
+                /* at present, extent can't cross block group: */
+                /* leaf + bitmap + group desc + sb + inode */
+                credits = 5;
+                if (ex == EXT_FIRST_EXTENT(eh)) {
+                        correct_index = 1;
+                        credits += (ext_depth(inode)) + 1;
+                }
+#ifdef CONFIG_QUOTA
+                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+                handle = ext4_ext_journal_restart(handle, credits);
+                if (IS_ERR(handle)) {
+                        err = PTR_ERR(handle);
+                        goto out;
+                }
+                err = ext4_ext_get_access(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                err = ext4_remove_blocks(handle, inode, ex, a, b);
+                if (err)
+                        goto out;
+                if (num == 0) {
+                        /* this extent is removed; mark slot entirely unused */
+                        ext4_ext_store_pblock(ex, 0);
+                        eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+                }
+                ex->ee_block = cpu_to_le32(block);
+                ex->ee_len = cpu_to_le16(num);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                ext_debug("new extent: %u:%u:%llu\n", block, num,
+                                ext_pblock(ex));
+                ex--;
+                ex_ee_block = le32_to_cpu(ex->ee_block);
+                ex_ee_len = le16_to_cpu(ex->ee_len);
+        }
+        if (correct_index && eh->eh_entries)
+                err = ext4_ext_correct_indexes(handle, inode, path);
+        /* if this leaf is free, then we should
+         * remove it from index block above */
+        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+                err = ext4_ext_rm_idx(handle, inode, path + depth);
+out:
+        return err;
+}
+/*
+ * ext4_ext_more_to_rm:
+ * returns 1 if current index has to be freed (even partial)
+ */
+static int inline
+ext4_ext_more_to_rm(struct ext4_ext_path *path)
+{
+        BUG_ON(path->p_idx == NULL);
+        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+                return 0;
+        /*
+         * if truncate on deeper level happened, it wasn't partial,
+         * so we have to consider current index for truncation
+         */
+        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
+                return 0;
+        return 1;
+}
+int ext4_ext_remove_space(struct inode *inode, unsigned long start)
+{
+        struct super_block *sb = inode->i_sb;
+        int depth = ext_depth(inode);
+        struct ext4_ext_path *path;
+        handle_t *handle;
+        int i = 0, err = 0;
+        ext_debug("truncate since %lu\n", start);
+        /* probably first extent we're gonna free will be last in block */
+        handle = ext4_journal_start(inode, depth + 1);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ext4_ext_invalidate_cache(inode);
+        /*
+         * We start scanning from right side, freeing all the blocks
+         * after i_size and walking into the tree depth-wise.
+         */
+        path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
+        if (path == NULL) {
+                ext4_journal_stop(handle);
+                return -ENOMEM;
+        }
+        memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
+        path[0].p_hdr = ext_inode_hdr(inode);
+        if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
+                err = -EIO;
+                goto out;
+        }
+        path[0].p_depth = depth;
+        while (i >= 0 && err == 0) {
+                if (i == depth) {
+                        /* this is leaf block */
+                        err = ext4_ext_rm_leaf(handle, inode, path, start);
+                        /* root level has p_bh == NULL, brelse() eats this */
+                        brelse(path[i].p_bh);
+                        path[i].p_bh = NULL;
+                        i--;
+                        continue;
+                }
+                /* this is index block */
+                if (!path[i].p_hdr) {
+                        ext_debug("initialize header\n");
+                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
+                        if (ext4_ext_check_header(__FUNCTION__, inode,
+                                                        path[i].p_hdr)) {
+                                err = -EIO;
+                                goto out;
+                        }
+                }
+                BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
+                           > le16_to_cpu(path[i].p_hdr->eh_max));
+                BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
+                if (!path[i].p_idx) {
+                        /* this level hasn't been touched yet */
+                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
+                        ext_debug("init index ptr: hdr 0x%p, num %d\n",
+                                  path[i].p_hdr,
+                                  le16_to_cpu(path[i].p_hdr->eh_entries));
+                } else {
+                        /* we were already here, see at next index */
+                        path[i].p_idx--;
+                }
+                ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+                                i, EXT_FIRST_INDEX(path[i].p_hdr),
+                                path[i].p_idx);
+                if (ext4_ext_more_to_rm(path + i)) {
+                        /* go to the next level */
+                        ext_debug("move to level %d (block %llu)\n",
+                                  i + 1, idx_pblock(path[i].p_idx));
+                        memset(path + i + 1, 0, sizeof(*path));
+                        path[i+1].p_bh =
+                                sb_bread(sb, idx_pblock(path[i].p_idx));
+                        if (!path[i+1].p_bh) {
+                                /* should we reset i_size? */
+                                err = -EIO;
+                                break;
+                        }
+                        /* save actual number of indexes since this
+                         * number is changed at the next iteration */
+                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
+                        i++;
+                } else {
+                        /* we finished processing this index, go up */
+                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
+                                /* index is empty, remove it;
+                                 * handle must be already prepared by the
+                                 * truncatei_leaf() */
+                                err = ext4_ext_rm_idx(handle, inode, path + i);
+                        }
+                        /* root level has p_bh == NULL, brelse() eats this */
+                        brelse(path[i].p_bh);
+                        path[i].p_bh = NULL;
+                        i--;
+                        ext_debug("return to level %d\n", i);
+                }
+        }
+        /* TODO: flexible tree reduction should be here */
+        if (path->p_hdr->eh_entries == 0) {
+                /*
+                 * truncate to zero freed all the tree,
+                 * so we need to correct eh_depth
+                 */
+                err = ext4_ext_get_access(handle, inode, path);
+                if (err == 0) {
+                        ext_inode_hdr(inode)->eh_depth = 0;
+                        ext_inode_hdr(inode)->eh_max =
+                                cpu_to_le16(ext4_ext_space_root(inode));
+                        err = ext4_ext_dirty(handle, inode, path);
+                }
+        }
+out:
+        ext4_ext_tree_changed(inode);
+        ext4_ext_drop_refs(path);
+        kfree(path);
+        ext4_journal_stop(handle);
+        return err;
+}
+/*
+ * called at mount time
+ */
+void ext4_ext_init(struct super_block *sb)
+{
+        /*
+         * possible initialization would be here
+         */
+        if (test_opt(sb, EXTENTS)) {
+                printk("EXT4-fs: file extents enabled");
+#ifdef AGRESSIVE_TEST
+                printk(", agressive tests");
+#endif
+#ifdef CHECK_BINSEARCH
+                printk(", check binsearch");
+#endif
+#ifdef EXTENTS_STATS
+                printk(", stats");
+#endif
+                printk("\n");
+#ifdef EXTENTS_STATS
+                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
+                EXT4_SB(sb)->s_ext_min = 1 << 30;
+                EXT4_SB(sb)->s_ext_max = 0;
+#endif
+        }
+}
+/*
+ * called at umount time
+ */
+void ext4_ext_release(struct super_block *sb)
+{
+        if (!test_opt(sb, EXTENTS))
+                return;
+#ifdef EXTENTS_STATS
+        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
+                struct ext4_sb_info *sbi = EXT4_SB(sb);
+                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
+                        sbi->s_ext_blocks, sbi->s_ext_extents,
+                        sbi->s_ext_blocks / sbi->s_ext_extents);
+                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
+                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
+        }
+#endif
+}
+int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+                        ext4_fsblk_t iblock,
+                        unsigned long max_blocks, struct buffer_head *bh_result,
+                        int create, int extend_disksize)
+{
+        struct ext4_ext_path *path = NULL;
+        struct ext4_extent newex, *ex;
+        ext4_fsblk_t goal, newblock;
+        int err = 0, depth;
+        unsigned long allocated = 0;
+        __clear_bit(BH_New, &bh_result->b_state);
+        ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
+                        max_blocks, (unsigned) inode->i_ino);
+        mutex_lock(&EXT4_I(inode)->truncate_mutex);
+        /* check in cache */
+        if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
+                if (goal == EXT4_EXT_CACHE_GAP) {
+                        if (!create) {
+                                /* block isn't allocated yet and
+                                 * user doesn't want to allocate it */
+                                goto out2;
+                        }
+                        /* we should allocate requested block */
+                } else if (goal == EXT4_EXT_CACHE_EXTENT) {
+                        /* block is already allocated */
+                        newblock = iblock
+                                   - le32_to_cpu(newex.ee_block)
+                                   + ext_pblock(&newex);
+                        /* number of remaining blocks in the extent */
+                        allocated = le16_to_cpu(newex.ee_len) -
+                                        (iblock - le32_to_cpu(newex.ee_block));
+                        goto out;
+                } else {
+                        BUG();
+                }
+        }
+        /* find extent for this block */
+        path = ext4_ext_find_extent(inode, iblock, NULL);
+        if (IS_ERR(path)) {
+                err = PTR_ERR(path);
+                path = NULL;
+                goto out2;
+        }
+        depth = ext_depth(inode);
+        /*
+         * consistent leaf must not be empty;
+         * this situation is possible, though, _during_ tree modification;
+         * this is why assert can't be put in ext4_ext_find_extent()
+         */
+        BUG_ON(path[depth].p_ext == NULL && depth != 0);
+        if ((ex = path[depth].p_ext)) {
+                unsigned long ee_block = le32_to_cpu(ex->ee_block);
+                ext4_fsblk_t ee_start = ext_pblock(ex);
+                unsigned short ee_len  = le16_to_cpu(ex->ee_len);
+                /*
+                 * Allow future support for preallocated extents to be added
+                 * as an RO_COMPAT feature:
+                 * Uninitialized extents are treated as holes, except that
+                 * we avoid (fail) allocating new blocks during a write.
+                 */
+                if (ee_len > EXT_MAX_LEN)
+                        goto out2;
+                /* if found extent covers block, simply return it */
+                if (iblock >= ee_block && iblock < ee_block + ee_len) {
+                        newblock = iblock - ee_block + ee_start;
+                        /* number of remaining blocks in the extent */
+                        allocated = ee_len - (iblock - ee_block);
+                        ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
+                                        ee_block, ee_len, newblock);
+                        ext4_ext_put_in_cache(inode, ee_block, ee_len,
+                                                ee_start, EXT4_EXT_CACHE_EXTENT);
+                        goto out;
+                }
+        }
+        /*
+         * requested block isn't allocated yet;
+         * we couldn't try to create block if create flag is zero
+         */
+        if (!create) {
+                /* put just found gap into cache to speed up
+                 * subsequent requests */
+                ext4_ext_put_gap_in_cache(inode, path, iblock);
+                goto out2;
+        }
+        /*
+         * Okay, we need to do block allocation.  Lazily initialize the block
+         * allocation info here if necessary.
+         */
+        if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
+                ext4_init_block_alloc_info(inode);
+        /* allocate new block */
+        goal = ext4_ext_find_goal(inode, path, iblock);
+        allocated = max_blocks;
+        newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
+        if (!newblock)
+                goto out2;
+        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+                        goal, newblock, allocated);
+        /* try to insert new extent into found leaf and return */
+        newex.ee_block = cpu_to_le32(iblock);
+        ext4_ext_store_pblock(&newex, newblock);
+        newex.ee_len = cpu_to_le16(allocated);
+        err = ext4_ext_insert_extent(handle, inode, path, &newex);
+        if (err)
+                goto out2;
+        if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
+                EXT4_I(inode)->i_disksize = inode->i_size;
+        /* previous routine could use block we allocated */
+        newblock = ext_pblock(&newex);
+        __set_bit(BH_New, &bh_result->b_state);
+        ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+                                EXT4_EXT_CACHE_EXTENT);
+out:
+        if (allocated > max_blocks)
+                allocated = max_blocks;
+        ext4_ext_show_leaf(inode, path);
+        __set_bit(BH_Mapped, &bh_result->b_state);
+        bh_result->b_bdev = inode->i_sb->s_bdev;
+        bh_result->b_blocknr = newblock;
+out2:
+        if (path) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
+        mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+        return err ? err : allocated;
+}
+void ext4_ext_truncate(struct inode * inode, struct page *page)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct super_block *sb = inode->i_sb;
+        unsigned long last_block;
+        handle_t *handle;
+        int err = 0;
+        /*
+         * probably first extent we're gonna free will be last in block
+         */
+        err = ext4_writepage_trans_blocks(inode) + 3;
+        handle = ext4_journal_start(inode, err);
+        if (IS_ERR(handle)) {
+                if (page) {
+                        clear_highpage(page);
+                        flush_dcache_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+                return;
+        }
+        if (page)
+                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+        mutex_lock(&EXT4_I(inode)->truncate_mutex);
+        ext4_ext_invalidate_cache(inode);
+        /*
+         * TODO: optimization is possible here.
+         * Probably we need not scan at all,
+         * because page truncation is enough.
+         */
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
+        /* we have to know where to truncate from in crash case */
+        EXT4_I(inode)->i_disksize = inode->i_size;
+        ext4_mark_inode_dirty(handle, inode);
+        last_block = (inode->i_size + sb->s_blocksize - 1)
+                        >> EXT4_BLOCK_SIZE_BITS(sb);
+        err = ext4_ext_remove_space(inode, last_block);
+        /* In a multi-transaction truncate, we only make the final
+         * transaction synchronous. */
+        if (IS_SYNC(inode))
+                handle->h_sync = 1;
+out_stop:
+        /*
+         * If this was a simple ftruncate() and the file will remain alive,
+         * then we need to clear up the orphan record which we created above.
+         * However, if this was a real unlink then we were called by
+         * ext4_delete_inode(), and we allow that function to clean up the
+         * orphan info for us.
+         */
+        if (inode->i_nlink)
+                ext4_orphan_del(handle, inode);
+        mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+        ext4_journal_stop(handle);
+}
+/*
+ * ext4_ext_writepage_trans_blocks:
+ * calculate max number of blocks we could modify
+ * in order to allocate new block for an inode
+ */
+int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
+{
+        int needed;
+        needed = ext4_ext_calc_credits_for_insert(inode, NULL);
+        /* caller wants to allocate num blocks, but note it includes sb */
+        needed = needed * num - (num - 1);
+#ifdef CONFIG_QUOTA
+        needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+        return needed;
+}
+EXPORT_SYMBOL(ext4_mark_inode_dirty);
+EXPORT_SYMBOL(ext4_ext_invalidate_cache);
+EXPORT_SYMBOL(ext4_ext_insert_extent);
+EXPORT_SYMBOL(ext4_ext_walk_space);
+EXPORT_SYMBOL(ext4_ext_find_goal);
+EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..0b622c0624b7
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
+/*
+ *  linux/fs/ext4/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *      (jj@sunsite.ms.mff.cuni.cz)
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext4_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext4_release_file (struct inode * inode, struct file * filp)
+{
+        /* if we are the last writer on the inode, drop the block reservation */
+        if ((filp->f_mode & FMODE_WRITE) &&
+                        (atomic_read(&inode->i_writecount) == 1))
+        {
+                mutex_lock(&EXT4_I(inode)->truncate_mutex);
+                ext4_discard_reservation(inode);
+                mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+        }
+        if (is_dx(inode) && filp->private_data)
+                ext4_htree_free_dir_info(filp->private_data);
+        return 0;
+}
+static ssize_t
+ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
+                unsigned long nr_segs, loff_t pos)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_dentry->d_inode;
+        ssize_t ret;
+        int err;
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /*
+         * Skip flushing if there was an error, or if nothing was written.
+         */
+        if (ret <= 0)
+                return ret;
+        /*
+         * If the inode is IS_SYNC, or is O_SYNC and we are doing data
+         * journalling then we need to make sure that we force the transaction
+         * to disk to keep all metadata uptodate synchronously.
+         */
+        if (file->f_flags & O_SYNC) {
+                /*
+                 * If we are non-data-journaled, then the dirty data has
+                 * already been flushed to backing store by generic_osync_inode,
+                 * and the inode has been flushed too if there have been any
+                 * modifications other than mere timestamp updates.
+                 *
+                 * Open question --- do we care about flushing timestamps too
+                 * if the inode is IS_SYNC?
+                 */
+                if (!ext4_should_journal_data(inode))
+                        return ret;
+                goto force_commit;
+        }
+        /*
+         * So we know that there has been no forced data flush.  If the inode
+         * is marked IS_SYNC, we need to force one ourselves.
+         */
+        if (!IS_SYNC(inode))
+                return ret;
+        /*
+         * Open question #2 --- should we force data to disk here too?  If we
+         * don't, the only impact is that data=writeback filesystems won't
+         * flush data to disk automatically on IS_SYNC, only metadata (but
+         * historically, that is what ext2 has done.)
+         */
+force_commit:
+        err = ext4_force_commit(inode->i_sb);
+        if (err)
+                return err;
+        return ret;
+}
+const struct file_operations ext4_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = generic_file_aio_read,
+        .aio_write      = ext4_file_write,
+        .ioctl          = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ext4_compat_ioctl,
+#endif
+        .mmap           = generic_file_mmap,
+        .open           = generic_file_open,
+        .release        = ext4_release_file,
+        .fsync          = ext4_sync_file,
+        .sendfile       = generic_file_sendfile,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+};
+struct inode_operations ext4_file_inode_operations = {
+        .truncate       = ext4_truncate,
+        .setattr        = ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ext4_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+        .permission     = ext4_permission,
+};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..2a167d7131fa
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
+/*
+ *  linux/fs/ext4/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s.
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode.  Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it.  This will snapshot the
+ * inode to disk.
+ */
+int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int ret = 0;
+        J_ASSERT(ext4_journal_current_handle() == 0);
+        /*
+         * data=writeback:
+         *  The caller's filemap_fdatawrite()/wait will sync the data.
+         *  sync_inode() will sync the metadata
+         *
+         * data=ordered:
+         *  The caller's filemap_fdatawrite() will write the data and
+         *  sync_inode() will write the inode if it is dirty.  Then the caller's
+         *  filemap_fdatawait() will wait on the pages.
+         *
+         * data=journal:
+         *  filemap_fdatawrite won't do anything (the buffers are clean).
+         *  ext4_force_commit will write the file data into the journal and
+         *  will wait on that.
+         *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+         *  (they were dirtied by commit).  But that's OK - the blocks are
+         *  safe in-journal, which is all fsync() needs to ensure.
+         */
+        if (ext4_should_journal_data(inode)) {
+                ret = ext4_force_commit(inode->i_sb);
+                goto out;
+        }
+        /*
+         * The VFS has written the file data.  If the inode is unaltered
+         * then we need not start a commit.
+         */
+        if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+                struct writeback_control wbc = {
+                        .sync_mode = WB_SYNC_ALL,
+                        .nr_to_write = 0, /* sys_fsync did this */
+                };
+                ret = sync_inode(inode, &wbc);
+        }
+out:
+        return ret;
+}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..a67966385e06
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
+/*
+ *  linux/fs/ext4/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/sched.h>
+#include <linux/ext4_fs.h>
+#include <linux/cryptohash.h>
+#define DELTA 0x9E3779B9
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+        __u32   sum = 0;
+        __u32   b0 = buf[0], b1 = buf[1];
+        __u32   a = in[0], b = in[1], c = in[2], d = in[3];
+        int     n = 16;
+        do {
+                sum += DELTA;
+                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+        } while(--n);
+        buf[0] += b0;
+        buf[1] += b1;
+}
+/* The old legacy hash */
+static __u32 dx_hack_hash (const char *name, int len)
+{
+        __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        while (len--) {
+                __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+                if (hash & 0x80000000) hash -= 0x7fffffff;
+                hash1 = hash0;
+                hash0 = hash;
+        }
+        return (hash0 << 1);
+}
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i=0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = msg[i] + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+/*
+ * Returns the hash of a filename.  If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash.  If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits.  32 bit hashes will return 0 for the minor hash.
+ */
+int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+        __u32   hash;
+        __u32   minor_hash = 0;
+        const char      *p;
+        int             i;
+        __u32           in[8], buf[4];
+        /* Initialize the default seed for the hash checksum functions */
+        buf[0] = 0x67452301;
+        buf[1] = 0xefcdab89;
+        buf[2] = 0x98badcfe;
+        buf[3] = 0x10325476;
+        /* Check to see if the seed is all zero's */
+        if (hinfo->seed) {
+                for (i=0; i < 4; i++) {
+                        if (hinfo->seed[i])
+                                break;
+                }
+                if (i < 4)
+                        memcpy(buf, hinfo->seed, sizeof(buf));
+        }
+        switch (hinfo->hash_version) {
+        case DX_HASH_LEGACY:
+                hash = dx_hack_hash(name, len);
+                break;
+        case DX_HASH_HALF_MD4:
+                p = name;
+                while (len > 0) {
+                        str2hashbuf(p, len, in, 8);
+                        half_md4_transform(buf, in);
+                        len -= 32;
+                        p += 32;
+                }
+                minor_hash = buf[2];
+                hash = buf[1];
+                break;
+        case DX_HASH_TEA:
+                p = name;
+                while (len > 0) {
+                        str2hashbuf(p, len, in, 4);
+                        TEA_transform(buf, in);
+                        len -= 16;
+                        p += 16;
+                }
+                hash = buf[0];
+                minor_hash = buf[1];
+                break;
+        default:
+                hinfo->hash = 0;
+                return -1;
+        }
+        hash = hash & ~1;
+        if (hash == (EXT4_HTREE_EOF << 1))
+                hash = (EXT4_HTREE_EOF-1) << 1;
+        hinfo->hash = hash;
+        hinfo->minor_hash = minor_hash;
+        return 0;
+}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..c88b439ba5cd
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,772 @@
+/*
+ *  linux/fs/ext4/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by
+ *  Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <asm/byteorder.h>
+#include "xattr.h"
+#include "acl.h"
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+        struct ext4_group_desc *desc;
+        struct buffer_head *bh = NULL;
+        desc = ext4_get_group_desc(sb, block_group, NULL);
+        if (!desc)
+                goto error_out;
+        bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
+        if (!bh)
+                ext4_error(sb, "read_inode_bitmap",
+                            "Cannot read inode bitmap - "
+                            "block_group = %lu, inode_bitmap = %llu",
+                            block_group, ext4_inode_bitmap(sb, desc));
+error_out:
+        return bh;
+}
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext4_free_inode (handle_t *handle, struct inode * inode)
+{
+        struct super_block * sb = inode->i_sb;
+        int is_directory;
+        unsigned long ino;
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *bh2;
+        unsigned long block_group;
+        unsigned long bit;
+        struct ext4_group_desc * gdp;
+        struct ext4_super_block * es;
+        struct ext4_sb_info *sbi;
+        int fatal = 0, err;
+        if (atomic_read(&inode->i_count) > 1) {
+                printk ("ext4_free_inode: inode has count=%d\n",
+                                        atomic_read(&inode->i_count));
+                return;
+        }
+        if (inode->i_nlink) {
+                printk ("ext4_free_inode: inode has nlink=%d\n",
+                        inode->i_nlink);
+                return;
+        }
+        if (!sb) {
+                printk("ext4_free_inode: inode on nonexistent device\n");
+                return;
+        }
+        sbi = EXT4_SB(sb);
+        ino = inode->i_ino;
+        ext4_debug ("freeing inode %lu\n", ino);
+        /*
+         * Note: we must free any quota before locking the superblock,
+         * as writing the quota to disk may need the lock as well.
+         */
+        DQUOT_INIT(inode);
+        ext4_xattr_delete_inode(handle, inode);
+        DQUOT_FREE_INODE(inode);
+        DQUOT_DROP(inode);
+        is_directory = S_ISDIR(inode->i_mode);
+        /* Do this BEFORE marking the inode not in use or returning an error */
+        clear_inode (inode);
+        es = EXT4_SB(sb)->s_es;
+        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+                ext4_error (sb, "ext4_free_inode",
+                            "reserved or nonexistent inode %lu", ino);
+                goto error_return;
+        }
+        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+        bitmap_bh = read_inode_bitmap(sb, block_group);
+        if (!bitmap_bh)
+                goto error_return;
+        BUFFER_TRACE(bitmap_bh, "get_write_access");
+        fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+        if (fatal)
+                goto error_return;
+        /* Ok, now we can actually update the inode bitmaps.. */
+        if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+                                        bit, bitmap_bh->b_data))
+                ext4_error (sb, "ext4_free_inode",
+                              "bit already cleared for inode %lu", ino);
+        else {
+                gdp = ext4_get_group_desc (sb, block_group, &bh2);
+                BUFFER_TRACE(bh2, "get_write_access");
+                fatal = ext4_journal_get_write_access(handle, bh2);
+                if (fatal) goto error_return;
+                if (gdp) {
+                        spin_lock(sb_bgl_lock(sbi, block_group));
+                        gdp->bg_free_inodes_count = cpu_to_le16(
+                                le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+                        if (is_directory)
+                                gdp->bg_used_dirs_count = cpu_to_le16(
+                                  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+                        spin_unlock(sb_bgl_lock(sbi, block_group));
+                        percpu_counter_inc(&sbi->s_freeinodes_counter);
+                        if (is_directory)
+                                percpu_counter_dec(&sbi->s_dirs_counter);
+                }
+                BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+                err = ext4_journal_dirty_metadata(handle, bh2);
+                if (!fatal) fatal = err;
+        }
+        BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
+        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        if (!fatal)
+                fatal = err;
+        sb->s_dirt = 1;
+error_return:
+        brelse(bitmap_bh);
+        ext4_std_error(sb, fatal);
+}
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent)
+{
+        int ngroups = EXT4_SB(sb)->s_groups_count;
+        unsigned int freei, avefreei;
+        struct ext4_group_desc *desc, *best_desc = NULL;
+        struct buffer_head *bh;
+        int group, best_group = -1;
+        freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
+        avefreei = freei / ngroups;
+        for (group = 0; group < ngroups; group++) {
+                desc = ext4_get_group_desc (sb, group, &bh);
+                if (!desc || !desc->bg_free_inodes_count)
+                        continue;
+                if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                        continue;
+                if (!best_desc ||
+                    (le16_to_cpu(desc->bg_free_blocks_count) >
+                     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+                        best_group = group;
+                        best_desc = desc;
+                }
+        }
+        return best_group;
+}
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * it's already running too large debt (max_debt).
+ * Parent's group is prefered, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
+#define INODE_COST 64
+#define BLOCK_COST 256
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+        int parent_group = EXT4_I(parent)->i_block_group;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        int ngroups = sbi->s_groups_count;
+        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
+        unsigned int freei, avefreei;
+        ext4_fsblk_t freeb, avefreeb;
+        ext4_fsblk_t blocks_per_dir;
+        unsigned int ndirs;
+        int max_debt, max_dirs, min_inodes;
+        ext4_grpblk_t min_blocks;
+        int group = -1, i;
+        struct ext4_group_desc *desc;
+        struct buffer_head *bh;
+        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+        avefreei = freei / ngroups;
+        freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        avefreeb = freeb;
+        do_div(avefreeb, ngroups);
+        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+        if ((parent == sb->s_root->d_inode) ||
+            (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+                int best_ndir = inodes_per_group;
+                int best_group = -1;
+                get_random_bytes(&group, sizeof(group));
+                parent_group = (unsigned)group % ngroups;
+                for (i = 0; i < ngroups; i++) {
+                        group = (parent_group + i) % ngroups;
+                        desc = ext4_get_group_desc (sb, group, &bh);
+                        if (!desc || !desc->bg_free_inodes_count)
+                                continue;
+                        if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+                                continue;
+                        if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                                continue;
+                        if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+                                continue;
+                        best_group = group;
+                        best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+                }
+                if (best_group >= 0)
+                        return best_group;
+                goto fallback;
+        }
+        blocks_per_dir = ext4_blocks_count(es) - freeb;
+        do_div(blocks_per_dir, ndirs);
+        max_dirs = ndirs / ngroups + inodes_per_group / 16;
+        min_inodes = avefreei - inodes_per_group / 4;
+        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
+        max_debt = EXT4_BLOCKS_PER_GROUP(sb);
+        max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
+        if (max_debt * INODE_COST > inodes_per_group)
+                max_debt = inodes_per_group / INODE_COST;
+        if (max_debt > 255)
+                max_debt = 255;
+        if (max_debt == 0)
+                max_debt = 1;
+        for (i = 0; i < ngroups; i++) {
+                group = (parent_group + i) % ngroups;
+                desc = ext4_get_group_desc (sb, group, &bh);
+                if (!desc || !desc->bg_free_inodes_count)
+                        continue;
+                if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+                        continue;
+                if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+                        continue;
+                if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+                        continue;
+                return group;
+        }
+fallback:
+        for (i = 0; i < ngroups; i++) {
+                group = (parent_group + i) % ngroups;
+                desc = ext4_get_group_desc (sb, group, &bh);
+                if (!desc || !desc->bg_free_inodes_count)
+                        continue;
+                if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+                        return group;
+        }
+        if (avefreei) {
+                /*
+                 * The free-inodes counter is approximate, and for really small
+                 * filesystems the above test can fail to find any blockgroups
+                 */
+                avefreei = 0;
+                goto fallback;
+        }
+        return -1;
+}
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+        int parent_group = EXT4_I(parent)->i_block_group;
+        int ngroups = EXT4_SB(sb)->s_groups_count;
+        struct ext4_group_desc *desc;
+        struct buffer_head *bh;
+        int group, i;
+        /*
+         * Try to place the inode in its parent directory
+         */
+        group = parent_group;
+        desc = ext4_get_group_desc (sb, group, &bh);
+        if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+                        le16_to_cpu(desc->bg_free_blocks_count))
+                return group;
+        /*
+         * We're going to place this inode in a different blockgroup from its
+         * parent.  We want to cause files in a common directory to all land in
+         * the same blockgroup.  But we want files which are in a different
+         * directory which shares a blockgroup with our parent to land in a
+         * different blockgroup.
+         *
+         * So add our directory's i_ino into the starting point for the hash.
+         */
+        group = (group + parent->i_ino) % ngroups;
+        /*
+         * Use a quadratic hash to find a group with a free inode and some free
+         * blocks.
+         */
+        for (i = 1; i < ngroups; i <<= 1) {
+                group += i;
+                if (group >= ngroups)
+                        group -= ngroups;
+                desc = ext4_get_group_desc (sb, group, &bh);
+                if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+                                le16_to_cpu(desc->bg_free_blocks_count))
+                        return group;
+        }
+        /*
+         * That failed: try linear search for a free inode, even if that group
+         * has no free blocks.
+         */
+        group = parent_group;
+        for (i = 0; i < ngroups; i++) {
+                if (++group >= ngroups)
+                        group = 0;
+                desc = ext4_get_group_desc (sb, group, &bh);
+                if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+                        return group;
+        }
+        return -1;
+}
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+{
+        struct super_block *sb;
+        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *bh2;
+        int group;
+        unsigned long ino = 0;
+        struct inode * inode;
+        struct ext4_group_desc * gdp = NULL;
+        struct ext4_super_block * es;
+        struct ext4_inode_info *ei;
+        struct ext4_sb_info *sbi;
+        int err = 0;
+        struct inode *ret;
+        int i;
+        /* Cannot create files in a deleted directory */
+        if (!dir || !dir->i_nlink)
+                return ERR_PTR(-EPERM);
+        sb = dir->i_sb;
+        inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        ei = EXT4_I(inode);
+        sbi = EXT4_SB(sb);
+        es = sbi->s_es;
+        if (S_ISDIR(mode)) {
+                if (test_opt (sb, OLDALLOC))
+                        group = find_group_dir(sb, dir);
+                else
+                        group = find_group_orlov(sb, dir);
+        } else
+                group = find_group_other(sb, dir);
+        err = -ENOSPC;
+        if (group == -1)
+                goto out;
+        for (i = 0; i < sbi->s_groups_count; i++) {
+                err = -EIO;
+                gdp = ext4_get_group_desc(sb, group, &bh2);
+                if (!gdp)
+                        goto fail;
+                brelse(bitmap_bh);
+                bitmap_bh = read_inode_bitmap(sb, group);
+                if (!bitmap_bh)
+                        goto fail;
+                ino = 0;
+repeat_in_this_group:
+                ino = ext4_find_next_zero_bit((unsigned long *)
+                                bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+                if (ino < EXT4_INODES_PER_GROUP(sb)) {
+                        BUFFER_TRACE(bitmap_bh, "get_write_access");
+                        err = ext4_journal_get_write_access(handle, bitmap_bh);
+                        if (err)
+                                goto fail;
+                        if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
+                                                ino, bitmap_bh->b_data)) {
+                                /* we won it */
+                                BUFFER_TRACE(bitmap_bh,
+                                        "call ext4_journal_dirty_metadata");
+                                err = ext4_journal_dirty_metadata(handle,
+                                                                bitmap_bh);
+                                if (err)
+                                        goto fail;
+                                goto got;
+                        }
+                        /* we lost it */
+                        jbd2_journal_release_buffer(handle, bitmap_bh);
+                        if (++ino < EXT4_INODES_PER_GROUP(sb))
+                                goto repeat_in_this_group;
+                }
+                /*
+                 * This case is possible in concurrent environment.  It is very
+                 * rare.  We cannot repeat the find_group_xxx() call because
+                 * that will simply return the same blockgroup, because the
+                 * group descriptor metadata has not yet been updated.
+                 * So we just go onto the next blockgroup.
+                 */
+                if (++group == sbi->s_groups_count)
+                        group = 0;
+        }
+        err = -ENOSPC;
+        goto out;
+got:
+        ino += group * EXT4_INODES_PER_GROUP(sb) + 1;
+        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+                ext4_error (sb, "ext4_new_inode",
+                            "reserved inode or inode > inodes count - "
+                            "block_group = %d, inode=%lu", group, ino);
+                err = -EIO;
+                goto fail;
+        }
+        BUFFER_TRACE(bh2, "get_write_access");
+        err = ext4_journal_get_write_access(handle, bh2);
+        if (err) goto fail;
+        spin_lock(sb_bgl_lock(sbi, group));
+        gdp->bg_free_inodes_count =
+                cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+        if (S_ISDIR(mode)) {
+                gdp->bg_used_dirs_count =
+                        cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+        }
+        spin_unlock(sb_bgl_lock(sbi, group));
+        BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+        err = ext4_journal_dirty_metadata(handle, bh2);
+        if (err) goto fail;
+        percpu_counter_dec(&sbi->s_freeinodes_counter);
+        if (S_ISDIR(mode))
+                percpu_counter_inc(&sbi->s_dirs_counter);
+        sb->s_dirt = 1;
+        inode->i_uid = current->fsuid;
+        if (test_opt (sb, GRPID))
+                inode->i_gid = dir->i_gid;
+        else if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current->fsgid;
+        inode->i_mode = mode;
+        inode->i_ino = ino;
+        /* This is the optimal IO size (for stat), not the fs block size */
+        inode->i_blocks = 0;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+        memset(ei->i_data, 0, sizeof(ei->i_data));
+        ei->i_dir_start_lookup = 0;
+        ei->i_disksize = 0;
+        ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
+        if (S_ISLNK(mode))
+                ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
+        /* dirsync only applies to directories */
+        if (!S_ISDIR(mode))
+                ei->i_flags &= ~EXT4_DIRSYNC_FL;
+#ifdef EXT4_FRAGMENTS
+        ei->i_faddr = 0;
+        ei->i_frag_no = 0;
+        ei->i_frag_size = 0;
+#endif
+        ei->i_file_acl = 0;
+        ei->i_dir_acl = 0;
+        ei->i_dtime = 0;
+        ei->i_block_alloc_info = NULL;
+        ei->i_block_group = group;
+        ext4_set_inode_flags(inode);
+        if (IS_DIRSYNC(inode))
+                handle->h_sync = 1;
+        insert_inode_hash(inode);
+        spin_lock(&sbi->s_next_gen_lock);
+        inode->i_generation = sbi->s_next_generation++;
+        spin_unlock(&sbi->s_next_gen_lock);
+        ei->i_state = EXT4_STATE_NEW;
+        ei->i_extra_isize =
+                (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
+                sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
+        ret = inode;
+        if(DQUOT_ALLOC_INODE(inode)) {
+                err = -EDQUOT;
+                goto fail_drop;
+        }
+        err = ext4_init_acl(handle, inode, dir);
+        if (err)
+                goto fail_free_drop;
+        err = ext4_init_security(handle,inode, dir);
+        if (err)
+                goto fail_free_drop;
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (err) {
+                ext4_std_error(sb, err);
+                goto fail_free_drop;
+        }
+        if (test_opt(sb, EXTENTS)) {
+                EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+                ext4_ext_tree_init(handle, inode);
+                if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+                        if (err) goto fail;
+                        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
+                        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
+                        err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+                }
+        }
+        ext4_debug("allocating inode %lu\n", inode->i_ino);
+        goto really_out;
+fail:
+        ext4_std_error(sb, err);
+out:
+        iput(inode);
+        ret = ERR_PTR(err);
+really_out:
+        brelse(bitmap_bh);
+        return ret;
+fail_free_drop:
+        DQUOT_FREE_INODE(inode);
+fail_drop:
+        DQUOT_DROP(inode);
+        inode->i_flags |= S_NOQUOTA;
+        inode->i_nlink = 0;
+        iput(inode);
+        brelse(bitmap_bh);
+        return ERR_PTR(err);
+}
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+{
+        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+        unsigned long block_group;
+        int bit;
+        struct buffer_head *bitmap_bh = NULL;
+        struct inode *inode = NULL;
+        /* Error cases - e2fsck has already cleaned up for us */
+        if (ino > max_ino) {
+                ext4_warning(sb, __FUNCTION__,
+                             "bad orphan ino %lu!  e2fsck was run?", ino);
+                goto out;
+        }
+        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+        bitmap_bh = read_inode_bitmap(sb, block_group);
+        if (!bitmap_bh) {
+                ext4_warning(sb, __FUNCTION__,
+                             "inode bitmap error for orphan %lu", ino);
+                goto out;
+        }
+        /* Having the inode bit set should be a 100% indicator that this
+         * is a valid orphan (no e2fsck run on fs).  Orphans also include
+         * inodes that were being truncated, so we can't check i_nlink==0.
+         */
+        if (!ext4_test_bit(bit, bitmap_bh->b_data) ||
+                        !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
+                        NEXT_ORPHAN(inode) > max_ino) {
+                ext4_warning(sb, __FUNCTION__,
+                             "bad orphan inode %lu!  e2fsck was run?", ino);
+                printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+                       bit, (unsigned long long)bitmap_bh->b_blocknr,
+                       ext4_test_bit(bit, bitmap_bh->b_data));
+                printk(KERN_NOTICE "inode=%p\n", inode);
+                if (inode) {
+                        printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+                               is_bad_inode(inode));
+                        printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+                               NEXT_ORPHAN(inode));
+                        printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+                }
+                /* Avoid freeing blocks if we got a bad deleted inode */
+                if (inode && inode->i_nlink == 0)
+                        inode->i_blocks = 0;
+                iput(inode);
+                inode = NULL;
+        }
+out:
+        brelse(bitmap_bh);
+        return inode;
+}
+unsigned long ext4_count_free_inodes (struct super_block * sb)
+{
+        unsigned long desc_count;
+        struct ext4_group_desc *gdp;
+        int i;
+#ifdef EXT4FS_DEBUG
+        struct ext4_super_block *es;
+        unsigned long bitmap_count, x;
+        struct buffer_head *bitmap_bh = NULL;
+        es = EXT4_SB(sb)->s_es;
+        desc_count = 0;
+        bitmap_count = 0;
+        gdp = NULL;
+        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+                gdp = ext4_get_group_desc (sb, i, NULL);
+                if (!gdp)
+                        continue;
+                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+                brelse(bitmap_bh);
+                bitmap_bh = read_inode_bitmap(sb, i);
+                if (!bitmap_bh)
+                        continue;
+                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+                printk("group %d: stored = %d, counted = %lu\n",
+                        i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+                bitmap_count += x;
+        }
+        brelse(bitmap_bh);
+        printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
+                le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+        return desc_count;
+#else
+        desc_count = 0;
+        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+                gdp = ext4_get_group_desc (sb, i, NULL);
+                if (!gdp)
+                        continue;
+                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+                cond_resched();
+        }
+        return desc_count;
+#endif
+}
+/* Called at mount-time, super-block is locked */
+unsigned long ext4_count_dirs (struct super_block * sb)
+{
+        unsigned long count = 0;
+        int i;
+        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+                struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+                if (!gdp)
+                        continue;
+                count += le16_to_cpu(gdp->bg_used_dirs_count);
+        }
+        return count;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..0a60ec5a16db
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3233 @@
+/*
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *      (sct@redhat.com), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *      (jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd2.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext4_inode_is_fast_symlink(struct inode *inode)
+{
+        int ea_blocks = EXT4_I(inode)->i_file_acl ?
+                (inode->i_sb->s_blocksize >> 9) : 0;
+        return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ */
+int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
+                        struct buffer_head *bh, ext4_fsblk_t blocknr)
+{
+        int err;
+        might_sleep();
+        BUFFER_TRACE(bh, "enter");
+        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+                  "data mode %lx\n",
+                  bh, is_metadata, inode->i_mode,
+                  test_opt(inode->i_sb, DATA_FLAGS));
+        /* Never use the revoke function if we are doing full data
+         * journaling: there is no need to, and a V1 superblock won't
+         * support it.  Otherwise, only skip the revoke on un-journaled
+         * data blocks. */
+        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+            (!is_metadata && !ext4_should_journal_data(inode))) {
+                if (bh) {
+                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
+                        return ext4_journal_forget(handle, bh);
+                }
+                return 0;
+        }
+        /*
+         * data!=journal && (is_metadata || should_journal_data(inode))
+         */
+        BUFFER_TRACE(bh, "call ext4_journal_revoke");
+        err = ext4_journal_revoke(handle, blocknr, bh);
+        if (err)
+                ext4_abort(inode->i_sb, __FUNCTION__,
+                           "error %d when attempting revoke", err);
+        BUFFER_TRACE(bh, "exit");
+        return err;
+}
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode)
+{
+        unsigned long needed;
+        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+        /* Give ourselves just enough room to cope with inodes in which
+         * i_blocks is corrupt: we've seen disk corruptions in the past
+         * which resulted in random data in an inode which looked enough
+         * like a regular file for ext4 to try to delete it.  Things
+         * will go a bit crazy if that happens, but at least we should
+         * try not to panic the whole kernel. */
+        if (needed < 2)
+                needed = 2;
+        /* But we need to bound the transaction so we don't overflow the
+         * journal. */
+        if (needed > EXT4_MAX_TRANS_DATA)
+                needed = EXT4_MAX_TRANS_DATA;
+        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+        handle_t *result;
+        result = ext4_journal_start(inode, blocks_for_truncate(inode));
+        if (!IS_ERR(result))
+                return result;
+        ext4_std_error(inode->i_sb, PTR_ERR(result));
+        return result;
+}
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+        if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+                return 0;
+        if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+                return 0;
+        return 1;
+}
+/*
+ * Restart the transaction associated with *handle.  This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+{
+        jbd_debug(2, "restarting handle %p\n", handle);
+        return ext4_journal_restart(handle, blocks_for_truncate(inode));
+}
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext4_delete_inode (struct inode * inode)
+{
+        handle_t *handle;
+        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode))
+                goto no_delete;
+        handle = start_transaction(inode);
+        if (IS_ERR(handle)) {
+                /*
+                 * If we're going to skip the normal cleanup, we still need to
+                 * make sure that the in-core orphan linked list is properly
+                 * cleaned up.
+                 */
+                ext4_orphan_del(NULL, inode);
+                goto no_delete;
+        }
+        if (IS_SYNC(inode))
+                handle->h_sync = 1;
+        inode->i_size = 0;
+        if (inode->i_blocks)
+                ext4_truncate(inode);
+        /*
+         * Kill off the orphan record which ext4_truncate created.
+         * AKPM: I think this can be inside the above `if'.
+         * Note that ext4_orphan_del() has to be able to cope with the
+         * deletion of a non-existent orphan - this is because we don't
+         * know if ext4_truncate() actually created an orphan record.
+         * (Well, we could do this if we need to, but heck - it works)
+         */
+        ext4_orphan_del(handle, inode);
+        EXT4_I(inode)->i_dtime  = get_seconds();
+        /*
+         * One subtle ordering requirement: if anything has gone wrong
+         * (transaction abort, IO errors, whatever), then we can still
+         * do these next steps (the fs will already have been marked as
+         * having errors), but we can't free the inode if the mark_dirty
+         * fails.
+         */
+        if (ext4_mark_inode_dirty(handle, inode))
+                /* If that failed, just do the required in-core inode clear. */
+                clear_inode(inode);
+        else
+                ext4_free_inode(handle, inode);
+        ext4_journal_stop(handle);
+        return;
+no_delete:
+        clear_inode(inode);     /* We must guarantee clearing of inode... */
+}
+typedef struct {
+        __le32  *p;
+        __le32  key;
+        struct buffer_head *bh;
+} Indirect;
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+        p->key = *(p->p = v);
+        p->bh = bh;
+}
+static int verify_chain(Indirect *from, Indirect *to)
+{
+        while (from <= to && from->key == *from->p)
+                from++;
+        return (from > to);
+}
+/**
+ *      ext4_block_to_path - parse the block number into array of offsets
+ *      @inode: inode in question (we are only interested in its superblock)
+ *      @i_block: block number to be parsed
+ *      @offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *
+ *      To store the locations of file's data ext4 uses a data structure common
+ *      for UNIX filesystems - tree of pointers anchored in the inode, with
+ *      data blocks at leaves and indirect blocks in intermediate nodes.
+ *      This function translates the block number into path in that tree -
+ *      return value is the path length and @offsets[n] is the offset of
+ *      pointer to (n+1)th node in the nth one. If @block is out of range
+ *      (negative or too large) warning is printed and zero returned.
+ *
+ *      Note: function doesn't find node addresses, so no IO is needed. All
+ *      we need to know is the capacity of indirect blocks (taken from the
+ *      inode->i_sb).
+ */
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+static int ext4_block_to_path(struct inode *inode,
+                        long i_block, int offsets[4], int *boundary)
+{
+        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+        const long direct_blocks = EXT4_NDIR_BLOCKS,
+                indirect_blocks = ptrs,
+                double_blocks = (1 << (ptrs_bits * 2));
+        int n = 0;
+        int final = 0;
+        if (i_block < 0) {
+                ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+        } else if (i_block < direct_blocks) {
+                offsets[n++] = i_block;
+                final = direct_blocks;
+        } else if ( (i_block -= direct_blocks) < indirect_blocks) {
+                offsets[n++] = EXT4_IND_BLOCK;
+                offsets[n++] = i_block;
+                final = ptrs;
+        } else if ((i_block -= indirect_blocks) < double_blocks) {
+                offsets[n++] = EXT4_DIND_BLOCK;
+                offsets[n++] = i_block >> ptrs_bits;
+                offsets[n++] = i_block & (ptrs - 1);
+                final = ptrs;
+        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+                offsets[n++] = EXT4_TIND_BLOCK;
+                offsets[n++] = i_block >> (ptrs_bits * 2);
+                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+                offsets[n++] = i_block & (ptrs - 1);
+                final = ptrs;
+        } else {
+                ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
+        }
+        if (boundary)
+                *boundary = final - 1 - (i_block & (ptrs - 1));
+        return n;
+}
+/**
+ *      ext4_get_branch - read the chain of indirect blocks leading to data
+ *      @inode: inode in question
+ *      @depth: depth of the chain (1 - direct pointer, etc.)
+ *      @offsets: offsets of pointers in inode/indirect blocks
+ *      @chain: place to store the result
+ *      @err: here we store the error value
+ *
+ *      Function fills the array of triples <key, p, bh> and returns %NULL
+ *      if everything went OK or the pointer to the last filled triple
+ *      (incomplete one) otherwise. Upon the return chain[i].key contains
+ *      the number of (i+1)-th block in the chain (as it is stored in memory,
+ *      i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *      number (it points into struct inode for i==0 and into the bh->b_data
+ *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *      block for i>0 and NULL for i==0. In other words, it holds the block
+ *      numbers of the chain, addresses they were taken from (and where we can
+ *      verify that chain did not change) and buffer_heads hosting these
+ *      numbers.
+ *
+ *      Function stops when it stumbles upon zero pointer (absent block)
+ *              (pointer to last triple returned, *@err == 0)
+ *      or when it gets an IO error reading an indirect block
+ *              (ditto, *@err == -EIO)
+ *      or when it notices that chain had been changed while it was reading
+ *              (ditto, *@err == -EAGAIN)
+ *      or when it reads all @depth-1 indirect blocks successfully and finds
+ *      the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
+                                 Indirect chain[4], int *err)
+{
+        struct super_block *sb = inode->i_sb;
+        Indirect *p = chain;
+        struct buffer_head *bh;
+        *err = 0;
+        /* i_data is not going away, no lock needed */
+        add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+        if (!p->key)
+                goto no_block;
+        while (--depth) {
+                bh = sb_bread(sb, le32_to_cpu(p->key));
+                if (!bh)
+                        goto failure;
+                /* Reader: pointers */
+                if (!verify_chain(chain, p))
+                        goto changed;
+                add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+                /* Reader: end */
+                if (!p->key)
+                        goto no_block;
+        }
+        return NULL;
+changed:
+        brelse(bh);
+        *err = -EAGAIN;
+        goto no_block;
+failure:
+        *err = -EIO;
+no_block:
+        return p;
+}
+/**
+ *      ext4_find_near - find a place for allocation with sufficient locality
+ *      @inode: owner
+ *      @ind: descriptor of indirect block.
+ *
+ *      This function returns the prefered place for block allocation.
+ *      It is used when heuristic for sequential allocation fails.
+ *      Rules are:
+ *        + if there is a block to the left of our position - allocate near it.
+ *        + if pointer will live in indirect block - allocate near that block.
+ *        + if pointer will live in inode - allocate in the same
+ *          cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *      Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+        __le32 *p;
+        ext4_fsblk_t bg_start;
+        ext4_grpblk_t colour;
+        /* Try to find previous block */
+        for (p = ind->p - 1; p >= start; p--) {
+                if (*p)
+                        return le32_to_cpu(*p);
+        }
+        /* No such thing, so let's try location of indirect block */
+        if (ind->bh)
+                return ind->bh->b_blocknr;
+        /*
+         * It is going to be referred to from the inode itself? OK, just put it
+         * into the same cylinder group then.
+         */
+        bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+        colour = (current->pid % 16) *
+                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+        return bg_start + colour;
+}
+/**
+ *      ext4_find_goal - find a prefered place for allocation.
+ *      @inode: owner
+ *      @block:  block we want
+ *      @chain:  chain of indirect blocks
+ *      @partial: pointer to the last triple within a chain
+ *      @goal:  place to store the result.
+ *
+ *      Normally this function find the prefered place for block allocation,
+ *      stores it in *@goal and returns zero.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
+                Indirect chain[4], Indirect *partial)
+{
+        struct ext4_block_alloc_info *block_i;
+        block_i =  EXT4_I(inode)->i_block_alloc_info;
+        /*
+         * try the heuristic for sequential allocation,
+         * failing that at least try to get decent locality.
+         */
+        if (block_i && (block == block_i->last_alloc_logical_block + 1)
+                && (block_i->last_alloc_physical_block != 0)) {
+                return block_i->last_alloc_physical_block + 1;
+        }
+        return ext4_find_near(inode, partial);
+}
+/**
+ *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      of direct blocks need to be allocated for the given branch.
+ *
+ *      @branch: chain of indirect blocks
+ *      @k: number of blocks need for indirect blocks
+ *      @blks: number of data blocks to be mapped.
+ *      @blocks_to_boundary:  the offset in the indirect block
+ *
+ *      return the total number of blocks to be allocate, including the
+ *      direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+                int blocks_to_boundary)
+{
+        unsigned long count = 0;
+        /*
+         * Simple case, [t,d]Indirect block(s) has not allocated yet
+         * then it's clear blocks on that path have not allocated
+         */
+        if (k > 0) {
+                /* right now we don't handle cross boundary allocation */
+                if (blks < blocks_to_boundary + 1)
+                        count += blks;
+                else
+                        count += blocks_to_boundary + 1;
+                return count;
+        }
+        count++;
+        while (count < blks && count <= blocks_to_boundary &&
+                le32_to_cpu(*(branch[0].p + count)) == 0) {
+                count++;
+        }
+        return count;
+}
+/**
+ *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @indirect_blks: the number of blocks need to allocate for indirect
+ *                      blocks
+ *
+ *      @new_blocks: on return it will store the new block numbers for
+ *      the indirect blocks(if needed) and the first direct block,
+ *      @blks:  on return it will store the total number of allocated
+ *              direct blocks
+ */
+static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
+                        ext4_fsblk_t goal, int indirect_blks, int blks,
+                        ext4_fsblk_t new_blocks[4], int *err)
+{
+        int target, i;
+        unsigned long count = 0;
+        int index = 0;
+        ext4_fsblk_t current_block = 0;
+        int ret = 0;
+        /*
+         * Here we try to allocate the requested multiple blocks at once,
+         * on a best-effort basis.
+         * To build a branch, we should allocate blocks for
+         * the indirect blocks(if not allocated yet), and at least
+         * the first direct block of this branch.  That's the
+         * minimum number of blocks need to allocate(required)
+         */
+        target = blks + indirect_blks;
+        while (1) {
+                count = target;
+                /* allocating blocks for indirect blocks and direct blocks */
+                current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+                if (*err)
+                        goto failed_out;
+                target -= count;
+                /* allocate blocks for indirect blocks */
+                while (index < indirect_blks && count) {
+                        new_blocks[index++] = current_block++;
+                        count--;
+                }
+                if (count > 0)
+                        break;
+        }
+        /* save the new block number for the first direct block */
+        new_blocks[index] = current_block;
+        /* total number of blocks allocated for direct blocks */
+        ret = count;
+        *err = 0;
+        return ret;
+failed_out:
+        for (i = 0; i <index; i++)
+                ext4_free_blocks(handle, inode, new_blocks[i], 1);
+        return ret;
+}
+/**
+ *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @inode: owner
+ *      @indirect_blks: number of allocated indirect blocks
+ *      @blks: number of allocated direct blocks
+ *      @offsets: offsets (in the blocks) to store the pointers to next.
+ *      @branch: place to store the chain in.
+ *
+ *      This function allocates blocks, zeroes out all but the last one,
+ *      links them into chain and (if we are synchronous) writes them to disk.
+ *      In other words, it prepares a branch that can be spliced onto the
+ *      inode. It stores the information about that chain in the branch[], in
+ *      the same format as ext4_get_branch() would do. We are calling it after
+ *      we had read the existing part of chain and partial points to the last
+ *      triple of that (one with zero ->key). Upon the exit we have the same
+ *      picture as after the successful ext4_get_block(), except that in one
+ *      place chain is disconnected - *branch->p is still zero (we did not
+ *      set the last link), but branch->key contains the number that should
+ *      be placed into *branch->p to fill that gap.
+ *
+ *      If allocation fails we free all blocks we've allocated (and forget
+ *      their buffer_heads) and return the error value the from failed
+ *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *      as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+                        int indirect_blks, int *blks, ext4_fsblk_t goal,
+                        int *offsets, Indirect *branch)
+{
+        int blocksize = inode->i_sb->s_blocksize;
+        int i, n = 0;
+        int err = 0;
+        struct buffer_head *bh;
+        int num;
+        ext4_fsblk_t new_blocks[4];
+        ext4_fsblk_t current_block;
+        num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+                                *blks, new_blocks, &err);
+        if (err)
+                return err;
+        branch[0].key = cpu_to_le32(new_blocks[0]);
+        /*
+         * metadata blocks and data blocks are allocated.
+         */
+        for (n = 1; n <= indirect_blks;  n++) {
+                /*
+                 * Get buffer_head for parent block, zero it out
+                 * and set the pointer to new one, then send
+                 * parent to disk.
+                 */
+                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                branch[n].bh = bh;
+                lock_buffer(bh);
+                BUFFER_TRACE(bh, "call get_create_access");
+                err = ext4_journal_get_create_access(handle, bh);
+                if (err) {
+                        unlock_buffer(bh);
+                        brelse(bh);
+                        goto failed;
+                }
+                memset(bh->b_data, 0, blocksize);
+                branch[n].p = (__le32 *) bh->b_data + offsets[n];
+                branch[n].key = cpu_to_le32(new_blocks[n]);
+                *branch[n].p = branch[n].key;
+                if ( n == indirect_blks) {
+                        current_block = new_blocks[n];
+                        /*
+                         * End of chain, update the last new metablock of
+                         * the chain to point to the new allocated
+                         * data blocks numbers
+                         */
+                        for (i=1; i < num; i++)
+                                *(branch[n].p + i) = cpu_to_le32(++current_block);
+                }
+                BUFFER_TRACE(bh, "marking uptodate");
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                err = ext4_journal_dirty_metadata(handle, bh);
+                if (err)
+                        goto failed;
+        }
+        *blks = num;
+        return err;
+failed:
+        /* Allocation failed, free what we already allocated */
+        for (i = 1; i <= n ; i++) {
+                BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+                ext4_journal_forget(handle, branch[i].bh);
+        }
+        for (i = 0; i <indirect_blks; i++)
+                ext4_free_blocks(handle, inode, new_blocks[i], 1);
+        ext4_free_blocks(handle, inode, new_blocks[i], num);
+        return err;
+}
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ *      ext4_alloc_branch)
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+                        long block, Indirect *where, int num, int blks)
+{
+        int i;
+        int err = 0;
+        struct ext4_block_alloc_info *block_i;
+        ext4_fsblk_t current_block;
+        block_i = EXT4_I(inode)->i_block_alloc_info;
+        /*
+         * If we're splicing into a [td]indirect block (as opposed to the
+         * inode) then we need to get write access to the [td]indirect block
+         * before the splice.
+         */
+        if (where->bh) {
+                BUFFER_TRACE(where->bh, "get_write_access");
+                err = ext4_journal_get_write_access(handle, where->bh);
+                if (err)
+                        goto err_out;
+        }
+        /* That's it */
+        *where->p = where->key;
+        /*
+         * Update the host buffer_head or inode to point to more just allocated
+         * direct blocks blocks
+         */
+        if (num == 0 && blks > 1) {
+                current_block = le32_to_cpu(where->key) + 1;
+                for (i = 1; i < blks; i++)
+                        *(where->p + i ) = cpu_to_le32(current_block++);
+        }
+        /*
+         * update the most recently allocated logical & physical block
+         * in i_block_alloc_info, to assist find the proper goal block for next
+         * allocation
+         */
+        if (block_i) {
+                block_i->last_alloc_logical_block = block + blks - 1;
+                block_i->last_alloc_physical_block =
+                                le32_to_cpu(where[num].key) + blks - 1;
+        }
+        /* We are done with atomic stuff, now do the rest of housekeeping */
+        inode->i_ctime = CURRENT_TIME_SEC;
+        ext4_mark_inode_dirty(handle, inode);
+        /* had we spliced it onto indirect block? */
+        if (where->bh) {
+                /*
+                 * If we spliced it onto an indirect block, we haven't
+                 * altered the inode.  Note however that if it is being spliced
+                 * onto an indirect block at the very end of the file (the
+                 * file is growing) then we *will* alter the inode to reflect
+                 * the new i_size.  But that is not done here - it is done in
+                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+                 */
+                jbd_debug(5, "splicing indirect only\n");
+                BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
+                err = ext4_journal_dirty_metadata(handle, where->bh);
+                if (err)
+                        goto err_out;
+        } else {
+                /*
+                 * OK, we spliced it into the inode itself on a direct block.
+                 * Inode was dirtied above.
+                 */
+                jbd_debug(5, "splicing direct\n");
+        }
+        return err;
+err_out:
+        for (i = 1; i <= num; i++) {
+                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+                ext4_journal_forget(handle, where[i].bh);
+                ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+        }
+        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+        return err;
+}
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * The BKL may not be held on entry here.  Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+                sector_t iblock, unsigned long maxblocks,
+                struct buffer_head *bh_result,
+                int create, int extend_disksize)
+{
+        int err = -EIO;
+        int offsets[4];
+        Indirect chain[4];
+        Indirect *partial;
+        ext4_fsblk_t goal;
+        int indirect_blks;
+        int blocks_to_boundary = 0;
+        int depth;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        int count = 0;
+        ext4_fsblk_t first_block = 0;
+        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+        J_ASSERT(handle != NULL || create == 0);
+        depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+        if (depth == 0)
+                goto out;
+        partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+        /* Simplest case - block found, no allocation needed */
+        if (!partial) {
+                first_block = le32_to_cpu(chain[depth - 1].key);
+                clear_buffer_new(bh_result);
+                count++;
+                /*map more blocks*/
+                while (count < maxblocks && count <= blocks_to_boundary) {
+                        ext4_fsblk_t blk;
+                        if (!verify_chain(chain, partial)) {
+                                /*
+                                 * Indirect block might be removed by
+                                 * truncate while we were reading it.
+                                 * Handling of that case: forget what we've
+                                 * got now. Flag the err as EAGAIN, so it
+                                 * will reread.
+                                 */
+                                err = -EAGAIN;
+                                count = 0;
+                                break;
+                        }
+                        blk = le32_to_cpu(*(chain[depth-1].p + count));
+                        if (blk == first_block + count)
+                                count++;
+                        else
+                                break;
+                }
+                if (err != -EAGAIN)
+                        goto got_it;
+        }
+        /* Next simple case - plain lookup or failed read of indirect block */
+        if (!create || err == -EIO)
+                goto cleanup;
+        mutex_lock(&ei->truncate_mutex);
+        /*
+         * If the indirect block is missing while we are reading
+         * the chain(ext4_get_branch() returns -EAGAIN err), or
+         * if the chain has been changed after we grab the semaphore,
+         * (either because another process truncated this branch, or
+         * another get_block allocated this branch) re-grab the chain to see if
+         * the request block has been allocated or not.
+         *
+         * Since we already block the truncate/other get_block
+         * at this point, we will have the current copy of the chain when we
+         * splice the branch into the tree.
+         */
+        if (err == -EAGAIN || !verify_chain(chain, partial)) {
+                while (partial > chain) {
+                        brelse(partial->bh);
+                        partial--;
+                }
+                partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+                if (!partial) {
+                        count++;
+                        mutex_unlock(&ei->truncate_mutex);
+                        if (err)
+                                goto cleanup;
+                        clear_buffer_new(bh_result);
+                        goto got_it;
+                }
+        }
+        /*
+         * Okay, we need to do block allocation.  Lazily initialize the block
+         * allocation info here if necessary
+        */
+        if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+                ext4_init_block_alloc_info(inode);
+        goal = ext4_find_goal(inode, iblock, chain, partial);
+        /* the number of blocks need to allocate for [d,t]indirect blocks */
+        indirect_blks = (chain + depth) - partial - 1;
+        /*
+         * Next look up the indirect map to count the totoal number of
+         * direct blocks to allocate for this branch.
+         */
+        count = ext4_blks_to_allocate(partial, indirect_blks,
+                                        maxblocks, blocks_to_boundary);
+        /*
+         * Block out ext4_truncate while we alter the tree
+         */
+        err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
+                                offsets + (partial - chain), partial);
+        /*
+         * The ext4_splice_branch call will free and forget any buffers
+         * on the new chain if there is a failure, but that risks using
+         * up transaction credits, especially for bitmaps where the
+         * credits cannot be returned.  Can we handle this somehow?  We
+         * may need to return -EAGAIN upwards in the worst case.  --sct
+         */
+        if (!err)
+                err = ext4_splice_branch(handle, inode, iblock,
+                                        partial, indirect_blks, count);
+        /*
+         * i_disksize growing is protected by truncate_mutex.  Don't forget to
+         * protect it if you're about to implement concurrent
+         * ext4_get_block() -bzzz
+        */
+        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
+                ei->i_disksize = inode->i_size;
+        mutex_unlock(&ei->truncate_mutex);
+        if (err)
+                goto cleanup;
+        set_buffer_new(bh_result);
+got_it:
+        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        if (count > blocks_to_boundary)
+                set_buffer_boundary(bh_result);
+        err = count;
+        /* Clean up and exit */
+        partial = chain + depth - 1;    /* the whole chain */
+cleanup:
+        while (partial > chain) {
+                BUFFER_TRACE(partial->bh, "call brelse");
+                brelse(partial->bh);
+                partial--;
+        }
+        BUFFER_TRACE(bh_result, "returned");
+out:
+        return err;
+}
+#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+static int ext4_get_block(struct inode *inode, sector_t iblock,
+                        struct buffer_head *bh_result, int create)
+{
+        handle_t *handle = journal_current_handle();
+        int ret = 0;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        if (!create)
+                goto get_block;         /* A read */
+        if (max_blocks == 1)
+                goto get_block;         /* A single block get */
+        if (handle->h_transaction->t_state == T_LOCKED) {
+                /*
+                 * Huge direct-io writes can hold off commits for long
+                 * periods of time.  Let this commit run.
+                 */
+                ext4_journal_stop(handle);
+                handle = ext4_journal_start(inode, DIO_CREDITS);
+                if (IS_ERR(handle))
+                        ret = PTR_ERR(handle);
+                goto get_block;
+        }
+        if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
+                /*
+                 * Getting low on buffer credits...
+                 */
+                ret = ext4_journal_extend(handle, DIO_CREDITS);
+                if (ret > 0) {
+                        /*
+                         * Couldn't extend the transaction.  Start a new one.
+                         */
+                        ret = ext4_journal_restart(handle, DIO_CREDITS);
+                }
+        }
+get_block:
+        if (ret == 0) {
+                ret = ext4_get_blocks_wrap(handle, inode, iblock,
+                                        max_blocks, bh_result, create, 0);
+                if (ret > 0) {
+                        bh_result->b_size = (ret << inode->i_blkbits);
+                        ret = 0;
+                }
+        }
+        return ret;
+}
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
+                                long block, int create, int *errp)
+{
+        struct buffer_head dummy;
+        int fatal = 0, err;
+        J_ASSERT(handle != NULL || create == 0);
+        dummy.b_state = 0;
+        dummy.b_blocknr = -1000;
+        buffer_trace_init(&dummy.b_history);
+        err = ext4_get_blocks_wrap(handle, inode, block, 1,
+                                        &dummy, create, 1);
+        /*
+         * ext4_get_blocks_handle() returns number of blocks
+         * mapped. 0 in case of a HOLE.
+         */
+        if (err > 0) {
+                if (err > 1)
+                        WARN_ON(1);
+                err = 0;
+        }
+        *errp = err;
+        if (!err && buffer_mapped(&dummy)) {
+                struct buffer_head *bh;
+                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+                if (!bh) {
+                        *errp = -EIO;
+                        goto err;
+                }
+                if (buffer_new(&dummy)) {
+                        J_ASSERT(create != 0);
+                        J_ASSERT(handle != 0);
+                        /*
+                         * Now that we do not always journal data, we should
+                         * keep in mind whether this should always journal the
+                         * new buffer as metadata.  For now, regular file
+                         * writes use ext4_get_block instead, so it's not a
+                         * problem.
+                         */
+                        lock_buffer(bh);
+                        BUFFER_TRACE(bh, "call get_create_access");
+                        fatal = ext4_journal_get_create_access(handle, bh);
+                        if (!fatal && !buffer_uptodate(bh)) {
+                                memset(bh->b_data,0,inode->i_sb->s_blocksize);
+                                set_buffer_uptodate(bh);
+                        }
+                        unlock_buffer(bh);
+                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        err = ext4_journal_dirty_metadata(handle, bh);
+                        if (!fatal)
+                                fatal = err;
+                } else {
+                        BUFFER_TRACE(bh, "not a new buffer");
+                }
+                if (fatal) {
+                        *errp = fatal;
+                        brelse(bh);
+                        bh = NULL;
+                }
+                return bh;
+        }
+err:
+        return NULL;
+}
+struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
+                               int block, int create, int *err)
+{
+        struct buffer_head * bh;
+        bh = ext4_getblk(handle, inode, block, create, err);
+        if (!bh)
+                return bh;
+        if (buffer_uptodate(bh))
+                return bh;
+        ll_rw_block(READ_META, 1, &bh);
+        wait_on_buffer(bh);
+        if (buffer_uptodate(bh))
+                return bh;
+        put_bh(bh);
+        *err = -EIO;
+        return NULL;
+}
+static int walk_page_buffers(   handle_t *handle,
+                                struct buffer_head *head,
+                                unsigned from,
+                                unsigned to,
+                                int *partial,
+                                int (*fn)(      handle_t *handle,
+                                                struct buffer_head *bh))
+{
+        struct buffer_head *bh;
+        unsigned block_start, block_end;
+        unsigned blocksize = head->b_size;
+        int err, ret = 0;
+        struct buffer_head *next;
+        for (   bh = head, block_start = 0;
+                ret == 0 && (bh != head || !block_start);
+                block_start = block_end, bh = next)
+        {
+                next = bh->b_this_page;
+                block_end = block_start + blocksize;
+                if (block_end <= from || block_start >= to) {
+                        if (partial && !buffer_uptodate(bh))
+                                *partial = 1;
+                        continue;
+                }
+                err = (*fn)(handle, bh);
+                if (!ret)
+                        ret = err;
+        }
+        return ret;
+}
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction.  We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the commit_write().  So doing the jbd2_journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext4_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * has generated enough buffer credits to do the whole page.  So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext4 can be reentered when a transaction is open via
+ * quota file writes.  If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated.  We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+                                        struct buffer_head *bh)
+{
+        if (!buffer_mapped(bh) || buffer_freed(bh))
+                return 0;
+        return ext4_journal_get_write_access(handle, bh);
+}
+static int ext4_prepare_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
+{
+        struct inode *inode = page->mapping->host;
+        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+        handle_t *handle;
+        int retries = 0;
+retry:
+        handle = ext4_journal_start(inode, needed_blocks);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_prepare_write(page, from, to, ext4_get_block);
+        else
+                ret = block_prepare_write(page, from, to, ext4_get_block);
+        if (ret)
+                goto prepare_write_failed;
+        if (ext4_should_journal_data(inode)) {
+                ret = walk_page_buffers(handle, page_buffers(page),
+                                from, to, NULL, do_journal_get_write_access);
+        }
+prepare_write_failed:
+        if (ret)
+                ext4_journal_stop(handle);
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+out:
+        return ret;
+}
+int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+        int err = jbd2_journal_dirty_data(handle, bh);
+        if (err)
+                ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
+                                                bh, handle,err);
+        return err;
+}
+/* For commit_write() in data=journal mode */
+static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+{
+        if (!buffer_mapped(bh) || buffer_freed(bh))
+                return 0;
+        set_buffer_uptodate(bh);
+        return ext4_journal_dirty_metadata(handle, bh);
+}
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_ordered_commit_write(struct file *file, struct page *page,
+                             unsigned from, unsigned to)
+{
+        handle_t *handle = ext4_journal_current_handle();
+        struct inode *inode = page->mapping->host;
+        int ret = 0, ret2;
+        ret = walk_page_buffers(handle, page_buffers(page),
+                from, to, NULL, ext4_journal_dirty_data);
+        if (ret == 0) {
+                /*
+                 * generic_commit_write() will run mark_inode_dirty() if i_size
+                 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+                 * into that.
+                 */
+                loff_t new_i_size;
+                new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+                if (new_i_size > EXT4_I(inode)->i_disksize)
+                        EXT4_I(inode)->i_disksize = new_i_size;
+                ret = generic_commit_write(file, page, from, to);
+        }
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret;
+}
+static int ext4_writeback_commit_write(struct file *file, struct page *page,
+                             unsigned from, unsigned to)
+{
+        handle_t *handle = ext4_journal_current_handle();
+        struct inode *inode = page->mapping->host;
+        int ret = 0, ret2;
+        loff_t new_i_size;
+        new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        if (new_i_size > EXT4_I(inode)->i_disksize)
+                EXT4_I(inode)->i_disksize = new_i_size;
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_commit_write(file, page, from, to);
+        else
+                ret = generic_commit_write(file, page, from, to);
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret;
+}
+static int ext4_journalled_commit_write(struct file *file,
+                        struct page *page, unsigned from, unsigned to)
+{
+        handle_t *handle = ext4_journal_current_handle();
+        struct inode *inode = page->mapping->host;
+        int ret = 0, ret2;
+        int partial = 0;
+        loff_t pos;
+        /*
+         * Here we duplicate the generic_commit_write() functionality
+         */
+        pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        ret = walk_page_buffers(handle, page_buffers(page), from,
+                                to, &partial, commit_write_fn);
+        if (!partial)
+                SetPageUptodate(page);
+        if (pos > inode->i_size)
+                i_size_write(inode, pos);
+        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        if (inode->i_size > EXT4_I(inode)->i_disksize) {
+                EXT4_I(inode)->i_disksize = inode->i_size;
+                ret2 = ext4_mark_inode_dirty(handle, inode);
+                if (!ret)
+                        ret = ret2;
+        }
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret;
+}
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+        struct inode *inode = mapping->host;
+        journal_t *journal;
+        int err;
+        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+                /*
+                 * This is a REALLY heavyweight approach, but the use of
+                 * bmap on dirty files is expected to be extremely rare:
+                 * only if we run lilo or swapon on a freshly made file
+                 * do we expect this to happen.
+                 *
+                 * (bmap requires CAP_SYS_RAWIO so this does not
+                 * represent an unprivileged user DOS attack --- we'd be
+                 * in trouble if mortal users could trigger this path at
+                 * will.)
+                 *
+                 * NB. EXT4_STATE_JDATA is not set on files other than
+                 * regular files.  If somebody wants to bmap a directory
+                 * or symlink and gets confused because the buffer
+                 * hasn't yet been flushed to disk, they deserve
+                 * everything they get.
+                 */
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+                journal = EXT4_JOURNAL(inode);
+                jbd2_journal_lock_updates(journal);
+                err = jbd2_journal_flush(journal);
+                jbd2_journal_unlock_updates(journal);
+                if (err)
+                        return 0;
+        }
+        return generic_block_bmap(mapping,block,ext4_get_block);
+}
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+        get_bh(bh);
+        return 0;
+}
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+        put_bh(bh);
+        return 0;
+}
+static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+        if (buffer_mapped(bh))
+                return ext4_journal_dirty_data(handle, bh);
+        return 0;
+}
+/*
+ * Note that we always start a transaction even if we're not journalling
+ * data.  This is to preserve ordering: any hole instantiation within
+ * __block_write_full_page -> ext4_get_block() should be journalled
+ * along with the data so we don't crash and then get metadata which
+ * refers to old data.
+ *
+ * In all journalling modes block_write_full_page() will start the I/O.
+ *
+ * Problem:
+ *
+ *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *              ext4_writepage()
+ *
+ * Similar for:
+ *
+ *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ *
+ * Same applies to ext4_get_block().  We will deadlock on various things like
+ * lock_journal and i_truncate_mutex.
+ *
+ * Setting PF_MEMALLOC here doesn't work - too many internal memory
+ * allocations fail.
+ *
+ * 16May01: If we're reentered then journal_current_handle() will be
+ *          non-zero. We simply *return*.
+ *
+ * 1 July 2001: @@@ FIXME:
+ *   In journalled data mode, a data buffer may be metadata against the
+ *   current transaction.  But the same file is part of a shared mapping
+ *   and someone does a writepage() on it.
+ *
+ *   We will move the buffer onto the async_data list, but *after* it has
+ *   been dirtied. So there's a small window where we have dirty data on
+ *   BJ_Metadata.
+ *
+ *   Note that this only applies to the last partial page in the file.  The
+ *   bit which block_write_full_page() uses prepare/commit for.  (That's
+ *   broken code anyway: it's wrong for msync()).
+ *
+ *   It's a rare case: affects the final partial page, for journalled data
+ *   where the file is subject to bith write() and writepage() in the same
+ *   transction.  To fix it we'll need a custom block_write_full_page().
+ *   We'll probably need that anyway for journalling writepage() output.
+ *
+ * We don't honour synchronous mounts for writepage().  That would be
+ * disastrous.  Any write() or metadata operation will sync the fs for
+ * us.
+ *
+ * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
+ * we don't need to open a transaction here.
+ */
+static int ext4_ordered_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct buffer_head *page_bufs;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        J_ASSERT(PageLocked(page));
+        /*
+         * We give up here if we're reentered, because it might be for a
+         * different filesystem.
+         */
+        if (ext4_journal_current_handle())
+                goto out_fail;
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out_fail;
+        }
+        if (!page_has_buffers(page)) {
+                create_empty_buffers(page, inode->i_sb->s_blocksize,
+                                (1 << BH_Dirty)|(1 << BH_Uptodate));
+        }
+        page_bufs = page_buffers(page);
+        walk_page_buffers(handle, page_bufs, 0,
+                        PAGE_CACHE_SIZE, NULL, bget_one);
+        ret = block_write_full_page(page, ext4_get_block, wbc);
+        /*
+         * The page can become unlocked at any point now, and
+         * truncate can then come in and change things.  So we
+         * can't touch *page from now on.  But *page_bufs is
+         * safe due to elevated refcount.
+         */
+        /*
+         * And attach them to the current transaction.  But only if
+         * block_write_full_page() succeeded.  Otherwise they are unmapped,
+         * and generally junk.
+         */
+        if (ret == 0) {
+                err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+                                        NULL, jbd2_journal_dirty_data_fn);
+                if (!ret)
+                        ret = err;
+        }
+        walk_page_buffers(handle, page_bufs, 0,
+                        PAGE_CACHE_SIZE, NULL, bput_one);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+out_fail:
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        return ret;
+}
+static int ext4_writeback_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        if (ext4_journal_current_handle())
+                goto out_fail;
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out_fail;
+        }
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_writepage(page, ext4_get_block, wbc);
+        else
+                ret = block_write_full_page(page, ext4_get_block, wbc);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+out_fail:
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        return ret;
+}
+static int ext4_journalled_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        if (ext4_journal_current_handle())
+                goto no_write;
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto no_write;
+        }
+        if (!page_has_buffers(page) || PageChecked(page)) {
+                /*
+                 * It's mmapped pagecache.  Add buffers and journal it.  There
+                 * doesn't seem much point in redirtying the page here.
+                 */
+                ClearPageChecked(page);
+                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                        ext4_get_block);
+                if (ret != 0) {
+                        ext4_journal_stop(handle);
+                        goto out_unlock;
+                }
+                ret = walk_page_buffers(handle, page_buffers(page), 0,
+                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+                err = walk_page_buffers(handle, page_buffers(page), 0,
+                                PAGE_CACHE_SIZE, NULL, commit_write_fn);
+                if (ret == 0)
+                        ret = err;
+                EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+                unlock_page(page);
+        } else {
+                /*
+                 * It may be a page full of checkpoint-mode buffers.  We don't
+                 * really know unless we go poke around in the buffer_heads.
+                 * But block_write_full_page will do the right thing.
+                 */
+                ret = block_write_full_page(page, ext4_get_block, wbc);
+        }
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+out:
+        return ret;
+no_write:
+        redirty_page_for_writepage(wbc, page);
+out_unlock:
+        unlock_page(page);
+        goto out;
+}
+static int ext4_readpage(struct file *file, struct page *page)
+{
+        return mpage_readpage(page, ext4_get_block);
+}
+static int
+ext4_readpages(struct file *file, struct address_space *mapping,
+                struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+}
+static void ext4_invalidatepage(struct page *page, unsigned long offset)
+{
+        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        /*
+         * If it's a full truncate we just forget about the pending dirtying
+         */
+        if (offset == 0)
+                ClearPageChecked(page);
+        jbd2_journal_invalidatepage(journal, page, offset);
+}
+static int ext4_releasepage(struct page *page, gfp_t wait)
+{
+        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        WARN_ON(PageChecked(page));
+        if (!page_has_buffers(page))
+                return 0;
+        return jbd2_journal_try_to_free_buffers(journal, page, wait);
+}
+/*
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file.
+ */
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        handle_t *handle = NULL;
+        ssize_t ret;
+        int orphan = 0;
+        size_t count = iov_length(iov, nr_segs);
+        if (rw == WRITE) {
+                loff_t final_size = offset + count;
+                handle = ext4_journal_start(inode, DIO_CREDITS);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out;
+                }
+                if (final_size > inode->i_size) {
+                        ret = ext4_orphan_add(handle, inode);
+                        if (ret)
+                                goto out_stop;
+                        orphan = 1;
+                        ei->i_disksize = inode->i_size;
+                }
+        }
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+                                 offset, nr_segs,
+                                 ext4_get_block, NULL);
+        /*
+         * Reacquire the handle: ext4_get_block() can restart the transaction
+         */
+        handle = journal_current_handle();
+out_stop:
+        if (handle) {
+                int err;
+                if (orphan && inode->i_nlink)
+                        ext4_orphan_del(handle, inode);
+                if (orphan && ret > 0) {
+                        loff_t end = offset + ret;
+                        if (end > inode->i_size) {
+                                ei->i_disksize = end;
+                                i_size_write(inode, end);
+                                /*
+                                 * We're going to return a positive `ret'
+                                 * here due to non-zero-length I/O, so there's
+                                 * no way of reporting error returns from
+                                 * ext4_mark_inode_dirty() to userspace.  So
+                                 * ignore it.
+                                 */
+                                ext4_mark_inode_dirty(handle, inode);
+                        }
+                }
+                err = ext4_journal_stop(handle);
+                if (ret == 0)
+                        ret = err;
+        }
+out:
+        return ret;
+}
+/*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
+ * much here because ->set_page_dirty is called under VFS locks.  The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext4_journalled_set_page_dirty(struct page *page)
+{
+        SetPageChecked(page);
+        return __set_page_dirty_nobuffers(page);
+}
+static const struct address_space_operations ext4_ordered_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_ordered_writepage,
+        .sync_page      = block_sync_page,
+        .prepare_write  = ext4_prepare_write,
+        .commit_write   = ext4_ordered_commit_write,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_invalidatepage,
+        .releasepage    = ext4_releasepage,
+        .direct_IO      = ext4_direct_IO,
+        .migratepage    = buffer_migrate_page,
+};
+static const struct address_space_operations ext4_writeback_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_writeback_writepage,
+        .sync_page      = block_sync_page,
+        .prepare_write  = ext4_prepare_write,
+        .commit_write   = ext4_writeback_commit_write,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_invalidatepage,
+        .releasepage    = ext4_releasepage,
+        .direct_IO      = ext4_direct_IO,
+        .migratepage    = buffer_migrate_page,
+};
+static const struct address_space_operations ext4_journalled_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_journalled_writepage,
+        .sync_page      = block_sync_page,
+        .prepare_write  = ext4_prepare_write,
+        .commit_write   = ext4_journalled_commit_write,
+        .set_page_dirty = ext4_journalled_set_page_dirty,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_invalidatepage,
+        .releasepage    = ext4_releasepage,
+};
+void ext4_set_aops(struct inode *inode)
+{
+        if (ext4_should_order_data(inode))
+                inode->i_mapping->a_ops = &ext4_ordered_aops;
+        else if (ext4_should_writeback_data(inode))
+                inode->i_mapping->a_ops = &ext4_writeback_aops;
+        else
+                inode->i_mapping->a_ops = &ext4_journalled_aops;
+}
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle, struct page *page,
+                struct address_space *mapping, loff_t from)
+{
+        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize, iblock, length, pos;
+        struct inode *inode = mapping->host;
+        struct buffer_head *bh;
+        int err = 0;
+        void *kaddr;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        /*
+         * For "nobh" option,  we can only work if we don't need to
+         * read-in the page - otherwise we create buffers to do the IO.
+         */
+        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
+             ext4_should_writeback_data(inode) && PageUptodate(page)) {
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + offset, 0, length);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_page_dirty(page);
+                goto unlock;
+        }
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        /* Find the buffer that contains "offset" */
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        err = 0;
+        if (buffer_freed(bh)) {
+                BUFFER_TRACE(bh, "freed: skip");
+                goto unlock;
+        }
+        if (!buffer_mapped(bh)) {
+                BUFFER_TRACE(bh, "unmapped");
+                ext4_get_block(inode, iblock, bh, 0);
+                /* unmapped? It's a hole - nothing to do */
+                if (!buffer_mapped(bh)) {
+                        BUFFER_TRACE(bh, "still unmapped");
+                        goto unlock;
+                }
+        }
+        /* Ok, it's mapped. Make sure it's up-to-date */
+        if (PageUptodate(page))
+                set_buffer_uptodate(bh);
+        if (!buffer_uptodate(bh)) {
+                err = -EIO;
+                ll_rw_block(READ, 1, &bh);
+                wait_on_buffer(bh);
+                /* Uhhuh. Read error. Complain and punt. */
+                if (!buffer_uptodate(bh))
+                        goto unlock;
+        }
+        if (ext4_should_journal_data(inode)) {
+                BUFFER_TRACE(bh, "get write access");
+                err = ext4_journal_get_write_access(handle, bh);
+                if (err)
+                        goto unlock;
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr + offset, 0, length);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        BUFFER_TRACE(bh, "zeroed end of block");
+        err = 0;
+        if (ext4_should_journal_data(inode)) {
+                err = ext4_journal_dirty_metadata(handle, bh);
+        } else {
+                if (ext4_should_order_data(inode))
+                        err = ext4_journal_dirty_data(handle, bh);
+                mark_buffer_dirty(bh);
+        }
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+        while (p < q)
+                if (*p++)
+                        return 0;
+        return 1;
+}
+/**
+ *      ext4_find_shared - find the indirect blocks for partial truncation.
+ *      @inode:   inode in question
+ *      @depth:   depth of the affected branch
+ *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ *      @chain:   place to store the pointers to partial indirect blocks
+ *      @top:     place to the (detached) top of branch
+ *
+ *      This is a helper function used by ext4_truncate().
+ *
+ *      When we do truncate() we may have to clean the ends of several
+ *      indirect blocks but leave the blocks themselves alive. Block is
+ *      partially truncated if some data below the new i_size is refered
+ *      from it (and it is on the path to the first completely truncated
+ *      data block, indeed).  We have to free the top of that path along
+ *      with everything to the right of the path. Since no allocation
+ *      past the truncation point is possible until ext4_truncate()
+ *      finishes, we may safely do the latter, but top of branch may
+ *      require special attention - pageout below the truncation point
+ *      might try to populate it.
+ *
+ *      We atomically detach the top of branch from the tree, store the
+ *      block number of its root in *@top, pointers to buffer_heads of
+ *      partially truncated blocks - in @chain[].bh and pointers to
+ *      their last elements that should not be removed - in
+ *      @chain[].p. Return value is the pointer to last filled element
+ *      of @chain.
+ *
+ *      The work left to caller to do the actual freeing of subtrees:
+ *              a) free the subtree starting from *@top
+ *              b) free the subtrees whose roots are stored in
+ *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *              c) free the subtrees growing from the inode past the @chain[0].
+ *                      (no partially truncated stuff there).  */
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+                        int offsets[4], Indirect chain[4], __le32 *top)
+{
+        Indirect *partial, *p;
+        int k, err;
+        *top = 0;
+        /* Make k index the deepest non-null offest + 1 */
+        for (k = depth; k > 1 && !offsets[k-1]; k--)
+                ;
+        partial = ext4_get_branch(inode, k, offsets, chain, &err);
+        /* Writer: pointers */
+        if (!partial)
+                partial = chain + k-1;
+        /*
+         * If the branch acquired continuation since we've looked at it -
+         * fine, it should all survive and (new) top doesn't belong to us.
+         */
+        if (!partial->key && *partial->p)
+                /* Writer: end */
+                goto no_top;
+        for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+                ;
+        /*
+         * OK, we've found the last block that must survive. The rest of our
+         * branch should be detached before unlocking. However, if that rest
+         * of branch is all ours and does not grow immediately from the inode
+         * it's easier to cheat and just decrement partial->p.
+         */
+        if (p == chain + k - 1 && p > chain) {
+                p->p--;
+        } else {
+                *top = *p->p;
+                /* Nope, don't do this in ext4.  Must leave the tree intact */
+#if 0
+                *p->p = 0;
+#endif
+        }
+        /* Writer: end */
+        while(partial > p) {
+                brelse(partial->bh);
+                partial--;
+        }
+no_top:
+        return partial;
+}
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ */
+static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
+                struct buffer_head *bh, ext4_fsblk_t block_to_free,
+                unsigned long count, __le32 *first, __le32 *last)
+{
+        __le32 *p;
+        if (try_to_extend_transaction(handle, inode)) {
+                if (bh) {
+                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        ext4_journal_dirty_metadata(handle, bh);
+                }
+                ext4_mark_inode_dirty(handle, inode);
+                ext4_journal_test_restart(handle, inode);
+                if (bh) {
+                        BUFFER_TRACE(bh, "retaking write access");
+                        ext4_journal_get_write_access(handle, bh);
+                }
+        }
+        /*
+         * Any buffers which are on the journal will be in memory. We find
+         * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
+         * on them.  We've already detached each block from the file, so
+         * bforget() in jbd2_journal_forget() should be safe.
+         *
+         * AKPM: turn on bforget in jbd2_journal_forget()!!!
+         */
+        for (p = first; p < last; p++) {
+                u32 nr = le32_to_cpu(*p);
+                if (nr) {
+                        struct buffer_head *bh;
+                        *p = 0;
+                        bh = sb_find_get_block(inode->i_sb, nr);
+                        ext4_forget(handle, 0, inode, bh, nr);
+                }
+        }
+        ext4_free_blocks(handle, inode, block_to_free, count);
+}
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle:     handle for this transaction
+ * @inode:      inode we are dealing with
+ * @this_bh:    indirect buffer_head which contains *@first and *@last
+ * @first:      array of block numbers
+ * @last:       points immediately past the end of array
+ *
+ * We are freeing all blocks refered from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+                           struct buffer_head *this_bh,
+                           __le32 *first, __le32 *last)
+{
+        ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+        unsigned long count = 0;            /* Number of blocks in the run */
+        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
+                                               corresponding to
+                                               block_to_free */
+        ext4_fsblk_t nr;                    /* Current block # */
+        __le32 *p;                          /* Pointer into inode/ind
+                                               for current block */
+        int err;
+        if (this_bh) {                          /* For indirect block */
+                BUFFER_TRACE(this_bh, "get_write_access");
+                err = ext4_journal_get_write_access(handle, this_bh);
+                /* Important: if we can't update the indirect pointers
+                 * to the blocks, we can't free them. */
+                if (err)
+                        return;
+        }
+        for (p = first; p < last; p++) {
+                nr = le32_to_cpu(*p);
+                if (nr) {
+                        /* accumulate blocks to free if they're contiguous */
+                        if (count == 0) {
+                                block_to_free = nr;
+                                block_to_free_p = p;
+                                count = 1;
+                        } else if (nr == block_to_free + count) {
+                                count++;
+                        } else {
+                                ext4_clear_blocks(handle, inode, this_bh,
+                                                  block_to_free,
+                                                  count, block_to_free_p, p);
+                                block_to_free = nr;
+                                block_to_free_p = p;
+                                count = 1;
+                        }
+                }
+        }
+        if (count > 0)
+                ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+                                  count, block_to_free_p, p);
+        if (this_bh) {
+                BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+                ext4_journal_dirty_metadata(handle, this_bh);
+        }
+}
+/**
+ *      ext4_free_branches - free an array of branches
+ *      @handle: JBD handle for this transaction
+ *      @inode: inode we are dealing with
+ *      @parent_bh: the buffer_head which contains *@first and *@last
+ *      @first: array of block numbers
+ *      @last:  pointer immediately past the end of array
+ *      @depth: depth of the branches to free
+ *
+ *      We are freeing all blocks refered from these branches (numbers are
+ *      stored as little-endian 32-bit) and updating @inode->i_blocks
+ *      appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+                               struct buffer_head *parent_bh,
+                               __le32 *first, __le32 *last, int depth)
+{
+        ext4_fsblk_t nr;
+        __le32 *p;
+        if (is_handle_aborted(handle))
+                return;
+        if (depth--) {
+                struct buffer_head *bh;
+                int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                p = last;
+                while (--p >= first) {
+                        nr = le32_to_cpu(*p);
+                        if (!nr)
+                                continue;               /* A hole */
+                        /* Go read the buffer for the next level down */
+                        bh = sb_bread(inode->i_sb, nr);
+                        /*
+                         * A read failure? Report error and clear slot
+                         * (should be rare).
+                         */
+                        if (!bh) {
+                                ext4_error(inode->i_sb, "ext4_free_branches",
+                                           "Read failure, inode=%lu, block=%llu",
+                                           inode->i_ino, nr);
+                                continue;
+                        }
+                        /* This zaps the entire block.  Bottom up. */
+                        BUFFER_TRACE(bh, "free child branches");
+                        ext4_free_branches(handle, inode, bh,
+                                           (__le32*)bh->b_data,
+                                           (__le32*)bh->b_data + addr_per_block,
+                                           depth);
+                        /*
+                         * We've probably journalled the indirect block several
+                         * times during the truncate.  But it's no longer
+                         * needed and we now drop it from the transaction via
+                         * jbd2_journal_revoke().
+                         *
+                         * That's easy if it's exclusively part of this
+                         * transaction.  But if it's part of the committing
+                         * transaction then jbd2_journal_forget() will simply
+                         * brelse() it.  That means that if the underlying
+                         * block is reallocated in ext4_get_block(),
+                         * unmap_underlying_metadata() will find this block
+                         * and will try to get rid of it.  damn, damn.
+                         *
+                         * If this block has already been committed to the
+                         * journal, a revoke record will be written.  And
+                         * revoke records must be emitted *before* clearing
+                         * this block's bit in the bitmaps.
+                         */
+                        ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+                        /*
+                         * Everything below this this pointer has been
+                         * released.  Now let this top-of-subtree go.
+                         *
+                         * We want the freeing of this indirect block to be
+                         * atomic in the journal with the updating of the
+                         * bitmap block which owns it.  So make some room in
+                         * the journal.
+                         *
+                         * We zero the parent pointer *after* freeing its
+                         * pointee in the bitmaps, so if extend_transaction()
+                         * for some reason fails to put the bitmap changes and
+                         * the release into the same transaction, recovery
+                         * will merely complain about releasing a free block,
+                         * rather than leaking blocks.
+                         */
+                        if (is_handle_aborted(handle))
+                                return;
+                        if (try_to_extend_transaction(handle, inode)) {
+                                ext4_mark_inode_dirty(handle, inode);
+                                ext4_journal_test_restart(handle, inode);
+                        }
+                        ext4_free_blocks(handle, inode, nr, 1);
+                        if (parent_bh) {
+                                /*
+                                 * The block which we have just freed is
+                                 * pointed to by an indirect block: journal it
+                                 */
+                                BUFFER_TRACE(parent_bh, "get_write_access");
+                                if (!ext4_journal_get_write_access(handle,
+                                                                   parent_bh)){
+                                        *p = 0;
+                                        BUFFER_TRACE(parent_bh,
+                                        "call ext4_journal_dirty_metadata");
+                                        ext4_journal_dirty_metadata(handle,
+                                                                    parent_bh);
+                                }
+                        }
+                }
+        } else {
+                /* We have reached the bottom of the tree. */
+                BUFFER_TRACE(parent_bh, "free data blocks");
+                ext4_free_data(handle, inode, parent_bh, first, last);
+        }
+}
+/*
+ * ext4_truncate()
+ *
+ * We block out ext4_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext4_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commmit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk.  We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable.  It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext4_truncate() to have another go.  So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext4 filesystem.  But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext4_truncate() run will find them and release them.
+ */
+void ext4_truncate(struct inode *inode)
+{
+        handle_t *handle;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        __le32 *i_data = ei->i_data;
+        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        struct address_space *mapping = inode->i_mapping;
+        int offsets[4];
+        Indirect chain[4];
+        Indirect *partial;
+        __le32 nr = 0;
+        int n;
+        long last_block;
+        unsigned blocksize = inode->i_sb->s_blocksize;
+        struct page *page;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return;
+        if (ext4_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        /*
+         * We have to lock the EOF page here, because lock_page() nests
+         * outside jbd2_journal_start().
+         */
+        if ((inode->i_size & (blocksize - 1)) == 0) {
+                /* Block boundary? Nothing to do */
+                page = NULL;
+        } else {
+                page = grab_cache_page(mapping,
+                                inode->i_size >> PAGE_CACHE_SHIFT);
+                if (!page)
+                        return;
+        }
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_truncate(inode, page);
+        handle = start_transaction(inode);
+        if (IS_ERR(handle)) {
+                if (page) {
+                        clear_highpage(page);
+                        flush_dcache_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+                return;         /* AKPM: return what? */
+        }
+        last_block = (inode->i_size + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        if (page)
+                ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (n == 0)
+                goto out_stop;  /* error */
+        /*
+         * OK.  This truncate is going to happen.  We add the inode to the
+         * orphan list, so that if this truncate spans multiple transactions,
+         * and we crash, we will resume the truncate when the filesystem
+         * recovers.  It also marks the inode dirty, to catch the new size.
+         *
+         * Implication: the file must always be in a sane, consistent
+         * truncatable state while each transaction commits.
+         */
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
+        /*
+         * The orphan list entry will now protect us from any crash which
+         * occurs before the truncate completes, so it is now safe to propagate
+         * the new, shorter inode size (held for now in i_size) into the
+         * on-disk inode. We do this via i_disksize, which is the value which
+         * ext4 *really* writes onto the disk inode.
+         */
+        ei->i_disksize = inode->i_size;
+        /*
+         * From here we block out all ext4_get_block() callers who want to
+         * modify the block allocation tree.
+         */
+        mutex_lock(&ei->truncate_mutex);
+        if (n == 1) {           /* direct blocks */
+                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+                               i_data + EXT4_NDIR_BLOCKS);
+                goto do_indirects;
+        }
+        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+        /* Kill the top of shared branch (not detached) */
+        if (nr) {
+                if (partial == chain) {
+                        /* Shared branch grows from the inode */
+                        ext4_free_branches(handle, inode, NULL,
+                                           &nr, &nr+1, (chain+n-1) - partial);
+                        *partial->p = 0;
+                        /*
+                         * We mark the inode dirty prior to restart,
+                         * and prior to stop.  No need for it here.
+                         */
+                } else {
+                        /* Shared branch grows from an indirect block */
+                        BUFFER_TRACE(partial->bh, "get_write_access");
+                        ext4_free_branches(handle, inode, partial->bh,
+                                        partial->p,
+                                        partial->p+1, (chain+n-1) - partial);
+                }
+        }
+        /* Clear the ends of indirect blocks on the shared branch */
+        while (partial > chain) {
+                ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+                                   (__le32*)partial->bh->b_data+addr_per_block,
+                                   (chain+n-1) - partial);
+                BUFFER_TRACE(partial->bh, "call brelse");
+                brelse (partial->bh);
+                partial--;
+        }
+do_indirects:
+        /* Kill the remaining (whole) subtrees */
+        switch (offsets[0]) {
+        default:
+                nr = i_data[EXT4_IND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+                        i_data[EXT4_IND_BLOCK] = 0;
+                }
+        case EXT4_IND_BLOCK:
+                nr = i_data[EXT4_DIND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+                        i_data[EXT4_DIND_BLOCK] = 0;
+                }
+        case EXT4_DIND_BLOCK:
+                nr = i_data[EXT4_TIND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+                        i_data[EXT4_TIND_BLOCK] = 0;
+                }
+        case EXT4_TIND_BLOCK:
+                ;
+        }
+        ext4_discard_reservation(inode);
+        mutex_unlock(&ei->truncate_mutex);
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+        ext4_mark_inode_dirty(handle, inode);
+        /*
+         * In a multi-transaction truncate, we only make the final transaction
+         * synchronous
+         */
+        if (IS_SYNC(inode))
+                handle->h_sync = 1;
+out_stop:
+        /*
+         * If this was a simple ftruncate(), and the file will remain alive
+         * then we need to clear up the orphan record which we created above.
+         * However, if this was a real unlink then we were called by
+         * ext4_delete_inode(), and we allow that function to clean up the
+         * orphan info for us.
+         */
+        if (inode->i_nlink)
+                ext4_orphan_del(handle, inode);
+        ext4_journal_stop(handle);
+}
+static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
+                unsigned long ino, struct ext4_iloc *iloc)
+{
+        unsigned long desc, group_desc, block_group;
+        unsigned long offset;
+        ext4_fsblk_t block;
+        struct buffer_head *bh;
+        struct ext4_group_desc * gdp;
+        if (!ext4_valid_inum(sb, ino)) {
+                /*
+                 * This error is already checked for in namei.c unless we are
+                 * looking at an NFS filehandle, in which case no error
+                 * report is needed
+                 */
+                return 0;
+        }
+        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+        if (block_group >= EXT4_SB(sb)->s_groups_count) {
+                ext4_error(sb,"ext4_get_inode_block","group >= groups count");
+                return 0;
+        }
+        smp_rmb();
+        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+        desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+        bh = EXT4_SB(sb)->s_group_desc[group_desc];
+        if (!bh) {
+                ext4_error (sb, "ext4_get_inode_block",
+                            "Descriptor not loaded");
+                return 0;
+        }
+        gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
+                desc * EXT4_DESC_SIZE(sb));
+        /*
+         * Figure out the offset within the block group inode table
+         */
+        offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
+                EXT4_INODE_SIZE(sb);
+        block = ext4_inode_table(sb, gdp) +
+                (offset >> EXT4_BLOCK_SIZE_BITS(sb));
+        iloc->block_group = block_group;
+        iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
+        return block;
+}
+/*
+ * ext4_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext4_get_inode_loc(struct inode *inode,
+                                struct ext4_iloc *iloc, int in_mem)
+{
+        ext4_fsblk_t block;
+        struct buffer_head *bh;
+        block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+        if (!block)
+                return -EIO;
+        bh = sb_getblk(inode->i_sb, block);
+        if (!bh) {
+                ext4_error (inode->i_sb, "ext4_get_inode_loc",
+                                "unable to read inode block - "
+                                "inode=%lu, block=%llu",
+                                 inode->i_ino, block);
+                return -EIO;
+        }
+        if (!buffer_uptodate(bh)) {
+                lock_buffer(bh);
+                if (buffer_uptodate(bh)) {
+                        /* someone brought it uptodate while we waited */
+                        unlock_buffer(bh);
+                        goto has_buffer;
+                }
+                /*
+                 * If we have all information of the inode in memory and this
+                 * is the only valid inode in the block, we need not read the
+                 * block.
+                 */
+                if (in_mem) {
+                        struct buffer_head *bitmap_bh;
+                        struct ext4_group_desc *desc;
+                        int inodes_per_buffer;
+                        int inode_offset, i;
+                        int block_group;
+                        int start;
+                        block_group = (inode->i_ino - 1) /
+                                        EXT4_INODES_PER_GROUP(inode->i_sb);
+                        inodes_per_buffer = bh->b_size /
+                                EXT4_INODE_SIZE(inode->i_sb);
+                        inode_offset = ((inode->i_ino - 1) %
+                                        EXT4_INODES_PER_GROUP(inode->i_sb));
+                        start = inode_offset & ~(inodes_per_buffer - 1);
+                        /* Is the inode bitmap in cache? */
+                        desc = ext4_get_group_desc(inode->i_sb,
+                                                block_group, NULL);
+                        if (!desc)
+                                goto make_io;
+                        bitmap_bh = sb_getblk(inode->i_sb,
+                                ext4_inode_bitmap(inode->i_sb, desc));
+                        if (!bitmap_bh)
+                                goto make_io;
+                        /*
+                         * If the inode bitmap isn't in cache then the
+                         * optimisation may end up performing two reads instead
+                         * of one, so skip it.
+                         */
+                        if (!buffer_uptodate(bitmap_bh)) {
+                                brelse(bitmap_bh);
+                                goto make_io;
+                        }
+                        for (i = start; i < start + inodes_per_buffer; i++) {
+                                if (i == inode_offset)
+                                        continue;
+                                if (ext4_test_bit(i, bitmap_bh->b_data))
+                                        break;
+                        }
+                        brelse(bitmap_bh);
+                        if (i == start + inodes_per_buffer) {
+                                /* all other inodes are free, so skip I/O */
+                                memset(bh->b_data, 0, bh->b_size);
+                                set_buffer_uptodate(bh);
+                                unlock_buffer(bh);
+                                goto has_buffer;
+                        }
+                }
+make_io:
+                /*
+                 * There are other valid inodes in the buffer, this inode
+                 * has in-inode xattrs, or we don't have this inode in memory.
+                 * Read the block from disk.
+                 */
+                get_bh(bh);
+                bh->b_end_io = end_buffer_read_sync;
+                submit_bh(READ_META, bh);
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh)) {
+                        ext4_error(inode->i_sb, "ext4_get_inode_loc",
+                                        "unable to read inode block - "
+                                        "inode=%lu, block=%llu",
+                                        inode->i_ino, block);
+                        brelse(bh);
+                        return -EIO;
+                }
+        }
+has_buffer:
+        iloc->bh = bh;
+        return 0;
+}
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+{
+        /* We have all inode data except xattrs in memory here. */
+        return __ext4_get_inode_loc(inode, iloc,
+                !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+}
+void ext4_set_inode_flags(struct inode *inode)
+{
+        unsigned int flags = EXT4_I(inode)->i_flags;
+        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+        if (flags & EXT4_SYNC_FL)
+                inode->i_flags |= S_SYNC;
+        if (flags & EXT4_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        if (flags & EXT4_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+        if (flags & EXT4_NOATIME_FL)
+                inode->i_flags |= S_NOATIME;
+        if (flags & EXT4_DIRSYNC_FL)
+                inode->i_flags |= S_DIRSYNC;
+}
+void ext4_read_inode(struct inode * inode)
+{
+        struct ext4_iloc iloc;
+        struct ext4_inode *raw_inode;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct buffer_head *bh;
+        int block;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+        ei->i_acl = EXT4_ACL_NOT_CACHED;
+        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+#endif
+        ei->i_block_alloc_info = NULL;
+        if (__ext4_get_inode_loc(inode, &iloc, 0))
+                goto bad_inode;
+        bh = iloc.bh;
+        raw_inode = ext4_raw_inode(&iloc);
+        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+        inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+        if(!(test_opt (inode->i_sb, NO_UID32))) {
+                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+        }
+        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        inode->i_size = le32_to_cpu(raw_inode->i_size);
+        inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
+        inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
+        inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+        inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+        ei->i_state = 0;
+        ei->i_dir_start_lookup = 0;
+        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+        /* We now have enough fields to check if the inode was active or not.
+         * This is needed because nfsd might try to access dead inodes
+         * the test is that same one that e2fsck uses
+         * NeilBrown 1999oct15
+         */
+        if (inode->i_nlink == 0) {
+                if (inode->i_mode == 0 ||
+                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+                        /* this inode is deleted */
+                        brelse (bh);
+                        goto bad_inode;
+                }
+                /* The only unlinked inodes we let through here have
+                 * valid i_mode and are being read by the orphan
+                 * recovery code: that's fine, we're about to complete
+                 * the process of deleting those. */
+        }
+        inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+#ifdef EXT4_FRAGMENTS
+        ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+        ei->i_frag_no = raw_inode->i_frag;
+        ei->i_frag_size = raw_inode->i_fsize;
+#endif
+        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+            cpu_to_le32(EXT4_OS_HURD))
+                ei->i_file_acl |=
+                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+        if (!S_ISREG(inode->i_mode)) {
+                ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+        } else {
+                inode->i_size |=
+                        ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+        }
+        ei->i_disksize = inode->i_size;
+        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+        ei->i_block_group = iloc.block_group;
+        /*
+         * NOTE! The in-memory inode i_data array is in little-endian order
+         * even on big-endian machines: we do NOT byteswap the block numbers!
+         */
+        for (block = 0; block < EXT4_N_BLOCKS; block++)
+                ei->i_data[block] = raw_inode->i_block[block];
+        INIT_LIST_HEAD(&ei->i_orphan);
+        if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
+            EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+                /*
+                 * When mke2fs creates big inodes it does not zero out
+                 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
+                 * so ignore those first few inodes.
+                 */
+                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+                    EXT4_INODE_SIZE(inode->i_sb))
+                        goto bad_inode;
+                if (ei->i_extra_isize == 0) {
+                        /* The extra space is currently unused. Use it. */
+                        ei->i_extra_isize = sizeof(struct ext4_inode) -
+                                            EXT4_GOOD_OLD_INODE_SIZE;
+                } else {
+                        __le32 *magic = (void *)raw_inode +
+                                        EXT4_GOOD_OLD_INODE_SIZE +
+                                        ei->i_extra_isize;
+                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+                                 ei->i_state |= EXT4_STATE_XATTR;
+                }
+        } else
+                ei->i_extra_isize = 0;
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &ext4_file_inode_operations;
+                inode->i_fop = &ext4_file_operations;
+                ext4_set_aops(inode);
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &ext4_dir_inode_operations;
+                inode->i_fop = &ext4_dir_operations;
+        } else if (S_ISLNK(inode->i_mode)) {
+                if (ext4_inode_is_fast_symlink(inode))
+                        inode->i_op = &ext4_fast_symlink_inode_operations;
+                else {
+                        inode->i_op = &ext4_symlink_inode_operations;
+                        ext4_set_aops(inode);
+                }
+        } else {
+                inode->i_op = &ext4_special_inode_operations;
+                if (raw_inode->i_block[0])
+                        init_special_inode(inode, inode->i_mode,
+                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+                else
+                        init_special_inode(inode, inode->i_mode,
+                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+        }
+        brelse (iloc.bh);
+        ext4_set_inode_flags(inode);
+        return;
+bad_inode:
+        make_bad_inode(inode);
+        return;
+}
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+                                struct inode *inode,
+                                struct ext4_iloc *iloc)
+{
+        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct buffer_head *bh = iloc->bh;
+        int err = 0, rc, block;
+        /* For fields not not tracking in the in-memory inode,
+         * initialise them to zero for new inodes. */
+        if (ei->i_state & EXT4_STATE_NEW)
+                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+        if(!(test_opt(inode->i_sb, NO_UID32))) {
+                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+                if(!ei->i_dtime) {
+                        raw_inode->i_uid_high =
+                                cpu_to_le16(high_16_bits(inode->i_uid));
+                        raw_inode->i_gid_high =
+                                cpu_to_le16(high_16_bits(inode->i_gid));
+                } else {
+                        raw_inode->i_uid_high = 0;
+                        raw_inode->i_gid_high = 0;
+                }
+        } else {
+                raw_inode->i_uid_low =
+                        cpu_to_le16(fs_high2lowuid(inode->i_uid));
+                raw_inode->i_gid_low =
+                        cpu_to_le16(fs_high2lowgid(inode->i_gid));
+                raw_inode->i_uid_high = 0;
+                raw_inode->i_gid_high = 0;
+        }
+        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+        raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+        raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+        raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+        raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+#ifdef EXT4_FRAGMENTS
+        raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+        raw_inode->i_frag = ei->i_frag_no;
+        raw_inode->i_fsize = ei->i_frag_size;
+#endif
+        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+            cpu_to_le32(EXT4_OS_HURD))
+                raw_inode->i_file_acl_high =
+                        cpu_to_le16(ei->i_file_acl >> 32);
+        raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+        if (!S_ISREG(inode->i_mode)) {
+                raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+        } else {
+                raw_inode->i_size_high =
+                        cpu_to_le32(ei->i_disksize >> 32);
+                if (ei->i_disksize > 0x7fffffffULL) {
+                        struct super_block *sb = inode->i_sb;
+                        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+                            EXT4_SB(sb)->s_es->s_rev_level ==
+                                        cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+                               /* If this is the first large file
+                                * created, add a flag to the superblock.
+                                */
+                                err = ext4_journal_get_write_access(handle,
+                                                EXT4_SB(sb)->s_sbh);
+                                if (err)
+                                        goto out_brelse;
+                                ext4_update_dynamic_rev(sb);
+                                EXT4_SET_RO_COMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+                                sb->s_dirt = 1;
+                                handle->h_sync = 1;
+                                err = ext4_journal_dirty_metadata(handle,
+                                                EXT4_SB(sb)->s_sbh);
+                        }
+                }
+        }
+        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+                if (old_valid_dev(inode->i_rdev)) {
+                        raw_inode->i_block[0] =
+                                cpu_to_le32(old_encode_dev(inode->i_rdev));
+                        raw_inode->i_block[1] = 0;
+                } else {
+                        raw_inode->i_block[0] = 0;
+                        raw_inode->i_block[1] =
+                                cpu_to_le32(new_encode_dev(inode->i_rdev));
+                        raw_inode->i_block[2] = 0;
+                }
+        } else for (block = 0; block < EXT4_N_BLOCKS; block++)
+                raw_inode->i_block[block] = ei->i_data[block];
+        if (ei->i_extra_isize)
+                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+        rc = ext4_journal_dirty_metadata(handle, bh);
+        if (!err)
+                err = rc;
+        ei->i_state &= ~EXT4_STATE_NEW;
+out_brelse:
+        brelse (bh);
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
+/*
+ * ext4_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ *   Here, there will be no transaction running. We wait for any running
+ *   trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ *   We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ *   Here we simply return.  We can't afford to block kswapd on the
+ *   journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this.  The code:
+ *
+ *      mark_inode_dirty(inode)
+ *      stuff();
+ *      inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+int ext4_write_inode(struct inode *inode, int wait)
+{
+        if (current->flags & PF_MEMALLOC)
+                return 0;
+        if (ext4_journal_current_handle()) {
+                jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+                dump_stack();
+                return -EIO;
+        }
+        if (!wait)
+                return 0;
+        return ext4_force_commit(inode->i_sb);
+}
+/*
+ * ext4_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible.  In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk.  (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Called with inode->sem down.
+ */
+int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error, rc = 0;
+        const unsigned int ia_valid = attr->ia_valid;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
+        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+                handle_t *handle;
+                /* (user+group)*(old+new) structure, inode write (sb,
+                 * inode block, ? - but truncate inode update has it) */
+                handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                                        EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                if (IS_ERR(handle)) {
+                        error = PTR_ERR(handle);
+                        goto err_out;
+                }
+                error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+                if (error) {
+                        ext4_journal_stop(handle);
+                        return error;
+                }
+                /* Update corresponding info in inode so that everything is in
+                 * one transaction */
+                if (attr->ia_valid & ATTR_UID)
+                        inode->i_uid = attr->ia_uid;
+                if (attr->ia_valid & ATTR_GID)
+                        inode->i_gid = attr->ia_gid;
+                error = ext4_mark_inode_dirty(handle, inode);
+                ext4_journal_stop(handle);
+        }
+        if (S_ISREG(inode->i_mode) &&
+            attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+                handle_t *handle;
+                handle = ext4_journal_start(inode, 3);
+                if (IS_ERR(handle)) {
+                        error = PTR_ERR(handle);
+                        goto err_out;
+                }
+                error = ext4_orphan_add(handle, inode);
+                EXT4_I(inode)->i_disksize = attr->ia_size;
+                rc = ext4_mark_inode_dirty(handle, inode);
+                if (!error)
+                        error = rc;
+                ext4_journal_stop(handle);
+        }
+        rc = inode_setattr(inode, attr);
+        /* If inode_setattr's call to ext4_truncate failed to get a
+         * transaction handle at all, we need to clean up the in-core
+         * orphan list manually. */
+        if (inode->i_nlink)
+                ext4_orphan_del(NULL, inode);
+        if (!rc && (ia_valid & ATTR_MODE))
+                rc = ext4_acl_chmod(inode);
+err_out:
+        ext4_std_error(inode->i_sb, error);
+        if (!error)
+                error = rc;
+        return error;
+}
+/*
+ * How many blocks doth make a writepage()?
+ *
+ * With N blocks per page, it may be:
+ * N data blocks
+ * 2 indirect block
+ * 2 dindirect
+ * 1 tindirect
+ * N+5 bitmap blocks (from the above)
+ * N+5 group descriptor summary blocks
+ * 1 inode block
+ * 1 superblock.
+ * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
+ *
+ * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ *
+ * With ordered or writeback data it's the same, less the N data blocks.
+ *
+ * If the inode's direct blocks can hold an integral number of pages then a
+ * page cannot straddle two indirect blocks, and we can only touch one indirect
+ * and dindirect block, and the "5" above becomes "3".
+ *
+ * This still overestimates under most circumstances.  If we were to pass the
+ * start and end offsets in here as well we could do block_to_path() on each
+ * block and work out the exact number of indirects which are touched.  Pah.
+ */
+int ext4_writepage_trans_blocks(struct inode *inode)
+{
+        int bpp = ext4_journal_blocks_per_page(inode);
+        int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
+        int ret;
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_writepage_trans_blocks(inode, bpp);
+        if (ext4_should_journal_data(inode))
+                ret = 3 * (bpp + indirects) + 2;
+        else
+                ret = 2 * (bpp + indirects) + 2;
+#ifdef CONFIG_QUOTA
+        /* We know that structure was already allocated during DQUOT_INIT so
+         * we will be updating only the data blocks + inodes */
+        ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+        return ret;
+}
+/*
+ * The caller must have previously called ext4_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext4_mark_iloc_dirty(handle_t *handle,
+                struct inode *inode, struct ext4_iloc *iloc)
+{
+        int err = 0;
+        /* the do_update_inode consumes one bh->b_count */
+        get_bh(iloc->bh);
+        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+        err = ext4_do_update_inode(handle, inode, iloc);
+        put_bh(iloc->bh);
+        return err;
+}
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+int
+ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+                         struct ext4_iloc *iloc)
+{
+        int err = 0;
+        if (handle) {
+                err = ext4_get_inode_loc(inode, iloc);
+                if (!err) {
+                        BUFFER_TRACE(iloc->bh, "get_write_access");
+                        err = ext4_journal_get_write_access(handle, iloc->bh);
+                        if (err) {
+                                brelse(iloc->bh);
+                                iloc->bh = NULL;
+                        }
+                }
+        }
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O.  This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating?  Not really.  Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective?  Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O.  But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out.  One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory.  It has the desired
+ * effect.
+ */
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+        struct ext4_iloc iloc;
+        int err;
+        might_sleep();
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (!err)
+                err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+        return err;
+}
+/*
+ * ext4_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext4_dirty_inode(struct inode *inode)
+{
+        handle_t *current_handle = ext4_journal_current_handle();
+        handle_t *handle;
+        handle = ext4_journal_start(inode, 2);
+        if (IS_ERR(handle))
+                goto out;
+        if (current_handle &&
+                current_handle->h_transaction != handle->h_transaction) {
+                /* This task has a transaction open against a different fs */
+                printk(KERN_EMERG "%s: transactions do not match!\n",
+                       __FUNCTION__);
+        } else {
+                jbd_debug(5, "marking dirty.  outer handle=%p\n",
+                                current_handle);
+                ext4_mark_inode_dirty(handle, inode);
+        }
+        ext4_journal_stop(handle);
+out:
+        return;
+}
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early.  Unlike
+ * ext4_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext4_pin_inode(handle_t *handle, struct inode *inode)
+{
+        struct ext4_iloc iloc;
+        int err = 0;
+        if (handle) {
+                err = ext4_get_inode_loc(inode, &iloc);
+                if (!err) {
+                        BUFFER_TRACE(iloc.bh, "get_write_access");
+                        err = jbd2_journal_get_write_access(handle, iloc.bh);
+                        if (!err)
+                                err = ext4_journal_dirty_metadata(handle,
+                                                                  iloc.bh);
+                        brelse(iloc.bh);
+                }
+        }
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
+#endif
+int ext4_change_inode_journal_flag(struct inode *inode, int val)
+{
+        journal_t *journal;
+        handle_t *handle;
+        int err;
+        /*
+         * We have to be very careful here: changing a data block's
+         * journaling status dynamically is dangerous.  If we write a
+         * data block to the journal, change the status and then delete
+         * that block, we risk forgetting to revoke the old log record
+         * from the journal and so a subsequent replay can corrupt data.
+         * So, first we make sure that the journal is empty and that
+         * nobody is changing anything.
+         */
+        journal = EXT4_JOURNAL(inode);
+        if (is_journal_aborted(journal) || IS_RDONLY(inode))
+                return -EROFS;
+        jbd2_journal_lock_updates(journal);
+        jbd2_journal_flush(journal);
+        /*
+         * OK, there are no updates running now, and all cached data is
+         * synced to disk.  We are now in a completely consistent state
+         * which doesn't have anything in the journal, and we know that
+         * no filesystem updates are running, so it is safe to modify
+         * the inode's in-core data-journaling state flag now.
+         */
+        if (val)
+                EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+        else
+                EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+        ext4_set_aops(inode);
+        jbd2_journal_unlock_updates(journal);
+        /* Finally we can mark the inode as dirty. */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        err = ext4_mark_inode_dirty(handle, inode);
+        handle->h_sync = 1;
+        ext4_journal_stop(handle);
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..22a737c306c7
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,306 @@
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/capability.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+                unsigned long arg)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned int flags;
+        unsigned short rsv_window_size;
+        ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+        switch (cmd) {
+        case EXT4_IOC_GETFLAGS:
+                flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+                return put_user(flags, (int __user *) arg);
+        case EXT4_IOC_SETFLAGS: {
+                handle_t *handle = NULL;
+                int err;
+                struct ext4_iloc iloc;
+                unsigned int oldflags;
+                unsigned int jflag;
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                        return -EACCES;
+                if (get_user(flags, (int __user *) arg))
+                        return -EFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        flags &= ~EXT4_DIRSYNC_FL;
+                mutex_lock(&inode->i_mutex);
+                oldflags = ei->i_flags;
+                /* The JOURNAL_DATA flag is modifiable only by root */
+                jflag = flags & EXT4_JOURNAL_DATA_FL;
+                /*
+                 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+                 * the relevant capability.
+                 *
+                 * This test looks nicer. Thanks to Pauline Middelink
+                 */
+                if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+                        if (!capable(CAP_LINUX_IMMUTABLE)) {
+                                mutex_unlock(&inode->i_mutex);
+                                return -EPERM;
+                        }
+                }
+                /*
+                 * The JOURNAL_DATA flag can only be changed by
+                 * the relevant capability.
+                 */
+                if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+                        if (!capable(CAP_SYS_RESOURCE)) {
+                                mutex_unlock(&inode->i_mutex);
+                                return -EPERM;
+                        }
+                }
+                handle = ext4_journal_start(inode, 1);
+                if (IS_ERR(handle)) {
+                        mutex_unlock(&inode->i_mutex);
+                        return PTR_ERR(handle);
+                }
+                if (IS_SYNC(inode))
+                        handle->h_sync = 1;
+                err = ext4_reserve_inode_write(handle, inode, &iloc);
+                if (err)
+                        goto flags_err;
+                flags = flags & EXT4_FL_USER_MODIFIABLE;
+                flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
+                ei->i_flags = flags;
+                ext4_set_inode_flags(inode);
+                inode->i_ctime = CURRENT_TIME_SEC;
+                err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+                ext4_journal_stop(handle);
+                if (err) {
+                        mutex_unlock(&inode->i_mutex);
+                        return err;
+                }
+                if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+                        err = ext4_change_inode_journal_flag(inode, jflag);
+                mutex_unlock(&inode->i_mutex);
+                return err;
+        }
+        case EXT4_IOC_GETVERSION:
+        case EXT4_IOC_GETVERSION_OLD:
+                return put_user(inode->i_generation, (int __user *) arg);
+        case EXT4_IOC_SETVERSION:
+        case EXT4_IOC_SETVERSION_OLD: {
+                handle_t *handle;
+                struct ext4_iloc iloc;
+                __u32 generation;
+                int err;
+                if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                        return -EPERM;
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if (get_user(generation, (int __user *) arg))
+                        return -EFAULT;
+                handle = ext4_journal_start(inode, 1);
+                if (IS_ERR(handle))
+                        return PTR_ERR(handle);
+                err = ext4_reserve_inode_write(handle, inode, &iloc);
+                if (err == 0) {
+                        inode->i_ctime = CURRENT_TIME_SEC;
+                        inode->i_generation = generation;
+                        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+                }
+                ext4_journal_stop(handle);
+                return err;
+        }
+#ifdef CONFIG_JBD_DEBUG
+        case EXT4_IOC_WAIT_FOR_READONLY:
+                /*
+                 * This is racy - by the time we're woken up and running,
+                 * the superblock could be released.  And the module could
+                 * have been unloaded.  So sue me.
+                 *
+                 * Returns 1 if it slept, else zero.
+                 */
+                {
+                        struct super_block *sb = inode->i_sb;
+                        DECLARE_WAITQUEUE(wait, current);
+                        int ret = 0;
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+                        if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
+                                schedule();
+                                ret = 1;
+                        }
+                        remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+                        return ret;
+                }
+#endif
+        case EXT4_IOC_GETRSVSZ:
+                if (test_opt(inode->i_sb, RESERVATION)
+                        && S_ISREG(inode->i_mode)
+                        && ei->i_block_alloc_info) {
+                        rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+                        return put_user(rsv_window_size, (int __user *)arg);
+                }
+                return -ENOTTY;
+        case EXT4_IOC_SETRSVSZ: {
+                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+                        return -ENOTTY;
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                        return -EACCES;
+                if (get_user(rsv_window_size, (int __user *)arg))
+                        return -EFAULT;
+                if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
+                        rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
+                /*
+                 * need to allocate reservation structure for this inode
+                 * before set the window size
+                 */
+                mutex_lock(&ei->truncate_mutex);
+                if (!ei->i_block_alloc_info)
+                        ext4_init_block_alloc_info(inode);
+                if (ei->i_block_alloc_info){
+                        struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+                        rsv->rsv_goal_size = rsv_window_size;
+                }
+                mutex_unlock(&ei->truncate_mutex);
+                return 0;
+        }
+        case EXT4_IOC_GROUP_EXTEND: {
+                ext4_fsblk_t n_blocks_count;
+                struct super_block *sb = inode->i_sb;
+                int err;
+                if (!capable(CAP_SYS_RESOURCE))
+                        return -EPERM;
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if (get_user(n_blocks_count, (__u32 __user *)arg))
+                        return -EFAULT;
+                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                return err;
+        }
+        case EXT4_IOC_GROUP_ADD: {
+                struct ext4_new_group_data input;
+                struct super_block *sb = inode->i_sb;
+                int err;
+                if (!capable(CAP_SYS_RESOURCE))
+                        return -EPERM;
+                if (IS_RDONLY(inode))
+                        return -EROFS;
+                if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+                                sizeof(input)))
+                        return -EFAULT;
+                err = ext4_group_add(sb, &input);
+                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                return err;
+        }
+        default:
+                return -ENOTTY;
+        }
+}
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        int ret;
+        /* These are just misnamed, they actually get/put from/to user an int */
+        switch (cmd) {
+        case EXT4_IOC32_GETFLAGS:
+                cmd = EXT4_IOC_GETFLAGS;
+                break;
+        case EXT4_IOC32_SETFLAGS:
+                cmd = EXT4_IOC_SETFLAGS;
+                break;
+        case EXT4_IOC32_GETVERSION:
+                cmd = EXT4_IOC_GETVERSION;
+                break;
+        case EXT4_IOC32_SETVERSION:
+                cmd = EXT4_IOC_SETVERSION;
+                break;
+        case EXT4_IOC32_GROUP_EXTEND:
+                cmd = EXT4_IOC_GROUP_EXTEND;
+                break;
+        case EXT4_IOC32_GETVERSION_OLD:
+                cmd = EXT4_IOC_GETVERSION_OLD;
+                break;
+        case EXT4_IOC32_SETVERSION_OLD:
+                cmd = EXT4_IOC_SETVERSION_OLD;
+                break;
+#ifdef CONFIG_JBD_DEBUG
+        case EXT4_IOC32_WAIT_FOR_READONLY:
+                cmd = EXT4_IOC_WAIT_FOR_READONLY;
+                break;
+#endif
+        case EXT4_IOC32_GETRSVSZ:
+                cmd = EXT4_IOC_GETRSVSZ;
+                break;
+        case EXT4_IOC32_SETRSVSZ:
+                cmd = EXT4_IOC_SETRSVSZ;
+                break;
+        case EXT4_IOC_GROUP_ADD:
+                break;
+        default:
+                return -ENOIOCTLCMD;
+        }
+        lock_kernel();
+        ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+        unlock_kernel();
+        return ret;
+}
+#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..8b1bd03d20f5
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2395 @@
+/*
+ *  linux/fs/ext4/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Directory entry file type support and forward compatibility hooks
+ *      for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *  Hash Tree Directory indexing (c)
+ *      Daniel Phillips, 2001
+ *  Hash Tree Directory indexing porting
+ *      Christopher Li, 2002
+ *  Hash Tree Directory indexing cleanup
+ *      Theodore Ts'o, 2002
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd2.h>
+#include <linux/time.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/smp_lock.h>
+#include "namei.h"
+#include "xattr.h"
+#include "acl.h"
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+static struct buffer_head *ext4_append(handle_t *handle,
+                                        struct inode *inode,
+                                        u32 *block, int *err)
+{
+        struct buffer_head *bh;
+        *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+        if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
+                inode->i_size += inode->i_sb->s_blocksize;
+                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_journal_get_write_access(handle,bh);
+        }
+        return bh;
+}
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+#ifndef swap
+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+struct fake_dirent
+{
+        __le32 inode;
+        __le16 rec_len;
+        u8 name_len;
+        u8 file_type;
+};
+struct dx_countlimit
+{
+        __le16 limit;
+        __le16 count;
+};
+struct dx_entry
+{
+        __le32 hash;
+        __le32 block;
+};
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+struct dx_root
+{
+        struct fake_dirent dot;
+        char dot_name[4];
+        struct fake_dirent dotdot;
+        char dotdot_name[4];
+        struct dx_root_info
+        {
+                __le32 reserved_zero;
+                u8 hash_version;
+                u8 info_length; /* 8 */
+                u8 indirect_levels;
+                u8 unused_flags;
+        }
+        info;
+        struct dx_entry entries[0];
+};
+struct dx_node
+{
+        struct fake_dirent fake;
+        struct dx_entry entries[0];
+};
+struct dx_frame
+{
+        struct buffer_head *bh;
+        struct dx_entry *entries;
+        struct dx_entry *at;
+};
+struct dx_map_entry
+{
+        u32 hash;
+        u32 offs;
+};
+#ifdef CONFIG_EXT4_INDEX
+static inline unsigned dx_get_block (struct dx_entry *entry);
+static void dx_set_block (struct dx_entry *entry, unsigned value);
+static inline unsigned dx_get_hash (struct dx_entry *entry);
+static void dx_set_hash (struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count (struct dx_entry *entries);
+static unsigned dx_get_limit (struct dx_entry *entries);
+static void dx_set_count (struct dx_entry *entries, unsigned value);
+static void dx_set_limit (struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit (struct inode *dir);
+static struct dx_frame *dx_probe(struct dentry *dentry,
+                                 struct inode *dir,
+                                 struct dx_hash_info *hinfo,
+                                 struct dx_frame *frame,
+                                 int *err);
+static void dx_release (struct dx_frame *frames);
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+                        struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+                struct dx_map_entry *offsets, int count);
+static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+                                 struct dx_frame *frame,
+                                 struct dx_frame *frames,
+                                 __u32 *start_hash);
+static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+                       struct ext4_dir_entry_2 **res_dir, int *err);
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                             struct inode *inode);
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+static inline unsigned dx_get_block (struct dx_entry *entry)
+{
+        return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+{
+        entry->block = cpu_to_le32(value);
+}
+static inline unsigned dx_get_hash (struct dx_entry *entry)
+{
+        return le32_to_cpu(entry->hash);
+}
+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+{
+        entry->hash = cpu_to_le32(value);
+}
+static inline unsigned dx_get_count (struct dx_entry *entries)
+{
+        return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+static inline unsigned dx_get_limit (struct dx_entry *entries)
+{
+        return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+{
+        ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+{
+        ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+{
+        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+                EXT4_DIR_REC_LEN(2) - infosize;
+        return 0? 20: entry_space / sizeof(struct dx_entry);
+}
+static inline unsigned dx_node_limit (struct inode *dir)
+{
+        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+        return 0? 22: entry_space / sizeof(struct dx_entry);
+}
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index (char * label, struct dx_entry *entries)
+{
+        int i, n = dx_get_count (entries);
+        printk("%s index ", label);
+        for (i = 0; i < n; i++) {
+                printk("%x->%u ", i? dx_get_hash(entries + i) :
+                                0, dx_get_block(entries + i));
+        }
+        printk("\n");
+}
+struct stats
+{
+        unsigned names;
+        unsigned space;
+        unsigned bcount;
+};
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
+                                 int size, int show_names)
+{
+        unsigned names = 0, space = 0;
+        char *base = (char *) de;
+        struct dx_hash_info h = *hinfo;
+        printk("names: ");
+        while ((char *) de < base + size)
+        {
+                if (de->inode)
+                {
+                        if (show_names)
+                        {
+                                int len = de->name_len;
+                                char *name = de->name;
+                                while (len--) printk("%c", *name++);
+                                ext4fs_dirhash(de->name, de->name_len, &h);
+                                printk(":%x.%u ", h.hash,
+                                       ((char *) de - base));
+                        }
+                        space += EXT4_DIR_REC_LEN(de->name_len);
+                        names++;
+                }
+                de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+        }
+        printk("(%i)\n", names);
+        return (struct stats) { names, space, 1 };
+}
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+                             struct dx_entry *entries, int levels)
+{
+        unsigned blocksize = dir->i_sb->s_blocksize;
+        unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+        unsigned bcount = 0;
+        struct buffer_head *bh;
+        int err;
+        printk("%i indexed blocks...\n", count);
+        for (i = 0; i < count; i++, entries++)
+        {
+                u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+                u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+                struct stats stats;
+                printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
+                if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+                stats = levels?
+                   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+                   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+                names += stats.names;
+                space += stats.space;
+                bcount += stats.bcount;
+                brelse (bh);
+        }
+        if (bcount)
+                printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
+                        names, space/bcount,(space/bcount)*100/blocksize);
+        return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally.  The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(struct dentry *dentry, struct inode *dir,
+         struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+        unsigned count, indirect;
+        struct dx_entry *at, *entries, *p, *q, *m;
+        struct dx_root *root;
+        struct buffer_head *bh;
+        struct dx_frame *frame = frame_in;
+        u32 hash;
+        frame->bh = NULL;
+        if (dentry)
+                dir = dentry->d_parent->d_inode;
+        if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+                goto fail;
+        root = (struct dx_root *) bh->b_data;
+        if (root->info.hash_version != DX_HASH_TEA &&
+            root->info.hash_version != DX_HASH_HALF_MD4 &&
+            root->info.hash_version != DX_HASH_LEGACY) {
+                ext4_warning(dir->i_sb, __FUNCTION__,
+                             "Unrecognised inode hash code %d",
+                             root->info.hash_version);
+                brelse(bh);
+                *err = ERR_BAD_DX_DIR;
+                goto fail;
+        }
+        hinfo->hash_version = root->info.hash_version;
+        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+        if (dentry)
+                ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+        hash = hinfo->hash;
+        if (root->info.unused_flags & 1) {
+                ext4_warning(dir->i_sb, __FUNCTION__,
+                             "Unimplemented inode hash flags: %#06x",
+                             root->info.unused_flags);
+                brelse(bh);
+                *err = ERR_BAD_DX_DIR;
+                goto fail;
+        }
+        if ((indirect = root->info.indirect_levels) > 1) {
+                ext4_warning(dir->i_sb, __FUNCTION__,
+                             "Unimplemented inode hash depth: %#06x",
+                             root->info.indirect_levels);
+                brelse(bh);
+                *err = ERR_BAD_DX_DIR;
+                goto fail;
+        }
+        entries = (struct dx_entry *) (((char *)&root->info) +
+                                       root->info.info_length);
+        assert(dx_get_limit(entries) == dx_root_limit(dir,
+                                                      root->info.info_length));
+        dxtrace (printk("Look up %x", hash));
+        while (1)
+        {
+                count = dx_get_count(entries);
+                assert (count && count <= dx_get_limit(entries));
+                p = entries + 1;
+                q = entries + count - 1;
+                while (p <= q)
+                {
+                        m = p + (q - p)/2;
+                        dxtrace(printk("."));
+                        if (dx_get_hash(m) > hash)
+                                q = m - 1;
+                        else
+                                p = m + 1;
+                }
+                if (0) // linear search cross check
+                {
+                        unsigned n = count - 1;
+                        at = entries;
+                        while (n--)
+                        {
+                                dxtrace(printk(","));
+                                if (dx_get_hash(++at) > hash)
+                                {
+                                        at--;
+                                        break;
+                                }
+                        }
+                        assert (at == p - 1);
+                }
+                at = p - 1;
+                dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+                frame->bh = bh;
+                frame->entries = entries;
+                frame->at = at;
+                if (!indirect--) return frame;
+                if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+                        goto fail2;
+                at = entries = ((struct dx_node *) bh->b_data)->entries;
+                assert (dx_get_limit(entries) == dx_node_limit (dir));
+                frame++;
+        }
+fail2:
+        while (frame >= frame_in) {
+                brelse(frame->bh);
+                frame--;
+        }
+fail:
+        return NULL;
+}
+static void dx_release (struct dx_frame *frames)
+{
+        if (frames[0].bh == NULL)
+                return;
+        if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+                brelse(frames[1].bh);
+        brelse(frames[0].bh);
+}
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+                                 struct dx_frame *frame,
+                                 struct dx_frame *frames,
+                                 __u32 *start_hash)
+{
+        struct dx_frame *p;
+        struct buffer_head *bh;
+        int err, num_frames = 0;
+        __u32 bhash;
+        p = frame;
+        /*
+         * Find the next leaf page by incrementing the frame pointer.
+         * If we run out of entries in the interior node, loop around and
+         * increment pointer in the parent node.  When we break out of
+         * this loop, num_frames indicates the number of interior
+         * nodes need to be read.
+         */
+        while (1) {
+                if (++(p->at) < p->entries + dx_get_count(p->entries))
+                        break;
+                if (p == frames)
+                        return 0;
+                num_frames++;
+                p--;
+        }
+        /*
+         * If the hash is 1, then continue only if the next page has a
+         * continuation hash of any value.  This is used for readdir
+         * handling.  Otherwise, check to see if the hash matches the
+         * desired contiuation hash.  If it doesn't, return since
+         * there's no point to read in the successive index pages.
+         */
+        bhash = dx_get_hash(p->at);
+        if (start_hash)
+                *start_hash = bhash;
+        if ((hash & 1) == 0) {
+                if ((bhash & ~1) != hash)
+                        return 0;
+        }
+        /*
+         * If the hash is HASH_NB_ALWAYS, we always go to the next
+         * block so no check is necessary
+         */
+        while (num_frames--) {
+                if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
+                                      0, &err)))
+                        return err; /* Failure */
+                p++;
+                brelse (p->bh);
+                p->bh = bh;
+                p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+        }
+        return 1;
+}
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+        return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
+}
+/*
+ * This function fills a red-black tree with information from a
+ * directory block.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+                                  struct inode *dir, int block,
+                                  struct dx_hash_info *hinfo,
+                                  __u32 start_hash, __u32 start_minor_hash)
+{
+        struct buffer_head *bh;
+        struct ext4_dir_entry_2 *de, *top;
+        int err, count = 0;
+        dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+        if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+                return err;
+        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        top = (struct ext4_dir_entry_2 *) ((char *) de +
+                                           dir->i_sb->s_blocksize -
+                                           EXT4_DIR_REC_LEN(0));
+        for (; de < top; de = ext4_next_entry(de)) {
+                ext4fs_dirhash(de->name, de->name_len, hinfo);
+                if ((hinfo->hash < start_hash) ||
+                    ((hinfo->hash == start_hash) &&
+                     (hinfo->minor_hash < start_minor_hash)))
+                        continue;
+                if (de->inode == 0)
+                        continue;
+                if ((err = ext4_htree_store_dirent(dir_file,
+                                   hinfo->hash, hinfo->minor_hash, de)) != 0) {
+                        brelse(bh);
+                        return err;
+                }
+                count++;
+        }
+        brelse(bh);
+        return count;
+}
+/*
+ * This function fills a red-black tree with information from a
+ * directory.  We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+                         __u32 start_minor_hash, __u32 *next_hash)
+{
+        struct dx_hash_info hinfo;
+        struct ext4_dir_entry_2 *de;
+        struct dx_frame frames[2], *frame;
+        struct inode *dir;
+        int block, err;
+        int count = 0;
+        int ret;
+        __u32 hashval;
+        dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+                       start_minor_hash));
+        dir = dir_file->f_dentry->d_inode;
+        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+                                               start_hash, start_minor_hash);
+                *next_hash = ~0;
+                return count;
+        }
+        hinfo.hash = start_hash;
+        hinfo.minor_hash = 0;
+        frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+        if (!frame)
+                return err;
+        /* Add '.' and '..' from the htree header */
+        if (!start_hash && !start_minor_hash) {
+                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+                if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+                        goto errout;
+                count++;
+        }
+        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+                de = ext4_next_entry(de);
+                if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+                        goto errout;
+                count++;
+        }
+        while (1) {
+                block = dx_get_block(frame->at);
+                ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+                                             start_hash, start_minor_hash);
+                if (ret < 0) {
+                        err = ret;
+                        goto errout;
+                }
+                count += ret;
+                hashval = ~0;
+                ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+                                            frame, frames, &hashval);
+                *next_hash = hashval;
+                if (ret < 0) {
+                        err = ret;
+                        goto errout;
+                }
+                /*
+                 * Stop if:  (a) there are no more entries, or
+                 * (b) we have inserted at least one entry and the
+                 * next hash value is not a continuation
+                 */
+                if ((ret == 0) ||
+                    (count && ((hashval & 1) == 0)))
+                        break;
+        }
+        dx_release(frames);
+        dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+                       count, *next_hash));
+        return count;
+errout:
+        dx_release(frames);
+        return (err);
+}
+/*
+ * Directory block splitting, compacting
+ */
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+                        struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+{
+        int count = 0;
+        char *base = (char *) de;
+        struct dx_hash_info h = *hinfo;
+        while ((char *) de < base + size)
+        {
+                if (de->name_len && de->inode) {
+                        ext4fs_dirhash(de->name, de->name_len, &h);
+                        map_tail--;
+                        map_tail->hash = h.hash;
+                        map_tail->offs = (u32) ((char *) de - base);
+                        count++;
+                        cond_resched();
+                }
+                /* XXX: do we need to check rec_len == 0 case? -Chris */
+                de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+        }
+        return count;
+}
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+        struct dx_map_entry *p, *q, *top = map + count - 1;
+        int more;
+        /* Combsort until bubble sort doesn't suck */
+        while (count > 2) {
+                count = count*10/13;
+                if (count - 9 < 2) /* 9, 10 -> 11 */
+                        count = 11;
+                for (p = top, q = p - count; q >= map; p--, q--)
+                        if (p->hash < q->hash)
+                                swap(*p, *q);
+        }
+        /* Garden variety bubble sort */
+        do {
+                more = 0;
+                q = top;
+                while (q-- > map) {
+                        if (q[1].hash >= q[0].hash)
+                                continue;
+                        swap(*(q+1), *q);
+                        more = 1;
+                }
+        } while(more);
+}
+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+{
+        struct dx_entry *entries = frame->entries;
+        struct dx_entry *old = frame->at, *new = old + 1;
+        int count = dx_get_count(entries);
+        assert(count < dx_get_limit(entries));
+        assert(old < entries + count);
+        memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+        dx_set_hash(new, hash);
+        dx_set_block(new, block);
+        dx_set_count(entries, count + 1);
+}
+#endif
+static void ext4_update_dx_flag(struct inode *inode)
+{
+        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
+                EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+}
+/*
+ * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT4_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext4_match (int len, const char * const name,
+                              struct ext4_dir_entry_2 * de)
+{
+        if (len != de->name_len)
+                return 0;
+        if (!de->inode)
+                return 0;
+        return !memcmp(name, de->name, len);
+}
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head * bh,
+                                  struct inode *dir,
+                                  struct dentry *dentry,
+                                  unsigned long offset,
+                                  struct ext4_dir_entry_2 ** res_dir)
+{
+        struct ext4_dir_entry_2 * de;
+        char * dlimit;
+        int de_len;
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        dlimit = bh->b_data + dir->i_sb->s_blocksize;
+        while ((char *) de < dlimit) {
+                /* this code is executed quadratically often */
+                /* do minimal checking `by hand' */
+                if ((char *) de + namelen <= dlimit &&
+                    ext4_match (namelen, name, de)) {
+                        /* found a match - just to be sure, do a full check */
+                        if (!ext4_check_dir_entry("ext4_find_entry",
+                                                  dir, de, bh, offset))
+                                return -1;
+                        *res_dir = de;
+                        return 1;
+                }
+                /* prevent looping on a bad block */
+                de_len = le16_to_cpu(de->rec_len);
+                if (de_len <= 0)
+                        return -1;
+                offset += de_len;
+                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+        }
+        return 0;
+}
+/*
+ *      ext4_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated.  The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+                                        struct ext4_dir_entry_2 ** res_dir)
+{
+        struct super_block * sb;
+        struct buffer_head * bh_use[NAMEI_RA_SIZE];
+        struct buffer_head * bh, *ret = NULL;
+        unsigned long start, block, b;
+        int ra_max = 0;         /* Number of bh's in the readahead
+                                   buffer, bh_use[] */
+        int ra_ptr = 0;         /* Current index into readahead
+                                   buffer */
+        int num = 0;
+        int nblocks, i, err;
+        struct inode *dir = dentry->d_parent->d_inode;
+        int namelen;
+        const u8 *name;
+        unsigned blocksize;
+        *res_dir = NULL;
+        sb = dir->i_sb;
+        blocksize = sb->s_blocksize;
+        namelen = dentry->d_name.len;
+        name = dentry->d_name.name;
+        if (namelen > EXT4_NAME_LEN)
+                return NULL;
+#ifdef CONFIG_EXT4_INDEX
+        if (is_dx(dir)) {
+                bh = ext4_dx_find_entry(dentry, res_dir, &err);
+                /*
+                 * On success, or if the error was file not found,
+                 * return.  Otherwise, fall back to doing a search the
+                 * old fashioned way.
+                 */
+                if (bh || (err != ERR_BAD_DX_DIR))
+                        return bh;
+                dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+        }
+#endif
+        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+        start = EXT4_I(dir)->i_dir_start_lookup;
+        if (start >= nblocks)
+                start = 0;
+        block = start;
+restart:
+        do {
+                /*
+                 * We deal with the read-ahead logic here.
+                 */
+                if (ra_ptr >= ra_max) {
+                        /* Refill the readahead buffer */
+                        ra_ptr = 0;
+                        b = block;
+                        for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+                                /*
+                                 * Terminate if we reach the end of the
+                                 * directory and must wrap, or if our
+                                 * search has finished at this block.
+                                 */
+                                if (b >= nblocks || (num && block == start)) {
+                                        bh_use[ra_max] = NULL;
+                                        break;
+                                }
+                                num++;
+                                bh = ext4_getblk(NULL, dir, b++, 0, &err);
+                                bh_use[ra_max] = bh;
+                                if (bh)
+                                        ll_rw_block(READ_META, 1, &bh);
+                        }
+                }
+                if ((bh = bh_use[ra_ptr++]) == NULL)
+                        goto next;
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh)) {
+                        /* read error, skip block & hope for the best */
+                        ext4_error(sb, __FUNCTION__, "reading directory #%lu "
+                                   "offset %lu", dir->i_ino, block);
+                        brelse(bh);
+                        goto next;
+                }
+                i = search_dirblock(bh, dir, dentry,
+                            block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+                if (i == 1) {
+                        EXT4_I(dir)->i_dir_start_lookup = block;
+                        ret = bh;
+                        goto cleanup_and_exit;
+                } else {
+                        brelse(bh);
+                        if (i < 0)
+                                goto cleanup_and_exit;
+                }
+        next:
+                if (++block >= nblocks)
+                        block = 0;
+        } while (block != start);
+        /*
+         * If the directory has grown while we were searching, then
+         * search the last part of the directory before giving up.
+         */
+        block = nblocks;
+        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+        if (block < nblocks) {
+                start = 0;
+                goto restart;
+        }
+cleanup_and_exit:
+        /* Clean up the read-ahead blocks */
+        for (; ra_ptr < ra_max; ra_ptr++)
+                brelse (bh_use[ra_ptr]);
+        return ret;
+}
+#ifdef CONFIG_EXT4_INDEX
+static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+                       struct ext4_dir_entry_2 **res_dir, int *err)
+{
+        struct super_block * sb;
+        struct dx_hash_info     hinfo;
+        u32 hash;
+        struct dx_frame frames[2], *frame;
+        struct ext4_dir_entry_2 *de, *top;
+        struct buffer_head *bh;
+        unsigned long block;
+        int retval;
+        int namelen = dentry->d_name.len;
+        const u8 *name = dentry->d_name.name;
+        struct inode *dir = dentry->d_parent->d_inode;
+        sb = dir->i_sb;
+        /* NFS may look up ".." - look at dx_root directory block */
+        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+                if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+                        return NULL;
+        } else {
+                frame = frames;
+                frame->bh = NULL;                       /* for dx_release() */
+                frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
+                dx_set_block(frame->at, 0);             /* dx_root block is 0 */
+        }
+        hash = hinfo.hash;
+        do {
+                block = dx_get_block(frame->at);
+                if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+                        goto errout;
+                de = (struct ext4_dir_entry_2 *) bh->b_data;
+                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+                                       EXT4_DIR_REC_LEN(0));
+                for (; de < top; de = ext4_next_entry(de))
+                if (ext4_match (namelen, name, de)) {
+                        if (!ext4_check_dir_entry("ext4_find_entry",
+                                                  dir, de, bh,
+                                  (block<<EXT4_BLOCK_SIZE_BITS(sb))
+                                          +((char *)de - bh->b_data))) {
+                                brelse (bh);
+                                goto errout;
+                        }
+                        *res_dir = de;
+                        dx_release (frames);
+                        return bh;
+                }
+                brelse (bh);
+                /* Check to see if we should continue to search */
+                retval = ext4_htree_next_block(dir, hash, frame,
+                                               frames, NULL);
+                if (retval < 0) {
+                        ext4_warning(sb, __FUNCTION__,
+                             "error reading index page in directory #%lu",
+                             dir->i_ino);
+                        *err = retval;
+                        goto errout;
+                }
+        } while (retval == 1);
+        *err = -ENOENT;
+errout:
+        dxtrace(printk("%s not found\n", name));
+        dx_release (frames);
+        return NULL;
+}
+#endif
+static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode * inode;
+        struct ext4_dir_entry_2 * de;
+        struct buffer_head * bh;
+        if (dentry->d_name.len > EXT4_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        bh = ext4_find_entry(dentry, &de);
+        inode = NULL;
+        if (bh) {
+                unsigned long ino = le32_to_cpu(de->inode);
+                brelse (bh);
+                if (!ext4_valid_inum(dir->i_sb, ino)) {
+                        ext4_error(dir->i_sb, "ext4_lookup",
+                                   "bad inode number: %lu", ino);
+                        inode = NULL;
+                } else
+                        inode = iget(dir->i_sb, ino);
+                if (!inode)
+                        return ERR_PTR(-EACCES);
+        }
+        return d_splice_alias(inode, dentry);
+}
+struct dentry *ext4_get_parent(struct dentry *child)
+{
+        unsigned long ino;
+        struct dentry *parent;
+        struct inode *inode;
+        struct dentry dotdot;
+        struct ext4_dir_entry_2 * de;
+        struct buffer_head *bh;
+        dotdot.d_name.name = "..";
+        dotdot.d_name.len = 2;
+        dotdot.d_parent = child; /* confusing, isn't it! */
+        bh = ext4_find_entry(&dotdot, &de);
+        inode = NULL;
+        if (!bh)
+                return ERR_PTR(-ENOENT);
+        ino = le32_to_cpu(de->inode);
+        brelse(bh);
+        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
+                ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+                           "bad inode number: %lu", ino);
+                inode = NULL;
+        } else
+                inode = iget(child->d_inode->i_sb, ino);
+        if (!inode)
+                return ERR_PTR(-EACCES);
+        parent = d_alloc_anon(inode);
+        if (!parent) {
+                iput(inode);
+                parent = ERR_PTR(-ENOMEM);
+        }
+        return parent;
+}
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = EXT4_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = EXT4_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = EXT4_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = EXT4_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = EXT4_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = EXT4_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = EXT4_FT_SYMLINK,
+};
+static inline void ext4_set_de_type(struct super_block *sb,
+                                struct ext4_dir_entry_2 *de,
+                                umode_t mode) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+#ifdef CONFIG_EXT4_INDEX
+static struct ext4_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+{
+        unsigned rec_len = 0;
+        while (count--) {
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+                rec_len = EXT4_DIR_REC_LEN(de->name_len);
+                memcpy (to, de, rec_len);
+                ((struct ext4_dir_entry_2 *) to)->rec_len =
+                                cpu_to_le16(rec_len);
+                de->inode = 0;
+                map++;
+                to += rec_len;
+        }
+        return (struct ext4_dir_entry_2 *) (to - rec_len);
+}
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+{
+        struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
+        unsigned rec_len = 0;
+        prev = to = de;
+        while ((char*)de < base + size) {
+                next = (struct ext4_dir_entry_2 *) ((char *) de +
+                                                    le16_to_cpu(de->rec_len));
+                if (de->inode && de->name_len) {
+                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
+                        if (de > to)
+                                memmove(to, de, rec_len);
+                        to->rec_len = cpu_to_le16(rec_len);
+                        prev = to;
+                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
+                }
+                de = next;
+        }
+        return prev;
+}
+static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+                        struct buffer_head **bh,struct dx_frame *frame,
+                        struct dx_hash_info *hinfo, int *error)
+{
+        unsigned blocksize = dir->i_sb->s_blocksize;
+        unsigned count, continued;
+        struct buffer_head *bh2;
+        u32 newblock;
+        u32 hash2;
+        struct dx_map_entry *map;
+        char *data1 = (*bh)->b_data, *data2;
+        unsigned split;
+        struct ext4_dir_entry_2 *de = NULL, *de2;
+        int     err;
+        bh2 = ext4_append (handle, dir, &newblock, error);
+        if (!(bh2)) {
+                brelse(*bh);
+                *bh = NULL;
+                goto errout;
+        }
+        BUFFER_TRACE(*bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, *bh);
+        if (err) {
+        journal_error:
+                brelse(*bh);
+                brelse(bh2);
+                *bh = NULL;
+                ext4_std_error(dir->i_sb, err);
+                goto errout;
+        }
+        BUFFER_TRACE(frame->bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, frame->bh);
+        if (err)
+                goto journal_error;
+        data2 = bh2->b_data;
+        /* create map in the end of data2 block */
+        map = (struct dx_map_entry *) (data2 + blocksize);
+        count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+                             blocksize, hinfo, map);
+        map -= count;
+        split = count/2; // need to adjust to actual middle
+        dx_sort_map (map, count);
+        hash2 = map[split].hash;
+        continued = hash2 == map[split - 1].hash;
+        dxtrace(printk("Split block %i at %x, %i/%i\n",
+                dx_get_block(frame->at), hash2, split, count-split));
+        /* Fancy dance to stay within two buffers */
+        de2 = dx_move_dirents(data1, data2, map + split, count - split);
+        de = dx_pack_dirents(data1,blocksize);
+        de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+        de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+        /* Which block gets the new entry? */
+        if (hinfo->hash >= hash2)
+        {
+                swap(*bh, bh2);
+                de = de2;
+        }
+        dx_insert_block (frame, hash2 + continued, newblock);
+        err = ext4_journal_dirty_metadata (handle, bh2);
+        if (err)
+                goto journal_error;
+        err = ext4_journal_dirty_metadata (handle, frame->bh);
+        if (err)
+                goto journal_error;
+        brelse (bh2);
+        dxtrace(dx_show_index ("frame", frame->entries));
+errout:
+        return de;
+}
+#endif
+/*
+ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry.  If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space.  It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ *
+ * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
+ * all other cases bh is released.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+                             struct inode *inode, struct ext4_dir_entry_2 *de,
+                             struct buffer_head * bh)
+{
+        struct inode    *dir = dentry->d_parent->d_inode;
+        const char      *name = dentry->d_name.name;
+        int             namelen = dentry->d_name.len;
+        unsigned long   offset = 0;
+        unsigned short  reclen;
+        int             nlen, rlen, err;
+        char            *top;
+        reclen = EXT4_DIR_REC_LEN(namelen);
+        if (!de) {
+                de = (struct ext4_dir_entry_2 *)bh->b_data;
+                top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+                while ((char *) de <= top) {
+                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
+                                                  bh, offset)) {
+                                brelse (bh);
+                                return -EIO;
+                        }
+                        if (ext4_match (namelen, name, de)) {
+                                brelse (bh);
+                                return -EEXIST;
+                        }
+                        nlen = EXT4_DIR_REC_LEN(de->name_len);
+                        rlen = le16_to_cpu(de->rec_len);
+                        if ((de->inode? rlen - nlen: rlen) >= reclen)
+                                break;
+                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+                        offset += rlen;
+                }
+                if ((char *) de > top)
+                        return -ENOSPC;
+        }
+        BUFFER_TRACE(bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, bh);
+        if (err) {
+                ext4_std_error(dir->i_sb, err);
+                brelse(bh);
+                return err;
+        }
+        /* By now the buffer is marked for journaling */
+        nlen = EXT4_DIR_REC_LEN(de->name_len);
+        rlen = le16_to_cpu(de->rec_len);
+        if (de->inode) {
+                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+                de1->rec_len = cpu_to_le16(rlen - nlen);
+                de->rec_len = cpu_to_le16(nlen);
+                de = de1;
+        }
+        de->file_type = EXT4_FT_UNKNOWN;
+        if (inode) {
+                de->inode = cpu_to_le32(inode->i_ino);
+                ext4_set_de_type(dir->i_sb, de, inode->i_mode);
+        } else
+                de->inode = 0;
+        de->name_len = namelen;
+        memcpy (de->name, name, namelen);
+        /*
+         * XXX shouldn't update any times until successful
+         * completion of syscall, but too many callers depend
+         * on this.
+         *
+         * XXX similarly, too many callers depend on
+         * ext4_new_inode() setting the times, but error
+         * recovery deletes the inode, so the worst that can
+         * happen is that the times are slightly out of date
+         * and/or different from the directory change time.
+         */
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+        ext4_update_dx_flag(dir);
+        dir->i_version++;
+        ext4_mark_inode_dirty(handle, dir);
+        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+        err = ext4_journal_dirty_metadata(handle, bh);
+        if (err)
+                ext4_std_error(dir->i_sb, err);
+        brelse(bh);
+        return 0;
+}
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+                            struct inode *inode, struct buffer_head *bh)
+{
+        struct inode    *dir = dentry->d_parent->d_inode;
+        const char      *name = dentry->d_name.name;
+        int             namelen = dentry->d_name.len;
+        struct buffer_head *bh2;
+        struct dx_root  *root;
+        struct dx_frame frames[2], *frame;
+        struct dx_entry *entries;
+        struct ext4_dir_entry_2 *de, *de2;
+        char            *data1, *top;
+        unsigned        len;
+        int             retval;
+        unsigned        blocksize;
+        struct dx_hash_info hinfo;
+        u32             block;
+        struct fake_dirent *fde;
+        blocksize =  dir->i_sb->s_blocksize;
+        dxtrace(printk("Creating index\n"));
+        retval = ext4_journal_get_write_access(handle, bh);
+        if (retval) {
+                ext4_std_error(dir->i_sb, retval);
+                brelse(bh);
+                return retval;
+        }
+        root = (struct dx_root *) bh->b_data;
+        bh2 = ext4_append (handle, dir, &block, &retval);
+        if (!(bh2)) {
+                brelse(bh);
+                return retval;
+        }
+        EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+        data1 = bh2->b_data;
+        /* The 0th block becomes the root, move the dirents out */
+        fde = &root->dotdot;
+        de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+        len = ((char *) root) + blocksize - (char *) de;
+        memcpy (data1, de, len);
+        de = (struct ext4_dir_entry_2 *) data1;
+        top = data1 + len;
+        while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+                de = de2;
+        de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+        /* Initialize the root; the dot dirents already exist */
+        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+        de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
+        memset (&root->info, 0, sizeof(root->info));
+        root->info.info_length = sizeof(root->info);
+        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+        entries = root->entries;
+        dx_set_block (entries, 1);
+        dx_set_count (entries, 1);
+        dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+        /* Initialize as for dx_probe */
+        hinfo.hash_version = root->info.hash_version;
+        hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+        ext4fs_dirhash(name, namelen, &hinfo);
+        frame = frames;
+        frame->entries = entries;
+        frame->at = entries;
+        frame->bh = bh;
+        bh = bh2;
+        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+        dx_release (frames);
+        if (!(de))
+                return retval;
+        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+#endif
+/*
+ *      ext4_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext4_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
+        struct inode *inode)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        unsigned long offset;
+        struct buffer_head * bh;
+        struct ext4_dir_entry_2 *de;
+        struct super_block * sb;
+        int     retval;
+#ifdef CONFIG_EXT4_INDEX
+        int     dx_fallback=0;
+#endif
+        unsigned blocksize;
+        u32 block, blocks;
+        sb = dir->i_sb;
+        blocksize = sb->s_blocksize;
+        if (!dentry->d_name.len)
+                return -EINVAL;
+#ifdef CONFIG_EXT4_INDEX
+        if (is_dx(dir)) {
+                retval = ext4_dx_add_entry(handle, dentry, inode);
+                if (!retval || (retval != ERR_BAD_DX_DIR))
+                        return retval;
+                EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+                dx_fallback++;
+                ext4_mark_inode_dirty(handle, dir);
+        }
+#endif
+        blocks = dir->i_size >> sb->s_blocksize_bits;
+        for (block = 0, offset = 0; block < blocks; block++) {
+                bh = ext4_bread(handle, dir, block, 0, &retval);
+                if(!bh)
+                        return retval;
+                retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+                if (retval != -ENOSPC)
+                        return retval;
+#ifdef CONFIG_EXT4_INDEX
+                if (blocks == 1 && !dx_fallback &&
+                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+                        return make_indexed_dir(handle, dentry, inode, bh);
+#endif
+                brelse(bh);
+        }
+        bh = ext4_append(handle, dir, &block, &retval);
+        if (!bh)
+                return retval;
+        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        de->inode = 0;
+        de->rec_len = cpu_to_le16(blocksize);
+        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+                             struct inode *inode)
+{
+        struct dx_frame frames[2], *frame;
+        struct dx_entry *entries, *at;
+        struct dx_hash_info hinfo;
+        struct buffer_head * bh;
+        struct inode *dir = dentry->d_parent->d_inode;
+        struct super_block * sb = dir->i_sb;
+        struct ext4_dir_entry_2 *de;
+        int err;
+        frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+        if (!frame)
+                return err;
+        entries = frame->entries;
+        at = frame->at;
+        if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+                goto cleanup;
+        BUFFER_TRACE(bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, bh);
+        if (err)
+                goto journal_error;
+        err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+        if (err != -ENOSPC) {
+                bh = NULL;
+                goto cleanup;
+        }
+        /* Block full, should compress but for now just split */
+        dxtrace(printk("using %u of %u node entries\n",
+                       dx_get_count(entries), dx_get_limit(entries)));
+        /* Need to split index? */
+        if (dx_get_count(entries) == dx_get_limit(entries)) {
+                u32 newblock;
+                unsigned icount = dx_get_count(entries);
+                int levels = frame - frames;
+                struct dx_entry *entries2;
+                struct dx_node *node2;
+                struct buffer_head *bh2;
+                if (levels && (dx_get_count(frames->entries) ==
+                               dx_get_limit(frames->entries))) {
+                        ext4_warning(sb, __FUNCTION__,
+                                     "Directory index full!");
+                        err = -ENOSPC;
+                        goto cleanup;
+                }
+                bh2 = ext4_append (handle, dir, &newblock, &err);
+                if (!(bh2))
+                        goto cleanup;
+                node2 = (struct dx_node *)(bh2->b_data);
+                entries2 = node2->entries;
+                node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+                node2->fake.inode = 0;
+                BUFFER_TRACE(frame->bh, "get_write_access");
+                err = ext4_journal_get_write_access(handle, frame->bh);
+                if (err)
+                        goto journal_error;
+                if (levels) {
+                        unsigned icount1 = icount/2, icount2 = icount - icount1;
+                        unsigned hash2 = dx_get_hash(entries + icount1);
+                        dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+                        BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                        err = ext4_journal_get_write_access(handle,
+                                                             frames[0].bh);
+                        if (err)
+                                goto journal_error;
+                        memcpy ((char *) entries2, (char *) (entries + icount1),
+                                icount2 * sizeof(struct dx_entry));
+                        dx_set_count (entries, icount1);
+                        dx_set_count (entries2, icount2);
+                        dx_set_limit (entries2, dx_node_limit(dir));
+                        /* Which index block gets the new entry? */
+                        if (at - entries >= icount1) {
+                                frame->at = at = at - entries - icount1 + entries2;
+                                frame->entries = entries = entries2;
+                                swap(frame->bh, bh2);
+                        }
+                        dx_insert_block (frames + 0, hash2, newblock);
+                        dxtrace(dx_show_index ("node", frames[1].entries));
+                        dxtrace(dx_show_index ("node",
+                               ((struct dx_node *) bh2->b_data)->entries));
+                        err = ext4_journal_dirty_metadata(handle, bh2);
+                        if (err)
+                                goto journal_error;
+                        brelse (bh2);
+                } else {
+                        dxtrace(printk("Creating second level index...\n"));
+                        memcpy((char *) entries2, (char *) entries,
+                               icount * sizeof(struct dx_entry));
+                        dx_set_limit(entries2, dx_node_limit(dir));
+                        /* Set up root */
+                        dx_set_count(entries, 1);
+                        dx_set_block(entries + 0, newblock);
+                        ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+                        /* Add new access path frame */
+                        frame = frames + 1;
+                        frame->at = at = at - entries + entries2;
+                        frame->entries = entries = entries2;
+                        frame->bh = bh2;
+                        err = ext4_journal_get_write_access(handle,
+                                                             frame->bh);
+                        if (err)
+                                goto journal_error;
+                }
+                ext4_journal_dirty_metadata(handle, frames[0].bh);
+        }
+        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+        if (!de)
+                goto cleanup;
+        err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        bh = NULL;
+        goto cleanup;
+journal_error:
+        ext4_std_error(dir->i_sb, err);
+cleanup:
+        if (bh)
+                brelse(bh);
+        dx_release(frames);
+        return err;
+}
+#endif
+/*
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext4_delete_entry (handle_t *handle,
+                              struct inode * dir,
+                              struct ext4_dir_entry_2 * de_del,
+                              struct buffer_head * bh)
+{
+        struct ext4_dir_entry_2 * de, * pde;
+        int i;
+        i = 0;
+        pde = NULL;
+        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        while (i < bh->b_size) {
+                if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+                        return -EIO;
+                if (de == de_del)  {
+                        BUFFER_TRACE(bh, "get_write_access");
+                        ext4_journal_get_write_access(handle, bh);
+                        if (pde)
+                                pde->rec_len =
+                                        cpu_to_le16(le16_to_cpu(pde->rec_len) +
+                                                    le16_to_cpu(de->rec_len));
+                        else
+                                de->inode = 0;
+                        dir->i_version++;
+                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        ext4_journal_dirty_metadata(handle, bh);
+                        return 0;
+                }
+                i += le16_to_cpu(de->rec_len);
+                pde = de;
+                de = (struct ext4_dir_entry_2 *)
+                        ((char *) de + le16_to_cpu(de->rec_len));
+        }
+        return -ENOENT;
+}
+/*
+ * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we
+ * do not perform it in these functions.  We perform it at the call site,
+ * if it is needed.
+ */
+static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+        inc_nlink(inode);
+}
+static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+        drop_nlink(inode);
+}
+static int ext4_add_nondir(handle_t *handle,
+                struct dentry *dentry, struct inode *inode)
+{
+        int err = ext4_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext4_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        ext4_dec_count(handle, inode);
+        iput(inode);
+        return err;
+}
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
+                struct nameidata *nd)
+{
+        handle_t *handle;
+        struct inode * inode;
+        int err, retries = 0;
+retry:
+        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        inode = ext4_new_inode (handle, dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                inode->i_op = &ext4_file_inode_operations;
+                inode->i_fop = &ext4_file_operations;
+                ext4_set_aops(inode);
+                err = ext4_add_nondir(handle, dentry, inode);
+        }
+        ext4_journal_stop(handle);
+        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+}
+static int ext4_mknod (struct inode * dir, struct dentry *dentry,
+                        int mode, dev_t rdev)
+{
+        handle_t *handle;
+        struct inode *inode;
+        int err, retries = 0;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+retry:
+        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        inode = ext4_new_inode (handle, dir, mode);
+        err = PTR_ERR(inode);
+        if (!IS_ERR(inode)) {
+                init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+                inode->i_op = &ext4_special_inode_operations;
+#endif
+                err = ext4_add_nondir(handle, dentry, inode);
+        }
+        ext4_journal_stop(handle);
+        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+}
+static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+        handle_t *handle;
+        struct inode * inode;
+        struct buffer_head * dir_block;
+        struct ext4_dir_entry_2 * de;
+        int err, retries = 0;
+        if (dir->i_nlink >= EXT4_LINK_MAX)
+                return -EMLINK;
+retry:
+        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_stop;
+        inode->i_op = &ext4_dir_inode_operations;
+        inode->i_fop = &ext4_dir_operations;
+        inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+        dir_block = ext4_bread (handle, inode, 0, 1, &err);
+        if (!dir_block) {
+                drop_nlink(inode); /* is this nlink == 0? */
+                ext4_mark_inode_dirty(handle, inode);
+                iput (inode);
+                goto out_stop;
+        }
+        BUFFER_TRACE(dir_block, "get_write_access");
+        ext4_journal_get_write_access(handle, dir_block);
+        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+        de->inode = cpu_to_le32(inode->i_ino);
+        de->name_len = 1;
+        de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
+        strcpy (de->name, ".");
+        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+        de = (struct ext4_dir_entry_2 *)
+                        ((char *) de + le16_to_cpu(de->rec_len));
+        de->inode = cpu_to_le32(dir->i_ino);
+        de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
+        de->name_len = 2;
+        strcpy (de->name, "..");
+        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+        inode->i_nlink = 2;
+        BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
+        ext4_journal_dirty_metadata(handle, dir_block);
+        brelse (dir_block);
+        ext4_mark_inode_dirty(handle, inode);
+        err = ext4_add_entry (handle, dentry, inode);
+        if (err) {
+                inode->i_nlink = 0;
+                ext4_mark_inode_dirty(handle, inode);
+                iput (inode);
+                goto out_stop;
+        }
+        inc_nlink(dir);
+        ext4_update_dx_flag(dir);
+        ext4_mark_inode_dirty(handle, dir);
+        d_instantiate(dentry, inode);
+out_stop:
+        ext4_journal_stop(handle);
+        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+}
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir (struct inode * inode)
+{
+        unsigned long offset;
+        struct buffer_head * bh;
+        struct ext4_dir_entry_2 * de, * de1;
+        struct super_block * sb;
+        int err = 0;
+        sb = inode->i_sb;
+        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
+            !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+                if (err)
+                        ext4_error(inode->i_sb, __FUNCTION__,
+                                   "error %d reading directory #%lu offset 0",
+                                   err, inode->i_ino);
+                else
+                        ext4_warning(inode->i_sb, __FUNCTION__,
+                                     "bad directory (dir #%lu) - no data block",
+                                     inode->i_ino);
+                return 1;
+        }
+        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        de1 = (struct ext4_dir_entry_2 *)
+                        ((char *) de + le16_to_cpu(de->rec_len));
+        if (le32_to_cpu(de->inode) != inode->i_ino ||
+                        !le32_to_cpu(de1->inode) ||
+                        strcmp (".", de->name) ||
+                        strcmp ("..", de1->name)) {
+                ext4_warning (inode->i_sb, "empty_dir",
+                              "bad directory (dir #%lu) - no `.' or `..'",
+                              inode->i_ino);
+                brelse (bh);
+                return 1;
+        }
+        offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+        de = (struct ext4_dir_entry_2 *)
+                        ((char *) de1 + le16_to_cpu(de1->rec_len));
+        while (offset < inode->i_size ) {
+                if (!bh ||
+                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                        err = 0;
+                        brelse (bh);
+                        bh = ext4_bread (NULL, inode,
+                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+                        if (!bh) {
+                                if (err)
+                                        ext4_error(sb, __FUNCTION__,
+                                                   "error %d reading directory"
+                                                   " #%lu offset %lu",
+                                                   err, inode->i_ino, offset);
+                                offset += sb->s_blocksize;
+                                continue;
+                        }
+                        de = (struct ext4_dir_entry_2 *) bh->b_data;
+                }
+                if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
+                                                         sb->s_blocksize);
+                        offset = (offset | (sb->s_blocksize - 1)) + 1;
+                        continue;
+                }
+                if (le32_to_cpu(de->inode)) {
+                        brelse (bh);
+                        return 0;
+                }
+                offset += le16_to_cpu(de->rec_len);
+                de = (struct ext4_dir_entry_2 *)
+                                ((char *) de + le16_to_cpu(de->rec_len));
+        }
+        brelse (bh);
+        return 1;
+}
+/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        struct ext4_iloc iloc;
+        int err = 0, rc;
+        lock_super(sb);
+        if (!list_empty(&EXT4_I(inode)->i_orphan))
+                goto out_unlock;
+        /* Orphan handling is only valid for files with data blocks
+         * being truncated, or files being unlinked. */
+        /* @@@ FIXME: Observation from aviro:
+         * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
+         * here (on lock_super()), so race with ext4_link() which might bump
+         * ->i_nlink. For, say it, character device. Not a regular file,
+         * not a directory, not a symlink and ->i_nlink > 0.
+         */
+        J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+        if (err)
+                goto out_unlock;
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (err)
+                goto out_unlock;
+        /* Insert this inode at the head of the on-disk orphan list... */
+        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+        err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+        if (!err)
+                err = rc;
+        /* Only add to the head of the in-memory list if all the
+         * previous operations succeeded.  If the orphan_add is going to
+         * fail (possibly taking the journal offline), we can't risk
+         * leaving the inode on the orphan list: stray orphan-list
+         * entries can cause panics at unmount time.
+         *
+         * This is safe: on error we're going to ignore the orphan list
+         * anyway on the next recovery. */
+        if (!err)
+                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+        jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+        jbd_debug(4, "orphan inode %lu will point to %d\n",
+                        inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+        unlock_super(sb);
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+        struct list_head *prev;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_sb_info *sbi;
+        unsigned long ino_next;
+        struct ext4_iloc iloc;
+        int err = 0;
+        lock_super(inode->i_sb);
+        if (list_empty(&ei->i_orphan)) {
+                unlock_super(inode->i_sb);
+                return 0;
+        }
+        ino_next = NEXT_ORPHAN(inode);
+        prev = ei->i_orphan.prev;
+        sbi = EXT4_SB(inode->i_sb);
+        jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+        list_del_init(&ei->i_orphan);
+        /* If we're on an error path, we may not have a valid
+         * transaction handle with which to update the orphan list on
+         * disk, but we still need to remove the inode from the linked
+         * list in memory. */
+        if (!handle)
+                goto out;
+        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (err)
+                goto out_err;
+        if (prev == &sbi->s_orphan) {
+                jbd_debug(4, "superblock will point to %lu\n", ino_next);
+                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+                err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+                if (err)
+                        goto out_brelse;
+                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+                err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+        } else {
+                struct ext4_iloc iloc2;
+                struct inode *i_prev =
+                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+                jbd_debug(4, "orphan inode %lu will point to %lu\n",
+                          i_prev->i_ino, ino_next);
+                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+                if (err)
+                        goto out_brelse;
+                NEXT_ORPHAN(i_prev) = ino_next;
+                err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+        }
+        if (err)
+                goto out_brelse;
+        NEXT_ORPHAN(inode) = 0;
+        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out_err:
+        ext4_std_error(inode->i_sb, err);
+out:
+        unlock_super(inode->i_sb);
+        return err;
+out_brelse:
+        brelse(iloc.bh);
+        goto out_err;
+}
+static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+{
+        int retval;
+        struct inode * inode;
+        struct buffer_head * bh;
+        struct ext4_dir_entry_2 * de;
+        handle_t *handle;
+        /* Initialize quotas before so that eventual writes go in
+         * separate transaction */
+        DQUOT_INIT(dentry->d_inode);
+        handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        retval = -ENOENT;
+        bh = ext4_find_entry (dentry, &de);
+        if (!bh)
+                goto end_rmdir;
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        inode = dentry->d_inode;
+        retval = -EIO;
+        if (le32_to_cpu(de->inode) != inode->i_ino)
+                goto end_rmdir;
+        retval = -ENOTEMPTY;
+        if (!empty_dir (inode))
+                goto end_rmdir;
+        retval = ext4_delete_entry(handle, dir, de, bh);
+        if (retval)
+                goto end_rmdir;
+        if (inode->i_nlink != 2)
+                ext4_warning (inode->i_sb, "ext4_rmdir",
+                              "empty directory has nlink!=2 (%d)",
+                              inode->i_nlink);
+        inode->i_version++;
+        clear_nlink(inode);
+        /* There's no need to set i_disksize: the fact that i_nlink is
+         * zero will ensure that the right thing happens during any
+         * recovery. */
+        inode->i_size = 0;
+        ext4_orphan_add(handle, inode);
+        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+        ext4_mark_inode_dirty(handle, inode);
+        drop_nlink(dir);
+        ext4_update_dx_flag(dir);
+        ext4_mark_inode_dirty(handle, dir);
+end_rmdir:
+        ext4_journal_stop(handle);
+        brelse (bh);
+        return retval;
+}
+static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+{
+        int retval;
+        struct inode * inode;
+        struct buffer_head * bh;
+        struct ext4_dir_entry_2 * de;
+        handle_t *handle;
+        /* Initialize quotas before so that eventual writes go
+         * in separate transaction */
+        DQUOT_INIT(dentry->d_inode);
+        handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        retval = -ENOENT;
+        bh = ext4_find_entry (dentry, &de);
+        if (!bh)
+                goto end_unlink;
+        inode = dentry->d_inode;
+        retval = -EIO;
+        if (le32_to_cpu(de->inode) != inode->i_ino)
+                goto end_unlink;
+        if (!inode->i_nlink) {
+                ext4_warning (inode->i_sb, "ext4_unlink",
+                              "Deleting nonexistent file (%lu), %d",
+                              inode->i_ino, inode->i_nlink);
+                inode->i_nlink = 1;
+        }
+        retval = ext4_delete_entry(handle, dir, de, bh);
+        if (retval)
+                goto end_unlink;
+        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+        ext4_update_dx_flag(dir);
+        ext4_mark_inode_dirty(handle, dir);
+        drop_nlink(inode);
+        if (!inode->i_nlink)
+                ext4_orphan_add(handle, inode);
+        inode->i_ctime = dir->i_ctime;
+        ext4_mark_inode_dirty(handle, inode);
+        retval = 0;
+end_unlink:
+        ext4_journal_stop(handle);
+        brelse (bh);
+        return retval;
+}
+static int ext4_symlink (struct inode * dir,
+                struct dentry *dentry, const char * symname)
+{
+        handle_t *handle;
+        struct inode * inode;
+        int l, err, retries = 0;
+        l = strlen(symname)+1;
+        if (l > dir->i_sb->s_blocksize)
+                return -ENAMETOOLONG;
+retry:
+        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+                                        2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_stop;
+        if (l > sizeof (EXT4_I(inode)->i_data)) {
+                inode->i_op = &ext4_symlink_inode_operations;
+                ext4_set_aops(inode);
+                /*
+                 * page_symlink() calls into ext4_prepare/commit_write.
+                 * We have a transaction open.  All is sweetness.  It also sets
+                 * i_size in generic_commit_write().
+                 */
+                err = __page_symlink(inode, symname, l,
+                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+                if (err) {
+                        ext4_dec_count(handle, inode);
+                        ext4_mark_inode_dirty(handle, inode);
+                        iput (inode);
+                        goto out_stop;
+                }
+        } else {
+                inode->i_op = &ext4_fast_symlink_inode_operations;
+                memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+                inode->i_size = l-1;
+        }
+        EXT4_I(inode)->i_disksize = inode->i_size;
+        err = ext4_add_nondir(handle, dentry, inode);
+out_stop:
+        ext4_journal_stop(handle);
+        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+}
+static int ext4_link (struct dentry * old_dentry,
+                struct inode * dir, struct dentry *dentry)
+{
+        handle_t *handle;
+        struct inode *inode = old_dentry->d_inode;
+        int err, retries = 0;
+        if (inode->i_nlink >= EXT4_LINK_MAX)
+                return -EMLINK;
+retry:
+        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(dir))
+                handle->h_sync = 1;
+        inode->i_ctime = CURRENT_TIME_SEC;
+        ext4_inc_count(handle, inode);
+        atomic_inc(&inode->i_count);
+        err = ext4_add_nondir(handle, dentry, inode);
+        ext4_journal_stop(handle);
+        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+                goto retry;
+        return err;
+}
+#define PARENT_INO(buffer) \
+        ((struct ext4_dir_entry_2 *) ((char *) buffer + \
+        le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
+                           struct inode * new_dir,struct dentry *new_dentry)
+{
+        handle_t *handle;
+        struct inode * old_inode, * new_inode;
+        struct buffer_head * old_bh, * new_bh, * dir_bh;
+        struct ext4_dir_entry_2 * old_de, * new_de;
+        int retval;
+        old_bh = new_bh = dir_bh = NULL;
+        /* Initialize quotas before so that eventual writes go
+         * in separate transaction */
+        if (new_dentry->d_inode)
+                DQUOT_INIT(new_dentry->d_inode);
+        handle = ext4_journal_start(old_dir, 2 *
+                                        EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+                handle->h_sync = 1;
+        old_bh = ext4_find_entry (old_dentry, &old_de);
+        /*
+         *  Check for inode number is _not_ due to possible IO errors.
+         *  We might rmdir the source, keep it as pwd of some process
+         *  and merrily kill the link to whatever was created under the
+         *  same name. Goodbye sticky bit ;-<
+         */
+        old_inode = old_dentry->d_inode;
+        retval = -ENOENT;
+        if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+                goto end_rename;
+        new_inode = new_dentry->d_inode;
+        new_bh = ext4_find_entry (new_dentry, &new_de);
+        if (new_bh) {
+                if (!new_inode) {
+                        brelse (new_bh);
+                        new_bh = NULL;
+                }
+        }
+        if (S_ISDIR(old_inode->i_mode)) {
+                if (new_inode) {
+                        retval = -ENOTEMPTY;
+                        if (!empty_dir (new_inode))
+                                goto end_rename;
+                }
+                retval = -EIO;
+                dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+                if (!dir_bh)
+                        goto end_rename;
+                if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+                        goto end_rename;
+                retval = -EMLINK;
+                if (!new_inode && new_dir!=old_dir &&
+                                new_dir->i_nlink >= EXT4_LINK_MAX)
+                        goto end_rename;
+        }
+        if (!new_bh) {
+                retval = ext4_add_entry (handle, new_dentry, old_inode);
+                if (retval)
+                        goto end_rename;
+        } else {
+                BUFFER_TRACE(new_bh, "get write access");
+                ext4_journal_get_write_access(handle, new_bh);
+                new_de->inode = cpu_to_le32(old_inode->i_ino);
+                if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+                                              EXT4_FEATURE_INCOMPAT_FILETYPE))
+                        new_de->file_type = old_de->file_type;
+                new_dir->i_version++;
+                BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
+                ext4_journal_dirty_metadata(handle, new_bh);
+                brelse(new_bh);
+                new_bh = NULL;
+        }
+        /*
+         * Like most other Unix systems, set the ctime for inodes on a
+         * rename.
+         */
+        old_inode->i_ctime = CURRENT_TIME_SEC;
+        ext4_mark_inode_dirty(handle, old_inode);
+        /*
+         * ok, that's it
+         */
+        if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+            old_de->name_len != old_dentry->d_name.len ||
+            strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+            (retval = ext4_delete_entry(handle, old_dir,
+                                        old_de, old_bh)) == -ENOENT) {
+                /* old_de could have moved from under us during htree split, so
+                 * make sure that we are deleting the right entry.  We might
+                 * also be pointing to a stale entry in the unused part of
+                 * old_bh so just checking inum and the name isn't enough. */
+                struct buffer_head *old_bh2;
+                struct ext4_dir_entry_2 *old_de2;
+                old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+                if (old_bh2) {
+                        retval = ext4_delete_entry(handle, old_dir,
+                                                   old_de2, old_bh2);
+                        brelse(old_bh2);
+                }
+        }
+        if (retval) {
+                ext4_warning(old_dir->i_sb, "ext4_rename",
+                                "Deleting old file (%lu), %d, error=%d",
+                                old_dir->i_ino, old_dir->i_nlink, retval);
+        }
+        if (new_inode) {
+                drop_nlink(new_inode);
+                new_inode->i_ctime = CURRENT_TIME_SEC;
+        }
+        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+        ext4_update_dx_flag(old_dir);
+        if (dir_bh) {
+                BUFFER_TRACE(dir_bh, "get_write_access");
+                ext4_journal_get_write_access(handle, dir_bh);
+                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+                BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
+                ext4_journal_dirty_metadata(handle, dir_bh);
+                drop_nlink(old_dir);
+                if (new_inode) {
+                        drop_nlink(new_inode);
+                } else {
+                        inc_nlink(new_dir);
+                        ext4_update_dx_flag(new_dir);
+                        ext4_mark_inode_dirty(handle, new_dir);
+                }
+        }
+        ext4_mark_inode_dirty(handle, old_dir);
+        if (new_inode) {
+                ext4_mark_inode_dirty(handle, new_inode);
+                if (!new_inode->i_nlink)
+                        ext4_orphan_add(handle, new_inode);
+        }
+        retval = 0;
+end_rename:
+        brelse (dir_bh);
+        brelse (old_bh);
+        brelse (new_bh);
+        ext4_journal_stop(handle);
+        return retval;
+}
+/*
+ * directories can handle most operations...
+ */
+struct inode_operations ext4_dir_inode_operations = {
+        .create         = ext4_create,
+        .lookup         = ext4_lookup,
+        .link           = ext4_link,
+        .unlink         = ext4_unlink,
+        .symlink        = ext4_symlink,
+        .mkdir          = ext4_mkdir,
+        .rmdir          = ext4_rmdir,
+        .mknod          = ext4_mknod,
+        .rename         = ext4_rename,
+        .setattr        = ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ext4_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+        .permission     = ext4_permission,
+};
+struct inode_operations ext4_special_inode_operations = {
+        .setattr        = ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ext4_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+        .permission     = ext4_permission,
+};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..5e4dfff36a00
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
+/*  linux/fs/ext4/namei.h
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ *      Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..1e9578052cd3
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1045 @@
+/*
+ *  linux/fs/ext4/resize.c
+ *
+ * Support for resizing an ext4 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+#define EXT4FS_DEBUG
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#define outside(b, first, last) ((b) < (first) || (b) >= (last))
+#define inside(b, first, last)  ((b) >= (first) && (b) < (last))
+static int verify_group_input(struct super_block *sb,
+                              struct ext4_new_group_data *input)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        ext4_fsblk_t start = ext4_blocks_count(es);
+        ext4_fsblk_t end = start + input->blocks_count;
+        unsigned group = input->group;
+        ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+        unsigned overhead = ext4_bg_has_super(sb, group) ?
+                (1 + ext4_bg_num_gdb(sb, group) +
+                 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+        ext4_fsblk_t metaend = start + overhead;
+        struct buffer_head *bh = NULL;
+        ext4_grpblk_t free_blocks_count, offset;
+        int err = -EINVAL;
+        input->free_blocks_count = free_blocks_count =
+                input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+        if (test_opt(sb, DEBUG))
+                printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
+                       "(%d free, %u reserved)\n",
+                       ext4_bg_has_super(sb, input->group) ? "normal" :
+                       "no-super", input->group, input->blocks_count,
+                       free_blocks_count, input->reserved_blocks);
+        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
+        if (group != sbi->s_groups_count)
+                ext4_warning(sb, __FUNCTION__,
+                             "Cannot add at group %u (only %lu groups)",
+                             input->group, sbi->s_groups_count);
+        else if (offset != 0)
+                        ext4_warning(sb, __FUNCTION__, "Last group not full");
+        else if (input->reserved_blocks > input->blocks_count / 5)
+                ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+                             input->reserved_blocks);
+        else if (free_blocks_count < 0)
+                ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
+                             input->blocks_count);
+        else if (!(bh = sb_bread(sb, end - 1)))
+                ext4_warning(sb, __FUNCTION__,
+                             "Cannot read last block (%llu)",
+                             end - 1);
+        else if (outside(input->block_bitmap, start, end))
+                ext4_warning(sb, __FUNCTION__,
+                             "Block bitmap not in group (block %llu)",
+                             input->block_bitmap);
+        else if (outside(input->inode_bitmap, start, end))
+                ext4_warning(sb, __FUNCTION__,
+                             "Inode bitmap not in group (block %llu)",
+                             input->inode_bitmap);
+        else if (outside(input->inode_table, start, end) ||
+                 outside(itend - 1, start, end))
+                ext4_warning(sb, __FUNCTION__,
+                             "Inode table not in group (blocks %llu-%llu)",
+                             input->inode_table, itend - 1);
+        else if (input->inode_bitmap == input->block_bitmap)
+                ext4_warning(sb, __FUNCTION__,
+                             "Block bitmap same as inode bitmap (%llu)",
+                             input->block_bitmap);
+        else if (inside(input->block_bitmap, input->inode_table, itend))
+                ext4_warning(sb, __FUNCTION__,
+                             "Block bitmap (%llu) in inode table (%llu-%llu)",
+                             input->block_bitmap, input->inode_table, itend-1);
+        else if (inside(input->inode_bitmap, input->inode_table, itend))
+                ext4_warning(sb, __FUNCTION__,
+                             "Inode bitmap (%llu) in inode table (%llu-%llu)",
+                             input->inode_bitmap, input->inode_table, itend-1);
+        else if (inside(input->block_bitmap, start, metaend))
+                ext4_warning(sb, __FUNCTION__,
+                             "Block bitmap (%llu) in GDT table"
+                             " (%llu-%llu)",
+                             input->block_bitmap, start, metaend - 1);
+        else if (inside(input->inode_bitmap, start, metaend))
+                ext4_warning(sb, __FUNCTION__,
+                             "Inode bitmap (%llu) in GDT table"
+                             " (%llu-%llu)",
+                             input->inode_bitmap, start, metaend - 1);
+        else if (inside(input->inode_table, start, metaend) ||
+                 inside(itend - 1, start, metaend))
+                ext4_warning(sb, __FUNCTION__,
+                             "Inode table (%llu-%llu) overlaps"
+                             "GDT table (%llu-%llu)",
+                             input->inode_table, itend - 1, start, metaend - 1);
+        else
+                err = 0;
+        brelse(bh);
+        return err;
+}
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+                                  ext4_fsblk_t blk)
+{
+        struct buffer_head *bh;
+        int err;
+        bh = sb_getblk(sb, blk);
+        if (!bh)
+                return ERR_PTR(-EIO);
+        if ((err = ext4_journal_get_write_access(handle, bh))) {
+                brelse(bh);
+                bh = ERR_PTR(err);
+        } else {
+                lock_buffer(bh);
+                memset(bh->b_data, 0, sb->s_blocksize);
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+        }
+        return bh;
+}
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+        int i;
+        if (start_bit >= end_bit)
+                return;
+        ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+        for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+                ext4_set_bit(i, bitmap);
+        if (i < end_bit)
+                memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new group.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ */
+static int setup_new_group_blocks(struct super_block *sb,
+                                  struct ext4_new_group_data *input)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
+        int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+        unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
+        struct buffer_head *bh;
+        handle_t *handle;
+        ext4_fsblk_t block;
+        ext4_grpblk_t bit;
+        int i;
+        int err = 0, err2;
+        handle = ext4_journal_start_sb(sb, reserved_gdb + gdblocks +
+                                       2 + sbi->s_itb_per_group);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        lock_super(sb);
+        if (input->group != sbi->s_groups_count) {
+                err = -EBUSY;
+                goto exit_journal;
+        }
+        if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
+                err = PTR_ERR(bh);
+                goto exit_journal;
+        }
+        if (ext4_bg_has_super(sb, input->group)) {
+                ext4_debug("mark backup superblock %#04lx (+0)\n", start);
+                ext4_set_bit(0, bh->b_data);
+        }
+        /* Copy all of the GDT blocks into the backup in this group */
+        for (i = 0, bit = 1, block = start + 1;
+             i < gdblocks; i++, block++, bit++) {
+                struct buffer_head *gdb;
+                ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
+                gdb = sb_getblk(sb, block);
+                if (!gdb) {
+                        err = -EIO;
+                        goto exit_bh;
+                }
+                if ((err = ext4_journal_get_write_access(handle, gdb))) {
+                        brelse(gdb);
+                        goto exit_bh;
+                }
+                lock_buffer(bh);
+                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
+                set_buffer_uptodate(gdb);
+                unlock_buffer(bh);
+                ext4_journal_dirty_metadata(handle, gdb);
+                ext4_set_bit(bit, bh->b_data);
+                brelse(gdb);
+        }
+        /* Zero out all of the reserved backup group descriptor table blocks */
+        for (i = 0, bit = gdblocks + 1, block = start + bit;
+             i < reserved_gdb; i++, block++, bit++) {
+                struct buffer_head *gdb;
+                ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+                if (IS_ERR(gdb = bclean(handle, sb, block))) {
+                        err = PTR_ERR(bh);
+                        goto exit_bh;
+                }
+                ext4_journal_dirty_metadata(handle, gdb);
+                ext4_set_bit(bit, bh->b_data);
+                brelse(gdb);
+        }
+        ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+                   input->block_bitmap - start);
+        ext4_set_bit(input->block_bitmap - start, bh->b_data);
+        ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+                   input->inode_bitmap - start);
+        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
+        /* Zero out all of the inode table blocks */
+        for (i = 0, block = input->inode_table, bit = block - start;
+             i < sbi->s_itb_per_group; i++, bit++, block++) {
+                struct buffer_head *it;
+                ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
+                if (IS_ERR(it = bclean(handle, sb, block))) {
+                        err = PTR_ERR(it);
+                        goto exit_bh;
+                }
+                ext4_journal_dirty_metadata(handle, it);
+                brelse(it);
+                ext4_set_bit(bit, bh->b_data);
+        }
+        mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
+                        bh->b_data);
+        ext4_journal_dirty_metadata(handle, bh);
+        brelse(bh);
+        /* Mark unused entries in inode bitmap used */
+        ext4_debug("clear inode bitmap %#04x (+%ld)\n",
+                   input->inode_bitmap, input->inode_bitmap - start);
+        if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+                err = PTR_ERR(bh);
+                goto exit_journal;
+        }
+        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+                        bh->b_data);
+        ext4_journal_dirty_metadata(handle, bh);
+exit_bh:
+        brelse(bh);
+exit_journal:
+        unlock_super(sb);
+        if ((err2 = ext4_journal_stop(handle)) && !err)
+                err = err2;
+        return err;
+}
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext4 filesystem.  The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time.  In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
+                                  unsigned *five, unsigned *seven)
+{
+        unsigned *min = three;
+        int mult = 3;
+        unsigned ret;
+        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+                ret = *min;
+                *min += 1;
+                return ret;
+        }
+        if (*five < *min) {
+                min = five;
+                mult = 5;
+        }
+        if (*seven < *min) {
+                min = seven;
+                mult = 7;
+        }
+        ret = *min;
+        *min *= mult;
+        return ret;
+}
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order.  Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+                               struct buffer_head *primary)
+{
+        const ext4_fsblk_t blk = primary->b_blocknr;
+        const unsigned long end = EXT4_SB(sb)->s_groups_count;
+        unsigned three = 1;
+        unsigned five = 5;
+        unsigned seven = 7;
+        unsigned grp;
+        __le32 *p = (__le32 *)primary->b_data;
+        int gdbackups = 0;
+        while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
+                if (le32_to_cpu(*p++) !=
+                    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
+                        ext4_warning(sb, __FUNCTION__,
+                                     "reserved GDT %llu"
+                                     " missing grp %d (%llu)",
+                                     blk, grp,
+                                     grp *
+                                     (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+                                     blk);
+                        return -EINVAL;
+                }
+                if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
+                        return -EFBIG;
+        }
+        return gdbackups;
+}
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode.  The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order.  Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+                       struct ext4_new_group_data *input,
+                       struct buffer_head **primary)
+{
+        struct super_block *sb = inode->i_sb;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+        ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+        struct buffer_head **o_group_desc, **n_group_desc;
+        struct buffer_head *dind;
+        int gdbackups;
+        struct ext4_iloc iloc;
+        __le32 *data;
+        int err;
+        if (test_opt(sb, DEBUG))
+                printk(KERN_DEBUG
+                       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
+                       gdb_num);
+        /*
+         * If we are not using the primary superblock/GDT copy don't resize,
+         * because the user tools have no way of handling this.  Probably a
+         * bad time to do it anyways.
+         */
+        if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+                ext4_warning(sb, __FUNCTION__,
+                        "won't resize using backup superblock at %llu",
+                        (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+                return -EPERM;
+        }
+        *primary = sb_bread(sb, gdblock);
+        if (!*primary)
+                return -EIO;
+        if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+                err = gdbackups;
+                goto exit_bh;
+        }
+        data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+        dind = sb_bread(sb, le32_to_cpu(*data));
+        if (!dind) {
+                err = -EIO;
+                goto exit_bh;
+        }
+        data = (__le32 *)dind->b_data;
+        if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
+                ext4_warning(sb, __FUNCTION__,
+                             "new group %u GDT block %llu not reserved",
+                             input->group, gdblock);
+                err = -EINVAL;
+                goto exit_dind;
+        }
+        if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+                goto exit_dind;
+        if ((err = ext4_journal_get_write_access(handle, *primary)))
+                goto exit_sbh;
+        if ((err = ext4_journal_get_write_access(handle, dind)))
+                goto exit_primary;
+        /* ext4_reserve_inode_write() gets a reference on the iloc */
+        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+                goto exit_dindj;
+        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+                        GFP_KERNEL);
+        if (!n_group_desc) {
+                err = -ENOMEM;
+                ext4_warning (sb, __FUNCTION__,
+                              "not enough memory for %lu groups", gdb_num + 1);
+                goto exit_inode;
+        }
+        /*
+         * Finally, we have all of the possible failures behind us...
+         *
+         * Remove new GDT block from inode double-indirect block and clear out
+         * the new GDT block for use (which also "frees" the backup GDT blocks
+         * from the reserved inode).  We don't need to change the bitmaps for
+         * these blocks, because they are marked as in-use from being in the
+         * reserved inode, and will become GDT blocks (primary and backup).
+         */
+        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
+        ext4_journal_dirty_metadata(handle, dind);
+        brelse(dind);
+        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+        ext4_mark_iloc_dirty(handle, inode, &iloc);
+        memset((*primary)->b_data, 0, sb->s_blocksize);
+        ext4_journal_dirty_metadata(handle, *primary);
+        o_group_desc = EXT4_SB(sb)->s_group_desc;
+        memcpy(n_group_desc, o_group_desc,
+               EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+        n_group_desc[gdb_num] = *primary;
+        EXT4_SB(sb)->s_group_desc = n_group_desc;
+        EXT4_SB(sb)->s_gdb_count++;
+        kfree(o_group_desc);
+        es->s_reserved_gdt_blocks =
+                cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
+        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        return 0;
+exit_inode:
+        //ext4_journal_release_buffer(handle, iloc.bh);
+        brelse(iloc.bh);
+exit_dindj:
+        //ext4_journal_release_buffer(handle, dind);
+exit_primary:
+        //ext4_journal_release_buffer(handle, *primary);
+exit_sbh:
+        //ext4_journal_release_buffer(handle, *primary);
+exit_dind:
+        brelse(dind);
+exit_bh:
+        brelse(*primary);
+        ext4_debug("leaving with error %d\n", err);
+        return err;
+}
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are.  We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident.  The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+                              struct ext4_new_group_data *input)
+{
+        struct super_block *sb = inode->i_sb;
+        int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+        struct buffer_head **primary;
+        struct buffer_head *dind;
+        struct ext4_iloc iloc;
+        ext4_fsblk_t blk;
+        __le32 *data, *end;
+        int gdbackups = 0;
+        int res, i;
+        int err;
+        primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+        if (!primary)
+                return -ENOMEM;
+        data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+        dind = sb_bread(sb, le32_to_cpu(*data));
+        if (!dind) {
+                err = -EIO;
+                goto exit_free;
+        }
+        blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
+        data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+        end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
+        /* Get each reserved primary GDT block and verify it holds backups */
+        for (res = 0; res < reserved_gdb; res++, blk++) {
+                if (le32_to_cpu(*data) != blk) {
+                        ext4_warning(sb, __FUNCTION__,
+                                     "reserved block %llu"
+                                     " not at offset %ld",
+                                     blk,
+                                     (long)(data - (__le32 *)dind->b_data));
+                        err = -EINVAL;
+                        goto exit_bh;
+                }
+                primary[res] = sb_bread(sb, blk);
+                if (!primary[res]) {
+                        err = -EIO;
+                        goto exit_bh;
+                }
+                if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+                        brelse(primary[res]);
+                        err = gdbackups;
+                        goto exit_bh;
+                }
+                if (++data >= end)
+                        data = (__le32 *)dind->b_data;
+        }
+        for (i = 0; i < reserved_gdb; i++) {
+                if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+                        /*
+                        int j;
+                        for (j = 0; j < i; j++)
+                                ext4_journal_release_buffer(handle, primary[j]);
+                         */
+                        goto exit_bh;
+                }
+        }
+        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+                goto exit_bh;
+        /*
+         * Finally we can add each of the reserved backup GDT blocks from
+         * the new group to its reserved primary GDT block.
+         */
+        blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+        for (i = 0; i < reserved_gdb; i++) {
+                int err2;
+                data = (__le32 *)primary[i]->b_data;
+                /* printk("reserving backup %lu[%u] = %lu\n",
+                       primary[i]->b_blocknr, gdbackups,
+                       blk + primary[i]->b_blocknr); */
+                data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+                err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+                if (!err)
+                        err = err2;
+        }
+        inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+        ext4_mark_iloc_dirty(handle, inode, &iloc);
+exit_bh:
+        while (--res >= 0)
+                brelse(primary[res]);
+        brelse(dind);
+exit_free:
+        kfree(primary);
+        return err;
+}
+/*
+ * Update the backup copies of the ext4 metadata.  These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem).  However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock.  The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need lock_super() for this, because these blocks are not
+ * otherwise touched by the filesystem code when it is mounted.  We don't
+ * need to worry about last changing from sbi->s_groups_count, because the
+ * worst that can happen is that we do not copy the full number of backups
+ * at this time.  The resize which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+                           int blk_off, char *data, int size)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        const unsigned long last = sbi->s_groups_count;
+        const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
+        unsigned three = 1;
+        unsigned five = 5;
+        unsigned seven = 7;
+        unsigned group;
+        int rest = sb->s_blocksize - size;
+        handle_t *handle;
+        int err = 0, err2;
+        handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+        if (IS_ERR(handle)) {
+                group = 1;
+                err = PTR_ERR(handle);
+                goto exit_err;
+        }
+        while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+                struct buffer_head *bh;
+                /* Out of journal space, and can't get more - abort - so sad */
+                if (handle->h_buffer_credits == 0 &&
+                    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
+                    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+                        break;
+                bh = sb_getblk(sb, group * bpg + blk_off);
+                if (!bh) {
+                        err = -EIO;
+                        break;
+                }
+                ext4_debug("update metadata backup %#04lx\n",
+                          (unsigned long)bh->b_blocknr);
+                if ((err = ext4_journal_get_write_access(handle, bh)))
+                        break;
+                lock_buffer(bh);
+                memcpy(bh->b_data, data, size);
+                if (rest)
+                        memset(bh->b_data + size, 0, rest);
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+                ext4_journal_dirty_metadata(handle, bh);
+                brelse(bh);
+        }
+        if ((err2 = ext4_journal_stop(handle)) && !err)
+                err = err2;
+        /*
+         * Ugh! Need to have e2fsck write the backup copies.  It is too
+         * late to revert the resize, we shouldn't fail just because of
+         * the backup copies (they are only needed in case of corruption).
+         *
+         * However, if we got here we have a journal problem too, so we
+         * can't really start a transaction to mark the superblock.
+         * Chicken out and just set the flag on the hope it will be written
+         * to disk, and if not - we will simply wait until next fsck.
+         */
+exit_err:
+        if (err) {
+                ext4_warning(sb, __FUNCTION__,
+                             "can't update backup for group %d (err %d), "
+                             "forcing fsck on next reboot", group, err);
+                sbi->s_mount_state &= ~EXT4_VALID_FS;
+                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+                mark_buffer_dirty(sbi->s_sbh);
+        }
+}
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+                le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+        struct buffer_head *primary = NULL;
+        struct ext4_group_desc *gdp;
+        struct inode *inode = NULL;
+        handle_t *handle;
+        int gdb_off, gdb_num;
+        int err, err2;
+        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+        gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+        if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                        EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+                ext4_warning(sb, __FUNCTION__,
+                             "Can't resize non-sparse filesystem further");
+                return -EPERM;
+        }
+        if (ext4_blocks_count(es) + input->blocks_count <
+            ext4_blocks_count(es)) {
+                ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
+                return -EINVAL;
+        }
+        if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+            le32_to_cpu(es->s_inodes_count)) {
+                ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
+                return -EINVAL;
+        }
+        if (reserved_gdb || gdb_off == 0) {
+                if (!EXT4_HAS_COMPAT_FEATURE(sb,
+                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+                        ext4_warning(sb, __FUNCTION__,
+                                     "No reserved GDT blocks, can't resize");
+                        return -EPERM;
+                }
+                inode = iget(sb, EXT4_RESIZE_INO);
+                if (!inode || is_bad_inode(inode)) {
+                        ext4_warning(sb, __FUNCTION__,
+                                     "Error opening resize inode");
+                        iput(inode);
+                        return -ENOENT;
+                }
+        }
+        if ((err = verify_group_input(sb, input)))
+                goto exit_put;
+        if ((err = setup_new_group_blocks(sb, input)))
+                goto exit_put;
+        /*
+         * We will always be modifying at least the superblock and a GDT
+         * block.  If we are adding a group past the last current GDT block,
+         * we will also modify the inode and the dindirect block.  If we
+         * are adding a group with superblock/GDT backups  we will also
+         * modify each of the reserved GDT dindirect blocks.
+         */
+        handle = ext4_journal_start_sb(sb,
+                                       ext4_bg_has_super(sb, input->group) ?
+                                       3 + reserved_gdb : 4);
+        if (IS_ERR(handle)) {
+                err = PTR_ERR(handle);
+                goto exit_put;
+        }
+        lock_super(sb);
+        if (input->group != sbi->s_groups_count) {
+                ext4_warning(sb, __FUNCTION__,
+                             "multiple resizers run on filesystem!");
+                err = -EBUSY;
+                goto exit_journal;
+        }
+        if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
+                goto exit_journal;
+        /*
+         * We will only either add reserved group blocks to a backup group
+         * or remove reserved blocks for the first group in a new group block.
+         * Doing both would be mean more complex code, and sane people don't
+         * use non-sparse filesystems anymore.  This is already checked above.
+         */
+        if (gdb_off) {
+                primary = sbi->s_group_desc[gdb_num];
+                if ((err = ext4_journal_get_write_access(handle, primary)))
+                        goto exit_journal;
+                if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
+                    (err = reserve_backup_gdb(handle, inode, input)))
+                        goto exit_journal;
+        } else if ((err = add_new_gdb(handle, inode, input, &primary)))
+                goto exit_journal;
+        /*
+         * OK, now we've set up the new group.  Time to make it active.
+         *
+         * Current kernels don't lock all allocations via lock_super(),
+         * so we have to be safe wrt. concurrent accesses the group
+         * data.  So we need to be careful to set all of the relevant
+         * group descriptor data etc. *before* we enable the group.
+         *
+         * The key field here is sbi->s_groups_count: as long as
+         * that retains its old value, nobody is going to access the new
+         * group.
+         *
+         * So first we update all the descriptor metadata for the new
+         * group; then we update the total disk blocks count; then we
+         * update the groups count to enable the group; then finally we
+         * update the free space counts so that the system can start
+         * using the new disk blocks.
+         */
+        /* Update group descriptor block for new group */
+        gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
+        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
+        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
+        gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+        gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+        /*
+         * Make the new blocks and inodes valid next.  We do this before
+         * increasing the group count so that once the group is enabled,
+         * all of its blocks and inodes are already valid.
+         *
+         * We always allocate group-by-group, then block-by-block or
+         * inode-by-inode within a group, so enabling these
+         * blocks/inodes before the group is live won't actually let us
+         * allocate the new space yet.
+         */
+        ext4_blocks_count_set(es, ext4_blocks_count(es) +
+                input->blocks_count);
+        es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
+                EXT4_INODES_PER_GROUP(sb));
+        /*
+         * We need to protect s_groups_count against other CPUs seeing
+         * inconsistent state in the superblock.
+         *
+         * The precise rules we use are:
+         *
+         * * Writers of s_groups_count *must* hold lock_super
+         * AND
+         * * Writers must perform a smp_wmb() after updating all dependent
+         *   data and before modifying the groups count
+         *
+         * * Readers must hold lock_super() over the access
+         * OR
+         * * Readers must perform an smp_rmb() after reading the groups count
+         *   and before reading any dependent data.
+         *
+         * NB. These rules can be relaxed when checking the group count
+         * while freeing data, as we can only allocate from a block
+         * group after serialising against the group count, and we can
+         * only then free after serialising in turn against that
+         * allocation.
+         */
+        smp_wmb();
+        /* Update the global fs size fields */
+        sbi->s_groups_count++;
+        ext4_journal_dirty_metadata(handle, primary);
+        /* Update the reserved block counts only once the new group is
+         * active. */
+        ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+                input->reserved_blocks);
+        /* Update the free space counts */
+        percpu_counter_mod(&sbi->s_freeblocks_counter,
+                           input->free_blocks_count);
+        percpu_counter_mod(&sbi->s_freeinodes_counter,
+                           EXT4_INODES_PER_GROUP(sb));
+        ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+        sb->s_dirt = 1;
+exit_journal:
+        unlock_super(sb);
+        if ((err2 = ext4_journal_stop(handle)) && !err)
+                err = err2;
+        if (!err) {
+                update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+                               sizeof(struct ext4_super_block));
+                update_backups(sb, primary->b_blocknr, primary->b_data,
+                               primary->b_size);
+        }
+exit_put:
+        iput(inode);
+        return err;
+} /* ext4_group_add */
+/* Extend the filesystem to the new number of blocks specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext4_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
+                      ext4_fsblk_t n_blocks_count)
+{
+        ext4_fsblk_t o_blocks_count;
+        unsigned long o_groups_count;
+        ext4_grpblk_t last;
+        ext4_grpblk_t add;
+        struct buffer_head * bh;
+        handle_t *handle;
+        int err;
+        unsigned long freed_blocks;
+        /* We don't need to worry about locking wrt other resizers just
+         * yet: we're going to revalidate es->s_blocks_count after
+         * taking lock_super() below. */
+        o_blocks_count = ext4_blocks_count(es);
+        o_groups_count = EXT4_SB(sb)->s_groups_count;
+        if (test_opt(sb, DEBUG))
+                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
+                       o_blocks_count, n_blocks_count);
+        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+                return 0;
+        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                        " too large to resize to %llu blocks safely\n",
+                        sb->s_id, n_blocks_count);
+                if (sizeof(sector_t) < 8)
+                        ext4_warning(sb, __FUNCTION__,
+                        "CONFIG_LBD not enabled\n");
+                return -EINVAL;
+        }
+        if (n_blocks_count < o_blocks_count) {
+                ext4_warning(sb, __FUNCTION__,
+                             "can't shrink FS - resize aborted");
+                return -EBUSY;
+        }
+        /* Handle the remaining blocks in the last group only. */
+        ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+        if (last == 0) {
+                ext4_warning(sb, __FUNCTION__,
+                             "need to use ext2online to resize further");
+                return -EPERM;
+        }
+        add = EXT4_BLOCKS_PER_GROUP(sb) - last;
+        if (o_blocks_count + add < o_blocks_count) {
+                ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
+                return -EINVAL;
+        }
+        if (o_blocks_count + add > n_blocks_count)
+                add = n_blocks_count - o_blocks_count;
+        if (o_blocks_count + add < n_blocks_count)
+                ext4_warning(sb, __FUNCTION__,
+                             "will only finish group (%llu"
+                             " blocks, %u new)",
+                             o_blocks_count + add, add);
+        /* See if the device is actually as big as what was requested */
+        bh = sb_bread(sb, o_blocks_count + add -1);
+        if (!bh) {
+                ext4_warning(sb, __FUNCTION__,
+                             "can't read last block, resize aborted");
+                return -ENOSPC;
+        }
+        brelse(bh);
+        /* We will update the superblock, one block bitmap, and
+         * one group descriptor via ext4_free_blocks().
+         */
+        handle = ext4_journal_start_sb(sb, 3);
+        if (IS_ERR(handle)) {
+                err = PTR_ERR(handle);
+                ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
+                goto exit_put;
+        }
+        lock_super(sb);
+        if (o_blocks_count != ext4_blocks_count(es)) {
+                ext4_warning(sb, __FUNCTION__,
+                             "multiple resizers run on filesystem!");
+                unlock_super(sb);
+                err = -EBUSY;
+                goto exit_put;
+        }
+        if ((err = ext4_journal_get_write_access(handle,
+                                                 EXT4_SB(sb)->s_sbh))) {
+                ext4_warning(sb, __FUNCTION__,
+                             "error %d on journal write access", err);
+                unlock_super(sb);
+                ext4_journal_stop(handle);
+                goto exit_put;
+        }
+        ext4_blocks_count_set(es, o_blocks_count + add);
+        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        sb->s_dirt = 1;
+        unlock_super(sb);
+        ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
+                   o_blocks_count + add);
+        ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+                   o_blocks_count + add);
+        if ((err = ext4_journal_stop(handle)))
+                goto exit_put;
+        if (test_opt(sb, DEBUG))
+                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
+                       ext4_blocks_count(es));
+        update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+                       sizeof(struct ext4_super_block));
+exit_put:
+        return err;
+} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..b4b022aa2bc2
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2829 @@
+/*
+ *  linux/fs/ext4/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "xattr.h"
+#include "acl.h"
+#include "namei.h"
+static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+                             unsigned long journal_devnum);
+static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
+                               unsigned int);
+static void ext4_commit_super (struct super_block * sb,
+                               struct ext4_super_block * es,
+                               int sync);
+static void ext4_mark_recovery_complete(struct super_block * sb,
+                                        struct ext4_super_block * es);
+static void ext4_clear_journal_err(struct super_block * sb,
+                                   struct ext4_super_block * es);
+static int ext4_sync_fs(struct super_block *sb, int wait);
+static const char *ext4_decode_error(struct super_block * sb, int errno,
+                                     char nbuf[16]);
+static int ext4_remount (struct super_block * sb, int * flags, char * data);
+static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
+static void ext4_unlockfs(struct super_block *sb);
+static void ext4_write_super (struct super_block * sb);
+static void ext4_write_super_lockfs(struct super_block *sb);
+ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+                               struct ext4_group_desc *bg)
+{
+        return le32_to_cpu(bg->bg_block_bitmap) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+}
+ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+                               struct ext4_group_desc *bg)
+{
+        return le32_to_cpu(bg->bg_inode_bitmap) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+}
+ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le32_to_cpu(bg->bg_inode_table) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+void ext4_block_bitmap_set(struct super_block *sb,
+                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+        bg->bg_block_bitmap = cpu_to_le32((u32)blk);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+void ext4_inode_bitmap_set(struct super_block *sb,
+                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+        bg->bg_inode_bitmap  = cpu_to_le32((u32)blk);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+void ext4_inode_table_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+        bg->bg_inode_table = cpu_to_le32((u32)blk);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
+}
+/*
+ * Wrappers for jbd2_journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+{
+        journal_t *journal;
+        if (sb->s_flags & MS_RDONLY)
+                return ERR_PTR(-EROFS);
+        /* Special case here: if the journal has aborted behind our
+         * backs (eg. EIO in the commit thread), then we still need to
+         * take the FS itself readonly cleanly. */
+        journal = EXT4_SB(sb)->s_journal;
+        if (is_journal_aborted(journal)) {
+                ext4_abort(sb, __FUNCTION__,
+                           "Detected aborted journal");
+                return ERR_PTR(-EROFS);
+        }
+        return jbd2_journal_start(journal, nblocks);
+}
+/*
+ * The only special thing we need to do here is to make sure that all
+ * jbd2_journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+int __ext4_journal_stop(const char *where, handle_t *handle)
+{
+        struct super_block *sb;
+        int err;
+        int rc;
+        sb = handle->h_transaction->t_journal->j_private;
+        err = handle->h_err;
+        rc = jbd2_journal_stop(handle);
+        if (!err)
+                err = rc;
+        if (err)
+                __ext4_std_error(sb, where, err);
+        return err;
+}
+void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+                struct buffer_head *bh, handle_t *handle, int err)
+{
+        char nbuf[16];
+        const char *errstr = ext4_decode_error(NULL, err, nbuf);
+        if (bh)
+                BUFFER_TRACE(bh, "abort");
+        if (!handle->h_err)
+                handle->h_err = err;
+        if (is_handle_aborted(handle))
+                return;
+        printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
+               caller, errstr, err_fn);
+        jbd2_journal_abort_handle(handle);
+}
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock.  That is not possible on ext4, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the jbd2_journal_abort() error code to record an error in
+ * the journal instead.  On recovery, the journal will compain about
+ * that error until we've noted it down and cleared it.
+ */
+static void ext4_handle_error(struct super_block *sb)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+        if (sb->s_flags & MS_RDONLY)
+                return;
+        if (!test_opt (sb, ERRORS_CONT)) {
+                journal_t *journal = EXT4_SB(sb)->s_journal;
+                EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+                if (journal)
+                        jbd2_journal_abort(journal, -EIO);
+        }
+        if (test_opt (sb, ERRORS_RO)) {
+                printk (KERN_CRIT "Remounting filesystem read-only\n");
+                sb->s_flags |= MS_RDONLY;
+        }
+        ext4_commit_super(sb, es, 1);
+        if (test_opt(sb, ERRORS_PANIC))
+                panic("EXT4-fs (device %s): panic forced after error\n",
+                        sb->s_id);
+}
+void ext4_error (struct super_block * sb, const char * function,
+                 const char * fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        ext4_handle_error(sb);
+}
+static const char *ext4_decode_error(struct super_block * sb, int errno,
+                                     char nbuf[16])
+{
+        char *errstr = NULL;
+        switch (errno) {
+        case -EIO:
+                errstr = "IO failure";
+                break;
+        case -ENOMEM:
+                errstr = "Out of memory";
+                break;
+        case -EROFS:
+                if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+                        errstr = "Journal has aborted";
+                else
+                        errstr = "Readonly filesystem";
+                break;
+        default:
+                /* If the caller passed in an extra buffer for unknown
+                 * errors, textualise them now.  Else we just return
+                 * NULL. */
+                if (nbuf) {
+                        /* Check for truncated error codes... */
+                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                                errstr = nbuf;
+                }
+                break;
+        }
+        return errstr;
+}
+/* __ext4_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response.  */
+void __ext4_std_error (struct super_block * sb, const char * function,
+                       int errno)
+{
+        char nbuf[16];
+        const char *errstr;
+        /* Special case: if the error is EROFS, and we're not already
+         * inside a transaction, then there's really no point in logging
+         * an error. */
+        if (errno == -EROFS && journal_current_handle() == NULL &&
+            (sb->s_flags & MS_RDONLY))
+                return;
+        errstr = ext4_decode_error(sb, errno, nbuf);
+        printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+                sb->s_id, function, errstr);
+        ext4_handle_error(sb);
+}
+/*
+ * ext4_abort is a much stronger failure handler than ext4_error.  The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+void ext4_abort (struct super_block * sb, const char * function,
+                 const char * fmt, ...)
+{
+        va_list args;
+        printk (KERN_CRIT "ext4_abort called.\n");
+        va_start(args, fmt);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        if (test_opt(sb, ERRORS_PANIC))
+                panic("EXT4-fs panic from previous error\n");
+        if (sb->s_flags & MS_RDONLY)
+                return;
+        printk(KERN_CRIT "Remounting filesystem read-only\n");
+        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+        sb->s_flags |= MS_RDONLY;
+        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+}
+void ext4_warning (struct super_block * sb, const char * function,
+                   const char * fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
+               sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
+void ext4_update_dynamic_rev(struct super_block *sb)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
+                return;
+        ext4_warning(sb, __FUNCTION__,
+                     "updating to rev %d because of new feature flag, "
+                     "running e2fsck is recommended",
+                     EXT4_DYNAMIC_REV);
+        es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
+        es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
+        es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
+        /* leave es->s_feature_*compat flags alone */
+        /* es->s_uuid will be set by e2fsck if empty */
+        /*
+         * The rest of the superblock fields should be zero, and if not it
+         * means they are likely already in use, so leave them alone.  We
+         * can leave it up to e2fsck to clean up any inconsistencies there.
+         */
+}
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext4_blkdev_get(dev_t dev)
+{
+        struct block_device *bdev;
+        char b[BDEVNAME_SIZE];
+        bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+        if (IS_ERR(bdev))
+                goto fail;
+        return bdev;
+fail:
+        printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+                        __bdevname(dev, b), PTR_ERR(bdev));
+        return NULL;
+}
+/*
+ * Release the journal device
+ */
+static int ext4_blkdev_put(struct block_device *bdev)
+{
+        bd_release(bdev);
+        return blkdev_put(bdev);
+}
+static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+{
+        struct block_device *bdev;
+        int ret = -ENODEV;
+        bdev = sbi->journal_bdev;
+        if (bdev) {
+                ret = ext4_blkdev_put(bdev);
+                sbi->journal_bdev = NULL;
+        }
+        return ret;
+}
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+        return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
+}
+static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
+{
+        struct list_head *l;
+        printk(KERN_ERR "sb orphan head is %d\n",
+               le32_to_cpu(sbi->s_es->s_last_orphan));
+        printk(KERN_ERR "sb_info orphan list:\n");
+        list_for_each(l, &sbi->s_orphan) {
+                struct inode *inode = orphan_list_entry(l);
+                printk(KERN_ERR "  "
+                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+                       inode->i_sb->s_id, inode->i_ino, inode,
+                       inode->i_mode, inode->i_nlink,
+                       NEXT_ORPHAN(inode));
+        }
+}
+static void ext4_put_super (struct super_block * sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        int i;
+        ext4_ext_release(sb);
+        ext4_xattr_put_super(sb);
+        jbd2_journal_destroy(sbi->s_journal);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+                es->s_state = cpu_to_le16(sbi->s_mount_state);
+                BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+                mark_buffer_dirty(sbi->s_sbh);
+                ext4_commit_super(sb, es, 1);
+        }
+        for (i = 0; i < sbi->s_gdb_count; i++)
+                brelse(sbi->s_group_desc[i]);
+        kfree(sbi->s_group_desc);
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+        for (i = 0; i < MAXQUOTAS; i++)
+                kfree(sbi->s_qf_names[i]);
+#endif
+        /* Debugging code just in case the in-memory inode orphan list
+         * isn't empty.  The on-disk one can be non-empty if we've
+         * detected an error and taken the fs readonly, but the
+         * in-memory list had better be clean by this point. */
+        if (!list_empty(&sbi->s_orphan))
+                dump_orphan_list(sb, sbi);
+        J_ASSERT(list_empty(&sbi->s_orphan));
+        invalidate_bdev(sb->s_bdev, 0);
+        if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+                /*
+                 * Invalidate the journal device's buffers.  We don't want them
+                 * floating about in memory - the physical journal device may
+                 * hotswapped, and it breaks the `ro-after' testing code.
+                 */
+                sync_blockdev(sbi->journal_bdev);
+                invalidate_bdev(sbi->journal_bdev, 0);
+                ext4_blkdev_remove(sbi);
+        }
+        sb->s_fs_info = NULL;
+        kfree(sbi);
+        return;
+}
+static kmem_cache_t *ext4_inode_cachep;
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext4_alloc_inode(struct super_block *sb)
+{
+        struct ext4_inode_info *ei;
+        ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
+        if (!ei)
+                return NULL;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+        ei->i_acl = EXT4_ACL_NOT_CACHED;
+        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+#endif
+        ei->i_block_alloc_info = NULL;
+        ei->vfs_inode.i_version = 1;
+        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+        return &ei->vfs_inode;
+}
+static void ext4_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+                init_rwsem(&ei->xattr_sem);
+#endif
+                mutex_init(&ei->truncate_mutex);
+                inode_init_once(&ei->vfs_inode);
+        }
+}
+static int init_inodecache(void)
+{
+        ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+                                             sizeof(struct ext4_inode_info),
+                                             0, (SLAB_RECLAIM_ACCOUNT|
+                                                SLAB_MEM_SPREAD),
+                                             init_once, NULL);
+        if (ext4_inode_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+static void destroy_inodecache(void)
+{
+        kmem_cache_destroy(ext4_inode_cachep);
+}
+static void ext4_clear_inode(struct inode *inode)
+{
+        struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+        if (EXT4_I(inode)->i_acl &&
+                        EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
+                posix_acl_release(EXT4_I(inode)->i_acl);
+                EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
+        }
+        if (EXT4_I(inode)->i_default_acl &&
+                        EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
+                posix_acl_release(EXT4_I(inode)->i_default_acl);
+                EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
+        }
+#endif
+        ext4_discard_reservation(inode);
+        EXT4_I(inode)->i_block_alloc_info = NULL;
+        if (unlikely(rsv))
+                kfree(rsv);
+}
+static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (sbi->s_jquota_fmt)
+                seq_printf(seq, ",jqfmt=%s",
+                (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+        if (sbi->s_qf_names[USRQUOTA])
+                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+        if (sbi->s_qf_names[GRPQUOTA])
+                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+        if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+                seq_puts(seq, ",usrquota");
+        if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+                seq_puts(seq, ",grpquota");
+#endif
+}
+static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct super_block *sb = vfs->mnt_sb;
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+                seq_puts(seq, ",data=journal");
+        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                seq_puts(seq, ",data=ordered");
+        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+                seq_puts(seq, ",data=writeback");
+        ext4_show_quota_options(seq, sb);
+        return 0;
+}
+static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
+{
+        __u32 *objp = vobjp;
+        unsigned long ino = objp[0];
+        __u32 generation = objp[1];
+        struct inode *inode;
+        struct dentry *result;
+        if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+                return ERR_PTR(-ESTALE);
+        if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+                return ERR_PTR(-ESTALE);
+        /* iget isn't really right if the inode is currently unallocated!!
+         *
+         * ext4_read_inode will return a bad_inode if the inode had been
+         * deleted, so we should be safe.
+         *
+         * Currently we don't know the generation for parent directory, so
+         * a generation of 0 means "accept any"
+         */
+        inode = iget(sb, ino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (is_bad_inode(inode) ||
+            (generation && inode->i_generation != generation)) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        /* now to find a dentry.
+         * If possible, get a well-connected one
+         */
+        result = d_alloc_anon(inode);
+        if (!result) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return result;
+}
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static int ext4_dquot_initialize(struct inode *inode, int type);
+static int ext4_dquot_drop(struct inode *inode);
+static int ext4_write_dquot(struct dquot *dquot);
+static int ext4_acquire_dquot(struct dquot *dquot);
+static int ext4_release_dquot(struct dquot *dquot);
+static int ext4_mark_dquot_dirty(struct dquot *dquot);
+static int ext4_write_info(struct super_block *sb, int type);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
+static int ext4_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+                               size_t len, loff_t off);
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+                                const char *data, size_t len, loff_t off);
+static struct dquot_operations ext4_quota_operations = {
+        .initialize     = ext4_dquot_initialize,
+        .drop           = ext4_dquot_drop,
+        .alloc_space    = dquot_alloc_space,
+        .alloc_inode    = dquot_alloc_inode,
+        .free_space     = dquot_free_space,
+        .free_inode     = dquot_free_inode,
+        .transfer       = dquot_transfer,
+        .write_dquot    = ext4_write_dquot,
+        .acquire_dquot  = ext4_acquire_dquot,
+        .release_dquot  = ext4_release_dquot,
+        .mark_dirty     = ext4_mark_dquot_dirty,
+        .write_info     = ext4_write_info
+};
+static struct quotactl_ops ext4_qctl_operations = {
+        .quota_on       = ext4_quota_on,
+        .quota_off      = vfs_quota_off,
+        .quota_sync     = vfs_quota_sync,
+        .get_info       = vfs_get_dqinfo,
+        .set_info       = vfs_set_dqinfo,
+        .get_dqblk      = vfs_get_dqblk,
+        .set_dqblk      = vfs_set_dqblk
+};
+#endif
+static struct super_operations ext4_sops = {
+        .alloc_inode    = ext4_alloc_inode,
+        .destroy_inode  = ext4_destroy_inode,
+        .read_inode     = ext4_read_inode,
+        .write_inode    = ext4_write_inode,
+        .dirty_inode    = ext4_dirty_inode,
+        .delete_inode   = ext4_delete_inode,
+        .put_super      = ext4_put_super,
+        .write_super    = ext4_write_super,
+        .sync_fs        = ext4_sync_fs,
+        .write_super_lockfs = ext4_write_super_lockfs,
+        .unlockfs       = ext4_unlockfs,
+        .statfs         = ext4_statfs,
+        .remount_fs     = ext4_remount,
+        .clear_inode    = ext4_clear_inode,
+        .show_options   = ext4_show_options,
+#ifdef CONFIG_QUOTA
+        .quota_read     = ext4_quota_read,
+        .quota_write    = ext4_quota_write,
+#endif
+};
+static struct export_operations ext4_export_ops = {
+        .get_parent = ext4_get_parent,
+        .get_dentry = ext4_get_dentry,
+};
+enum {
+        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+        Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
+        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+        Opt_grpquota, Opt_extents,
+};
+static match_table_t tokens = {
+        {Opt_bsd_df, "bsddf"},
+        {Opt_minix_df, "minixdf"},
+        {Opt_grpid, "grpid"},
+        {Opt_grpid, "bsdgroups"},
+        {Opt_nogrpid, "nogrpid"},
+        {Opt_nogrpid, "sysvgroups"},
+        {Opt_resgid, "resgid=%u"},
+        {Opt_resuid, "resuid=%u"},
+        {Opt_sb, "sb=%u"},
+        {Opt_err_cont, "errors=continue"},
+        {Opt_err_panic, "errors=panic"},
+        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_nouid32, "nouid32"},
+        {Opt_nocheck, "nocheck"},
+        {Opt_nocheck, "check=none"},
+        {Opt_debug, "debug"},
+        {Opt_oldalloc, "oldalloc"},
+        {Opt_orlov, "orlov"},
+        {Opt_user_xattr, "user_xattr"},
+        {Opt_nouser_xattr, "nouser_xattr"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_reservation, "reservation"},
+        {Opt_noreservation, "noreservation"},
+        {Opt_noload, "noload"},
+        {Opt_nobh, "nobh"},
+        {Opt_bh, "bh"},
+        {Opt_commit, "commit=%u"},
+        {Opt_journal_update, "journal=update"},
+        {Opt_journal_inum, "journal=%u"},
+        {Opt_journal_dev, "journal_dev=%u"},
+        {Opt_abort, "abort"},
+        {Opt_data_journal, "data=journal"},
+        {Opt_data_ordered, "data=ordered"},
+        {Opt_data_writeback, "data=writeback"},
+        {Opt_offusrjquota, "usrjquota="},
+        {Opt_usrjquota, "usrjquota=%s"},
+        {Opt_offgrpjquota, "grpjquota="},
+        {Opt_grpjquota, "grpjquota=%s"},
+        {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+        {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+        {Opt_grpquota, "grpquota"},
+        {Opt_noquota, "noquota"},
+        {Opt_quota, "quota"},
+        {Opt_usrquota, "usrquota"},
+        {Opt_barrier, "barrier=%u"},
+        {Opt_extents, "extents"},
+        {Opt_err, NULL},
+        {Opt_resize, "resize"},
+};
+static ext4_fsblk_t get_sb_block(void **data)
+{
+        ext4_fsblk_t    sb_block;
+        char            *options = (char *) *data;
+        if (!options || strncmp(options, "sb=", 3) != 0)
+                return 1;       /* Default location */
+        options += 3;
+        /*todo: use simple_strtoll with >32bit ext4 */
+        sb_block = simple_strtoul(options, &options, 0);
+        if (*options && *options != ',') {
+                printk("EXT4-fs: Invalid sb specification: %s\n",
+                       (char *) *data);
+                return 1;
+        }
+        if (*options == ',')
+                options++;
+        *data = (void *) options;
+        return sb_block;
+}
+static int parse_options (char *options, struct super_block *sb,
+                          unsigned int *inum, unsigned long *journal_devnum,
+                          ext4_fsblk_t *n_blocks_count, int is_remount)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        char * p;
+        substring_t args[MAX_OPT_ARGS];
+        int data_opt = 0;
+        int option;
+#ifdef CONFIG_QUOTA
+        int qtype;
+        char *qname;
+#endif
+        if (!options)
+                return 1;
+        while ((p = strsep (&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_bsd_df:
+                        clear_opt (sbi->s_mount_opt, MINIX_DF);
+                        break;
+                case Opt_minix_df:
+                        set_opt (sbi->s_mount_opt, MINIX_DF);
+                        break;
+                case Opt_grpid:
+                        set_opt (sbi->s_mount_opt, GRPID);
+                        break;
+                case Opt_nogrpid:
+                        clear_opt (sbi->s_mount_opt, GRPID);
+                        break;
+                case Opt_resuid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        sbi->s_resuid = option;
+                        break;
+                case Opt_resgid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        sbi->s_resgid = option;
+                        break;
+                case Opt_sb:
+                        /* handled by get_sb_block() instead of here */
+                        /* *sb_block = match_int(&args[0]); */
+                        break;
+                case Opt_err_panic:
+                        clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt (sbi->s_mount_opt, ERRORS_RO);
+                        set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        break;
+                case Opt_err_ro:
+                        clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt (sbi->s_mount_opt, ERRORS_RO);
+                        break;
+                case Opt_err_cont:
+                        clear_opt (sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        break;
+                case Opt_nouid32:
+                        set_opt (sbi->s_mount_opt, NO_UID32);
+                        break;
+                case Opt_nocheck:
+                        clear_opt (sbi->s_mount_opt, CHECK);
+                        break;
+                case Opt_debug:
+                        set_opt (sbi->s_mount_opt, DEBUG);
+                        break;
+                case Opt_oldalloc:
+                        set_opt (sbi->s_mount_opt, OLDALLOC);
+                        break;
+                case Opt_orlov:
+                        clear_opt (sbi->s_mount_opt, OLDALLOC);
+                        break;
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+                case Opt_user_xattr:
+                        set_opt (sbi->s_mount_opt, XATTR_USER);
+                        break;
+                case Opt_nouser_xattr:
+                        clear_opt (sbi->s_mount_opt, XATTR_USER);
+                        break;
+#else
+                case Opt_user_xattr:
+                case Opt_nouser_xattr:
+                        printk("EXT4 (no)user_xattr options not supported\n");
+                        break;
+#endif
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+                case Opt_acl:
+                        set_opt(sbi->s_mount_opt, POSIX_ACL);
+                        break;
+                case Opt_noacl:
+                        clear_opt(sbi->s_mount_opt, POSIX_ACL);
+                        break;
+#else
+                case Opt_acl:
+                case Opt_noacl:
+                        printk("EXT4 (no)acl options not supported\n");
+                        break;
+#endif
+                case Opt_reservation:
+                        set_opt(sbi->s_mount_opt, RESERVATION);
+                        break;
+                case Opt_noreservation:
+                        clear_opt(sbi->s_mount_opt, RESERVATION);
+                        break;
+                case Opt_journal_update:
+                        /* @@@ FIXME */
+                        /* Eventually we will want to be able to create
+                           a journal file here.  For now, only allow the
+                           user to specify an existing inode to be the
+                           journal file. */
+                        if (is_remount) {
+                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                       "journal on remount\n");
+                                return 0;
+                        }
+                        set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+                        break;
+                case Opt_journal_inum:
+                        if (is_remount) {
+                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                       "journal on remount\n");
+                                return 0;
+                        }
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        *inum = option;
+                        break;
+                case Opt_journal_dev:
+                        if (is_remount) {
+                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                       "journal on remount\n");
+                                return 0;
+                        }
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        *journal_devnum = option;
+                        break;
+                case Opt_noload:
+                        set_opt (sbi->s_mount_opt, NOLOAD);
+                        break;
+                case Opt_commit:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        if (option == 0)
+                                option = JBD_DEFAULT_MAX_COMMIT_AGE;
+                        sbi->s_commit_interval = HZ * option;
+                        break;
+                case Opt_data_journal:
+                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
+                        goto datacheck;
+                case Opt_data_ordered:
+                        data_opt = EXT4_MOUNT_ORDERED_DATA;
+                        goto datacheck;
+                case Opt_data_writeback:
+                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
+                datacheck:
+                        if (is_remount) {
+                                if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
+                                                != data_opt) {
+                                        printk(KERN_ERR
+                                                "EXT4-fs: cannot change data "
+                                                "mode on remount\n");
+                                        return 0;
+                                }
+                        } else {
+                                sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+                                sbi->s_mount_opt |= data_opt;
+                        }
+                        break;
+#ifdef CONFIG_QUOTA
+                case Opt_usrjquota:
+                        qtype = USRQUOTA;
+                        goto set_qf_name;
+                case Opt_grpjquota:
+                        qtype = GRPQUOTA;
+set_qf_name:
+                        if (sb_any_quota_enabled(sb)) {
+                                printk(KERN_ERR
+                                        "EXT4-fs: Cannot change journalled "
+                                        "quota options when quota turned on.\n");
+                                return 0;
+                        }
+                        qname = match_strdup(&args[0]);
+                        if (!qname) {
+                                printk(KERN_ERR
+                                        "EXT4-fs: not enough memory for "
+                                        "storing quotafile name.\n");
+                                return 0;
+                        }
+                        if (sbi->s_qf_names[qtype] &&
+                            strcmp(sbi->s_qf_names[qtype], qname)) {
+                                printk(KERN_ERR
+                                        "EXT4-fs: %s quota file already "
+                                        "specified.\n", QTYPE2NAME(qtype));
+                                kfree(qname);
+                                return 0;
+                        }
+                        sbi->s_qf_names[qtype] = qname;
+                        if (strchr(sbi->s_qf_names[qtype], '/')) {
+                                printk(KERN_ERR
+                                        "EXT4-fs: quotafile must be on "
+                                        "filesystem root.\n");
+                                kfree(sbi->s_qf_names[qtype]);
+                                sbi->s_qf_names[qtype] = NULL;
+                                return 0;
+                        }
+                        set_opt(sbi->s_mount_opt, QUOTA);
+                        break;
+                case Opt_offusrjquota:
+                        qtype = USRQUOTA;
+                        goto clear_qf_name;
+                case Opt_offgrpjquota:
+                        qtype = GRPQUOTA;
+clear_qf_name:
+                        if (sb_any_quota_enabled(sb)) {
+                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                        "journalled quota options when "
+                                        "quota turned on.\n");
+                                return 0;
+                        }
+                        /*
+                         * The space will be released later when all options
+                         * are confirmed to be correct
+                         */
+                        sbi->s_qf_names[qtype] = NULL;
+                        break;
+                case Opt_jqfmt_vfsold:
+                        sbi->s_jquota_fmt = QFMT_VFS_OLD;
+                        break;
+                case Opt_jqfmt_vfsv0:
+                        sbi->s_jquota_fmt = QFMT_VFS_V0;
+                        break;
+                case Opt_quota:
+                case Opt_usrquota:
+                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        break;
+                case Opt_grpquota:
+                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        break;
+                case Opt_noquota:
+                        if (sb_any_quota_enabled(sb)) {
+                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
+                                        "options when quota turned on.\n");
+                                return 0;
+                        }
+                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                        break;
+#else
+                case Opt_quota:
+                case Opt_usrquota:
+                case Opt_grpquota:
+                case Opt_usrjquota:
+                case Opt_grpjquota:
+                case Opt_offusrjquota:
+                case Opt_offgrpjquota:
+                case Opt_jqfmt_vfsold:
+                case Opt_jqfmt_vfsv0:
+                        printk(KERN_ERR
+                                "EXT4-fs: journalled quota options not "
+                                "supported.\n");
+                        break;
+                case Opt_noquota:
+                        break;
+#endif
+                case Opt_abort:
+                        set_opt(sbi->s_mount_opt, ABORT);
+                        break;
+                case Opt_barrier:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option)
+                                set_opt(sbi->s_mount_opt, BARRIER);
+                        else
+                                clear_opt(sbi->s_mount_opt, BARRIER);
+                        break;
+                case Opt_ignore:
+                        break;
+                case Opt_resize:
+                        if (!is_remount) {
+                                printk("EXT4-fs: resize option only available "
+                                        "for remount\n");
+                                return 0;
+                        }
+                        if (match_int(&args[0], &option) != 0)
+                                return 0;
+                        *n_blocks_count = option;
+                        break;
+                case Opt_nobh:
+                        set_opt(sbi->s_mount_opt, NOBH);
+                        break;
+                case Opt_bh:
+                        clear_opt(sbi->s_mount_opt, NOBH);
+                        break;
+                case Opt_extents:
+                        set_opt (sbi->s_mount_opt, EXTENTS);
+                        break;
+                default:
+                        printk (KERN_ERR
+                                "EXT4-fs: Unrecognized mount option \"%s\" "
+                                "or missing value\n", p);
+                        return 0;
+                }
+        }
+#ifdef CONFIG_QUOTA
+        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+                if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
+                     sbi->s_qf_names[USRQUOTA])
+                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
+                     sbi->s_qf_names[GRPQUOTA])
+                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                if ((sbi->s_qf_names[USRQUOTA] &&
+                                (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
+                    (sbi->s_qf_names[GRPQUOTA] &&
+                                (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+                        printk(KERN_ERR "EXT4-fs: old and new quota "
+                                        "format mixing.\n");
+                        return 0;
+                }
+                if (!sbi->s_jquota_fmt) {
+                        printk(KERN_ERR "EXT4-fs: journalled quota format "
+                                        "not specified.\n");
+                        return 0;
+                }
+        } else {
+                if (sbi->s_jquota_fmt) {
+                        printk(KERN_ERR "EXT4-fs: journalled quota format "
+                                        "specified with no journalling "
+                                        "enabled.\n");
+                        return 0;
+                }
+        }
+#endif
+        return 1;
+}
+static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
+                            int read_only)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        int res = 0;
+        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
+                printk (KERN_ERR "EXT4-fs warning: revision level too high, "
+                        "forcing read-only mode\n");
+                res = MS_RDONLY;
+        }
+        if (read_only)
+                return res;
+        if (!(sbi->s_mount_state & EXT4_VALID_FS))
+                printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+                        "running e2fsck is recommended\n");
+        else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+                printk (KERN_WARNING
+                        "EXT4-fs warning: mounting fs with errors, "
+                        "running e2fsck is recommended\n");
+        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+                 le16_to_cpu(es->s_mnt_count) >=
+                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+                printk (KERN_WARNING
+                        "EXT4-fs warning: maximal mount count reached, "
+                        "running e2fsck is recommended\n");
+        else if (le32_to_cpu(es->s_checkinterval) &&
+                (le32_to_cpu(es->s_lastcheck) +
+                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+                printk (KERN_WARNING
+                        "EXT4-fs warning: checktime reached, "
+                        "running e2fsck is recommended\n");
+#if 0
+                /* @@@ We _will_ want to clear the valid bit if we find
+                 * inconsistencies, to force a fsck at reboot.  But for
+                 * a plain journaled filesystem we can keep it set as
+                 * valid forever! :)
+                 */
+        es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
+#endif
+        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
+        es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+        es->s_mtime = cpu_to_le32(get_seconds());
+        ext4_update_dynamic_rev(sb);
+        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        ext4_commit_super(sb, es, 1);
+        if (test_opt(sb, DEBUG))
+                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+                        sb->s_blocksize,
+                        sbi->s_groups_count,
+                        EXT4_BLOCKS_PER_GROUP(sb),
+                        EXT4_INODES_PER_GROUP(sb),
+                        sbi->s_mount_opt);
+        printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
+        if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
+                char b[BDEVNAME_SIZE];
+                printk("external journal on %s\n",
+                        bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
+        } else {
+                printk("internal journal\n");
+        }
+        return res;
+}
+/* Called at mount-time, super-block is locked */
+static int ext4_check_descriptors (struct super_block * sb)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        ext4_fsblk_t last_block;
+        ext4_fsblk_t block_bitmap;
+        ext4_fsblk_t inode_bitmap;
+        ext4_fsblk_t inode_table;
+        struct ext4_group_desc * gdp = NULL;
+        int desc_block = 0;
+        int i;
+        ext4_debug ("Checking group descriptors");
+        for (i = 0; i < sbi->s_groups_count; i++)
+        {
+                if (i == sbi->s_groups_count - 1)
+                        last_block = ext4_blocks_count(sbi->s_es) - 1;
+                else
+                        last_block = first_block +
+                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+                if ((i % EXT4_DESC_PER_BLOCK(sb)) == 0)
+                        gdp = (struct ext4_group_desc *)
+                                        sbi->s_group_desc[desc_block++]->b_data;
+                block_bitmap = ext4_block_bitmap(sb, gdp);
+                if (block_bitmap < first_block || block_bitmap > last_block)
+                {
+                        ext4_error (sb, "ext4_check_descriptors",
+                                    "Block bitmap for group %d"
+                                    " not in group (block %llu)!",
+                                    i, block_bitmap);
+                        return 0;
+                }
+                inode_bitmap = ext4_inode_bitmap(sb, gdp);
+                if (inode_bitmap < first_block || inode_bitmap > last_block)
+                {
+                        ext4_error (sb, "ext4_check_descriptors",
+                                    "Inode bitmap for group %d"
+                                    " not in group (block %llu)!",
+                                    i, inode_bitmap);
+                        return 0;
+                }
+                inode_table = ext4_inode_table(sb, gdp);
+                if (inode_table < first_block ||
+                    inode_table + sbi->s_itb_per_group > last_block)
+                {
+                        ext4_error (sb, "ext4_check_descriptors",
+                                    "Inode table for group %d"
+                                    " not in group (block %llu)!",
+                                    i, inode_table);
+                        return 0;
+                }
+                first_block += EXT4_BLOCKS_PER_GROUP(sb);
+                gdp = (struct ext4_group_desc *)
+                        ((__u8 *)gdp + EXT4_DESC_SIZE(sb));
+        }
+        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+        sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+        return 1;
+}
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext4_orphan_cleanup (struct super_block * sb,
+                                 struct ext4_super_block * es)
+{
+        unsigned int s_flags = sb->s_flags;
+        int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+        int i;
+#endif
+        if (!es->s_last_orphan) {
+                jbd_debug(4, "no orphan inodes to clean up\n");
+                return;
+        }
+        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+                if (es->s_last_orphan)
+                        jbd_debug(1, "Errors on filesystem, "
+                                  "clearing orphan list.\n");
+                es->s_last_orphan = 0;
+                jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+                return;
+        }
+        if (s_flags & MS_RDONLY) {
+                printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
+                       sb->s_id);
+                sb->s_flags &= ~MS_RDONLY;
+        }
+#ifdef CONFIG_QUOTA
+        /* Needed for iput() to work correctly and not trash data */
+        sb->s_flags |= MS_ACTIVE;
+        /* Turn on quotas so that they are updated correctly */
+        for (i = 0; i < MAXQUOTAS; i++) {
+                if (EXT4_SB(sb)->s_qf_names[i]) {
+                        int ret = ext4_quota_on_mount(sb, i);
+                        if (ret < 0)
+                                printk(KERN_ERR
+                                        "EXT4-fs: Cannot turn on journalled "
+                                        "quota: error %d\n", ret);
+                }
+        }
+#endif
+        while (es->s_last_orphan) {
+                struct inode *inode;
+                if (!(inode =
+                      ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+                        es->s_last_orphan = 0;
+                        break;
+                }
+                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+                DQUOT_INIT(inode);
+                if (inode->i_nlink) {
+                        printk(KERN_DEBUG
+                                "%s: truncating inode %lu to %Ld bytes\n",
+                                __FUNCTION__, inode->i_ino, inode->i_size);
+                        jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+                                  inode->i_ino, inode->i_size);
+                        ext4_truncate(inode);
+                        nr_truncates++;
+                } else {
+                        printk(KERN_DEBUG
+                                "%s: deleting unreferenced inode %lu\n",
+                                __FUNCTION__, inode->i_ino);
+                        jbd_debug(2, "deleting unreferenced inode %lu\n",
+                                  inode->i_ino);
+                        nr_orphans++;
+                }
+                iput(inode);  /* The delete magic happens here! */
+        }
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+        if (nr_orphans)
+                printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
+                       sb->s_id, PLURAL(nr_orphans));
+        if (nr_truncates)
+                printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
+                       sb->s_id, PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+        /* Turn quotas off */
+        for (i = 0; i < MAXQUOTAS; i++) {
+                if (sb_dqopt(sb)->files[i])
+                        vfs_quota_off(sb, i);
+        }
+#endif
+        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+#define log2(n) ffz(~(n))
+/*
+ * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext4_max_size(int bits)
+{
+        loff_t res = EXT4_NDIR_BLOCKS;
+        /* This constant is calculated to be the largest file size for a
+         * dense, 4k-blocksize file such that the total number of
+         * sectors in the file, including data and all indirect blocks,
+         * does not exceed 2^32. */
+        const loff_t upper_limit = 0x1ff7fffd000LL;
+        res += 1LL << (bits-2);
+        res += 1LL << (2*(bits-2));
+        res += 1LL << (3*(bits-2));
+        res <<= bits;
+        if (res > upper_limit)
+                res = upper_limit;
+        return res;
+}
+static ext4_fsblk_t descriptor_loc(struct super_block *sb,
+                                ext4_fsblk_t logical_sb_block, int nr)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        unsigned long bg, first_meta_bg;
+        int has_super = 0;
+        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+            nr < first_meta_bg)
+                return logical_sb_block + nr + 1;
+        bg = sbi->s_desc_per_block * nr;
+        if (ext4_bg_has_super(sb, bg))
+                has_super = 1;
+        return (has_super + ext4_group_first_block_no(sb, bg));
+}
+static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+{
+        struct buffer_head * bh;
+        struct ext4_super_block *es = NULL;
+        struct ext4_sb_info *sbi;
+        ext4_fsblk_t block;
+        ext4_fsblk_t sb_block = get_sb_block(&data);
+        ext4_fsblk_t logical_sb_block;
+        unsigned long offset = 0;
+        unsigned int journal_inum = 0;
+        unsigned long journal_devnum = 0;
+        unsigned long def_mount_opts;
+        struct inode *root;
+        int blocksize;
+        int hblock;
+        int db_count;
+        int i;
+        int needs_recovery;
+        __le32 features;
+        __u64 blocks_count;
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        sbi->s_mount_opt = 0;
+        sbi->s_resuid = EXT4_DEF_RESUID;
+        sbi->s_resgid = EXT4_DEF_RESGID;
+        unlock_kernel();
+        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+        if (!blocksize) {
+                printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+                goto out_fail;
+        }
+        /*
+         * The ext4 superblock will not be buffer aligned for other than 1kB
+         * block sizes.  We need to calculate the offset from buffer start.
+         */
+        if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+                offset = do_div(logical_sb_block, blocksize);
+        } else {
+                logical_sb_block = sb_block;
+        }
+        if (!(bh = sb_bread(sb, logical_sb_block))) {
+                printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+                goto out_fail;
+        }
+        /*
+         * Note: s_es must be initialized as soon as possible because
+         *       some ext4 macro-instructions depend on its value
+         */
+        es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+        sbi->s_es = es;
+        sb->s_magic = le16_to_cpu(es->s_magic);
+        if (sb->s_magic != EXT4_SUPER_MAGIC)
+                goto cantfind_ext4;
+        /* Set defaults before we parse the mount options */
+        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+        if (def_mount_opts & EXT4_DEFM_DEBUG)
+                set_opt(sbi->s_mount_opt, DEBUG);
+        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+                set_opt(sbi->s_mount_opt, GRPID);
+        if (def_mount_opts & EXT4_DEFM_UID16)
+                set_opt(sbi->s_mount_opt, NO_UID32);
+        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
+                set_opt(sbi->s_mount_opt, XATTR_USER);
+        if (def_mount_opts & EXT4_DEFM_ACL)
+                set_opt(sbi->s_mount_opt, POSIX_ACL);
+        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
+                sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
+                sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
+                sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+        if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+                set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+        else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
+                set_opt(sbi->s_mount_opt, ERRORS_RO);
+        else
+                set_opt(sbi->s_mount_opt, ERRORS_CONT);
+        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+        set_opt(sbi->s_mount_opt, RESERVATION);
+        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+                            NULL, 0))
+                goto failed_mount;
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
+            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
+             EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+             EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+                printk(KERN_WARNING
+                       "EXT4-fs warning: feature flags set on rev 0 fs, "
+                       "running e2fsck is recommended\n");
+        /*
+         * Check feature flags regardless of the revision level, since we
+         * previously didn't change the revision level when setting the flags,
+         * so there is a chance incompat flags are set on a rev 0 filesystem.
+         */
+        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
+        if (features) {
+                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
+                       "unsupported optional features (%x).\n",
+                       sb->s_id, le32_to_cpu(features));
+                goto failed_mount;
+        }
+        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
+        if (!(sb->s_flags & MS_RDONLY) && features) {
+                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
+                       "unsupported optional features (%x).\n",
+                       sb->s_id, le32_to_cpu(features));
+                goto failed_mount;
+        }
+        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+            blocksize > EXT4_MAX_BLOCK_SIZE) {
+                printk(KERN_ERR
+                       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
+                       blocksize, sb->s_id);
+                goto failed_mount;
+        }
+        hblock = bdev_hardsect_size(sb->s_bdev);
+        if (sb->s_blocksize != blocksize) {
+                /*
+                 * Make sure the blocksize for the filesystem is larger
+                 * than the hardware sectorsize for the machine.
+                 */
+                if (blocksize < hblock) {
+                        printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
+                               "device blocksize %d.\n", blocksize, hblock);
+                        goto failed_mount;
+                }
+                brelse (bh);
+                sb_set_blocksize(sb, blocksize);
+                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+                offset = do_div(logical_sb_block, blocksize);
+                bh = sb_bread(sb, logical_sb_block);
+                if (!bh) {
+                        printk(KERN_ERR
+                               "EXT4-fs: Can't read superblock on 2nd try.\n");
+                        goto failed_mount;
+                }
+                es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+                sbi->s_es = es;
+                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+                        printk (KERN_ERR
+                                "EXT4-fs: Magic mismatch, very weird !\n");
+                        goto failed_mount;
+                }
+        }
+        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+        } else {
+                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+                    (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
+                    (sbi->s_inode_size > blocksize)) {
+                        printk (KERN_ERR
+                                "EXT4-fs: unsupported inode size: %d\n",
+                                sbi->s_inode_size);
+                        goto failed_mount;
+                }
+        }
+        sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
+                                   le32_to_cpu(es->s_log_frag_size);
+        if (blocksize != sbi->s_frag_size) {
+                printk(KERN_ERR
+                       "EXT4-fs: fragsize %lu != blocksize %u (unsupported)\n",
+                       sbi->s_frag_size, blocksize);
+                goto failed_mount;
+        }
+        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+                    sbi->s_desc_size & (sbi->s_desc_size - 1)) {
+                        printk(KERN_ERR
+                               "EXT4-fs: unsupported descriptor size %lu\n",
+                               sbi->s_desc_size);
+                        goto failed_mount;
+                }
+        } else
+                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+        sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+        if (EXT4_INODE_SIZE(sb) == 0)
+                goto cantfind_ext4;
+        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+        if (sbi->s_inodes_per_block == 0)
+                goto cantfind_ext4;
+        sbi->s_itb_per_group = sbi->s_inodes_per_group /
+                                        sbi->s_inodes_per_block;
+        sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+        sbi->s_sbh = bh;
+        sbi->s_mount_state = le16_to_cpu(es->s_state);
+        sbi->s_addr_per_block_bits = log2(EXT4_ADDR_PER_BLOCK(sb));
+        sbi->s_desc_per_block_bits = log2(EXT4_DESC_PER_BLOCK(sb));
+        for (i=0; i < 4; i++)
+                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+        sbi->s_def_hash_version = es->s_def_hash_version;
+        if (sbi->s_blocks_per_group > blocksize * 8) {
+                printk (KERN_ERR
+                        "EXT4-fs: #blocks per group too big: %lu\n",
+                        sbi->s_blocks_per_group);
+                goto failed_mount;
+        }
+        if (sbi->s_frags_per_group > blocksize * 8) {
+                printk (KERN_ERR
+                        "EXT4-fs: #fragments per group too big: %lu\n",
+                        sbi->s_frags_per_group);
+                goto failed_mount;
+        }
+        if (sbi->s_inodes_per_group > blocksize * 8) {
+                printk (KERN_ERR
+                        "EXT4-fs: #inodes per group too big: %lu\n",
+                        sbi->s_inodes_per_group);
+                goto failed_mount;
+        }
+        if (ext4_blocks_count(es) >
+                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                        " too large to mount safely\n", sb->s_id);
+                if (sizeof(sector_t) < 8)
+                        printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
+                                        "enabled\n");
+                goto failed_mount;
+        }
+        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+                goto cantfind_ext4;
+        blocks_count = (ext4_blocks_count(es) -
+                        le32_to_cpu(es->s_first_data_block) +
+                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
+        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+        sbi->s_groups_count = blocks_count;
+        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+                   EXT4_DESC_PER_BLOCK(sb);
+        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+                                    GFP_KERNEL);
+        if (sbi->s_group_desc == NULL) {
+                printk (KERN_ERR "EXT4-fs: not enough memory\n");
+                goto failed_mount;
+        }
+        bgl_lock_init(&sbi->s_blockgroup_lock);
+        for (i = 0; i < db_count; i++) {
+                block = descriptor_loc(sb, logical_sb_block, i);
+                sbi->s_group_desc[i] = sb_bread(sb, block);
+                if (!sbi->s_group_desc[i]) {
+                        printk (KERN_ERR "EXT4-fs: "
+                                "can't read group descriptor %d\n", i);
+                        db_count = i;
+                        goto failed_mount2;
+                }
+        }
+        if (!ext4_check_descriptors (sb)) {
+                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+                goto failed_mount2;
+        }
+        sbi->s_gdb_count = db_count;
+        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+        spin_lock_init(&sbi->s_next_gen_lock);
+        percpu_counter_init(&sbi->s_freeblocks_counter,
+                ext4_count_free_blocks(sb));
+        percpu_counter_init(&sbi->s_freeinodes_counter,
+                ext4_count_free_inodes(sb));
+        percpu_counter_init(&sbi->s_dirs_counter,
+                ext4_count_dirs(sb));
+        /* per fileystem reservation list head & lock */
+        spin_lock_init(&sbi->s_rsv_window_lock);
+        sbi->s_rsv_window_root = RB_ROOT;
+        /* Add a single, static dummy reservation to the start of the
+         * reservation window list --- it gives us a placeholder for
+         * append-at-start-of-list which makes the allocation logic
+         * _much_ simpler. */
+        sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+        sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+        sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+        sbi->s_rsv_window_head.rsv_goal_size = 0;
+        ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
+        /*
+         * set up enough so that it can read an inode
+         */
+        sb->s_op = &ext4_sops;
+        sb->s_export_op = &ext4_export_ops;
+        sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_QUOTA
+        sb->s_qcop = &ext4_qctl_operations;
+        sb->dq_op = &ext4_quota_operations;
+#endif
+        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+        sb->s_root = NULL;
+        needs_recovery = (es->s_last_orphan != 0 ||
+                          EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                    EXT4_FEATURE_INCOMPAT_RECOVER));
+        /*
+         * The first inode we look at is the journal inode.  Don't try
+         * root first: it may be modified in the journal!
+         */
+        if (!test_opt(sb, NOLOAD) &&
+            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+                if (ext4_load_journal(sb, es, journal_devnum))
+                        goto failed_mount3;
+        } else if (journal_inum) {
+                if (ext4_create_journal(sb, es, journal_inum))
+                        goto failed_mount3;
+        } else {
+                if (!silent)
+                        printk (KERN_ERR
+                                "ext4: No journal on filesystem on %s\n",
+                                sb->s_id);
+                goto failed_mount3;
+        }
+        /* We have now updated the journal if required, so we can
+         * validate the data journaling mode. */
+        switch (test_opt(sb, DATA_FLAGS)) {
+        case 0:
+                /* No mode set, assume a default based on the journal
+                 * capabilities: ORDERED_DATA if the journal can
+                 * cope, else JOURNAL_DATA
+                 */
+                if (jbd2_journal_check_available_features
+                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+                        set_opt(sbi->s_mount_opt, ORDERED_DATA);
+                else
+                        set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+                break;
+        case EXT4_MOUNT_ORDERED_DATA:
+        case EXT4_MOUNT_WRITEBACK_DATA:
+                if (!jbd2_journal_check_available_features
+                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+                        printk(KERN_ERR "EXT4-fs: Journal does not support "
+                               "requested data journaling mode\n");
+                        goto failed_mount4;
+                }
+        default:
+                break;
+        }
+        if (test_opt(sb, NOBH)) {
+                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
+                        printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
+                                "its supported only with writeback mode\n");
+                        clear_opt(sbi->s_mount_opt, NOBH);
+                }
+        }
+        /*
+         * The jbd2_journal_load will have done any necessary log recovery,
+         * so we can safely mount the rest of the filesystem now.
+         */
+        root = iget(sb, EXT4_ROOT_INO);
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+                iput(root);
+                goto failed_mount4;
+        }
+        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+                dput(sb->s_root);
+                sb->s_root = NULL;
+                printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+                goto failed_mount4;
+        }
+        ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+        /*
+         * akpm: core read_super() calls in here with the superblock locked.
+         * That deadlocks, because orphan cleanup needs to lock the superblock
+         * in numerous places.  Here we just pop the lock - it's relatively
+         * harmless, because we are now ready to accept write_super() requests,
+         * and aviro says that's the only reason for hanging onto the
+         * superblock lock.
+         */
+        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
+        ext4_orphan_cleanup(sb, es);
+        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+        if (needs_recovery)
+                printk (KERN_INFO "EXT4-fs: recovery complete.\n");
+        ext4_mark_recovery_complete(sb, es);
+        printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+                "writeback");
+        ext4_ext_init(sb);
+        lock_kernel();
+        return 0;
+cantfind_ext4:
+        if (!silent)
+                printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
+                       sb->s_id);
+        goto failed_mount;
+failed_mount4:
+        jbd2_journal_destroy(sbi->s_journal);
+failed_mount3:
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+failed_mount2:
+        for (i = 0; i < db_count; i++)
+                brelse(sbi->s_group_desc[i]);
+        kfree(sbi->s_group_desc);
+failed_mount:
+#ifdef CONFIG_QUOTA
+        for (i = 0; i < MAXQUOTAS; i++)
+                kfree(sbi->s_qf_names[i]);
+#endif
+        ext4_blkdev_remove(sbi);
+        brelse(bh);
+out_fail:
+        sb->s_fs_info = NULL;
+        kfree(sbi);
+        lock_kernel();
+        return -EINVAL;
+}
+/*
+ * Setup any per-fs journal parameters now.  We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        if (sbi->s_commit_interval)
+                journal->j_commit_interval = sbi->s_commit_interval;
+        /* We could also set up an ext4-specific default for the commit
+         * interval here, but for now we'll just fall back to the jbd
+         * default. */
+        spin_lock(&journal->j_state_lock);
+        if (test_opt(sb, BARRIER))
+                journal->j_flags |= JBD2_BARRIER;
+        else
+                journal->j_flags &= ~JBD2_BARRIER;
+        spin_unlock(&journal->j_state_lock);
+}
+static journal_t *ext4_get_journal(struct super_block *sb,
+                                   unsigned int journal_inum)
+{
+        struct inode *journal_inode;
+        journal_t *journal;
+        /* First, test for the existence of a valid inode on disk.  Bad
+         * things happen if we iget() an unused inode, as the subsequent
+         * iput() will try to delete it. */
+        journal_inode = iget(sb, journal_inum);
+        if (!journal_inode) {
+                printk(KERN_ERR "EXT4-fs: no journal found.\n");
+                return NULL;
+        }
+        if (!journal_inode->i_nlink) {
+                make_bad_inode(journal_inode);
+                iput(journal_inode);
+                printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+                return NULL;
+        }
+        jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+                  journal_inode, journal_inode->i_size);
+        if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
+                printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+                iput(journal_inode);
+                return NULL;
+        }
+        journal = jbd2_journal_init_inode(journal_inode);
+        if (!journal) {
+                printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+                iput(journal_inode);
+                return NULL;
+        }
+        journal->j_private = sb;
+        ext4_init_journal_params(sb, journal);
+        return journal;
+}
+static journal_t *ext4_get_dev_journal(struct super_block *sb,
+                                       dev_t j_dev)
+{
+        struct buffer_head * bh;
+        journal_t *journal;
+        ext4_fsblk_t start;
+        ext4_fsblk_t len;
+        int hblock, blocksize;
+        ext4_fsblk_t sb_block;
+        unsigned long offset;
+        struct ext4_super_block * es;
+        struct block_device *bdev;
+        bdev = ext4_blkdev_get(j_dev);
+        if (bdev == NULL)
+                return NULL;
+        if (bd_claim(bdev, sb)) {
+                printk(KERN_ERR
+                        "EXT4: failed to claim external journal device.\n");
+                blkdev_put(bdev);
+                return NULL;
+        }
+        blocksize = sb->s_blocksize;
+        hblock = bdev_hardsect_size(bdev);
+        if (blocksize < hblock) {
+                printk(KERN_ERR
+                        "EXT4-fs: blocksize too small for journal device.\n");
+                goto out_bdev;
+        }
+        sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
+        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
+        set_blocksize(bdev, blocksize);
+        if (!(bh = __bread(bdev, sb_block, blocksize))) {
+                printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
+                       "external journal\n");
+                goto out_bdev;
+        }
+        es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
+            !(le32_to_cpu(es->s_feature_incompat) &
+              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+                printk(KERN_ERR "EXT4-fs: external journal has "
+                                        "bad superblock\n");
+                brelse(bh);
+                goto out_bdev;
+        }
+        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+                printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+                brelse(bh);
+                goto out_bdev;
+        }
+        len = ext4_blocks_count(es);
+        start = sb_block + 1;
+        brelse(bh);     /* we're done with the superblock */
+        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
+                                        start, len, blocksize);
+        if (!journal) {
+                printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+                goto out_bdev;
+        }
+        journal->j_private = sb;
+        ll_rw_block(READ, 1, &journal->j_sb_buffer);
+        wait_on_buffer(journal->j_sb_buffer);
+        if (!buffer_uptodate(journal->j_sb_buffer)) {
+                printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+                goto out_journal;
+        }
+        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+                printk(KERN_ERR "EXT4-fs: External journal has more than one "
+                                        "user (unsupported) - %d\n",
+                        be32_to_cpu(journal->j_superblock->s_nr_users));
+                goto out_journal;
+        }
+        EXT4_SB(sb)->journal_bdev = bdev;
+        ext4_init_journal_params(sb, journal);
+        return journal;
+out_journal:
+        jbd2_journal_destroy(journal);
+out_bdev:
+        ext4_blkdev_put(bdev);
+        return NULL;
+}
+static int ext4_load_journal(struct super_block *sb,
+                             struct ext4_super_block *es,
+                             unsigned long journal_devnum)
+{
+        journal_t *journal;
+        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+        dev_t journal_dev;
+        int err = 0;
+        int really_read_only;
+        if (journal_devnum &&
+            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
+                        "numbers have changed\n");
+                journal_dev = new_decode_dev(journal_devnum);
+        } else
+                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+        really_read_only = bdev_read_only(sb->s_bdev);
+        /*
+         * Are we loading a blank journal or performing recovery after a
+         * crash?  For recovery, we need to check in advance whether we
+         * can get read-write access to the device.
+         */
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+                if (sb->s_flags & MS_RDONLY) {
+                        printk(KERN_INFO "EXT4-fs: INFO: recovery "
+                                        "required on readonly filesystem.\n");
+                        if (really_read_only) {
+                                printk(KERN_ERR "EXT4-fs: write access "
+                                        "unavailable, cannot proceed.\n");
+                                return -EROFS;
+                        }
+                        printk (KERN_INFO "EXT4-fs: write access will "
+                                        "be enabled during recovery.\n");
+                }
+        }
+        if (journal_inum && journal_dev) {
+                printk(KERN_ERR "EXT4-fs: filesystem has both journal "
+                       "and inode journals!\n");
+                return -EINVAL;
+        }
+        if (journal_inum) {
+                if (!(journal = ext4_get_journal(sb, journal_inum)))
+                        return -EINVAL;
+        } else {
+                if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
+                        return -EINVAL;
+        }
+        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+                err = jbd2_journal_update_format(journal);
+                if (err)  {
+                        printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+                        jbd2_journal_destroy(journal);
+                        return err;
+                }
+        }
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+                err = jbd2_journal_wipe(journal, !really_read_only);
+        if (!err)
+                err = jbd2_journal_load(journal);
+        if (err) {
+                printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+                jbd2_journal_destroy(journal);
+                return err;
+        }
+        EXT4_SB(sb)->s_journal = journal;
+        ext4_clear_journal_err(sb, es);
+        if (journal_devnum &&
+            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+                es->s_journal_dev = cpu_to_le32(journal_devnum);
+                sb->s_dirt = 1;
+                /* Make sure we flush the recovery flag to disk. */
+                ext4_commit_super(sb, es, 1);
+        }
+        return 0;
+}
+static int ext4_create_journal(struct super_block * sb,
+                               struct ext4_super_block * es,
+                               unsigned int journal_inum)
+{
+        journal_t *journal;
+        if (sb->s_flags & MS_RDONLY) {
+                printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
+                                "create journal.\n");
+                return -EROFS;
+        }
+        if (!(journal = ext4_get_journal(sb, journal_inum)))
+                return -EINVAL;
+        printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
+               journal_inum);
+        if (jbd2_journal_create(journal)) {
+                printk(KERN_ERR "EXT4-fs: error creating journal.\n");
+                jbd2_journal_destroy(journal);
+                return -EIO;
+        }
+        EXT4_SB(sb)->s_journal = journal;
+        ext4_update_dynamic_rev(sb);
+        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
+        es->s_journal_inum = cpu_to_le32(journal_inum);
+        sb->s_dirt = 1;
+        /* Make sure we flush the recovery flag to disk. */
+        ext4_commit_super(sb, es, 1);
+        return 0;
+}
+static void ext4_commit_super (struct super_block * sb,
+                               struct ext4_super_block * es,
+                               int sync)
+{
+        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+        if (!sbh)
+                return;
+        es->s_wtime = cpu_to_le32(get_seconds());
+        ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
+        es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+        BUFFER_TRACE(sbh, "marking dirty");
+        mark_buffer_dirty(sbh);
+        if (sync)
+                sync_dirty_buffer(sbh);
+}
+/*
+ * Have we just finished recovery?  If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk.  Record that fact.
+ */
+static void ext4_mark_recovery_complete(struct super_block * sb,
+                                        struct ext4_super_block * es)
+{
+        journal_t *journal = EXT4_SB(sb)->s_journal;
+        jbd2_journal_lock_updates(journal);
+        jbd2_journal_flush(journal);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+            sb->s_flags & MS_RDONLY) {
+                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+                sb->s_dirt = 0;
+                ext4_commit_super(sb, es, 1);
+        }
+        jbd2_journal_unlock_updates(journal);
+}
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext4_clear_journal_err(struct super_block * sb,
+                                   struct ext4_super_block * es)
+{
+        journal_t *journal;
+        int j_errno;
+        const char *errstr;
+        journal = EXT4_SB(sb)->s_journal;
+        /*
+         * Now check for any error status which may have been recorded in the
+         * journal by a prior ext4_error() or ext4_abort()
+         */
+        j_errno = jbd2_journal_errno(journal);
+        if (j_errno) {
+                char nbuf[16];
+                errstr = ext4_decode_error(sb, j_errno, nbuf);
+                ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
+                             "from previous mount: %s", errstr);
+                ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
+                             "filesystem check.");
+                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+                ext4_commit_super (sb, es, 1);
+                jbd2_journal_clear_err(journal);
+        }
+}
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext4_force_commit(struct super_block *sb)
+{
+        journal_t *journal;
+        int ret;
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        journal = EXT4_SB(sb)->s_journal;
+        sb->s_dirt = 0;
+        ret = ext4_journal_force_commit(journal);
+        return ret;
+}
+/*
+ * Ext4 always journals updates to the superblock itself, so we don't
+ * have to propagate any other updates to the superblock on disk at this
+ * point.  Just start an async writeback to get the buffers on their way
+ * to the disk.
+ *
+ * This implicitly triggers the writebehind on sync().
+ */
+static void ext4_write_super (struct super_block * sb)
+{
+        if (mutex_trylock(&sb->s_lock) != 0)
+                BUG();
+        sb->s_dirt = 0;
+}
+static int ext4_sync_fs(struct super_block *sb, int wait)
+{
+        tid_t target;
+        sb->s_dirt = 0;
+        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+                if (wait)
+                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+        }
+        return 0;
+}
+/*
+ * LVM calls this function before a (read-only) snapshot is created.  This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ */
+static void ext4_write_super_lockfs(struct super_block *sb)
+{
+        sb->s_dirt = 0;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                journal_t *journal = EXT4_SB(sb)->s_journal;
+                /* Now we set up the journal barrier. */
+                jbd2_journal_lock_updates(journal);
+                jbd2_journal_flush(journal);
+                /* Journal blocked and flushed, clear needs_recovery flag. */
+                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+        }
+}
+/*
+ * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static void ext4_unlockfs(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY)) {
+                lock_super(sb);
+                /* Reser the needs_recovery flag before the fs is unlocked. */
+                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+                unlock_super(sb);
+                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        }
+}
+static int ext4_remount (struct super_block * sb, int * flags, char * data)
+{
+        struct ext4_super_block * es;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        ext4_fsblk_t n_blocks_count = 0;
+        unsigned long old_sb_flags;
+        struct ext4_mount_options old_opts;
+        int err;
+#ifdef CONFIG_QUOTA
+        int i;
+#endif
+        /* Store the original options */
+        old_sb_flags = sb->s_flags;
+        old_opts.s_mount_opt = sbi->s_mount_opt;
+        old_opts.s_resuid = sbi->s_resuid;
+        old_opts.s_resgid = sbi->s_resgid;
+        old_opts.s_commit_interval = sbi->s_commit_interval;
+#ifdef CONFIG_QUOTA
+        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+        for (i = 0; i < MAXQUOTAS; i++)
+                old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
+        /*
+         * Allow the "check" option to be passed as a remount option.
+         */
+        if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+                err = -EINVAL;
+                goto restore_opts;
+        }
+        if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+                ext4_abort(sb, __FUNCTION__, "Abort forced by user");
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+        es = sbi->s_es;
+        ext4_init_journal_params(sb, sbi->s_journal);
+        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+                n_blocks_count > ext4_blocks_count(es)) {
+                if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+                        err = -EROFS;
+                        goto restore_opts;
+                }
+                if (*flags & MS_RDONLY) {
+                        /*
+                         * First of all, the unconditional stuff we have to do
+                         * to disable replay of the journal when we next remount
+                         */
+                        sb->s_flags |= MS_RDONLY;
+                        /*
+                         * OK, test if we are remounting a valid rw partition
+                         * readonly, and if so set the rdonly flag and then
+                         * mark the partition as valid again.
+                         */
+                        if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
+                            (sbi->s_mount_state & EXT4_VALID_FS))
+                                es->s_state = cpu_to_le16(sbi->s_mount_state);
+                        ext4_mark_recovery_complete(sb, es);
+                } else {
+                        __le32 ret;
+                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
+                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                       "remount RDWR because of unsupported "
+                                       "optional features (%x).\n",
+                                       sb->s_id, le32_to_cpu(ret));
+                                err = -EROFS;
+                                goto restore_opts;
+                        }
+                        /*
+                         * Mounting a RDONLY partition read-write, so reread
+                         * and store the current valid flag.  (It may have
+                         * been changed by e2fsck since we originally mounted
+                         * the partition.)
+                         */
+                        ext4_clear_journal_err(sb, es);
+                        sbi->s_mount_state = le16_to_cpu(es->s_state);
+                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
+                                goto restore_opts;
+                        if (!ext4_setup_super (sb, es, 0))
+                                sb->s_flags &= ~MS_RDONLY;
+                }
+        }
+#ifdef CONFIG_QUOTA
+        /* Release old quota file names */
+        for (i = 0; i < MAXQUOTAS; i++)
+                if (old_opts.s_qf_names[i] &&
+                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+                        kfree(old_opts.s_qf_names[i]);
+#endif
+        return 0;
+restore_opts:
+        sb->s_flags = old_sb_flags;
+        sbi->s_mount_opt = old_opts.s_mount_opt;
+        sbi->s_resuid = old_opts.s_resuid;
+        sbi->s_resgid = old_opts.s_resgid;
+        sbi->s_commit_interval = old_opts.s_commit_interval;
+#ifdef CONFIG_QUOTA
+        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+        for (i = 0; i < MAXQUOTAS; i++) {
+                if (sbi->s_qf_names[i] &&
+                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+                        kfree(sbi->s_qf_names[i]);
+                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+        }
+#endif
+        return err;
+}
+static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        ext4_fsblk_t overhead;
+        int i;
+        if (test_opt (sb, MINIX_DF))
+                overhead = 0;
+        else {
+                unsigned long ngroups;
+                ngroups = EXT4_SB(sb)->s_groups_count;
+                smp_rmb();
+                /*
+                 * Compute the overhead (FS structures)
+                 */
+                /*
+                 * All of the blocks before first_data_block are
+                 * overhead
+                 */
+                overhead = le32_to_cpu(es->s_first_data_block);
+                /*
+                 * Add the overhead attributed to the superblock and
+                 * block group descriptors.  If the sparse superblocks
+                 * feature is turned on, then not all groups have this.
+                 */
+                for (i = 0; i < ngroups; i++) {
+                        overhead += ext4_bg_has_super(sb, i) +
+                                ext4_bg_num_gdb(sb, i);
+                        cond_resched();
+                }
+                /*
+                 * Every block group has an inode bitmap, a block
+                 * bitmap, and an inode table.
+                 */
+                overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
+        }
+        buf->f_type = EXT4_SUPER_MAGIC;
+        buf->f_bsize = sb->s_blocksize;
+        buf->f_blocks = ext4_blocks_count(es) - overhead;
+        buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+        if (buf->f_bfree < ext4_r_blocks_count(es))
+                buf->f_bavail = 0;
+        buf->f_files = le32_to_cpu(es->s_inodes_count);
+        buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+        buf->f_namelen = EXT4_NAME_LEN;
+        return 0;
+}
+/* Helper function for writing quotas on sync - we need to start transaction before quota file
+ * is locked for write. Otherwise the are possible deadlocks:
+ * Process 1                         Process 2
+ * ext4_create()                     quota_sync()
+ *   jbd2_journal_start()                   write_dquot()
+ *   DQUOT_INIT()                        down(dqio_mutex)
+ *     down(dqio_mutex)                    jbd2_journal_start()
+ *
+ */
+#ifdef CONFIG_QUOTA
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+}
+static int ext4_dquot_initialize(struct inode *inode, int type)
+{
+        handle_t *handle;
+        int ret, err;
+        /* We may create quota structure so we need to reserve enough blocks */
+        handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = dquot_initialize(inode, type);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+}
+static int ext4_dquot_drop(struct inode *inode)
+{
+        handle_t *handle;
+        int ret, err;
+        /* We may delete quota structure so we need to reserve enough blocks */
+        handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = dquot_drop(inode);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+}
+static int ext4_write_dquot(struct dquot *dquot)
+{
+        int ret, err;
+        handle_t *handle;
+        struct inode *inode;
+        inode = dquot_to_inode(dquot);
+        handle = ext4_journal_start(inode,
+                                        EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = dquot_commit(dquot);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+}
+static int ext4_acquire_dquot(struct dquot *dquot)
+{
+        int ret, err;
+        handle_t *handle;
+        handle = ext4_journal_start(dquot_to_inode(dquot),
+                                        EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = dquot_acquire(dquot);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+}
+static int ext4_release_dquot(struct dquot *dquot)
+{
+        int ret, err;
+        handle_t *handle;
+        handle = ext4_journal_start(dquot_to_inode(dquot),
+                                        EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = dquot_release(dquot);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+}
+static int ext4_mark_dquot_dirty(struct dquot *dquot)
+{
+        /* Are we journalling quotas? */
+        if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+            EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+                dquot_mark_dquot_dirty(dquot);
+                return ext4_write_dquot(dquot);
+        } else {
+                return dquot_mark_dquot_dirty(dquot);
+        }
+}
+static int ext4_write_info(struct super_block *sb, int type)
+{
+        int ret, err;
+        handle_t *handle;
+        /* Data block + inode block */
+        handle = ext4_journal_start(sb->s_root->d_inode, 2);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = dquot_commit_info(sb, type);
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+}
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+                        EXT4_SB(sb)->s_jquota_fmt, type);
+}
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+                         char *path)
+{
+        int err;
+        struct nameidata nd;
+        if (!test_opt(sb, QUOTA))
+                return -EINVAL;
+        /* Not journalling quota? */
+        if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
+            !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
+                return vfs_quota_on(sb, type, format_id, path);
+        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+        if (err)
+                return err;
+        /* Quotafile not on the same filesystem? */
+        if (nd.mnt->mnt_sb != sb) {
+                path_release(&nd);
+                return -EXDEV;
+        }
+        /* Quotafile not of fs root? */
+        if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
+                printk(KERN_WARNING
+                        "EXT4-fs: Quota file not on filesystem root. "
+                        "Journalled quota will not work.\n");
+        path_release(&nd);
+        return vfs_quota_on(sb, type, format_id, path);
+}
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and noone else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+                               size_t len, loff_t off)
+{
+        struct inode *inode = sb_dqopt(sb)->files[type];
+        sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+        int err = 0;
+        int offset = off & (sb->s_blocksize - 1);
+        int tocopy;
+        size_t toread;
+        struct buffer_head *bh;
+        loff_t i_size = i_size_read(inode);
+        if (off > i_size)
+                return 0;
+        if (off+len > i_size)
+                len = i_size-off;
+        toread = len;
+        while (toread > 0) {
+                tocopy = sb->s_blocksize - offset < toread ?
+                                sb->s_blocksize - offset : toread;
+                bh = ext4_bread(NULL, inode, blk, 0, &err);
+                if (err)
+                        return err;
+                if (!bh)        /* A hole? */
+                        memset(data, 0, tocopy);
+                else
+                        memcpy(data, bh->b_data+offset, tocopy);
+                brelse(bh);
+                offset = 0;
+                toread -= tocopy;
+                data += tocopy;
+                blk++;
+        }
+        return len;
+}
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+                                const char *data, size_t len, loff_t off)
+{
+        struct inode *inode = sb_dqopt(sb)->files[type];
+        sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+        int err = 0;
+        int offset = off & (sb->s_blocksize - 1);
+        int tocopy;
+        int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
+        size_t towrite = len;
+        struct buffer_head *bh;
+        handle_t *handle = journal_current_handle();
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+        while (towrite > 0) {
+                tocopy = sb->s_blocksize - offset < towrite ?
+                                sb->s_blocksize - offset : towrite;
+                bh = ext4_bread(handle, inode, blk, 1, &err);
+                if (!bh)
+                        goto out;
+                if (journal_quota) {
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (err) {
+                                brelse(bh);
+                                goto out;
+                        }
+                }
+                lock_buffer(bh);
+                memcpy(bh->b_data+offset, data, tocopy);
+                flush_dcache_page(bh->b_page);
+                unlock_buffer(bh);
+                if (journal_quota)
+                        err = ext4_journal_dirty_metadata(handle, bh);
+                else {
+                        /* Always do at least ordered writes for quotas */
+                        err = ext4_journal_dirty_data(handle, bh);
+                        mark_buffer_dirty(bh);
+                }
+                brelse(bh);
+                if (err)
+                        goto out;
+                offset = 0;
+                towrite -= tocopy;
+                data += tocopy;
+                blk++;
+        }
+out:
+        if (len == towrite)
+                return err;
+        if (inode->i_size < off+len-towrite) {
+                i_size_write(inode, off+len-towrite);
+                EXT4_I(inode)->i_disksize = inode->i_size;
+        }
+        inode->i_version++;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        ext4_mark_inode_dirty(handle, inode);
+        mutex_unlock(&inode->i_mutex);
+        return len - towrite;
+}
+#endif
+static int ext4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+static struct file_system_type ext4dev_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext4dev",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+static int __init init_ext4_fs(void)
+{
+        int err = init_ext4_xattr();
+        if (err)
+                return err;
+        err = init_inodecache();
+        if (err)
+                goto out1;
+        err = register_filesystem(&ext4dev_fs_type);
+        if (err)
+                goto out;
+        return 0;
+out:
+        destroy_inodecache();
+out1:
+        exit_ext4_xattr();
+        return err;
+}
+static void __exit exit_ext4_fs(void)
+{
+        unregister_filesystem(&ext4dev_fs_type);
+        destroy_inodecache();
+        exit_ext4_xattr();
+}
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_LICENSE("GPL");
+module_init(init_ext4_fs)
+module_exit(exit_ext4_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..fcf527286d75
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
+/*
+ *  linux/fs/ext4/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 symlink handling code
+ */
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/namei.h>
+#include "xattr.h"
+static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
+        nd_set_link(nd, (char*)ei->i_data);
+        return NULL;
+}
+struct inode_operations ext4_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ext4_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+};
+struct inode_operations ext4_fast_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = ext4_follow_link,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ext4_listxattr,
+        .removexattr    = generic_removexattr,
+#endif
+};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..63233cd946a7
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
+/*
+ * linux/fs/ext4/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ *  and Andreas Gruenbacher <agruen@suse.de>.
+ */
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include "xattr.h"
+#include "acl.h"
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+#define IHDR(inode, raw_inode) \
+        ((struct ext4_xattr_ibody_header *) \
+                ((void *)raw_inode + \
+                 EXT4_GOOD_OLD_INODE_SIZE + \
+                 EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+#ifdef EXT4_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+                printk(KERN_DEBUG "inode %s:%lu: ", \
+                        inode->i_sb->s_id, inode->i_ino); \
+                printk(f); \
+                printk("\n"); \
+        } while (0)
+# define ea_bdebug(bh, f...) do { \
+                char b[BDEVNAME_SIZE]; \
+                printk(KERN_DEBUG "block %s:%lu: ", \
+                        bdevname(bh->b_bdev, b), \
+                        (unsigned long) bh->b_blocknr); \
+                printk(f); \
+                printk("\n"); \
+        } while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+static void ext4_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+                                                 struct ext4_xattr_header *,
+                                                 struct mb_cache_entry **);
+static void ext4_xattr_rehash(struct ext4_xattr_header *,
+                              struct ext4_xattr_entry *);
+static struct mb_cache *ext4_xattr_cache;
+static struct xattr_handler *ext4_xattr_handler_map[] = {
+        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
+        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#endif
+        [EXT4_XATTR_INDEX_TRUSTED]           = &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+        [EXT4_XATTR_INDEX_SECURITY]          = &ext4_xattr_security_handler,
+#endif
+};
+struct xattr_handler *ext4_xattr_handlers[] = {
+        &ext4_xattr_user_handler,
+        &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+        &ext4_xattr_acl_access_handler,
+        &ext4_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+        &ext4_xattr_security_handler,
+#endif
+        NULL
+};
+static inline struct xattr_handler *
+ext4_xattr_handler(int name_index)
+{
+        struct xattr_handler *handler = NULL;
+        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
+                handler = ext4_xattr_handler_map[name_index];
+        return handler;
+}
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        return ext4_xattr_list(dentry->d_inode, buffer, size);
+}
+static int
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+{
+        while (!IS_LAST_ENTRY(entry)) {
+                struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+                if ((void *)next >= end)
+                        return -EIO;
+                entry = next;
+        }
+        return 0;
+}
+static inline int
+ext4_xattr_check_block(struct buffer_head *bh)
+{
+        int error;
+        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+            BHDR(bh)->h_blocks != cpu_to_le32(1))
+                return -EIO;
+        error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+        return error;
+}
+static inline int
+ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
+{
+        size_t value_size = le32_to_cpu(entry->e_value_size);
+        if (entry->e_value_block != 0 || value_size > size ||
+            le16_to_cpu(entry->e_value_offs) + value_size > size)
+                return -EIO;
+        return 0;
+}
+static int
+ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+                      const char *name, size_t size, int sorted)
+{
+        struct ext4_xattr_entry *entry;
+        size_t name_len;
+        int cmp = 1;
+        if (name == NULL)
+                return -EINVAL;
+        name_len = strlen(name);
+        entry = *pentry;
+        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+                cmp = name_index - entry->e_name_index;
+                if (!cmp)
+                        cmp = name_len - entry->e_name_len;
+                if (!cmp)
+                        cmp = memcmp(name, entry->e_name, name_len);
+                if (cmp <= 0 && (sorted || cmp == 0))
+                        break;
+        }
+        *pentry = entry;
+        if (!cmp && ext4_xattr_check_entry(entry, size))
+                        return -EIO;
+        return cmp ? -ENODATA : 0;
+}
+static int
+ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+                     void *buffer, size_t buffer_size)
+{
+        struct buffer_head *bh = NULL;
+        struct ext4_xattr_entry *entry;
+        size_t size;
+        int error;
+        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+                  name_index, name, buffer, (long)buffer_size);
+        error = -ENODATA;
+        if (!EXT4_I(inode)->i_file_acl)
+                goto cleanup;
+        ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+        if (!bh)
+                goto cleanup;
+        ea_bdebug(bh, "b_count=%d, refcount=%d",
+                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+        if (ext4_xattr_check_block(bh)) {
+bad_block:      ext4_error(inode->i_sb, __FUNCTION__,
+                           "inode %lu: bad block %llu", inode->i_ino,
+                           EXT4_I(inode)->i_file_acl);
+                error = -EIO;
+                goto cleanup;
+        }
+        ext4_xattr_cache_insert(bh);
+        entry = BFIRST(bh);
+        error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+        if (error == -EIO)
+                goto bad_block;
+        if (error)
+                goto cleanup;
+        size = le32_to_cpu(entry->e_value_size);
+        if (buffer) {
+                error = -ERANGE;
+                if (size > buffer_size)
+                        goto cleanup;
+                memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+                       size);
+        }
+        error = size;
+cleanup:
+        brelse(bh);
+        return error;
+}
+static int
+ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+                     void *buffer, size_t buffer_size)
+{
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_xattr_entry *entry;
+        struct ext4_inode *raw_inode;
+        struct ext4_iloc iloc;
+        size_t size;
+        void *end;
+        int error;
+        if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+                return -ENODATA;
+        error = ext4_get_inode_loc(inode, &iloc);
+        if (error)
+                return error;
+        raw_inode = ext4_raw_inode(&iloc);
+        header = IHDR(inode, raw_inode);
+        entry = IFIRST(header);
+        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+        error = ext4_xattr_check_names(entry, end);
+        if (error)
+                goto cleanup;
+        error = ext4_xattr_find_entry(&entry, name_index, name,
+                                      end - (void *)entry, 0);
+        if (error)
+                goto cleanup;
+        size = le32_to_cpu(entry->e_value_size);
+        if (buffer) {
+                error = -ERANGE;
+                if (size > buffer_size)
+                        goto cleanup;
+                memcpy(buffer, (void *)IFIRST(header) +
+                       le16_to_cpu(entry->e_value_offs), size);
+        }
+        error = size;
+cleanup:
+        brelse(iloc.bh);
+        return error;
+}
+/*
+ * ext4_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+               void *buffer, size_t buffer_size)
+{
+        int error;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
+                                     buffer_size);
+        if (error == -ENODATA)
+                error = ext4_xattr_block_get(inode, name_index, name, buffer,
+                                             buffer_size);
+        up_read(&EXT4_I(inode)->xattr_sem);
+        return error;
+}
+static int
+ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+                        char *buffer, size_t buffer_size)
+{
+        size_t rest = buffer_size;
+        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+                struct xattr_handler *handler =
+                        ext4_xattr_handler(entry->e_name_index);
+                if (handler) {
+                        size_t size = handler->list(inode, buffer, rest,
+                                                    entry->e_name,
+                                                    entry->e_name_len);
+                        if (buffer) {
+                                if (size > rest)
+                                        return -ERANGE;
+                                buffer += size;
+                        }
+                        rest -= size;
+                }
+        }
+        return buffer_size - rest;
+}
+static int
+ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+        struct buffer_head *bh = NULL;
+        int error;
+        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+                  buffer, (long)buffer_size);
+        error = 0;
+        if (!EXT4_I(inode)->i_file_acl)
+                goto cleanup;
+        ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+        error = -EIO;
+        if (!bh)
+                goto cleanup;
+        ea_bdebug(bh, "b_count=%d, refcount=%d",
+                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+        if (ext4_xattr_check_block(bh)) {
+                ext4_error(inode->i_sb, __FUNCTION__,
+                           "inode %lu: bad block %llu", inode->i_ino,
+                           EXT4_I(inode)->i_file_acl);
+                error = -EIO;
+                goto cleanup;
+        }
+        ext4_xattr_cache_insert(bh);
+        error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+cleanup:
+        brelse(bh);
+        return error;
+}
+static int
+ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_inode *raw_inode;
+        struct ext4_iloc iloc;
+        void *end;
+        int error;
+        if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+                return 0;
+        error = ext4_get_inode_loc(inode, &iloc);
+        if (error)
+                return error;
+        raw_inode = ext4_raw_inode(&iloc);
+        header = IHDR(inode, raw_inode);
+        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+        error = ext4_xattr_check_names(IFIRST(header), end);
+        if (error)
+                goto cleanup;
+        error = ext4_xattr_list_entries(inode, IFIRST(header),
+                                        buffer, buffer_size);
+cleanup:
+        brelse(iloc.bh);
+        return error;
+}
+/*
+ * ext4_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+        int i_error, b_error;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
+        if (i_error < 0) {
+                b_error = 0;
+        } else {
+                if (buffer) {
+                        buffer += i_error;
+                        buffer_size -= i_error;
+                }
+                b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
+                if (b_error < 0)
+                        i_error = 0;
+        }
+        up_read(&EXT4_I(inode)->xattr_sem);
+        return i_error + b_error;
+}
+/*
+ * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext4_xattr_update_super_block(handle_t *handle,
+                                          struct super_block *sb)
+{
+        if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+                return;
+        lock_super(sb);
+        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+                EXT4_SB(sb)->s_es->s_feature_compat |=
+                        cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
+                sb->s_dirt = 1;
+                ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        }
+        unlock_super(sb);
+}
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh)
+{
+        struct mb_cache_entry *ce = NULL;
+        ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+        if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+                ea_bdebug(bh, "refcount now=0; freeing");
+                if (ce)
+                        mb_cache_entry_free(ce);
+                ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
+                get_bh(bh);
+                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+        } else {
+                if (ext4_journal_get_write_access(handle, bh) == 0) {
+                        lock_buffer(bh);
+                        BHDR(bh)->h_refcount = cpu_to_le32(
+                                le32_to_cpu(BHDR(bh)->h_refcount) - 1);
+                        ext4_journal_dirty_metadata(handle, bh);
+                        if (IS_SYNC(inode))
+                                handle->h_sync = 1;
+                        DQUOT_FREE_BLOCK(inode, 1);
+                        unlock_buffer(bh);
+                        ea_bdebug(bh, "refcount now=%d; releasing",
+                                  le32_to_cpu(BHDR(bh)->h_refcount));
+                }
+                if (ce)
+                        mb_cache_entry_release(ce);
+        }
+}
+struct ext4_xattr_info {
+        int name_index;
+        const char *name;
+        const void *value;
+        size_t value_len;
+};
+struct ext4_xattr_search {
+        struct ext4_xattr_entry *first;
+        void *base;
+        void *end;
+        struct ext4_xattr_entry *here;
+        int not_found;
+};
+static int
+ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+{
+        struct ext4_xattr_entry *last;
+        size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+        /* Compute min_offs and last. */
+        last = s->first;
+        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+                if (!last->e_value_block && last->e_value_size) {
+                        size_t offs = le16_to_cpu(last->e_value_offs);
+                        if (offs < min_offs)
+                                min_offs = offs;
+                }
+        }
+        free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+        if (!s->not_found) {
+                if (!s->here->e_value_block && s->here->e_value_size) {
+                        size_t size = le32_to_cpu(s->here->e_value_size);
+                        free += EXT4_XATTR_SIZE(size);
+                }
+                free += EXT4_XATTR_LEN(name_len);
+        }
+        if (i->value) {
+                if (free < EXT4_XATTR_SIZE(i->value_len) ||
+                    free < EXT4_XATTR_LEN(name_len) +
+                           EXT4_XATTR_SIZE(i->value_len))
+                        return -ENOSPC;
+        }
+        if (i->value && s->not_found) {
+                /* Insert the new name. */
+                size_t size = EXT4_XATTR_LEN(name_len);
+                size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+                memmove((void *)s->here + size, s->here, rest);
+                memset(s->here, 0, size);
+                s->here->e_name_index = i->name_index;
+                s->here->e_name_len = name_len;
+                memcpy(s->here->e_name, i->name, name_len);
+        } else {
+                if (!s->here->e_value_block && s->here->e_value_size) {
+                        void *first_val = s->base + min_offs;
+                        size_t offs = le16_to_cpu(s->here->e_value_offs);
+                        void *val = s->base + offs;
+                        size_t size = EXT4_XATTR_SIZE(
+                                le32_to_cpu(s->here->e_value_size));
+                        if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
+                                /* The old and the new value have the same
+                                   size. Just replace. */
+                                s->here->e_value_size =
+                                        cpu_to_le32(i->value_len);
+                                memset(val + size - EXT4_XATTR_PAD, 0,
+                                       EXT4_XATTR_PAD); /* Clear pad bytes. */
+                                memcpy(val, i->value, i->value_len);
+                                return 0;
+                        }
+                        /* Remove the old value. */
+                        memmove(first_val + size, first_val, val - first_val);
+                        memset(first_val, 0, size);
+                        s->here->e_value_size = 0;
+                        s->here->e_value_offs = 0;
+                        min_offs += size;
+                        /* Adjust all value offsets. */
+                        last = s->first;
+                        while (!IS_LAST_ENTRY(last)) {
+                                size_t o = le16_to_cpu(last->e_value_offs);
+                                if (!last->e_value_block &&
+                                    last->e_value_size && o < offs)
+                                        last->e_value_offs =
+                                                cpu_to_le16(o + size);
+                                last = EXT4_XATTR_NEXT(last);
+                        }
+                }
+                if (!i->value) {
+                        /* Remove the old name. */
+                        size_t size = EXT4_XATTR_LEN(name_len);
+                        last = ENTRY((void *)last - size);
+                        memmove(s->here, (void *)s->here + size,
+                                (void *)last - (void *)s->here + sizeof(__u32));
+                        memset(last, 0, size);
+                }
+        }
+        if (i->value) {
+                /* Insert the new value. */
+                s->here->e_value_size = cpu_to_le32(i->value_len);
+                if (i->value_len) {
+                        size_t size = EXT4_XATTR_SIZE(i->value_len);
+                        void *val = s->base + min_offs - size;
+                        s->here->e_value_offs = cpu_to_le16(min_offs - size);
+                        memset(val + size - EXT4_XATTR_PAD, 0,
+                               EXT4_XATTR_PAD); /* Clear the pad bytes. */
+                        memcpy(val, i->value, i->value_len);
+                }
+        }
+        return 0;
+}
+struct ext4_xattr_block_find {
+        struct ext4_xattr_search s;
+        struct buffer_head *bh;
+};
+static int
+ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+                      struct ext4_xattr_block_find *bs)
+{
+        struct super_block *sb = inode->i_sb;
+        int error;
+        ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+                  i->name_index, i->name, i->value, (long)i->value_len);
+        if (EXT4_I(inode)->i_file_acl) {
+                /* The inode already has an extended attribute block. */
+                bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
+                error = -EIO;
+                if (!bs->bh)
+                        goto cleanup;
+                ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+                        atomic_read(&(bs->bh->b_count)),
+                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
+                if (ext4_xattr_check_block(bs->bh)) {
+                        ext4_error(sb, __FUNCTION__,
+                                "inode %lu: bad block %llu", inode->i_ino,
+                                EXT4_I(inode)->i_file_acl);
+                        error = -EIO;
+                        goto cleanup;
+                }
+                /* Find the named attribute. */
+                bs->s.base = BHDR(bs->bh);
+                bs->s.first = BFIRST(bs->bh);
+                bs->s.end = bs->bh->b_data + bs->bh->b_size;
+                bs->s.here = bs->s.first;
+                error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+                                              i->name, bs->bh->b_size, 1);
+                if (error && error != -ENODATA)
+                        goto cleanup;
+                bs->s.not_found = error;
+        }
+        error = 0;
+cleanup:
+        return error;
+}
+static int
+ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+                     struct ext4_xattr_info *i,
+                     struct ext4_xattr_block_find *bs)
+{
+        struct super_block *sb = inode->i_sb;
+        struct buffer_head *new_bh = NULL;
+        struct ext4_xattr_search *s = &bs->s;
+        struct mb_cache_entry *ce = NULL;
+        int error;
+#define header(x) ((struct ext4_xattr_header *)(x))
+        if (i->value && i->value_len > sb->s_blocksize)
+                return -ENOSPC;
+        if (s->base) {
+                ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+                                        bs->bh->b_blocknr);
+                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+                        if (ce) {
+                                mb_cache_entry_free(ce);
+                                ce = NULL;
+                        }
+                        ea_bdebug(bs->bh, "modifying in-place");
+                        error = ext4_journal_get_write_access(handle, bs->bh);
+                        if (error)
+                                goto cleanup;
+                        lock_buffer(bs->bh);
+                        error = ext4_xattr_set_entry(i, s);
+                        if (!error) {
+                                if (!IS_LAST_ENTRY(s->first))
+                                        ext4_xattr_rehash(header(s->base),
+                                                          s->here);
+                                ext4_xattr_cache_insert(bs->bh);
+                        }
+                        unlock_buffer(bs->bh);
+                        if (error == -EIO)
+                                goto bad_block;
+                        if (!error)
+                                error = ext4_journal_dirty_metadata(handle,
+                                                                    bs->bh);
+                        if (error)
+                                goto cleanup;
+                        goto inserted;
+                } else {
+                        int offset = (char *)s->here - bs->bh->b_data;
+                        if (ce) {
+                                mb_cache_entry_release(ce);
+                                ce = NULL;
+                        }
+                        ea_bdebug(bs->bh, "cloning");
+                        s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+                        error = -ENOMEM;
+                        if (s->base == NULL)
+                                goto cleanup;
+                        memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+                        s->first = ENTRY(header(s->base)+1);
+                        header(s->base)->h_refcount = cpu_to_le32(1);
+                        s->here = ENTRY(s->base + offset);
+                        s->end = s->base + bs->bh->b_size;
+                }
+        } else {
+                /* Allocate a buffer where we construct the new block. */
+                s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
+                /* assert(header == s->base) */
+                error = -ENOMEM;
+                if (s->base == NULL)
+                        goto cleanup;
+                memset(s->base, 0, sb->s_blocksize);
+                header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+                header(s->base)->h_blocks = cpu_to_le32(1);
+                header(s->base)->h_refcount = cpu_to_le32(1);
+                s->first = ENTRY(header(s->base)+1);
+                s->here = ENTRY(header(s->base)+1);
+                s->end = s->base + sb->s_blocksize;
+        }
+        error = ext4_xattr_set_entry(i, s);
+        if (error == -EIO)
+                goto bad_block;
+        if (error)
+                goto cleanup;
+        if (!IS_LAST_ENTRY(s->first))
+                ext4_xattr_rehash(header(s->base), s->here);
+inserted:
+        if (!IS_LAST_ENTRY(s->first)) {
+                new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+                if (new_bh) {
+                        /* We found an identical block in the cache. */
+                        if (new_bh == bs->bh)
+                                ea_bdebug(new_bh, "keeping");
+                        else {
+                                /* The old block is released after updating
+                                   the inode. */
+                                error = -EDQUOT;
+                                if (DQUOT_ALLOC_BLOCK(inode, 1))
+                                        goto cleanup;
+                                error = ext4_journal_get_write_access(handle,
+                                                                      new_bh);
+                                if (error)
+                                        goto cleanup_dquot;
+                                lock_buffer(new_bh);
+                                BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
+                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
+                                ea_bdebug(new_bh, "reusing; refcount now=%d",
+                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
+                                unlock_buffer(new_bh);
+                                error = ext4_journal_dirty_metadata(handle,
+                                                                    new_bh);
+                                if (error)
+                                        goto cleanup_dquot;
+                        }
+                        mb_cache_entry_release(ce);
+                        ce = NULL;
+                } else if (bs->bh && s->base == bs->bh->b_data) {
+                        /* We were modifying this block in-place. */
+                        ea_bdebug(bs->bh, "keeping this block");
+                        new_bh = bs->bh;
+                        get_bh(new_bh);
+                } else {
+                        /* We need to allocate a new block */
+                        ext4_fsblk_t goal = le32_to_cpu(
+                                        EXT4_SB(sb)->s_es->s_first_data_block) +
+                                (ext4_fsblk_t)EXT4_I(inode)->i_block_group *
+                                EXT4_BLOCKS_PER_GROUP(sb);
+                        ext4_fsblk_t block = ext4_new_block(handle, inode,
+                                                        goal, &error);
+                        if (error)
+                                goto cleanup;
+                        ea_idebug(inode, "creating block %d", block);
+                        new_bh = sb_getblk(sb, block);
+                        if (!new_bh) {
+getblk_failed:
+                                ext4_free_blocks(handle, inode, block, 1);
+                                error = -EIO;
+                                goto cleanup;
+                        }
+                        lock_buffer(new_bh);
+                        error = ext4_journal_get_create_access(handle, new_bh);
+                        if (error) {
+                                unlock_buffer(new_bh);
+                                goto getblk_failed;
+                        }
+                        memcpy(new_bh->b_data, s->base, new_bh->b_size);
+                        set_buffer_uptodate(new_bh);
+                        unlock_buffer(new_bh);
+                        ext4_xattr_cache_insert(new_bh);
+                        error = ext4_journal_dirty_metadata(handle, new_bh);
+                        if (error)
+                                goto cleanup;
+                }
+        }
+        /* Update the inode. */
+        EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+        /* Drop the previous xattr block. */
+        if (bs->bh && bs->bh != new_bh)
+                ext4_xattr_release_block(handle, inode, bs->bh);
+        error = 0;
+cleanup:
+        if (ce)
+                mb_cache_entry_release(ce);
+        brelse(new_bh);
+        if (!(bs->bh && s->base == bs->bh->b_data))
+                kfree(s->base);
+        return error;
+cleanup_dquot:
+        DQUOT_FREE_BLOCK(inode, 1);
+        goto cleanup;
+bad_block:
+        ext4_error(inode->i_sb, __FUNCTION__,
+                   "inode %lu: bad block %llu", inode->i_ino,
+                   EXT4_I(inode)->i_file_acl);
+        goto cleanup;
+#undef header
+}
+struct ext4_xattr_ibody_find {
+        struct ext4_xattr_search s;
+        struct ext4_iloc iloc;
+};
+static int
+ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+                      struct ext4_xattr_ibody_find *is)
+{
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_inode *raw_inode;
+        int error;
+        if (EXT4_I(inode)->i_extra_isize == 0)
+                return 0;
+        raw_inode = ext4_raw_inode(&is->iloc);
+        header = IHDR(inode, raw_inode);
+        is->s.base = is->s.first = IFIRST(header);
+        is->s.here = is->s.first;
+        is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+        if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+                error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+                if (error)
+                        return error;
+                /* Find the named attribute. */
+                error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+                                              i->name, is->s.end -
+                                              (void *)is->s.base, 0);
+                if (error && error != -ENODATA)
+                        return error;
+                is->s.not_found = error;
+        }
+        return 0;
+}
+static int
+ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+                     struct ext4_xattr_info *i,
+                     struct ext4_xattr_ibody_find *is)
+{
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_xattr_search *s = &is->s;
+        int error;
+        if (EXT4_I(inode)->i_extra_isize == 0)
+                return -ENOSPC;
+        error = ext4_xattr_set_entry(i, s);
+        if (error)
+                return error;
+        header = IHDR(inode, ext4_raw_inode(&is->iloc));
+        if (!IS_LAST_ENTRY(s->first)) {
+                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+                EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+        } else {
+                header->h_magic = cpu_to_le32(0);
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+        }
+        return 0;
+}
+/*
+ * ext4_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+                      const char *name, const void *value, size_t value_len,
+                      int flags)
+{
+        struct ext4_xattr_info i = {
+                .name_index = name_index,
+                .name = name,
+                .value = value,
+                .value_len = value_len,
+        };
+        struct ext4_xattr_ibody_find is = {
+                .s = { .not_found = -ENODATA, },
+        };
+        struct ext4_xattr_block_find bs = {
+                .s = { .not_found = -ENODATA, },
+        };
+        int error;
+        if (!name)
+                return -EINVAL;
+        if (strlen(name) > 255)
+                return -ERANGE;
+        down_write(&EXT4_I(inode)->xattr_sem);
+        error = ext4_get_inode_loc(inode, &is.iloc);
+        if (error)
+                goto cleanup;
+        if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+        }
+        error = ext4_xattr_ibody_find(inode, &i, &is);
+        if (error)
+                goto cleanup;
+        if (is.s.not_found)
+                error = ext4_xattr_block_find(inode, &i, &bs);
+        if (error)
+                goto cleanup;
+        if (is.s.not_found && bs.s.not_found) {
+                error = -ENODATA;
+                if (flags & XATTR_REPLACE)
+                        goto cleanup;
+                error = 0;
+                if (!value)
+                        goto cleanup;
+        } else {
+                error = -EEXIST;
+                if (flags & XATTR_CREATE)
+                        goto cleanup;
+        }
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto cleanup;
+        if (!value) {
+                if (!is.s.not_found)
+                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+                else if (!bs.s.not_found)
+                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
+        } else {
+                error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+                if (!error && !bs.s.not_found) {
+                        i.value = NULL;
+                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
+                } else if (error == -ENOSPC) {
+                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
+                        if (error)
+                                goto cleanup;
+                        if (!is.s.not_found) {
+                                i.value = NULL;
+                                error = ext4_xattr_ibody_set(handle, inode, &i,
+                                                             &is);
+                        }
+                }
+        }
+        if (!error) {
+                ext4_xattr_update_super_block(handle, inode->i_sb);
+                inode->i_ctime = CURRENT_TIME_SEC;
+                error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+                /*
+                 * The bh is consumed by ext4_mark_iloc_dirty, even with
+                 * error != 0.
+                 */
+                is.iloc.bh = NULL;
+                if (IS_SYNC(inode))
+                        handle->h_sync = 1;
+        }
+cleanup:
+        brelse(is.iloc.bh);
+        brelse(bs.bh);
+        up_write(&EXT4_I(inode)->xattr_sem);
+        return error;
+}
+/*
+ * ext4_xattr_set()
+ *
+ * Like ext4_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+               const void *value, size_t value_len, int flags)
+{
+        handle_t *handle;
+        int error, retries = 0;
+retry:
+        handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+        if (IS_ERR(handle)) {
+                error = PTR_ERR(handle);
+        } else {
+                int error2;
+                error = ext4_xattr_set_handle(handle, inode, name_index, name,
+                                              value, value_len, flags);
+                error2 = ext4_journal_stop(handle);
+                if (error == -ENOSPC &&
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
+                        goto retry;
+                if (error == 0)
+                        error = error2;
+        }
+        return error;
+}
+/*
+ * ext4_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+        struct buffer_head *bh = NULL;
+        if (!EXT4_I(inode)->i_file_acl)
+                goto cleanup;
+        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+        if (!bh) {
+                ext4_error(inode->i_sb, __FUNCTION__,
+                        "inode %lu: block %llu read error", inode->i_ino,
+                        EXT4_I(inode)->i_file_acl);
+                goto cleanup;
+        }
+        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+                ext4_error(inode->i_sb, __FUNCTION__,
+                        "inode %lu: bad block %llu", inode->i_ino,
+                        EXT4_I(inode)->i_file_acl);
+                goto cleanup;
+        }
+        ext4_xattr_release_block(handle, inode, bh);
+        EXT4_I(inode)->i_file_acl = 0;
+cleanup:
+        brelse(bh);
+}
+/*
+ * ext4_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext4_xattr_put_super(struct super_block *sb)
+{
+        mb_cache_shrink(sb->s_bdev);
+}
+/*
+ * ext4_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext4_xattr_cache_insert(struct buffer_head *bh)
+{
+        __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+        struct mb_cache_entry *ce;
+        int error;
+        ce = mb_cache_entry_alloc(ext4_xattr_cache);
+        if (!ce) {
+                ea_bdebug(bh, "out of memory");
+                return;
+        }
+        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+        if (error) {
+                mb_cache_entry_free(ce);
+                if (error == -EBUSY) {
+                        ea_bdebug(bh, "already in cache");
+                        error = 0;
+                }
+        } else {
+                ea_bdebug(bh, "inserting [%x]", (int)hash);
+                mb_cache_entry_release(ce);
+        }
+}
+/*
+ * ext4_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext4_xattr_cmp(struct ext4_xattr_header *header1,
+               struct ext4_xattr_header *header2)
+{
+        struct ext4_xattr_entry *entry1, *entry2;
+        entry1 = ENTRY(header1+1);
+        entry2 = ENTRY(header2+1);
+        while (!IS_LAST_ENTRY(entry1)) {
+                if (IS_LAST_ENTRY(entry2))
+                        return 1;
+                if (entry1->e_hash != entry2->e_hash ||
+                    entry1->e_name_index != entry2->e_name_index ||
+                    entry1->e_name_len != entry2->e_name_len ||
+                    entry1->e_value_size != entry2->e_value_size ||
+                    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+                        return 1;
+                if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+                        return -EIO;
+                if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+                           (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+                           le32_to_cpu(entry1->e_value_size)))
+                        return 1;
+                entry1 = EXT4_XATTR_NEXT(entry1);
+                entry2 = EXT4_XATTR_NEXT(entry2);
+        }
+        if (!IS_LAST_ENTRY(entry2))
+                return 1;
+        return 0;
+}
+/*
+ * ext4_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+                      struct mb_cache_entry **pce)
+{
+        __u32 hash = le32_to_cpu(header->h_hash);
+        struct mb_cache_entry *ce;
+        if (!header->h_hash)
+                return NULL;  /* never share */
+        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+        ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
+                                       inode->i_sb->s_bdev, hash);
+        while (ce) {
+                struct buffer_head *bh;
+                if (IS_ERR(ce)) {
+                        if (PTR_ERR(ce) == -EAGAIN)
+                                goto again;
+                        break;
+                }
+                bh = sb_bread(inode->i_sb, ce->e_block);
+                if (!bh) {
+                        ext4_error(inode->i_sb, __FUNCTION__,
+                                "inode %lu: block %lu read error",
+                                inode->i_ino, (unsigned long) ce->e_block);
+                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+                                EXT4_XATTR_REFCOUNT_MAX) {
+                        ea_idebug(inode, "block %lu refcount %d>=%d",
+                                  (unsigned long) ce->e_block,
+                                  le32_to_cpu(BHDR(bh)->h_refcount),
+                                          EXT4_XATTR_REFCOUNT_MAX);
+                } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
+                        *pce = ce;
+                        return bh;
+                }
+                brelse(bh);
+                ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+        }
+        return NULL;
+}
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+/*
+ * ext4_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+                                         struct ext4_xattr_entry *entry)
+{
+        __u32 hash = 0;
+        char *name = entry->e_name;
+        int n;
+        for (n=0; n < entry->e_name_len; n++) {
+                hash = (hash << NAME_HASH_SHIFT) ^
+                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+                       *name++;
+        }
+        if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+                __le32 *value = (__le32 *)((char *)header +
+                        le16_to_cpu(entry->e_value_offs));
+                for (n = (le32_to_cpu(entry->e_value_size) +
+                     EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
+                        hash = (hash << VALUE_HASH_SHIFT) ^
+                               (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+                               le32_to_cpu(*value++);
+                }
+        }
+        entry->e_hash = cpu_to_le32(hash);
+}
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+#define BLOCK_HASH_SHIFT 16
+/*
+ * ext4_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+                              struct ext4_xattr_entry *entry)
+{
+        struct ext4_xattr_entry *here;
+        __u32 hash = 0;
+        ext4_xattr_hash_entry(header, entry);
+        here = ENTRY(header+1);
+        while (!IS_LAST_ENTRY(here)) {
+                if (!here->e_hash) {
+                        /* Block is not shared if an entry's hash value == 0 */
+                        hash = 0;
+                        break;
+                }
+                hash = (hash << BLOCK_HASH_SHIFT) ^
+                       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+                       le32_to_cpu(here->e_hash);
+                here = EXT4_XATTR_NEXT(here);
+        }
+        header->h_hash = cpu_to_le32(hash);
+}
+#undef BLOCK_HASH_SHIFT
+int __init
+init_ext4_xattr(void)
+{
+        ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
+                sizeof(struct mb_cache_entry) +
+                sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+        if (!ext4_xattr_cache)
+                return -ENOMEM;
+        return 0;
+}
+void
+exit_ext4_xattr(void)
+{
+        if (ext4_xattr_cache)
+                mb_cache_destroy(ext4_xattr_cache);
+        ext4_xattr_cache = NULL;
+}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..79432b35398f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
+/*
+  File: fs/ext4/xattr.h
+  On-disk format of extended attributes for the ext4 filesystem.
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+#include <linux/xattr.h>
+/* Magic value in attribute blocks */
+#define EXT4_XATTR_MAGIC                0xEA020000
+/* Maximum number of references to one attribute block */
+#define EXT4_XATTR_REFCOUNT_MAX         1024
+/* Name indexes */
+#define EXT4_XATTR_INDEX_USER                   1
+#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS       2
+#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT      3
+#define EXT4_XATTR_INDEX_TRUSTED                4
+#define EXT4_XATTR_INDEX_LUSTRE                 5
+#define EXT4_XATTR_INDEX_SECURITY               6
+struct ext4_xattr_header {
+        __le32  h_magic;        /* magic number for identification */
+        __le32  h_refcount;     /* reference count */
+        __le32  h_blocks;       /* number of disk blocks used */
+        __le32  h_hash;         /* hash value of all attributes */
+        __u32   h_reserved[4];  /* zero right now */
+};
+struct ext4_xattr_ibody_header {
+        __le32  h_magic;        /* magic number for identification */
+};
+struct ext4_xattr_entry {
+        __u8    e_name_len;     /* length of name */
+        __u8    e_name_index;   /* attribute name index */
+        __le16  e_value_offs;   /* offset in disk block of value */
+        __le32  e_value_block;  /* disk block attribute is stored on (n/i) */
+        __le32  e_value_size;   /* size of attribute value */
+        __le32  e_hash;         /* hash value of name and value */
+        char    e_name[0];      /* attribute name */
+};
+#define EXT4_XATTR_PAD_BITS             2
+#define EXT4_XATTR_PAD          (1<<EXT4_XATTR_PAD_BITS)
+#define EXT4_XATTR_ROUND                (EXT4_XATTR_PAD-1)
+#define EXT4_XATTR_LEN(name_len) \
+        (((name_len) + EXT4_XATTR_ROUND + \
+        sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
+#define EXT4_XATTR_NEXT(entry) \
+        ( (struct ext4_xattr_entry *)( \
+          (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+#define EXT4_XATTR_SIZE(size) \
+        (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
+# ifdef CONFIG_EXT4DEV_FS_XATTR
+extern struct xattr_handler ext4_xattr_user_handler;
+extern struct xattr_handler ext4_xattr_trusted_handler;
+extern struct xattr_handler ext4_xattr_acl_access_handler;
+extern struct xattr_handler ext4_xattr_acl_default_handler;
+extern struct xattr_handler ext4_xattr_security_handler;
+extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
+extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext4_xattr_list(struct inode *, char *, size_t);
+extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext4_xattr_put_super(struct super_block *);
+extern int init_ext4_xattr(void);
+extern void exit_ext4_xattr(void);
+extern struct xattr_handler *ext4_xattr_handlers[];
+# else  /* CONFIG_EXT4DEV_FS_XATTR */
+static inline int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+               void *buffer, size_t size, int flags)
+{
+        return -EOPNOTSUPP;
+}
+static inline int
+ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
+{
+        return -EOPNOTSUPP;
+}
+static inline int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+               const void *value, size_t size, int flags)
+{
+        return -EOPNOTSUPP;
+}
+static inline int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+               const char *name, const void *value, size_t size, int flags)
+{
+        return -EOPNOTSUPP;
+}
+static inline void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+static inline void
+ext4_xattr_put_super(struct super_block *sb)
+{
+}
+static inline int
+init_ext4_xattr(void)
+{
+        return 0;
+}
+static inline void
+exit_ext4_xattr(void)
+{
+}
+#define ext4_xattr_handlers     NULL
+# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+extern int ext4_init_security(handle_t *handle, struct inode *inode,
+                                struct inode *dir);
+#else
+static inline int ext4_init_security(handle_t *handle, struct inode *inode,
+                                struct inode *dir)
+{
+        return 0;
+}
+#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b6a6861951f9
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
+/*
+ * linux/fs/ext4/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/security.h>
+#include "xattr.h"
+static size_t
+ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+                         const char *name, size_t name_len)
+{
+        const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list+prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int
+ext4_xattr_security_get(struct inode *inode, const char *name,
+                       void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
+                              buffer, size);
+}
+static int
+ext4_xattr_security_set(struct inode *inode, const char *name,
+                       const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
+                              value, size, flags);
+}
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+        int err;
+        size_t len;
+        void *value;
+        char *name;
+        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        return 0;
+                return err;
+        }
+        err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
+                                    name, value, len, 0);
+        kfree(name);
+        kfree(value);
+        return err;
+}
+struct xattr_handler ext4_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = ext4_xattr_security_list,
+        .get    = ext4_xattr_security_get,
+        .set    = ext4_xattr_security_set,
+};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..b76f2dbc82da
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
+/*
+ * linux/fs/ext4/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+#define XATTR_TRUSTED_PREFIX "trusted."
+static size_t
+ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+                        const char *name, size_t name_len)
+{
+        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (!capable(CAP_SYS_ADMIN))
+                return 0;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+                memcpy(list+prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int
+ext4_xattr_trusted_get(struct inode *inode, const char *name,
+                       void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+                              buffer, size);
+}
+static int
+ext4_xattr_trusted_set(struct inode *inode, const char *name,
+                       const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+                              value, size, flags);
+}
+struct xattr_handler ext4_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list   = ext4_xattr_trusted_list,
+        .get    = ext4_xattr_trusted_get,
+        .set    = ext4_xattr_trusted_set,
+};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..c53cded0761a
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
+/*
+ * linux/fs/ext4/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+#define XATTR_USER_PREFIX "user."
+static size_t
+ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+                     const char *name, size_t name_len)
+{
+        const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (!test_opt(inode->i_sb, XATTR_USER))
+                return 0;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_USER_PREFIX, prefix_len);
+                memcpy(list+prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int
+ext4_xattr_user_get(struct inode *inode, const char *name,
+                    void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        if (!test_opt(inode->i_sb, XATTR_USER))
+                return -EOPNOTSUPP;
+        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+}
+static int
+ext4_xattr_user_set(struct inode *inode, const char *name,
+                    const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        if (!test_opt(inode->i_sb, XATTR_USER))
+                return -EOPNOTSUPP;
+        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
+                              value, size, flags);
+}
+struct xattr_handler ext4_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list   = ext4_xattr_user_list,
+        .get    = ext4_xattr_user_get,
+        .set    = ext4_xattr_user_set,
+};
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f4b8f8b3fbdd..8337451e7897 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
@@ -118,7 +119,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        if ((filp->f_mode & FMODE_WRITE) &&
             MSDOS_SB(inode->i_sb)->options.flush) {
                fat_flush_inodes(inode->i_sb, inode, NULL);
-                blk_congestion_wait(WRITE, HZ/10);
+                congestion_wait(WRITE, HZ/10);
        }
        return 0;
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 045738032a83..78945b53b0f8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -384,7 +384,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
                                      le16_to_cpu(de->cdate)) + secs;
                inode->i_ctime.tv_nsec = csecs * 10000000;
                inode->i_atime.tv_sec =
-                        date_dos2unix(le16_to_cpu(0), le16_to_cpu(de->adate));
+                        date_dos2unix(0, le16_to_cpu(de->adate));
                inode->i_atime.tv_nsec = 0;
        } else
                inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -1472,7 +1472,7 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
                ret = writeback_inode(i1);
        if (!ret && i2)
                ret = writeback_inode(i2);
-        if (!ret && sb) {
+        if (!ret) {
                struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
                ret = filemap_flush(mapping);
        }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8605155db171..cfc8f81e60d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -138,6 +138,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                struct fuse_entry_out outarg;
                struct fuse_conn *fc;
                struct fuse_req *req;
+                struct dentry *parent;
                /* Doesn't hurt to "reset" the validity timeout */
                fuse_invalidate_entry_cache(entry);
@@ -151,8 +152,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (IS_ERR(req))
                        return 0;
-                fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
+                parent = dget_parent(entry);
+                fuse_lookup_init(req, parent->d_inode, entry, &outarg);
                request_send(fc, req);
+                dput(parent);
                err = req->out.h.error;
                /* Zero nodeid is same as -ENOENT */
                if (!err && !outarg.nodeid)
@@ -163,7 +166,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                                fuse_send_forget(fc, req, outarg.nodeid, 1);
                                return 0;
                        }
+                        spin_lock(&fc->lock);
                        fi->nlookup ++;
+                        spin_unlock(&fc->lock);
                }
                fuse_put_request(fc, req);
                if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
@@ -175,22 +180,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
        return 1;
 }
-/*
- * Check if there's already a hashed alias of this directory inode.
- * If yes, then lookup and mkdir must not create a new alias.
- */
-static int dir_alias(struct inode *inode)
-{
-        if (S_ISDIR(inode->i_mode)) {
-                struct dentry *alias = d_find_alias(inode);
-                if (alias) {
-                        dput(alias);
-                        return 1;
-                }
-        }
-        return 0;
-}
 static int invalid_nodeid(u64 nodeid)
 {
        return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -206,6 +195,24 @@ static int valid_mode(int m)
                S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
 }
+/*
+ * Add a directory inode to a dentry, ensuring that no other dentry
+ * refers to this inode.  Called with fc->inst_mutex.
+ */
+static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+{
+        struct dentry *alias = d_find_alias(inode);
+        if (alias) {
+                /* This tries to shrink the subtree below alias */
+                fuse_invalidate_entry(alias);
+                dput(alias);
+                if (!list_empty(&inode->i_dentry))
+                        return -EBUSY;
+        }
+        d_add(entry, inode);
+        return 0;
+}
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
                                  struct nameidata *nd)
 {
@@ -241,11 +248,17 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        if (err && err != -ENOENT)
                return ERR_PTR(err);
-        if (inode && dir_alias(inode)) {
+        if (inode && S_ISDIR(inode->i_mode)) {
-                iput(inode);
+                mutex_lock(&fc->inst_mutex);
-                return ERR_PTR(-EIO);
+                err = fuse_d_add_directory(entry, inode);
-        }
+                mutex_unlock(&fc->inst_mutex);
-        d_add(entry, inode);
+                if (err) {
+                        iput(inode);
+                        return ERR_PTR(err);
+                }
+        } else
+                d_add(entry, inode);
        entry->d_op = &fuse_dentry_operations;
        if (!err)
                fuse_change_timeout(entry, &outarg);
@@ -401,12 +414,22 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        }
        fuse_put_request(fc, req);
-        if (dir_alias(inode)) {
+        if (S_ISDIR(inode->i_mode)) {
-                iput(inode);
+                struct dentry *alias;
-                return -EIO;
+                mutex_lock(&fc->inst_mutex);
-        }
+                alias = d_find_alias(inode);
+                if (alias) {
+                        /* New directory must have moved since mkdir */
+                        mutex_unlock(&fc->inst_mutex);
+                        dput(alias);
+                        iput(inode);
+                        return -EBUSY;
+                }
+                d_instantiate(entry, inode);
+                mutex_unlock(&fc->inst_mutex);
+        } else
+                d_instantiate(entry, inode);
-        d_instantiate(entry, inode);
        fuse_change_timeout(entry, &outarg);
        fuse_invalidate_attr(dir);
        return 0;
@@ -935,14 +958,30 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
        }
 }
+static void fuse_vmtruncate(struct inode *inode, loff_t offset)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        int need_trunc;
+        spin_lock(&fc->lock);
+        need_trunc = inode->i_size > offset;
+        i_size_write(inode, offset);
+        spin_unlock(&fc->lock);
+        if (need_trunc) {
+                struct address_space *mapping = inode->i_mapping;
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+                truncate_inode_pages(mapping, offset);
+        }
+}
 /*
 * Set attributes, and at the same time refresh them.
 *
 * Truncation is slightly complicated, because the 'truncate' request
 * may fail, in which case we don't want to touch the mapping.
- * vmtruncate() doesn't allow for this case.  So do the rlimit
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
- * checking by hand and call vmtruncate() only after the file has
+ * and the actual truncation by hand.
- * actually been truncated.
 */
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
@@ -993,12 +1032,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
                        make_bad_inode(inode);
                        err = -EIO;
                } else {
-                        if (is_truncate) {
+                        if (is_truncate)
-                                loff_t origsize = i_size_read(inode);
+                                fuse_vmtruncate(inode, outarg.attr.size);
-                                i_size_write(inode, outarg.attr.size);
-                                if (origsize > outarg.attr.size)
-                                        vmtruncate(inode, outarg.attr.size);
-                        }
                        fuse_change_attributes(inode, &outarg.attr);
                        fi->i_time = time_to_jiffies(outarg.attr_valid,
                                                     outarg.attr_valid_nsec);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 183626868eea..2bb5ace3882d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -481,8 +481,10 @@ static int fuse_commit_write(struct file *file, struct page *page,
                err = -EIO;
        if (!err) {
                pos += count;
-                if (pos > i_size_read(inode))
+                spin_lock(&fc->lock);
+                if (pos > inode->i_size)
                        i_size_write(inode, pos);
+                spin_unlock(&fc->lock);
                if (offset == 0 && to == PAGE_CACHE_SIZE) {
                        clear_page_dirty(page);
@@ -586,8 +588,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
        }
        fuse_put_request(fc, req);
        if (res > 0) {
-                if (write && pos > i_size_read(inode))
+                if (write) {
-                        i_size_write(inode, pos);
+                        spin_lock(&fc->lock);
+                        if (pos > inode->i_size)
+                                i_size_write(inode, pos);
+                        spin_unlock(&fc->lock);
+                }
                *ppos = pos;
        }
        fuse_invalidate_attr(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 69c7750d55b8..91edb8932d90 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -239,6 +239,9 @@ struct fuse_conn {
        /** Lock protecting accessess to  members of this structure */
        spinlock_t lock;
+        /** Mutex protecting against directory alias creation */
+        struct mutex inst_mutex;
        /** Refcount */
        atomic_t count;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d0a9aee01f2..fc4203570370 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -109,6 +109,7 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
+        struct fuse_conn *fc = get_fuse_conn(inode);
        if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
                invalidate_inode_pages(inode->i_mapping);
@@ -117,7 +118,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
        inode->i_nlink   = attr->nlink;
        inode->i_uid     = attr->uid;
        inode->i_gid     = attr->gid;
+        spin_lock(&fc->lock);
        i_size_write(inode, attr->size);
+        spin_unlock(&fc->lock);
        inode->i_blocks  = attr->blocks;
        inode->i_atime.tv_sec   = attr->atime;
        inode->i_atime.tv_nsec  = attr->atimensec;
@@ -130,7 +133,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 {
        inode->i_mode = attr->mode & S_IFMT;
-        i_size_write(inode, attr->size);
+        inode->i_size = attr->size;
        if (S_ISREG(inode->i_mode)) {
                fuse_init_common(inode);
                fuse_init_file_inode(inode);
@@ -169,7 +172,6 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
        struct inode *inode;
        struct fuse_inode *fi;
        struct fuse_conn *fc = get_fuse_conn_super(sb);
-        int retried = 0;
 retry:
        inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
@@ -183,16 +185,16 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
                fuse_init_inode(inode, attr);
                unlock_new_inode(inode);
        } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
-                BUG_ON(retried);
                /* Inode has changed type, any I/O on the old should fail */
                make_bad_inode(inode);
                iput(inode);
-                retried = 1;
                goto retry;
        }
        fi = get_fuse_inode(inode);
+        spin_lock(&fc->lock);
        fi->nlookup ++;
+        spin_unlock(&fc->lock);
        fuse_change_attributes(inode, attr);
        return inode;
 }
@@ -377,6 +379,7 @@ static struct fuse_conn *new_conn(void)
        fc = kzalloc(sizeof(*fc), GFP_KERNEL);
        if (fc) {
                spin_lock_init(&fc->lock);
+                mutex_init(&fc->inst_mutex);
                atomic_set(&fc->count, 1);
                init_waitqueue_head(&fc->waitq);
                init_waitqueue_head(&fc->blocked_waitq);
@@ -396,8 +399,10 @@ static struct fuse_conn *new_conn(void)
 void fuse_conn_put(struct fuse_conn *fc)
 {
-        if (atomic_dec_and_test(&fc->count))
+        if (atomic_dec_and_test(&fc->count)) {
+                mutex_destroy(&fc->inst_mutex);
                kfree(fc);
+        }
 }
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
+config GFS2_FS
+        tristate "GFS2 file system support"
+        depends on EXPERIMENTAL
+        select FS_POSIX_ACL
+        help
+        A cluster filesystem.
+        Allows a cluster of computers to simultaneously use a block device
+        that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+        and writes to the block device like a local filesystem, but also uses
+        a lock module to allow the computers coordinate their I/O so
+        filesystem consistency is maintained.  One of the nifty features of
+        GFS is perfect consistency -- changes made to the filesystem on one
+        machine show up immediately on all other machines in the cluster.
+        To use the GFS2 filesystem, you will need to enable one or more of
+        the below locking modules. Documentation and utilities for GFS2 can
+        be found here: http://sources.redhat.com/cluster
+config GFS2_FS_LOCKING_NOLOCK
+        tristate "GFS2 \"nolock\" locking module"
+        depends on GFS2_FS
+        help
+        Single node locking module for GFS2.
+        Use this module if you want to use GFS2 on a single node without
+        its clustering features. You can still take advantage of the
+        large file support, and upgrade to running a full cluster later on
+        if required.
+        If you will only be using GFS2 in cluster mode, you do not need this
+        module.
+config GFS2_FS_LOCKING_DLM
+        tristate "GFS2 DLM locking module"
+        depends on GFS2_FS
+        select DLM
+        help
+        Multiple node locking module for GFS2
+        Most users of GFS2 will require this module. It provides the locking
+        interface between GFS2 and the DLM, which is required to use GFS2
+        in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+        glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+        mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+        ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+        recovery.o rgrp.o super.o sys.o trans.o util.o
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+#define ACL_ACCESS 1
+#define ACL_DEFAULT 0
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+                      struct gfs2_ea_request *er,
+                      int *remove, mode_t *mode)
+{
+        struct posix_acl *acl;
+        int error;
+        error = gfs2_acl_validate_remove(ip, access);
+        if (error)
+                return error;
+        if (!er->er_data)
+                return -EINVAL;
+        acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (!acl) {
+                *remove = 1;
+                return 0;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out;
+        if (access) {
+                error = posix_acl_equiv_mode(acl, mode);
+                if (!error)
+                        *remove = 1;
+                else if (error > 0)
+                        error = 0;
+        }
+out:
+        posix_acl_release(acl);
+        return error;
+}
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+{
+        if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
+        if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (S_ISLNK(ip->i_di.di_mode))
+                return -EOPNOTSUPP;
+        if (!access && !S_ISDIR(ip->i_di.di_mode))
+                return -EACCES;
+        return 0;
+}
+static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+                   struct gfs2_ea_location *el, char **data, unsigned int *len)
+{
+        struct gfs2_ea_request er;
+        struct gfs2_ea_location el_this;
+        int error;
+        if (!ip->i_di.di_eattr)
+                return 0;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        if (access) {
+                er.er_name = GFS2_POSIX_ACL_ACCESS;
+                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+        } else {
+                er.er_name = GFS2_POSIX_ACL_DEFAULT;
+                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+        }
+        er.er_type = GFS2_EATYPE_SYS;
+        if (!el)
+                el = &el_this;
+        error = gfs2_ea_find(ip, &er, el);
+        if (error)
+                return error;
+        if (!el->el_ea)
+                return 0;
+        if (!GFS2_EA_DATA_LEN(el->el_ea))
+                goto out;
+        er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+        er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+        error = -ENOMEM;
+        if (!er.er_data)
+                goto out;
+        error = gfs2_ea_get_copy(ip, el, er.er_data);
+        if (error)
+                goto out_kfree;
+        if (acl) {
+                *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+                if (IS_ERR(*acl))
+                        error = PTR_ERR(*acl);
+        }
+out_kfree:
+        if (error || !data)
+                kfree(er.er_data);
+        else {
+                *data = er.er_data;
+                *len = er.er_data_len;
+        }
+out:
+        if (error || el == &el_this)
+                brelse(el->el_bh);
+        return error;
+}
+/**
+ * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+int gfs2_check_acl_locked(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = NULL;
+        int error;
+        error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+        if (error)
+                return error;
+        if (acl) {
+                error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+int gfs2_check_acl(struct inode *inode, int mask)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (!error) {
+                error = gfs2_check_acl_locked(inode, mask);
+                gfs2_glock_dq_uninit(&i_gh);
+        }
+        return error;
+}
+static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                gfs2_assert_withdraw(sdp,
+                                (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
+                ip->i_di.di_mode = mode;
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+        return 0;
+}
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct posix_acl *acl = NULL, *clone;
+        struct gfs2_ea_request er;
+        mode_t mode = ip->i_di.di_mode;
+        int error;
+        if (!sdp->sd_args.ar_posix_acl)
+                return 0;
+        if (S_ISLNK(ip->i_di.di_mode))
+                return 0;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = GFS2_EATYPE_SYS;
+        error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
+                        &er.er_data, &er.er_data_len);
+        if (error)
+                return error;
+        if (!acl) {
+                mode &= ~current->fs->umask;
+                if (mode != ip->i_di.di_mode)
+                        error = munge_mode(ip, mode);
+                return error;
+        }
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        error = -ENOMEM;
+        if (!clone)
+                goto out;
+        posix_acl_release(acl);
+        acl = clone;
+        if (S_ISDIR(ip->i_di.di_mode)) {
+                er.er_name = GFS2_POSIX_ACL_DEFAULT;
+                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+                error = gfs2_system_eaops.eo_set(ip, &er);
+                if (error)
+                        goto out;
+        }
+        error = posix_acl_create_masq(acl, &mode);
+        if (error < 0)
+                goto out;
+        if (error > 0) {
+                er.er_name = GFS2_POSIX_ACL_ACCESS;
+                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+                posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
+                er.er_mode = mode;
+                er.er_flags = GFS2_ERF_MODE;
+                error = gfs2_system_eaops.eo_set(ip, &er);
+                if (error)
+                        goto out;
+        } else
+                munge_mode(ip, mode);
+out:
+        posix_acl_release(acl);
+        kfree(er.er_data);
+        return error;
+}
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+        struct posix_acl *acl = NULL, *clone;
+        struct gfs2_ea_location el;
+        char *data;
+        unsigned int len;
+        int error;
+        error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+        if (error)
+                return error;
+        if (!acl)
+                return gfs2_setattr_simple(ip, attr);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        error = -ENOMEM;
+        if (!clone)
+                goto out;
+        posix_acl_release(acl);
+        acl = clone;
+        error = posix_acl_chmod_masq(acl, attr->ia_mode);
+        if (!error) {
+                posix_acl_to_xattr(acl, data, len);
+                error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+        }
+out:
+        posix_acl_release(acl);
+        brelse(el.el_bh);
+        kfree(data);
+        return error;
+}
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+#include "incore.h"
+#define GFS2_POSIX_ACL_ACCESS           "posix_acl_access"
+#define GFS2_POSIX_ACL_ACCESS_LEN       16
+#define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
+#define GFS2_POSIX_ACL_DEFAULT_LEN      17
+#define GFS2_ACL_IS_ACCESS(name, len) \
+         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+#define GFS2_ACL_IS_DEFAULT(name, len) \
+         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
+         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
+struct gfs2_ea_request;
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+                          struct gfs2_ea_request *er,
+                          int *remove, mode_t *mode);
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
+int gfs2_check_acl_locked(struct inode *inode, int mask);
+int gfs2_check_acl(struct inode *inode, int mask);
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..06e9a8cb45e9
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1222 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "ops_address.h"
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+        __u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+                             struct buffer_head *bh, u64 *top,
+                             u64 *bottom, unsigned int height,
+                             void *data);
+struct strip_mine {
+        int sm_first;
+        unsigned int sm_height;
+};
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @private: any locked page held by the caller process
+ *
+ * Returns: errno
+ */
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+                               u64 block, struct page *page)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *inode = &ip->i_inode;
+        struct buffer_head *bh;
+        int release = 0;
+        if (!page || page->index) {
+                page = grab_cache_page(inode->i_mapping, 0);
+                if (!page)
+                        return -ENOMEM;
+                release = 1;
+        }
+        if (!PageUptodate(page)) {
+                void *kaddr = kmap(page);
+                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+                       ip->i_di.di_size);
+                memset(kaddr + ip->i_di.di_size, 0,
+                       PAGE_CACHE_SIZE - ip->i_di.di_size);
+                kunmap(page);
+                SetPageUptodate(page);
+        }
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << inode->i_blkbits,
+                                     (1 << BH_Uptodate));
+        bh = page_buffers(page);
+        if (!buffer_mapped(bh))
+                map_bh(bh, inode->i_sb, block);
+        set_buffer_uptodate(bh);
+        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        mark_buffer_dirty(bh);
+        if (release) {
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        return 0;
+}
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @private: private data for the unstuffer
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+        struct buffer_head *bh, *dibh;
+        struct gfs2_dinode *di;
+        u64 block = 0;
+        int isdir = gfs2_is_dir(ip);
+        int error;
+        down_write(&ip->i_rw_mutex);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (ip->i_di.di_size) {
+                /* Get a free block, fill it with the stuffed data,
+                   and write it out to disk */
+                if (isdir) {
+                        block = gfs2_alloc_meta(ip);
+                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
+                        if (error)
+                                goto out_brelse;
+                        gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+                                              dibh, sizeof(struct gfs2_dinode));
+                        brelse(bh);
+                } else {
+                        block = gfs2_alloc_data(ip);
+                        error = gfs2_unstuffer_page(ip, dibh, block, page);
+                        if (error)
+                                goto out_brelse;
+                }
+        }
+        /*  Set up the pointer to the new block  */
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        di = (struct gfs2_dinode *)dibh->b_data;
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        if (ip->i_di.di_size) {
+                *(__be64 *)(di + 1) = cpu_to_be64(block);
+                ip->i_di.di_blocks++;
+                di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+        }
+        ip->i_di.di_height = 1;
+        di->di_height = cpu_to_be16(1);
+out_brelse:
+        brelse(dibh);
+out:
+        up_write(&ip->i_rw_mutex);
+        return error;
+}
+/**
+ * calc_tree_height - Calculate the height of a metadata tree
+ * @ip: The GFS2 inode
+ * @size: The proposed size of the file
+ *
+ * Work out how tall a metadata tree needs to be in order to accommodate a
+ * file of a particular size. If size is less than the current size of
+ * the inode, then the current size of the inode is used instead of the
+ * supplied one.
+ *
+ * Returns: the height the tree should be
+ */
+static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 *arr;
+        unsigned int max, height;
+        if (ip->i_di.di_size > size)
+                size = ip->i_di.di_size;
+        if (gfs2_is_dir(ip)) {
+                arr = sdp->sd_jheightsize;
+                max = sdp->sd_max_jheight;
+        } else {
+                arr = sdp->sd_heightsize;
+                max = sdp->sd_max_height;
+        }
+        for (height = 0; height < max; height++)
+                if (arr[height] >= size)
+                        break;
+        return height;
+}
+/**
+ * build_height - Build a metadata tree of the requested height
+ * @ip: The GFS2 inode
+ * @height: The height to build to
+ *
+ *
+ * Returns: errno
+ */
+static int build_height(struct inode *inode, unsigned height)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        unsigned new_height = height - ip->i_di.di_height;
+        struct buffer_head *dibh;
+        struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
+        struct gfs2_dinode *di;
+        int error;
+        u64 *bp;
+        u64 bn;
+        unsigned n;
+        if (height <= ip->i_di.di_height)
+                return 0;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        for(n = 0; n < new_height; n++) {
+                bn = gfs2_alloc_meta(ip);
+                blocks[n] = gfs2_meta_new(ip->i_gl, bn);
+                gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
+        }
+        n = 0;
+        bn = blocks[0]->b_blocknr;
+        if (new_height > 1) {
+                for(; n < new_height-1; n++) {
+                        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
+                                          GFS2_FORMAT_IN);
+                        gfs2_buffer_clear_tail(blocks[n],
+                                               sizeof(struct gfs2_meta_header));
+                        bp = (u64 *)(blocks[n]->b_data +
+                                     sizeof(struct gfs2_meta_header));
+                        *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
+                        brelse(blocks[n]);
+                        blocks[n] = NULL;
+                }
+        }
+        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+        gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
+                              dibh, sizeof(struct gfs2_dinode));
+        brelse(blocks[n]);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        di = (struct gfs2_dinode *)dibh->b_data;
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        *(__be64 *)(di + 1) = cpu_to_be64(bn);
+        ip->i_di.di_height += new_height;
+        ip->i_di.di_blocks += new_height;
+        di->di_height = cpu_to_be16(ip->i_di.di_height);
+        di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+        brelse(dibh);
+        return error;
+}
+/**
+ * find_metapath - Find path through the metadata tree
+ * @ip: The inode pointer
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ *
+ *   This routine returns a struct metapath structure that defines a path
+ *   through the metadata of inode "ip" to get to block "block".
+ *
+ *   Example:
+ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
+ *   filesystem with a blocksize of 4096.
+ *
+ *   find_metapath() would return a struct metapath structure set to:
+ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ *   and mp_list[2] = 165.
+ *
+ *   That means that in order to get to the block containing the byte at
+ *   offset 101342453, we would load the indirect block pointed to by pointer
+ *   0 in the dinode.  We would then load the indirect block pointed to by
+ *   pointer 48 in that indirect block.  We would then load the data block
+ *   pointed to by pointer 165 in that indirect block.
+ *
+ *             ----------------------------------------
+ *             | Dinode |                             |
+ *             |        |                            4|
+ *             |        |0 1 2 3 4 5                 9|
+ *             |        |                            6|
+ *             ----------------------------------------
+ *                       |
+ *                       |
+ *                       V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                                     5|
+ *             |            4 4 4 4 4 5 5            1|
+ *             |0           5 6 7 8 9 0 1            2|
+ *             ----------------------------------------
+ *                                |
+ *                                |
+ *                                V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                         1 1 1 1 1   5|
+ *             |                         6 6 6 6 6   1|
+ *             |0                        3 4 5 6 7   2|
+ *             ----------------------------------------
+ *                                           |
+ *                                           |
+ *                                           V
+ *             ----------------------------------------
+ *             | Data block containing offset         |
+ *             |            101342453                 |
+ *             |                                      |
+ *             |                                      |
+ *             ----------------------------------------
+ *
+ */
+static void find_metapath(struct gfs2_inode *ip, u64 block,
+                          struct metapath *mp)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 b = block;
+        unsigned int i;
+        for (i = ip->i_di.di_height; i--;)
+                mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+}
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @bh: The buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
+                               unsigned int height, const struct metapath *mp)
+{
+        unsigned int head_size = (height > 0) ?
+                sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+        u64 *ptr;
+        *boundary = 0;
+        ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+        if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
+                *boundary = 1;
+        return ptr;
+}
+/**
+ * lookup_block - Get the next metadata block in metadata tree
+ * @ip: The GFS2 inode
+ * @bh: Buffer containing the pointers to metadata blocks
+ * @height: The height of the tree (0 = dinode)
+ * @mp: The metapath
+ * @create: Non-zero if we may create a new meatdata block
+ * @new: Used to indicate if we did create a new metadata block
+ * @block: the returned disk block number
+ *
+ * Given a metatree, complete to a particular height, checks to see if the next
+ * height of the tree exists. If not the next height of the tree is created.
+ * The block number of the next height of the metadata tree is returned.
+ *
+ */
+static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+                        unsigned int height, struct metapath *mp, int create,
+                        int *new, u64 *block)
+{
+        int boundary;
+        u64 *ptr = metapointer(bh, &boundary, height, mp);
+        if (*ptr) {
+                *block = be64_to_cpu(*ptr);
+                return boundary;
+        }
+        *block = 0;
+        if (!create)
+                return 0;
+        if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+                *block = gfs2_alloc_data(ip);
+        else
+                *block = gfs2_alloc_meta(ip);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        *ptr = cpu_to_be64(*block);
+        ip->i_di.di_blocks++;
+        *new = 1;
+        return 0;
+}
+/**
+ * gfs2_block_pointers - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @map_bh: The bh to be mapped
+ * @mp: metapath to use
+ *
+ * Find the block number on the current device which corresponds to an
+ * inode's block. If the block had to be created, "new" will be set.
+ *
+ * Returns: errno
+ */
+static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
+                               struct buffer_head *bh_map, struct metapath *mp)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *bh;
+        unsigned int bsize;
+        unsigned int height;
+        unsigned int end_of_metadata;
+        unsigned int x;
+        int error = 0;
+        int new = 0;
+        u64 dblock = 0;
+        int boundary;
+        unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+        BUG_ON(maxlen == 0);
+        if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+                return 0;
+        bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+        height = calc_tree_height(ip, (lblock + 1) * bsize);
+        if (ip->i_di.di_height < height) {
+                if (!create)
+                        return 0;
+                error = build_height(inode, height);
+                if (error)
+                        return error;
+        }
+        find_metapath(ip, lblock, mp);
+        end_of_metadata = ip->i_di.di_height - 1;
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                return error;
+        for (x = 0; x < end_of_metadata; x++) {
+                lookup_block(ip, bh, x, mp, create, &new, &dblock);
+                brelse(bh);
+                if (!dblock)
+                        return 0;
+                error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+                if (error)
+                        return error;
+        }
+        boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
+        clear_buffer_mapped(bh_map);
+        clear_buffer_new(bh_map);
+        clear_buffer_boundary(bh_map);
+        if (dblock) {
+                map_bh(bh_map, inode->i_sb, dblock);
+                if (boundary)
+                        set_buffer_boundary(bh);
+                if (new) {
+                        struct buffer_head *dibh;
+                        error = gfs2_meta_inode_buffer(ip, &dibh);
+                        if (!error) {
+                                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                                brelse(dibh);
+                        }
+                        set_buffer_new(bh_map);
+                        goto out_brelse;
+                }
+                while(--maxlen && !buffer_boundary(bh_map)) {
+                        u64 eblock;
+                        mp->mp_list[end_of_metadata]++;
+                        boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
+                        if (eblock != ++dblock)
+                                break;
+                        bh_map->b_size += (1 << inode->i_blkbits);
+                        if (boundary)
+                                set_buffer_boundary(bh_map);
+                }
+        }
+out_brelse:
+        brelse(bh);
+        return 0;
+}
+static inline void bmap_lock(struct inode *inode, int create)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (create)
+                down_write(&ip->i_rw_mutex);
+        else
+                down_read(&ip->i_rw_mutex);
+}
+static inline void bmap_unlock(struct inode *inode, int create)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (create)
+                up_write(&ip->i_rw_mutex);
+        else
+                up_read(&ip->i_rw_mutex);
+}
+int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+                   struct buffer_head *bh)
+{
+        struct metapath mp;
+        int ret;
+        bmap_lock(inode, create);
+        ret = gfs2_block_pointers(inode, lblock, create, bh, &mp);
+        bmap_unlock(inode, create);
+        return ret;
+}
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+        struct metapath mp;
+        struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
+        int ret;
+        int create = *new;
+        BUG_ON(!extlen);
+        BUG_ON(!dblock);
+        BUG_ON(!new);
+        bh.b_size = 1 << (inode->i_blkbits + 5);
+        bmap_lock(inode, create);
+        ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp);
+        bmap_unlock(inode, create);
+        *extlen = bh.b_size >> inode->i_blkbits;
+        *dblock = bh.b_blocknr;
+        if (buffer_new(&bh))
+                *new = 1;
+        else
+                *new = 0;
+        return ret;
+}
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+                          struct metapath *mp, unsigned int height,
+                          u64 block, int first, block_call_t bc,
+                          void *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *bh = NULL;
+        u64 *top, *bottom;
+        u64 bn;
+        int error;
+        int mh_size = sizeof(struct gfs2_meta_header);
+        if (!height) {
+                error = gfs2_meta_inode_buffer(ip, &bh);
+                if (error)
+                        return error;
+                dibh = bh;
+                top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+                bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+        } else {
+                error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+                if (error)
+                        return error;
+                top = (u64 *)(bh->b_data + mh_size) +
+                                  (first ? mp->mp_list[height] : 0);
+                bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+        }
+        error = bc(ip, dibh, bh, top, bottom, height, data);
+        if (error)
+                goto out;
+        if (height < ip->i_di.di_height - 1)
+                for (; top < bottom; top++, first = 0) {
+                        if (!*top)
+                                continue;
+                        bn = be64_to_cpu(*top);
+                        error = recursive_scan(ip, dibh, mp, height + 1, bn,
+                                               first, bc, data);
+                        if (error)
+                                break;
+                }
+out:
+        brelse(bh);
+        return error;
+}
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+                    struct buffer_head *bh, u64 *top, u64 *bottom,
+                    unsigned int height, void *data)
+{
+        struct strip_mine *sm = data;
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrp_list rlist;
+        u64 bn, bstart;
+        u32 blen;
+        u64 *p;
+        unsigned int rg_blocks = 0;
+        int metadata;
+        unsigned int revokes = 0;
+        int x;
+        int error;
+        if (!*top)
+                sm->sm_first = 0;
+        if (height != sm->sm_height)
+                return 0;
+        if (sm->sm_first) {
+                top++;
+                sm->sm_first = 0;
+        }
+        metadata = (height != ip->i_di.di_height - 1);
+        if (metadata)
+                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+        error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+        if (error)
+                return error;
+        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+        bstart = 0;
+        blen = 0;
+        for (p = top; p < bottom; p++) {
+                if (!*p)
+                        continue;
+                bn = be64_to_cpu(*p);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_rlist_add(sdp, &rlist, bstart);
+                        bstart = bn;
+                        blen = 1;
+                }
+        }
+        if (bstart)
+                gfs2_rlist_add(sdp, &rlist, bstart);
+        else
+                goto out; /* Nothing to do */
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        for (x = 0; x < rlist.rl_rgrps; x++) {
+                struct gfs2_rgrpd *rgd;
+                rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+                rg_blocks += rgd->rd_ri.ri_length;
+        }
+        error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+        if (error)
+                goto out_rlist;
+        error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+                                 RES_INDIRECT + RES_STATFS + RES_QUOTA,
+                                 revokes);
+        if (error)
+                goto out_rg_gunlock;
+        down_write(&ip->i_rw_mutex);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        bstart = 0;
+        blen = 0;
+        for (p = top; p < bottom; p++) {
+                if (!*p)
+                        continue;
+                bn = be64_to_cpu(*p);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart) {
+                                if (metadata)
+                                        gfs2_free_meta(ip, bstart, blen);
+                                else
+                                        gfs2_free_data(ip, bstart, blen);
+                        }
+                        bstart = bn;
+                        blen = 1;
+                }
+                *p = 0;
+                if (!ip->i_di.di_blocks)
+                        gfs2_consist_inode(ip);
+                ip->i_di.di_blocks--;
+        }
+        if (bstart) {
+                if (metadata)
+                        gfs2_free_meta(ip, bstart, blen);
+                else
+                        gfs2_free_data(ip, bstart, blen);
+        }
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        up_write(&ip->i_rw_mutex);
+        gfs2_trans_end(sdp);
+out_rg_gunlock:
+        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+        gfs2_rlist_free(&rlist);
+out:
+        gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+        return error;
+}
+/**
+ * do_grow - Make a file look bigger than it is
+ * @ip: the inode
+ * @size: the size to set the file to
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+static int do_grow(struct gfs2_inode *ip, u64 size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al;
+        struct buffer_head *dibh;
+        unsigned int h;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+        if (error)
+                goto out_gunlock_q;
+        al->al_requested = sdp->sd_max_height + RES_DATA;
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_trans_begin(sdp,
+                        sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
+                        RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+        if (error)
+                goto out_ipres;
+        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+                if (gfs2_is_stuffed(ip)) {
+                        error = gfs2_unstuff_dinode(ip, NULL);
+                        if (error)
+                                goto out_end_trans;
+                }
+                h = calc_tree_height(ip, size);
+                if (ip->i_di.di_height < h) {
+                        down_write(&ip->i_rw_mutex);
+                        error = build_height(&ip->i_inode, h);
+                        up_write(&ip->i_rw_mutex);
+                        if (error)
+                                goto out_end_trans;
+                }
+        }
+        ip->i_di.di_size = size;
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        gfs2_inplace_release(ip);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping)
+{
+        struct inode *inode = mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        loff_t from = inode->i_size;
+        unsigned long index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize, iblock, length, pos;
+        struct buffer_head *bh;
+        struct page *page;
+        void *kaddr;
+        int err;
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return 0;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        /* Find the buffer that contains "offset" */
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        err = 0;
+        if (!buffer_mapped(bh)) {
+                gfs2_get_block(inode, iblock, bh, 0);
+                /* unmapped? It's a hole - nothing to do */
+                if (!buffer_mapped(bh))
+                        goto unlock;
+        }
+        /* Ok, it's mapped. Make sure it's up-to-date */
+        if (PageUptodate(page))
+                set_buffer_uptodate(bh);
+        if (!buffer_uptodate(bh)) {
+                err = -EIO;
+                ll_rw_block(READ, 1, &bh);
+                wait_on_buffer(bh);
+                /* Uhhuh. Read error. Complain and punt. */
+                if (!buffer_uptodate(bh))
+                        goto unlock;
+        }
+        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr + offset, 0, length);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+static int trunc_start(struct gfs2_inode *ip, u64 size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        int journaled = gfs2_is_jdata(ip);
+        int error;
+        error = gfs2_trans_begin(sdp,
+                                 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                ip->i_di.di_size = size;
+                ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+                error = 1;
+        } else {
+                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
+                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
+                if (!error) {
+                        ip->i_di.di_size = size;
+                        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                        ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                }
+        }
+        brelse(dibh);
+out:
+        gfs2_trans_end(sdp);
+        return error;
+}
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+        unsigned int height = ip->i_di.di_height;
+        u64 lblock;
+        struct metapath mp;
+        int error;
+        if (!size)
+                lblock = 0;
+        else
+                lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+        find_metapath(ip, lblock, &mp);
+        gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        while (height--) {
+                struct strip_mine sm;
+                sm.sm_first = !!size;
+                sm.sm_height = height;
+                error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+                if (error)
+                        break;
+        }
+        gfs2_quota_unhold(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+static int trunc_end(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        down_write(&ip->i_rw_mutex);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (!ip->i_di.di_size) {
+                ip->i_di.di_height = 0;
+                ip->i_di.di_goal_meta =
+                        ip->i_di.di_goal_data =
+                        ip->i_num.no_addr;
+                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        }
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+out:
+        up_write(&ip->i_rw_mutex);
+        gfs2_trans_end(sdp);
+        return error;
+}
+/**
+ * do_shrink - make a file smaller
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+static int do_shrink(struct gfs2_inode *ip, u64 size)
+{
+        int error;
+        error = trunc_start(ip, size);
+        if (error < 0)
+                return error;
+        if (error > 0)
+                return 0;
+        error = trunc_dealloc(ip, size);
+        if (!error)
+                error = trunc_end(ip);
+        return error;
+}
+/**
+ * gfs2_truncatei - make a file a given size
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * The file size can grow, shrink, or stay the same size.
+ *
+ * Returns: errno
+ */
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+{
+        int error;
+        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
+                return -EINVAL;
+        if (size > ip->i_di.di_size)
+                error = do_grow(ip, size);
+        else
+                error = do_shrink(ip, size);
+        return error;
+}
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+        int error;
+        error = trunc_dealloc(ip, ip->i_di.di_size);
+        if (!error)
+                error = trunc_end(ip);
+        return error;
+}
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+        return trunc_dealloc(ip, 0);
+}
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+                            unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int tmp;
+        if (gfs2_is_dir(ip)) {
+                *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
+                *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
+        } else {
+                *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+                *ind_blocks = 3 * (sdp->sd_max_height - 1);
+        }
+        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                *ind_blocks += tmp;
+        }
+}
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                              unsigned int len, int *alloc_required)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 lblock, lblock_stop, dblock;
+        u32 extlen;
+        int new = 0;
+        int error = 0;
+        *alloc_required = 0;
+        if (!len)
+                return 0;
+        if (gfs2_is_stuffed(ip)) {
+                if (offset + len >
+                    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+                        *alloc_required = 1;
+                return 0;
+        }
+        if (gfs2_is_dir(ip)) {
+                unsigned int bsize = sdp->sd_jbsize;
+                lblock = offset;
+                do_div(lblock, bsize);
+                lblock_stop = offset + len + bsize - 1;
+                do_div(lblock_stop, bsize);
+        } else {
+                unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+                lblock = offset >> shift;
+                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+        }
+        for (; lblock < lblock_stop; lblock += extlen) {
+                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+                if (error)
+                        return error;
+                if (!dblock) {
+                        *alloc_required = 1;
+                        return 0;
+                }
+        }
+        return 0;
+}
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..ac2fd04370dc
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+struct inode;
+struct gfs2_inode;
+struct page;
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+                            unsigned int *data_blocks,
+                            unsigned int *ind_blocks);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                              unsigned int len, int *alloc_required);
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "log.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+/* This uses schedule_timeout() instead of msleep() because it's good for
+   the daemons to wake up more often than the timeout when unmounting so
+   the user's unmount doesn't sit there forever.
+   The kthread functions used to start these daemons block and flush signals. */
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+int gfs2_scand(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_scand_internal(sdp);
+                t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
+/**
+ * gfs2_glockd - Reclaim unused glock structures
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
+ * Number of daemons can be set by user, with num_glockd mount option.
+ */
+int gfs2_glockd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        while (!kthread_should_stop()) {
+                while (atomic_read(&sdp->sd_reclaim_count))
+                        gfs2_reclaim_glock(sdp);
+                wait_event_interruptible(sdp->sd_reclaim_wq,
+                                         (atomic_read(&sdp->sd_reclaim_count) ||
+                                         kthread_should_stop()));
+        }
+        return 0;
+}
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_recoverd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_check_journals(sdp);
+                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+int gfs2_logd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        struct gfs2_holder ji_gh;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                /* Advance the log tail */
+                t = sdp->sd_log_flush_time +
+                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                gfs2_ail1_empty(sdp, DIO_ALL);
+                if (time_after_eq(jiffies, t)) {
+                        gfs2_log_flush(sdp, NULL);
+                        sdp->sd_log_flush_time = jiffies;
+                }
+                /* Check for latest journal index */
+                t = sdp->sd_jindex_refresh_time +
+                    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+                if (time_after_eq(jiffies, t)) {
+                        if (!gfs2_jindex_hold(sdp, &ji_gh))
+                                gfs2_glock_dq_uninit(&ji_gh);
+                        sdp->sd_jindex_refresh_time = jiffies;
+                }
+                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_quotad(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        int error;
+        while (!kthread_should_stop()) {
+                /* Update the master statfs file */
+                t = sdp->sd_statfs_sync_time +
+                    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+                if (time_after_eq(jiffies, t)) {
+                        error = gfs2_statfs_sync(sdp);
+                        if (error &&
+                            error != -EROFS &&
+                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                                fs_err(sdp, "quotad: (1) error=%d\n", error);
+                        sdp->sd_statfs_sync_time = jiffies;
+                }
+                /* Update quota file */
+                t = sdp->sd_quota_sync_time +
+                    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
+                if (time_after_eq(jiffies, t)) {
+                        error = gfs2_quota_sync(sdp);
+                        if (error &&
+                            error != -EROFS &&
+                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                                fs_err(sdp, "quotad: (2) error=%d\n", error);
+                        sdp->sd_quota_sync_time = jiffies;
+                }
+                gfs2_quota_scan(sdp);
+                t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __DAEMON_DOT_H__
+#define __DAEMON_DOT_H__
+int gfs2_scand(void *data);
+int gfs2_glockd(void *data);
+int gfs2_recoverd(void *data);
+int gfs2_logd(void *data);
+int gfs2_quotad(void *data);
+#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..e24af28b1a12
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+/*
+ * Implements Extendible Hashing as described in:
+ *   "Extendible Hashing" by Fagin, et al in
+ *     __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+#define IS_LEAF     1 /* Hashed (leaf) directory */
+#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
+                            u64 leaf_no, void *data);
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+                            const struct qstr *name, void *opaque);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+                            struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        bh = gfs2_meta_new(ip->i_gl, block);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+        gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+        *bhp = bh;
+        return 0;
+}
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+                                        struct buffer_head **bhp)
+{
+        struct buffer_head *bh;
+        int error;
+        error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+                brelse(bh);
+                return -EIO;
+        }
+        *bhp = bh;
+        return 0;
+}
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+                                  unsigned int offset, unsigned int size)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+        if (ip->i_di.di_size < offset + size)
+                ip->i_di.di_size = offset + size;
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        return size;
+}
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+                               u64 offset, unsigned int size)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *dibh;
+        u64 lblock, dblock;
+        u32 extlen = 0;
+        unsigned int o;
+        int copied = 0;
+        int error = 0;
+        if (!size)
+                return 0;
+        if (gfs2_is_stuffed(ip) &&
+            offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+                return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+                                              size);
+        if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+                return -EINVAL;
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (error)
+                        return error;
+        }
+        lblock = offset;
+        o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+        while (copied < size) {
+                unsigned int amount;
+                struct buffer_head *bh;
+                int new = 0;
+                amount = size - copied;
+                if (amount > sdp->sd_sb.sb_bsize - o)
+                        amount = sdp->sd_sb.sb_bsize - o;
+                if (!extlen) {
+                        new = 1;
+                        error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+                                                &dblock, &extlen);
+                        if (error)
+                                goto fail;
+                        error = -EIO;
+                        if (gfs2_assert_withdraw(sdp, dblock))
+                                goto fail;
+                }
+                if (amount == sdp->sd_jbsize || new)
+                        error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+                else
+                        error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+                if (error)
+                        goto fail;
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                memcpy(bh->b_data + o, buf, amount);
+                brelse(bh);
+                buf += amount;
+                copied += amount;
+                lblock++;
+                dblock++;
+                extlen--;
+                o = sizeof(struct gfs2_meta_header);
+        }
+out:
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        if (ip->i_di.di_size < offset + copied)
+                ip->i_di.di_size = offset + copied;
+        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        return copied;
+fail:
+        if (copied)
+                goto out;
+        return error;
+}
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+                                 u64 offset, unsigned int size)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                offset += sizeof(struct gfs2_dinode);
+                memcpy(buf, dibh->b_data + offset, size);
+                brelse(dibh);
+        }
+        return (error) ? error : size;
+}
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @offset: File offset to begin jdata_readng from
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+                              unsigned int size, unsigned ra)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u64 lblock, dblock;
+        u32 extlen = 0;
+        unsigned int o;
+        int copied = 0;
+        int error = 0;
+        if (offset >= ip->i_di.di_size)
+                return 0;
+        if (offset + size > ip->i_di.di_size)
+                size = ip->i_di.di_size - offset;
+        if (!size)
+                return 0;
+        if (gfs2_is_stuffed(ip))
+                return gfs2_dir_read_stuffed(ip, buf, offset, size);
+        if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+                return -EINVAL;
+        lblock = offset;
+        o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+        while (copied < size) {
+                unsigned int amount;
+                struct buffer_head *bh;
+                int new;
+                amount = size - copied;
+                if (amount > sdp->sd_sb.sb_bsize - o)
+                        amount = sdp->sd_sb.sb_bsize - o;
+                if (!extlen) {
+                        new = 0;
+                        error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+                                                &dblock, &extlen);
+                        if (error || !dblock)
+                                goto fail;
+                        BUG_ON(extlen < 1);
+                        if (!ra)
+                                extlen = 1;
+                        bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+                } else {
+                        error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+                        if (error)
+                                goto fail;
+                }
+                error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+                if (error) {
+                        brelse(bh);
+                        goto fail;
+                }
+                dblock++;
+                extlen--;
+                memcpy(buf, bh->b_data + o, amount);
+                brelse(bh);
+                buf += amount;
+                copied += amount;
+                lblock++;
+                o = sizeof(struct gfs2_meta_header);
+        }
+        return copied;
+fail:
+        return (copied) ? copied : error;
+}
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+                                     const struct qstr *name, int ret)
+{
+        if (dent->de_inum.no_addr != 0 &&
+            be32_to_cpu(dent->de_hash) == name->hash &&
+            be16_to_cpu(dent->de_name_len) == name->len &&
+            memcmp(dent+1, name->name, name->len) == 0)
+                return ret;
+        return 0;
+}
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+                            const struct qstr *name,
+                            void *opaque)
+{
+        return __gfs2_dirent_find(dent, name, 1);
+}
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+                            const struct qstr *name,
+                            void *opaque)
+{
+        return __gfs2_dirent_find(dent, name, 2);
+}
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+                            const struct qstr *name,
+                            void *opaque)
+{
+        const char *start = name->name;
+        const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+        if (name->len == (end - start))
+                return 1;
+        return 0;
+}
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+                                  const struct qstr *name,
+                                  void *opaque)
+{
+        unsigned required = GFS2_DIRENT_SIZE(name->len);
+        unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+        unsigned totlen = be16_to_cpu(dent->de_rec_len);
+        if (!dent->de_inum.no_addr)
+                actual = GFS2_DIRENT_SIZE(0);
+        if (totlen - actual >= required)
+                return 1;
+        return 0;
+}
+struct dirent_gather {
+        const struct gfs2_dirent **pdent;
+        unsigned offset;
+};
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+                              const struct qstr *name,
+                              void *opaque)
+{
+        struct dirent_gather *g = opaque;
+        if (dent->de_inum.no_addr) {
+                g->pdent[g->offset++] = dent;
+        }
+        return 0;
+}
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+                             unsigned int size, unsigned int len, int first)
+{
+        const char *msg = "gfs2_dirent too small";
+        if (unlikely(size < sizeof(struct gfs2_dirent)))
+                goto error;
+        msg = "gfs2_dirent misaligned";
+        if (unlikely(offset & 0x7))
+                goto error;
+        msg = "gfs2_dirent points beyond end of block";
+        if (unlikely(offset + size > len))
+                goto error;
+        msg = "zero inode number";
+        if (unlikely(!first && !dent->de_inum.no_addr))
+                goto error;
+        msg = "name length is greater than space in dirent";
+        if (dent->de_inum.no_addr &&
+            unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+                     size))
+                goto error;
+        return 0;
+error:
+        printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+               first ? "first in block" : "not first in block");
+        return -EIO;
+}
+static int gfs2_dirent_offset(const void *buf)
+{
+        const struct gfs2_meta_header *h = buf;
+        int offset;
+        BUG_ON(buf == NULL);
+        switch(be32_to_cpu(h->mh_type)) {
+        case GFS2_METATYPE_LF:
+                offset = sizeof(struct gfs2_leaf);
+                break;
+        case GFS2_METATYPE_DI:
+                offset = sizeof(struct gfs2_dinode);
+                break;
+        default:
+                goto wrong_type;
+        }
+        return offset;
+wrong_type:
+        printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+               be32_to_cpu(h->mh_type));
+        return -1;
+}
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+                                            unsigned int len, gfs2_dscan_t scan,
+                                            const struct qstr *name,
+                                            void *opaque)
+{
+        struct gfs2_dirent *dent, *prev;
+        unsigned offset;
+        unsigned size;
+        int ret = 0;
+        ret = gfs2_dirent_offset(buf);
+        if (ret < 0)
+                goto consist_inode;
+        offset = ret;
+        prev = NULL;
+        dent = buf + offset;
+        size = be16_to_cpu(dent->de_rec_len);
+        if (gfs2_check_dirent(dent, offset, size, len, 1))
+                goto consist_inode;
+        do {
+                ret = scan(dent, name, opaque);
+                if (ret)
+                        break;
+                offset += size;
+                if (offset == len)
+                        break;
+                prev = dent;
+                dent = buf + offset;
+                size = be16_to_cpu(dent->de_rec_len);
+                if (gfs2_check_dirent(dent, offset, size, len, 0))
+                        goto consist_inode;
+        } while(1);
+        switch(ret) {
+        case 0:
+                return NULL;
+        case 1:
+                return dent;
+        case 2:
+                return prev ? prev : dent;
+        default:
+                BUG_ON(ret > 0);
+                return ERR_PTR(ret);
+        }
+consist_inode:
+        gfs2_consist_inode(GFS2_I(inode));
+        return ERR_PTR(-EIO);
+}
+/**
+ * dirent_first - Return the first dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * return first dirent whether bh points to leaf or stuffed dinode
+ *
+ * Returns: IS_LEAF, IS_DINODE, or -errno
+ */
+static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
+                        struct gfs2_dirent **dent)
+{
+        struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
+        if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
+                if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
+                        return -EIO;
+                *dent = (struct gfs2_dirent *)(bh->b_data +
+                                               sizeof(struct gfs2_leaf));
+                return IS_LEAF;
+        } else {
+                if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
+                        return -EIO;
+                *dent = (struct gfs2_dirent *)(bh->b_data +
+                                               sizeof(struct gfs2_dinode));
+                return IS_DINODE;
+        }
+}
+static int dirent_check_reclen(struct gfs2_inode *dip,
+                               const struct gfs2_dirent *d, const void *end_p)
+{
+        const void *ptr = d;
+        u16 rec_len = be16_to_cpu(d->de_rec_len);
+        if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+                goto broken;
+        ptr += rec_len;
+        if (ptr < end_p)
+                return rec_len;
+        if (ptr == end_p)
+                return -ENOENT;
+broken:
+        gfs2_consist_inode(dip);
+        return -EIO;
+}
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+                       struct gfs2_dirent **dent)
+{
+        struct gfs2_dirent *cur = *dent, *tmp;
+        char *bh_end = bh->b_data + bh->b_size;
+        int ret;
+        ret = dirent_check_reclen(dip, cur, bh_end);
+        if (ret < 0)
+                return ret;
+        tmp = (void *)cur + ret;
+        ret = dirent_check_reclen(dip, tmp, bh_end);
+        if (ret == -EIO)
+                return ret;
+        /* Only the first dent could ever have de_inum.no_addr == 0 */
+        if (!tmp->de_inum.no_addr) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        *dent = tmp;
+        return 0;
+}
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+                       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+        u16 cur_rec_len, prev_rec_len;
+        if (!cur->de_inum.no_addr) {
+                gfs2_consist_inode(dip);
+                return;
+        }
+        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        /* If there is no prev entry, this is the first entry in the block.
+           The de_rec_len is already as big as it needs to be.  Just zero
+           out the inode number and return.  */
+        if (!prev) {
+                cur->de_inum.no_addr = 0;       /* No endianess worries */
+                return;
+        }
+        /*  Combine this dentry with the previous one.  */
+        prev_rec_len = be16_to_cpu(prev->de_rec_len);
+        cur_rec_len = be16_to_cpu(cur->de_rec_len);
+        if ((char *)prev + prev_rec_len != (char *)cur)
+                gfs2_consist_inode(dip);
+        if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+                gfs2_consist_inode(dip);
+        prev_rec_len += cur_rec_len;
+        prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+                                            struct gfs2_dirent *dent,
+                                            const struct qstr *name,
+                                            struct buffer_head *bh)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_dirent *ndent;
+        unsigned offset = 0, totlen;
+        if (dent->de_inum.no_addr)
+                offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+        totlen = be16_to_cpu(dent->de_rec_len);
+        BUG_ON(offset + name->len > totlen);
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        ndent = (struct gfs2_dirent *)((char *)dent + offset);
+        dent->de_rec_len = cpu_to_be16(offset);
+        gfs2_qstr2dirent(name, totlen - offset, ndent);
+        return ndent;
+}
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+                                             struct buffer_head *bh,
+                                             const struct qstr *name)
+{
+        struct gfs2_dirent *dent;
+        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+                                gfs2_dirent_find_space, name, NULL);
+        if (!dent || IS_ERR(dent))
+                return dent;
+        return gfs2_init_dirent(inode, dent, name, bh);
+}
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+                    struct buffer_head **bhp)
+{
+        int error;
+        error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+        if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+                /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+                error = -EIO;
+        }
+        return error;
+}
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+                       u64 *leaf_out)
+{
+        u64 leaf_no;
+        int error;
+        error = gfs2_dir_read_data(dip, (char *)&leaf_no,
+                                    index * sizeof(u64),
+                                    sizeof(u64), 0);
+        if (error != sizeof(u64))
+                return (error < 0) ? error : -EIO;
+        *leaf_out = be64_to_cpu(leaf_no);
+        return 0;
+}
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+                          struct buffer_head **bh_out)
+{
+        u64 leaf_no;
+        int error;
+        error = get_leaf_nr(dip, index, &leaf_no);
+        if (!error)
+                error = get_leaf(dip, leaf_no, bh_out);
+        return error;
+}
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+                                              const struct qstr *name,
+                                              gfs2_dscan_t scan,
+                                              struct buffer_head **pbh)
+{
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int error;
+        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                struct gfs2_leaf *leaf;
+                unsigned hsize = 1 << ip->i_di.di_depth;
+                unsigned index;
+                u64 ln;
+                if (hsize * sizeof(u64) != ip->i_di.di_size) {
+                        gfs2_consist_inode(ip);
+                        return ERR_PTR(-EIO);
+                }
+                index = name->hash >> (32 - ip->i_di.di_depth);
+                error = get_first_leaf(ip, index, &bh);
+                if (error)
+                        return ERR_PTR(error);
+                do {
+                        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+                                                scan, name, NULL);
+                        if (dent)
+                                goto got_dent;
+                        leaf = (struct gfs2_leaf *)bh->b_data;
+                        ln = be64_to_cpu(leaf->lf_next);
+                        brelse(bh);
+                        if (!ln)
+                                break;
+                        error = get_leaf(ip, ln, &bh);
+                } while(!error);
+                return error ? ERR_PTR(error) : NULL;
+        }
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                return ERR_PTR(error);
+        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+        if (unlikely(dent == NULL || IS_ERR(dent))) {
+                brelse(bh);
+                bh = NULL;
+        }
+        *pbh = bh;
+        return dent;
+}
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        u64 bn = gfs2_alloc_meta(ip);
+        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+        struct gfs2_leaf *leaf;
+        struct gfs2_dirent *dent;
+        struct qstr name = { .name = "", .len = 0, .hash = 0 };
+        if (!bh)
+                return NULL;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+        leaf = (struct gfs2_leaf *)bh->b_data;
+        leaf->lf_depth = cpu_to_be16(depth);
+        leaf->lf_entries = 0;
+        leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+        leaf->lf_next = 0;
+        memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+        dent = (struct gfs2_dirent *)(leaf+1);
+        gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+        *pbh = bh;
+        return leaf;
+}
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int dir_make_exhash(struct inode *inode)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_dirent *dent;
+        struct qstr args;
+        struct buffer_head *bh, *dibh;
+        struct gfs2_leaf *leaf;
+        int y;
+        u32 x;
+        u64 *lp, bn;
+        int error;
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (error)
+                return error;
+        /*  Turn over a new leaf  */
+        leaf = new_leaf(inode, &bh, 0);
+        if (!leaf)
+                return -ENOSPC;
+        bn = bh->b_blocknr;
+        gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+        leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+        /*  Copy dirents  */
+        gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+                             sizeof(struct gfs2_dinode));
+        /*  Find last entry  */
+        x = 0;
+        args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+                   sizeof(struct gfs2_leaf);
+        args.name = bh->b_data;
+        dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+                                gfs2_dirent_last, &args, NULL);
+        if (!dent) {
+                brelse(bh);
+                brelse(dibh);
+                return -EIO;
+        }
+        if (IS_ERR(dent)) {
+                brelse(bh);
+                brelse(dibh);
+                return PTR_ERR(dent);
+        }
+        /*  Adjust the last dirent's record length
+           (Remember that dent still points to the last entry.)  */
+        dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+                sizeof(struct gfs2_dinode) -
+                sizeof(struct gfs2_leaf));
+        brelse(bh);
+        /*  We're done with the new leaf block, now setup the new
+            hash table.  */
+        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+        for (x = sdp->sd_hash_ptrs; x--; lp++)
+                *lp = cpu_to_be64(bn);
+        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+        dip->i_di.di_blocks++;
+        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+        dip->i_di.di_payload_format = 0;
+        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+        dip->i_di.di_depth = y;
+        gfs2_dinode_out(&dip->i_di, dibh->b_data);
+        brelse(dibh);
+        return 0;
+}
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct buffer_head *nbh, *obh, *dibh;
+        struct gfs2_leaf *nleaf, *oleaf;
+        struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+        u32 start, len, half_len, divider;
+        u64 bn, *lp, leaf_no;
+        u32 index;
+        int x, moved = 0;
+        int error;
+        index = name->hash >> (32 - dip->i_di.di_depth);
+        error = get_leaf_nr(dip, index, &leaf_no);
+        if (error)
+                return error;
+        /*  Get the old leaf block  */
+        error = get_leaf(dip, leaf_no, &obh);
+        if (error)
+                return error;
+        oleaf = (struct gfs2_leaf *)obh->b_data;
+        if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+                brelse(obh);
+                return 1; /* can't split */
+        }
+        gfs2_trans_add_bh(dip->i_gl, obh, 1);
+        nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+        if (!nleaf) {
+                brelse(obh);
+                return -ENOSPC;
+        }
+        bn = nbh->b_blocknr;
+        /*  Compute the start and len of leaf pointers in the hash table.  */
+        len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+        half_len = len >> 1;
+        if (!half_len) {
+                printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+                gfs2_consist_inode(dip);
+                error = -EIO;
+                goto fail_brelse;
+        }
+        start = (index & ~(len - 1));
+        /* Change the pointers.
+           Don't bother distinguishing stuffed from non-stuffed.
+           This code is complicated enough already. */
+        lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
+        /*  Change the pointers  */
+        for (x = 0; x < half_len; x++)
+                lp[x] = cpu_to_be64(bn);
+        error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+                                    half_len * sizeof(u64));
+        if (error != half_len * sizeof(u64)) {
+                if (error >= 0)
+                        error = -EIO;
+                goto fail_lpfree;
+        }
+        kfree(lp);
+        /*  Compute the divider  */
+        divider = (start + half_len) << (32 - dip->i_di.di_depth);
+        /*  Copy the entries  */
+        dirent_first(dip, obh, &dent);
+        do {
+                next = dent;
+                if (dirent_next(dip, obh, &next))
+                        next = NULL;
+                if (dent->de_inum.no_addr &&
+                    be32_to_cpu(dent->de_hash) < divider) {
+                        struct qstr str;
+                        str.name = (char*)(dent+1);
+                        str.len = be16_to_cpu(dent->de_name_len);
+                        str.hash = be32_to_cpu(dent->de_hash);
+                        new = gfs2_dirent_alloc(inode, nbh, &str);
+                        if (IS_ERR(new)) {
+                                error = PTR_ERR(new);
+                                break;
+                        }
+                        new->de_inum = dent->de_inum; /* No endian worries */
+                        new->de_type = dent->de_type; /* No endian worries */
+                        nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+                        dirent_del(dip, obh, prev, dent);
+                        if (!oleaf->lf_entries)
+                                gfs2_consist_inode(dip);
+                        oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+                        if (!prev)
+                                prev = dent;
+                        moved = 1;
+                } else {
+                        prev = dent;
+                }
+                dent = next;
+        } while (dent);
+        oleaf->lf_depth = nleaf->lf_depth;
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+                dip->i_di.di_blocks++;
+                gfs2_dinode_out(&dip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        brelse(obh);
+        brelse(nbh);
+        return error;
+fail_lpfree:
+        kfree(lp);
+fail_brelse:
+        brelse(obh);
+        brelse(nbh);
+        return error;
+}
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct buffer_head *dibh;
+        u32 hsize;
+        u64 *buf;
+        u64 *from, *to;
+        u64 block;
+        int x;
+        int error = 0;
+        hsize = 1 << dip->i_di.di_depth;
+        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        /*  Allocate both the "from" and "to" buffers in one big chunk  */
+        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+                error = gfs2_dir_read_data(dip, (char *)buf,
+                                            block * sdp->sd_hash_bsize,
+                                            sdp->sd_hash_bsize, 1);
+                if (error != sdp->sd_hash_bsize) {
+                        if (error >= 0)
+                                error = -EIO;
+                        goto fail;
+                }
+                from = buf;
+                to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
+                for (x = sdp->sd_hash_ptrs; x--; from++) {
+                        *to++ = *from;  /*  No endianess worries  */
+                        *to++ = *from;
+                }
+                error = gfs2_dir_write_data(dip,
+                                             (char *)buf + sdp->sd_hash_bsize,
+                                             block * sdp->sd_sb.sb_bsize,
+                                             sdp->sd_sb.sb_bsize);
+                if (error != sdp->sd_sb.sb_bsize) {
+                        if (error >= 0)
+                                error = -EIO;
+                        goto fail;
+                }
+        }
+        kfree(buf);
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                dip->i_di.di_depth++;
+                gfs2_dinode_out(&dip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        return error;
+fail:
+        kfree(buf);
+        return error;
+}
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ *   gt: returns 1
+ *   lt: returns -1
+ *   eq: returns 0
+ */
+static int compare_dents(const void *a, const void *b)
+{
+        const struct gfs2_dirent *dent_a, *dent_b;
+        u32 hash_a, hash_b;
+        int ret = 0;
+        dent_a = *(const struct gfs2_dirent **)a;
+        hash_a = be32_to_cpu(dent_a->de_hash);
+        dent_b = *(const struct gfs2_dirent **)b;
+        hash_b = be32_to_cpu(dent_b->de_hash);
+        if (hash_a > hash_b)
+                ret = 1;
+        else if (hash_a < hash_b)
+                ret = -1;
+        else {
+                unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+                unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+                if (len_a > len_b)
+                        ret = 1;
+                else if (len_a < len_b)
+                        ret = -1;
+                else
+                        ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+        }
+        return ret;
+}
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer.  We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+                           void *opaque, gfs2_filldir_t filldir,
+                           const struct gfs2_dirent **darr, u32 entries,
+                           int *copied)
+{
+        const struct gfs2_dirent *dent, *dent_next;
+        struct gfs2_inum inum;
+        u64 off, off_next;
+        unsigned int x, y;
+        int run = 0;
+        int error = 0;
+        sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+        dent_next = darr[0];
+        off_next = be32_to_cpu(dent_next->de_hash);
+        off_next = gfs2_disk_hash2offset(off_next);
+        for (x = 0, y = 1; x < entries; x++, y++) {
+                dent = dent_next;
+                off = off_next;
+                if (y < entries) {
+                        dent_next = darr[y];
+                        off_next = be32_to_cpu(dent_next->de_hash);
+                        off_next = gfs2_disk_hash2offset(off_next);
+                        if (off < *offset)
+                                continue;
+                        *offset = off;
+                        if (off_next == off) {
+                                if (*copied && !run)
+                                        return 1;
+                                run = 1;
+                        } else
+                                run = 0;
+                } else {
+                        if (off < *offset)
+                                continue;
+                        *offset = off;
+                }
+                gfs2_inum_in(&inum, (char *)&dent->de_inum);
+                error = filldir(opaque, (const char *)(dent + 1),
+                                be16_to_cpu(dent->de_name_len),
+                                off, &inum,
+                                be16_to_cpu(dent->de_type));
+                if (error)
+                        return 1;
+                *copied = 1;
+        }
+        /* Increment the *offset by one, so the next time we come into the
+           do_filldir fxn, we get the next entry instead of the last one in the
+           current leaf */
+        (*offset)++;
+        return 0;
+}
+static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+                              gfs2_filldir_t filldir, int *copied,
+                              unsigned *depth, u64 leaf_no)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        struct gfs2_leaf *lf;
+        unsigned entries = 0;
+        unsigned leaves = 0;
+        const struct gfs2_dirent **darr, *dent;
+        struct dirent_gather g;
+        struct buffer_head **larr;
+        int leaf = 0;
+        int error, i;
+        u64 lfn = leaf_no;
+        do {
+                error = get_leaf(ip, lfn, &bh);
+                if (error)
+                        goto out;
+                lf = (struct gfs2_leaf *)bh->b_data;
+                if (leaves == 0)
+                        *depth = be16_to_cpu(lf->lf_depth);
+                entries += be16_to_cpu(lf->lf_entries);
+                leaves++;
+                lfn = be64_to_cpu(lf->lf_next);
+                brelse(bh);
+        } while(lfn);
+        if (!entries)
+                return 0;
+        error = -ENOMEM;
+        larr = vmalloc((leaves + entries) * sizeof(void *));
+        if (!larr)
+                goto out;
+        darr = (const struct gfs2_dirent **)(larr + leaves);
+        g.pdent = darr;
+        g.offset = 0;
+        lfn = leaf_no;
+        do {
+                error = get_leaf(ip, lfn, &bh);
+                if (error)
+                        goto out_kfree;
+                lf = (struct gfs2_leaf *)bh->b_data;
+                lfn = be64_to_cpu(lf->lf_next);
+                if (lf->lf_entries) {
+                        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+                                                gfs2_dirent_gather, NULL, &g);
+                        error = PTR_ERR(dent);
+                        if (IS_ERR(dent)) {
+                                goto out_kfree;
+                        }
+                        error = 0;
+                        larr[leaf++] = bh;
+                } else {
+                        brelse(bh);
+                }
+        } while(lfn);
+        error = do_filldir_main(ip, offset, opaque, filldir, darr,
+                                entries, copied);
+out_kfree:
+        for(i = 0; i < leaf; i++)
+                brelse(larr[i]);
+        vfree(larr);
+out:
+        return error;
+}
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+                      gfs2_filldir_t filldir)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        u32 hsize, len = 0;
+        u32 ht_offset, lp_offset, ht_offset_cur = -1;
+        u32 hash, index;
+        u64 *lp;
+        int copied = 0;
+        int error = 0;
+        unsigned depth = 0;
+        hsize = 1 << dip->i_di.di_depth;
+        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        hash = gfs2_dir_offset2hash(*offset);
+        index = hash >> (32 - dip->i_di.di_depth);
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        if (!lp)
+                return -ENOMEM;
+        while (index < hsize) {
+                lp_offset = index & (sdp->sd_hash_ptrs - 1);
+                ht_offset = index - lp_offset;
+                if (ht_offset_cur != ht_offset) {
+                        error = gfs2_dir_read_data(dip, (char *)lp,
+                                                ht_offset * sizeof(u64),
+                                                sdp->sd_hash_bsize, 1);
+                        if (error != sdp->sd_hash_bsize) {
+                                if (error >= 0)
+                                        error = -EIO;
+                                goto out;
+                        }
+                        ht_offset_cur = ht_offset;
+                }
+                error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+                                           &copied, &depth,
+                                           be64_to_cpu(lp[lp_offset]));
+                if (error)
+                        break;
+                len = 1 << (dip->i_di.di_depth - depth);
+                index = (index & ~(len - 1)) + len;
+        }
+out:
+        kfree(lp);
+        if (error > 0)
+                error = 0;
+        return error;
+}
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+                  gfs2_filldir_t filldir)
+{
+        struct gfs2_inode *dip = GFS2_I(inode);
+        struct dirent_gather g;
+        const struct gfs2_dirent **darr, *dent;
+        struct buffer_head *dibh;
+        int copied = 0;
+        int error;
+        if (!dip->i_di.di_entries)
+                return 0;
+        if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+                return dir_e_read(inode, offset, opaque, filldir);
+        if (!gfs2_is_stuffed(dip)) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (error)
+                return error;
+        error = -ENOMEM;
+        darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
+                       GFP_KERNEL);
+        if (darr) {
+                g.pdent = darr;
+                g.offset = 0;
+                dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+                                        gfs2_dirent_gather, NULL, &g);
+                if (IS_ERR(dent)) {
+                        error = PTR_ERR(dent);
+                        goto out;
+                }
+                error = do_filldir_main(dip, offset, opaque, filldir, darr,
+                                        dip->i_di.di_entries, &copied);
+out:
+                kfree(darr);
+        }
+        if (error > 0)
+                error = 0;
+        brelse(dibh);
+        return error;
+}
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+int gfs2_dir_search(struct inode *dir, const struct qstr *name,
+                    struct gfs2_inum *inum, unsigned int *type)
+{
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+        if (dent) {
+                if (IS_ERR(dent))
+                        return PTR_ERR(dent);
+                if (inum)
+                        gfs2_inum_in(inum, (char *)&dent->de_inum);
+                if (type)
+                        *type = be16_to_cpu(dent->de_type);
+                brelse(bh);
+                return 0;
+        }
+        return -ENOENT;
+}
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+        struct buffer_head *bh, *obh;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_leaf *leaf, *oleaf;
+        int error;
+        u32 index;
+        u64 bn;
+        index = name->hash >> (32 - ip->i_di.di_depth);
+        error = get_first_leaf(ip, index, &obh);
+        if (error)
+                return error;
+        do {
+                oleaf = (struct gfs2_leaf *)obh->b_data;
+                bn = be64_to_cpu(oleaf->lf_next);
+                if (!bn)
+                        break;
+                brelse(obh);
+                error = get_leaf(ip, bn, &obh);
+                if (error)
+                        return error;
+        } while(1);
+        gfs2_trans_add_bh(ip->i_gl, obh, 1);
+        leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+        if (!leaf) {
+                brelse(obh);
+                return -ENOSPC;
+        }
+        oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+        brelse(bh);
+        brelse(obh);
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        ip->i_di.di_blocks++;
+        gfs2_dinode_out(&ip->i_di, bh->b_data);
+        brelse(bh);
+        return 0;
+}
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+                 const struct gfs2_inum *inum, unsigned type)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        struct gfs2_leaf *leaf;
+        int error;
+        while(1) {
+                dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+                                          &bh);
+                if (dent) {
+                        if (IS_ERR(dent))
+                                return PTR_ERR(dent);
+                        dent = gfs2_init_dirent(inode, dent, name, bh);
+                        gfs2_inum_out(inum, (char *)&dent->de_inum);
+                        dent->de_type = cpu_to_be16(type);
+                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                                leaf = (struct gfs2_leaf *)bh->b_data;
+                                leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+                        }
+                        brelse(bh);
+                        error = gfs2_meta_inode_buffer(ip, &bh);
+                        if (error)
+                                break;
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        ip->i_di.di_entries++;
+                        ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+                        gfs2_dinode_out(&ip->i_di, bh->b_data);
+                        brelse(bh);
+                        error = 0;
+                        break;
+                }
+                if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                        error = dir_make_exhash(inode);
+                        if (error)
+                                break;
+                        continue;
+                }
+                error = dir_split_leaf(inode, name);
+                if (error == 0)
+                        continue;
+                if (error < 0)
+                        break;
+                if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+                        error = dir_double_exhash(ip);
+                        if (error)
+                                break;
+                        error = dir_split_leaf(inode, name);
+                        if (error < 0)
+                                break;
+                        if (error == 0)
+                                continue;
+                }
+                error = dir_new_leaf(inode, name);
+                if (!error)
+                        continue;
+                error = -ENOSPC;
+                break;
+        }
+        return error;
+}
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+{
+        struct gfs2_dirent *dent, *prev = NULL;
+        struct buffer_head *bh;
+        int error;
+        /* Returns _either_ the entry (if its first in block) or the
+           previous entry otherwise */
+        dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+        if (!dent) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        if (IS_ERR(dent)) {
+                gfs2_consist_inode(dip);
+                return PTR_ERR(dent);
+        }
+        /* If not first in block, adjust pointers accordingly */
+        if (gfs2_dirent_find(dent, name, NULL) == 0) {
+                prev = dent;
+                dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+        }
+        dirent_del(dip, bh, prev, dent);
+        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+                u16 entries = be16_to_cpu(leaf->lf_entries);
+                if (!entries)
+                        gfs2_consist_inode(dip);
+                leaf->lf_entries = cpu_to_be16(--entries);
+        }
+        brelse(bh);
+        error = gfs2_meta_inode_buffer(dip, &bh);
+        if (error)
+                return error;
+        if (!dip->i_di.di_entries)
+                gfs2_consist_inode(dip);
+        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        dip->i_di.di_entries--;
+        dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&dip->i_di, bh->b_data);
+        brelse(bh);
+        mark_inode_dirty(&dip->i_inode);
+        return error;
+}
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry.  It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                   struct gfs2_inum *inum, unsigned int new_type)
+{
+        struct buffer_head *bh;
+        struct gfs2_dirent *dent;
+        int error;
+        dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+        if (!dent) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        if (IS_ERR(dent))
+                return PTR_ERR(dent);
+        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        gfs2_inum_out(inum, (char *)&dent->de_inum);
+        dent->de_type = cpu_to_be16(new_type);
+        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                brelse(bh);
+                error = gfs2_meta_inode_buffer(dip, &bh);
+                if (error)
+                        return error;
+                gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        }
+        dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+        gfs2_dinode_out(&dip->i_di, bh->b_data);
+        brelse(bh);
+        return 0;
+}
+/**
+ * foreach_leaf - call a function for each leaf in a directory
+ * @dip: the directory
+ * @lc: the function to call for each each
+ * @data: private data to pass to it
+ *
+ * Returns: errno
+ */
+static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct buffer_head *bh;
+        struct gfs2_leaf *leaf;
+        u32 hsize, len;
+        u32 ht_offset, lp_offset, ht_offset_cur = -1;
+        u32 index = 0;
+        u64 *lp;
+        u64 leaf_no;
+        int error = 0;
+        hsize = 1 << dip->i_di.di_depth;
+        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        if (!lp)
+                return -ENOMEM;
+        while (index < hsize) {
+                lp_offset = index & (sdp->sd_hash_ptrs - 1);
+                ht_offset = index - lp_offset;
+                if (ht_offset_cur != ht_offset) {
+                        error = gfs2_dir_read_data(dip, (char *)lp,
+                                                ht_offset * sizeof(u64),
+                                                sdp->sd_hash_bsize, 1);
+                        if (error != sdp->sd_hash_bsize) {
+                                if (error >= 0)
+                                        error = -EIO;
+                                goto out;
+                        }
+                        ht_offset_cur = ht_offset;
+                }
+                leaf_no = be64_to_cpu(lp[lp_offset]);
+                if (leaf_no) {
+                        error = get_leaf(dip, leaf_no, &bh);
+                        if (error)
+                                goto out;
+                        leaf = (struct gfs2_leaf *)bh->b_data;
+                        len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+                        brelse(bh);
+                        error = lc(dip, index, len, leaf_no, data);
+                        if (error)
+                                goto out;
+                        index = (index & ~(len - 1)) + len;
+                } else
+                        index++;
+        }
+        if (index != hsize) {
+                gfs2_consist_inode(dip);
+                error = -EIO;
+        }
+out:
+        kfree(lp);
+        return error;
+}
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: not used
+ *
+ * Returns: errno
+ */
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+                        u64 leaf_no, void *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_leaf *tmp_leaf;
+        struct gfs2_rgrp_list rlist;
+        struct buffer_head *bh, *dibh;
+        u64 blk, nblk;
+        unsigned int rg_blocks = 0, l_blocks = 0;
+        char *ht;
+        unsigned int x, size = len * sizeof(u64);
+        int error;
+        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+        ht = kzalloc(size, GFP_KERNEL);
+        if (!ht)
+                return -ENOMEM;
+        gfs2_alloc_get(dip);
+        error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+        if (error)
+                goto out_qs;
+        /*  Count the number of leaves  */
+        for (blk = leaf_no; blk; blk = nblk) {
+                error = get_leaf(dip, blk, &bh);
+                if (error)
+                        goto out_rlist;
+                tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+                nblk = be64_to_cpu(tmp_leaf->lf_next);
+                brelse(bh);
+                gfs2_rlist_add(sdp, &rlist, blk);
+                l_blocks++;
+        }
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        for (x = 0; x < rlist.rl_rgrps; x++) {
+                struct gfs2_rgrpd *rgd;
+                rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+                rg_blocks += rgd->rd_ri.ri_length;
+        }
+        error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+        if (error)
+                goto out_rlist;
+        error = gfs2_trans_begin(sdp,
+                        rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+                        RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+        if (error)
+                goto out_rg_gunlock;
+        for (blk = leaf_no; blk; blk = nblk) {
+                error = get_leaf(dip, blk, &bh);
+                if (error)
+                        goto out_end_trans;
+                tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+                nblk = be64_to_cpu(tmp_leaf->lf_next);
+                brelse(bh);
+                gfs2_free_meta(dip, blk, 1);
+                if (!dip->i_di.di_blocks)
+                        gfs2_consist_inode(dip);
+                dip->i_di.di_blocks--;
+        }
+        error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+        if (error != size) {
+                if (error >= 0)
+                        error = -EIO;
+                goto out_end_trans;
+        }
+        error = gfs2_meta_inode_buffer(dip, &dibh);
+        if (error)
+                goto out_end_trans;
+        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        gfs2_dinode_out(&dip->i_di, dibh->b_data);
+        brelse(dibh);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_rg_gunlock:
+        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+        gfs2_rlist_free(&rlist);
+        gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+out_qs:
+        gfs2_quota_unhold(dip);
+out:
+        gfs2_alloc_put(dip);
+        kfree(ht);
+        return error;
+}
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct buffer_head *bh;
+        int error;
+        /* Dealloc on-disk leaves to FREEMETA state */
+        error = foreach_leaf(dip, leaf_dealloc, NULL);
+        if (error)
+                return error;
+        /* Make this a regular file in case we crash.
+           (We don't want to free these blocks a second time.)  */
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(dip, &bh);
+        if (!error) {
+                gfs2_trans_add_bh(dip->i_gl, bh, 1);
+                ((struct gfs2_dinode *)bh->b_data)->di_mode =
+                                                cpu_to_be32(S_IFREG);
+                brelse(bh);
+        }
+        gfs2_trans_end(sdp);
+        return error;
+}
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ *
+ * Returns: 1 if alloc required, 0 if not, -ve on error
+ */
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+{
+        struct gfs2_dirent *dent;
+        struct buffer_head *bh;
+        dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+        if (!dent) {
+                return 1;
+        }
+        if (IS_ERR(dent))
+                return PTR_ERR(dent);
+        brelse(bh);
+        return 0;
+}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+#include <linux/dcache.h>
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+/**
+ * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+typedef int (*gfs2_filldir_t) (void *opaque,
+                              const char *name, unsigned int length,
+                              u64 offset,
+                              struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
+                    struct gfs2_inum *inum, unsigned int *type);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+                 const struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
+                  gfs2_filldir_t filldir);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                   struct gfs2_inum *new_inum, unsigned int new_type);
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+int gfs2_diradd_alloc_required(struct inode *dir,
+                               const struct qstr *filename);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+                            struct buffer_head **bhp);
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+        return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+        name->name = fname;
+        name->len = strlen(fname);
+        name->hash = gfs2_disk_hash(name->name, name->len);
+}
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+        dent->de_inum.no_addr = cpu_to_be64(0);
+        dent->de_inum.no_formal_ino = cpu_to_be64(0);
+        dent->de_hash = cpu_to_be32(name->hash);
+        dent->de_rec_len = cpu_to_be16(reclen);
+        dent->de_name_len = cpu_to_be16(name->len);
+        dent->de_type = cpu_to_be16(0);
+        memset(dent->__pad, 0, sizeof(dent->__pad));
+        memcpy(dent + 1, name->name, name->len);
+}
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "util.h"
+/**
+ * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
+ * @namep: ea name, possibly with type appended
+ *
+ * Returns: GFS2_EATYPE_XXX
+ */
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
+{
+        unsigned int type;
+        if (strncmp(name, "system.", 7) == 0) {
+                type = GFS2_EATYPE_SYS;
+                if (truncated_name)
+                        *truncated_name = name + sizeof("system.") - 1;
+        } else if (strncmp(name, "user.", 5) == 0) {
+                type = GFS2_EATYPE_USR;
+                if (truncated_name)
+                        *truncated_name = name + sizeof("user.") - 1;
+        } else if (strncmp(name, "security.", 9) == 0) {
+                type = GFS2_EATYPE_SECURITY;
+                if (truncated_name)
+                        *truncated_name = name + sizeof("security.") - 1;
+        } else {
+                type = GFS2_EATYPE_UNUSED;
+                if (truncated_name)
+                        *truncated_name = NULL;
+        }
+        return type;
+}
+static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_READ, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_get_i(ip, er);
+}
+static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        if (S_ISREG(inode->i_mode) ||
+            (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+                int error = permission(inode, MAY_WRITE, NULL);
+                if (error)
+                        return error;
+        } else
+                return -EPERM;
+        return gfs2_ea_set_i(ip, er);
+}
+static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        if (S_ISREG(inode->i_mode) ||
+            (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+                int error = permission(inode, MAY_WRITE, NULL);
+                if (error)
+                        return error;
+        } else
+                return -EPERM;
+        return gfs2_ea_remove_i(ip, er);
+}
+static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
+            !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
+            !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
+            (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
+             GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
+                return -EOPNOTSUPP;
+        return gfs2_ea_get_i(ip, er);
+}
+static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        int remove = 0;
+        int error;
+        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+                if (!(er->er_flags & GFS2_ERF_MODE)) {
+                        er->er_mode = ip->i_di.di_mode;
+                        er->er_flags |= GFS2_ERF_MODE;
+                }
+                error = gfs2_acl_validate_set(ip, 1, er,
+                                              &remove, &er->er_mode);
+                if (error)
+                        return error;
+                error = gfs2_ea_set_i(ip, er);
+                if (error)
+                        return error;
+                if (remove)
+                        gfs2_ea_remove_i(ip, er);
+                return 0;
+        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+                error = gfs2_acl_validate_set(ip, 0, er,
+                                              &remove, NULL);
+                if (error)
+                        return error;
+                if (!remove)
+                        error = gfs2_ea_set_i(ip, er);
+                else {
+                        error = gfs2_ea_remove_i(ip, er);
+                        if (error == -ENODATA)
+                                error = 0;
+                }
+                return error;
+        }
+        return -EPERM;
+}
+static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+                int error = gfs2_acl_validate_remove(ip, 1);
+                if (error)
+                        return error;
+        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+                int error = gfs2_acl_validate_remove(ip, 0);
+                if (error)
+                        return error;
+        } else
+                return -EPERM;
+        return gfs2_ea_remove_i(ip, er);
+}
+static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_READ, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_get_i(ip, er);
+}
+static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_WRITE, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_set_i(ip, er);
+}
+static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct inode *inode = &ip->i_inode;
+        int error = permission(inode, MAY_WRITE, NULL);
+        if (error)
+                return error;
+        return gfs2_ea_remove_i(ip, er);
+}
+static struct gfs2_eattr_operations gfs2_user_eaops = {
+        .eo_get = user_eo_get,
+        .eo_set = user_eo_set,
+        .eo_remove = user_eo_remove,
+        .eo_name = "user",
+};
+struct gfs2_eattr_operations gfs2_system_eaops = {
+        .eo_get = system_eo_get,
+        .eo_set = system_eo_set,
+        .eo_remove = system_eo_remove,
+        .eo_name = "system",
+};
+static struct gfs2_eattr_operations gfs2_security_eaops = {
+        .eo_get = security_eo_get,
+        .eo_set = security_eo_set,
+        .eo_remove = security_eo_remove,
+        .eo_name = "security",
+};
+struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+        NULL,
+        &gfs2_user_eaops,
+        &gfs2_system_eaops,
+        &gfs2_security_eaops,
+};
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __EAOPS_DOT_H__
+#define __EAOPS_DOT_H__
+struct gfs2_ea_request;
+struct gfs2_inode;
+struct gfs2_eattr_operations {
+        int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+        int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+        int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+        char *eo_name;
+};
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
+extern struct gfs2_eattr_operations gfs2_system_eaops;
+extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+#endif /* __EAOPS_DOT_H__ */
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+                        unsigned int *size)
+{
+        *size = GFS2_EAREQ_SIZE_STUFFED(er);
+        if (*size <= sdp->sd_jbsize)
+                return 1;
+        *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+        return 0;
+}
+static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+{
+        unsigned int size;
+        if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+                return -ERANGE;
+        ea_calc_size(sdp, er, &size);
+        /* This can only happen with 512 byte blocks */
+        if (size > sdp->sd_jbsize)
+                return -ERANGE;
+        return 0;
+}
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+                          struct gfs2_ea_header *ea,
+                          struct gfs2_ea_header *prev, void *private);
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                        ea_call_t ea_call, void *data)
+{
+        struct gfs2_ea_header *ea, *prev = NULL;
+        int error = 0;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+                return -EIO;
+        for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+                if (!GFS2_EA_REC_LEN(ea))
+                        goto fail;
+                if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+                                                  bh->b_data + bh->b_size))
+                        goto fail;
+                if (!GFS2_EATYPE_VALID(ea->ea_type))
+                        goto fail;
+                error = ea_call(ip, bh, ea, prev, data);
+                if (error)
+                        return error;
+                if (GFS2_EA_IS_LAST(ea)) {
+                        if ((char *)GFS2_EA2NEXT(ea) !=
+                            bh->b_data + bh->b_size)
+                                goto fail;
+                        break;
+                }
+        }
+        return error;
+fail:
+        gfs2_consist_inode(ip);
+        return -EIO;
+}
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+        struct buffer_head *bh, *eabh;
+        u64 *eablk, *end;
+        int error;
+        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+        if (error)
+                return error;
+        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+                error = ea_foreach_i(ip, bh, ea_call, data);
+                goto out;
+        }
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+                error = -EIO;
+                goto out;
+        }
+        eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+        end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+        for (; eablk < end; eablk++) {
+                u64 bn;
+                if (!*eablk)
+                        break;
+                bn = be64_to_cpu(*eablk);
+                error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+                if (error)
+                        break;
+                error = ea_foreach_i(ip, eabh, ea_call, data);
+                brelse(eabh);
+                if (error)
+                        break;
+        }
+out:
+        brelse(bh);
+        return error;
+}
+struct ea_find {
+        struct gfs2_ea_request *ef_er;
+        struct gfs2_ea_location *ef_el;
+};
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                     void *private)
+{
+        struct ea_find *ef = private;
+        struct gfs2_ea_request *er = ef->ef_er;
+        if (ea->ea_type == GFS2_EATYPE_UNUSED)
+                return 0;
+        if (ea->ea_type == er->er_type) {
+                if (ea->ea_name_len == er->er_name_len &&
+                    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+                        struct gfs2_ea_location *el = ef->ef_el;
+                        get_bh(bh);
+                        el->el_bh = bh;
+                        el->el_ea = ea;
+                        el->el_prev = prev;
+                        return 1;
+                }
+        }
+        return 0;
+}
+int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                 struct gfs2_ea_location *el)
+{
+        struct ea_find ef;
+        int error;
+        ef.ef_er = er;
+        ef.ef_el = el;
+        memset(el, 0, sizeof(struct gfs2_ea_location));
+        error = ea_foreach(ip, ea_find_i, &ef);
+        if (error > 0)
+                return 0;
+        return error;
+}
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+                                struct gfs2_ea_header *ea,
+                                struct gfs2_ea_header *prev, void *private)
+{
+        int *leave = private;
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder rg_gh;
+        struct buffer_head *dibh;
+        u64 *dataptrs, bn = 0;
+        u64 bstart = 0;
+        unsigned int blen = 0;
+        unsigned int blks = 0;
+        unsigned int x;
+        int error;
+        if (GFS2_EA_IS_STUFFED(ea))
+                return 0;
+        dataptrs = GFS2_EA2DATAPTRS(ea);
+        for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+                if (*dataptrs) {
+                        blks++;
+                        bn = be64_to_cpu(*dataptrs);
+                }
+        }
+        if (!blks)
+                return 0;
+        rgd = gfs2_blk2rgrpd(sdp, bn);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+        if (error)
+                return error;
+        error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
+                                 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+        if (error)
+                goto out_gunlock;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        dataptrs = GFS2_EA2DATAPTRS(ea);
+        for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+                if (!*dataptrs)
+                        break;
+                bn = be64_to_cpu(*dataptrs);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_free_meta(ip, bstart, blen);
+                        bstart = bn;
+                        blen = 1;
+                }
+                *dataptrs = 0;
+                if (!ip->i_di.di_blocks)
+                        gfs2_consist_inode(ip);
+                ip->i_di.di_blocks--;
+        }
+        if (bstart)
+                gfs2_free_meta(ip, bstart, blen);
+        if (prev && !leave) {
+                u32 len;
+                len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+                prev->ea_rec_len = cpu_to_be32(len);
+                if (GFS2_EA_IS_LAST(ea))
+                        prev->ea_flags |= GFS2_EAFLAG_LAST;
+        } else {
+                ea->ea_type = GFS2_EATYPE_UNUSED;
+                ea->ea_num_ptrs = 0;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_uninit(&rg_gh);
+        return error;
+}
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+                               struct gfs2_ea_header *ea,
+                               struct gfs2_ea_header *prev, int leave)
+{
+        struct gfs2_alloc *al;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out_alloc;
+        error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+        if (error)
+                goto out_quota;
+        error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+        gfs2_quota_unhold(ip);
+out_alloc:
+        gfs2_alloc_put(ip);
+        return error;
+}
+struct ea_list {
+        struct gfs2_ea_request *ei_er;
+        unsigned int ei_size;
+};
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+                     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                     void *private)
+{
+        struct ea_list *ei = private;
+        struct gfs2_ea_request *er = ei->ei_er;
+        unsigned int ea_size = gfs2_ea_strlen(ea);
+        if (ea->ea_type == GFS2_EATYPE_UNUSED)
+                return 0;
+        if (er->er_data_len) {
+                char *prefix = NULL;
+                unsigned int l = 0;
+                char c = 0;
+                if (ei->ei_size + ea_size > er->er_data_len)
+                        return -ERANGE;
+                switch (ea->ea_type) {
+                case GFS2_EATYPE_USR:
+                        prefix = "user.";
+                        l = 5;
+                        break;
+                case GFS2_EATYPE_SYS:
+                        prefix = "system.";
+                        l = 7;
+                        break;
+                case GFS2_EATYPE_SECURITY:
+                        prefix = "security.";
+                        l = 9;
+                        break;
+                }
+                BUG_ON(l == 0);
+                memcpy(er->er_data + ei->ei_size, prefix, l);
+                memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+                       ea->ea_name_len);
+                memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+        }
+        ei->ei_size += ea_size;
+        return 0;
+}
+/**
+ * gfs2_ea_list -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_data || !er->er_data_len) {
+                er->er_data = NULL;
+                er->er_data_len = 0;
+        }
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (error)
+                return error;
+        if (ip->i_di.di_eattr) {
+                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+                error = ea_foreach(ip, ea_list_i, &ei);
+                if (!error)
+                        error = ei.ei_size;
+        }
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+                            char *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head **bh;
+        unsigned int amount = GFS2_EA_DATA_LEN(ea);
+        unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+        u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+        unsigned int x;
+        int error = 0;
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        if (!bh)
+                return -ENOMEM;
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+                                       bh + x);
+                if (error) {
+                        while (x--)
+                                brelse(bh[x]);
+                        goto out;
+                }
+                dataptrs++;
+        }
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_wait(sdp, bh[x]);
+                if (error) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        goto out;
+                }
+                if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        error = -EIO;
+                        goto out;
+                }
+                memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+                       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+                amount -= sdp->sd_jbsize;
+                data += sdp->sd_jbsize;
+                brelse(bh[x]);
+        }
+out:
+        kfree(bh);
+        return error;
+}
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                     char *data)
+{
+        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+                memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+                return 0;
+        } else
+                return ea_get_unstuffed(ip, el->el_ea, data);
+}
+/**
+ * gfs2_ea_get_i -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_ea_location el;
+        int error;
+        if (!ip->i_di.di_eattr)
+                return -ENODATA;
+        error = gfs2_ea_find(ip, er, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                return -ENODATA;
+        if (er->er_data_len) {
+                if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+                        error =  -ERANGE;
+                else
+                        error = gfs2_ea_get_copy(ip, &el, er->er_data);
+        }
+        if (!error)
+                error = GFS2_EA_DATA_LEN(el.el_ea);
+        brelse(el.el_bh);
+        return error;
+}
+/**
+ * gfs2_ea_get -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_name_len ||
+            er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
+        if (!er->er_data || !er->er_data_len) {
+                er->er_data = NULL;
+                er->er_data_len = 0;
+        }
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (error)
+                return error;
+        error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_ea_header *ea;
+        u64 block;
+        block = gfs2_alloc_meta(ip);
+        *bhp = gfs2_meta_new(ip->i_gl, block);
+        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+        gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+        gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+        ea = GFS2_EA_BH2FIRST(*bhp);
+        ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+        ea->ea_type = GFS2_EATYPE_UNUSED;
+        ea->ea_flags = GFS2_EAFLAG_LAST;
+        ea->ea_num_ptrs = 0;
+        ip->i_di.di_blocks++;
+        return 0;
+}
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+                    struct gfs2_ea_request *er)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        ea->ea_data_len = cpu_to_be32(er->er_data_len);
+        ea->ea_name_len = er->er_name_len;
+        ea->ea_type = er->er_type;
+        ea->__pad = 0;
+        memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+        if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+                ea->ea_num_ptrs = 0;
+                memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+        } else {
+                u64 *dataptr = GFS2_EA2DATAPTRS(ea);
+                const char *data = er->er_data;
+                unsigned int data_len = er->er_data_len;
+                unsigned int copy;
+                unsigned int x;
+                ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+                for (x = 0; x < ea->ea_num_ptrs; x++) {
+                        struct buffer_head *bh;
+                        u64 block;
+                        int mh_size = sizeof(struct gfs2_meta_header);
+                        block = gfs2_alloc_meta(ip);
+                        bh = gfs2_meta_new(ip->i_gl, block);
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+                        ip->i_di.di_blocks++;
+                        copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+                                                           data_len;
+                        memcpy(bh->b_data + mh_size, data, copy);
+                        if (copy < sdp->sd_jbsize)
+                                memset(bh->b_data + mh_size + copy, 0,
+                                       sdp->sd_jbsize - copy);
+                        *dataptr++ = cpu_to_be64(bh->b_blocknr);
+                        data += copy;
+                        data_len -= copy;
+                        brelse(bh);
+                }
+                gfs2_assert_withdraw(sdp, !data_len);
+        }
+        return 0;
+}
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+                                   struct gfs2_ea_request *er, void *private);
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                             unsigned int blks,
+                             ea_skeleton_call_t skeleton_call, void *private)
+{
+        struct gfs2_alloc *al;
+        struct buffer_head *dibh;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+        if (error)
+                goto out_gunlock_q;
+        al->al_requested = blks;
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+                                 blks + al->al_rgd->rd_ri.ri_length +
+                                 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+        if (error)
+                goto out_ipres;
+        error = skeleton_call(ip, er, private);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                if (er->er_flags & GFS2_ERF_MODE) {
+                        gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+                                            (ip->i_di.di_mode & S_IFMT) ==
+                                            (er->er_mode & S_IFMT));
+                        ip->i_di.di_mode = er->er_mode;
+                }
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+out_end_trans:
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+        gfs2_inplace_release(ip);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                     void *private)
+{
+        struct buffer_head *bh;
+        int error;
+        error = ea_alloc_blk(ip, &bh);
+        if (error)
+                return error;
+        ip->i_di.di_eattr = bh->b_blocknr;
+        error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+        brelse(bh);
+        return error;
+}
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+        unsigned int blks = 1;
+        if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+                blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+        return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+}
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+        u32 ea_size = GFS2_EA_SIZE(ea);
+        struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+                                     ea_size);
+        u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+        int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+        ea->ea_rec_len = cpu_to_be32(ea_size);
+        ea->ea_flags ^= last;
+        new->ea_rec_len = cpu_to_be32(new_size);
+        new->ea_flags = last;
+        return new;
+}
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+                                  struct gfs2_ea_location *el)
+{
+        struct gfs2_ea_header *ea = el->el_ea;
+        struct gfs2_ea_header *prev = el->el_prev;
+        u32 len;
+        gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+                ea->ea_type = GFS2_EATYPE_UNUSED;
+                return;
+        } else if (GFS2_EA2NEXT(prev) != ea) {
+                prev = GFS2_EA2NEXT(prev);
+                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+        }
+        len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+        prev->ea_rec_len = cpu_to_be32(len);
+        if (GFS2_EA_IS_LAST(ea))
+                prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+struct ea_set {
+        int ea_split;
+        struct gfs2_ea_request *es_er;
+        struct gfs2_ea_location *es_el;
+        struct buffer_head *es_bh;
+        struct gfs2_ea_header *es_ea;
+};
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+                                 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+        struct gfs2_ea_request *er = es->es_er;
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        if (es->ea_split)
+                ea = ea_split_ea(ea);
+        ea_write(ip, ea, er);
+        if (es->es_el)
+                ea_set_remove_stuffed(ip, es->es_el);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        if (er->er_flags & GFS2_ERF_MODE) {
+                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+                        (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
+                ip->i_di.di_mode = er->er_mode;
+        }
+        ip->i_di.di_ctime = get_seconds();
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+out:
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+                               struct gfs2_ea_request *er, void *private)
+{
+        struct ea_set *es = private;
+        struct gfs2_ea_header *ea = es->es_ea;
+        int error;
+        gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+        if (es->ea_split)
+                ea = ea_split_ea(ea);
+        error = ea_write(ip, ea, er);
+        if (error)
+                return error;
+        if (es->es_el)
+                ea_set_remove_stuffed(ip, es->es_el);
+        return 0;
+}
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+                         struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+                         void *private)
+{
+        struct ea_set *es = private;
+        unsigned int size;
+        int stuffed;
+        int error;
+        stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+        if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+                if (GFS2_EA_REC_LEN(ea) < size)
+                        return 0;
+                if (!GFS2_EA_IS_STUFFED(ea)) {
+                        error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+                        if (error)
+                                return error;
+                }
+                es->ea_split = 0;
+        } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+                es->ea_split = 1;
+        else
+                return 0;
+        if (stuffed) {
+                error = ea_set_simple_noalloc(ip, bh, ea, es);
+                if (error)
+                        return error;
+        } else {
+                unsigned int blks;
+                es->es_bh = bh;
+                es->es_ea = ea;
+                blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+                                        GFS2_SB(&ip->i_inode)->sd_jbsize);
+                error = ea_alloc_skeleton(ip, es->es_er, blks,
+                                          ea_set_simple_alloc, es);
+                if (error)
+                        return error;
+        }
+        return 1;
+}
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                        void *private)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *indbh, *newbh;
+        u64 *eablk;
+        int error;
+        int mh_size = sizeof(struct gfs2_meta_header);
+        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+                u64 *end;
+                error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+                                       &indbh);
+                if (error)
+                        return error;
+                if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+                        error = -EIO;
+                        goto out;
+                }
+                eablk = (u64 *)(indbh->b_data + mh_size);
+                end = eablk + sdp->sd_inptrs;
+                for (; eablk < end; eablk++)
+                        if (!*eablk)
+                                break;
+                if (eablk == end) {
+                        error = -ENOSPC;
+                        goto out;
+                }
+                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+        } else {
+                u64 blk;
+                blk = gfs2_alloc_meta(ip);
+                indbh = gfs2_meta_new(ip->i_gl, blk);
+                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+                gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+                gfs2_buffer_clear_tail(indbh, mh_size);
+                eablk = (u64 *)(indbh->b_data + mh_size);
+                *eablk = cpu_to_be64(ip->i_di.di_eattr);
+                ip->i_di.di_eattr = blk;
+                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+                ip->i_di.di_blocks++;
+                eablk++;
+        }
+        error = ea_alloc_blk(ip, &newbh);
+        if (error)
+                goto out;
+        *eablk = cpu_to_be64((u64)newbh->b_blocknr);
+        error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+        brelse(newbh);
+        if (error)
+                goto out;
+        if (private)
+                ea_set_remove_stuffed(ip, private);
+out:
+        brelse(indbh);
+        return error;
+}
+static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+                    struct gfs2_ea_location *el)
+{
+        struct ea_set es;
+        unsigned int blks = 2;
+        int error;
+        memset(&es, 0, sizeof(struct ea_set));
+        es.es_er = er;
+        es.es_el = el;
+        error = ea_foreach(ip, ea_set_simple, &es);
+        if (error > 0)
+                return 0;
+        if (error)
+                return error;
+        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+                blks++;
+        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+        return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+}
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+                                   struct gfs2_ea_location *el)
+{
+        if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+                el->el_prev = GFS2_EA2NEXT(el->el_prev);
+                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+                                     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+        }
+        return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_ea_location el;
+        int error;
+        if (!ip->i_di.di_eattr) {
+                if (er->er_flags & XATTR_REPLACE)
+                        return -ENODATA;
+                return ea_init(ip, er);
+        }
+        error = gfs2_ea_find(ip, er, &el);
+        if (error)
+                return error;
+        if (el.el_ea) {
+                if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+                        brelse(el.el_bh);
+                        return -EPERM;
+                }
+                error = -EEXIST;
+                if (!(er->er_flags & XATTR_CREATE)) {
+                        int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+                        error = ea_set_i(ip, er, &el);
+                        if (!error && unstuffed)
+                                ea_set_remove_unstuffed(ip, &el);
+                }
+                brelse(el.el_bh);
+        } else {
+                error = -ENODATA;
+                if (!(er->er_flags & XATTR_REPLACE))
+                        error = ea_set_i(ip, er, NULL);
+        }
+        return error;
+}
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
+        if (!er->er_data || !er->er_data_len) {
+                er->er_data = NULL;
+                er->er_data_len = 0;
+        }
+        error = ea_check_size(GFS2_SB(&ip->i_inode), er);
+        if (error)
+                return error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        if (IS_IMMUTABLE(&ip->i_inode))
+                error = -EPERM;
+        else
+                error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+        struct gfs2_ea_header *ea = el->el_ea;
+        struct gfs2_ea_header *prev = el->el_prev;
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+        if (error)
+                return error;
+        gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        if (prev) {
+                u32 len;
+                len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+                prev->ea_rec_len = cpu_to_be32(len);
+                if (GFS2_EA_IS_LAST(ea))
+                        prev->ea_flags |= GFS2_EAFLAG_LAST;
+        } else
+                ea->ea_type = GFS2_EATYPE_UNUSED;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_ea_location el;
+        int error;
+        if (!ip->i_di.di_eattr)
+                return -ENODATA;
+        error = gfs2_ea_find(ip, er, &el);
+        if (error)
+                return error;
+        if (!el.el_ea)
+                return -ENODATA;
+        if (GFS2_EA_IS_STUFFED(el.el_ea))
+                error = ea_remove_stuffed(ip, &el);
+        else
+                error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+                                            0);
+        brelse(el.el_bh);
+        return error;
+}
+/**
+ * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * @ip: pointer to the inode of the target file
+ * @er: request information
+ *
+ * Returns: errno
+ */
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+        struct gfs2_holder i_gh;
+        int error;
+        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                error = -EPERM;
+        else
+                error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+                                  struct gfs2_ea_header *ea, char *data)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head **bh;
+        unsigned int amount = GFS2_EA_DATA_LEN(ea);
+        unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+        u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+        unsigned int x;
+        int error;
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        if (!bh)
+                return -ENOMEM;
+        error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+        if (error)
+                goto out;
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+                                       bh + x);
+                if (error) {
+                        while (x--)
+                                brelse(bh[x]);
+                        goto fail;
+                }
+                dataptrs++;
+        }
+        for (x = 0; x < nptrs; x++) {
+                error = gfs2_meta_wait(sdp, bh[x]);
+                if (error) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        goto fail;
+                }
+                if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+                        for (; x < nptrs; x++)
+                                brelse(bh[x]);
+                        error = -EIO;
+                        goto fail;
+                }
+                gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+                memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+                       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+                amount -= sdp->sd_jbsize;
+                data += sdp->sd_jbsize;
+                brelse(bh[x]);
+        }
+out:
+        kfree(bh);
+        return error;
+fail:
+        gfs2_trans_end(sdp);
+        kfree(bh);
+        return error;
+}
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                      struct iattr *attr, char *data)
+{
+        struct buffer_head *dibh;
+        int error;
+        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+                error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+                if (error)
+                        return error;
+                gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+                memcpy(GFS2_EA2DATA(el->el_ea), data,
+                       GFS2_EA_DATA_LEN(el->el_ea));
+        } else
+                error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                error = inode_setattr(&ip->i_inode, attr);
+                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+                gfs2_inode_attr_out(ip);
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrp_list rlist;
+        struct buffer_head *indbh, *dibh;
+        u64 *eablk, *end;
+        unsigned int rg_blocks = 0;
+        u64 bstart = 0;
+        unsigned int blen = 0;
+        unsigned int blks = 0;
+        unsigned int x;
+        int error;
+        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+                error = -EIO;
+                goto out;
+        }
+        eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+        end = eablk + sdp->sd_inptrs;
+        for (; eablk < end; eablk++) {
+                u64 bn;
+                if (!*eablk)
+                        break;
+                bn = be64_to_cpu(*eablk);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_rlist_add(sdp, &rlist, bstart);
+                        bstart = bn;
+                        blen = 1;
+                }
+                blks++;
+        }
+        if (bstart)
+                gfs2_rlist_add(sdp, &rlist, bstart);
+        else
+                goto out;
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        for (x = 0; x < rlist.rl_rgrps; x++) {
+                struct gfs2_rgrpd *rgd;
+                rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+                rg_blocks += rgd->rd_ri.ri_length;
+        }
+        error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+        if (error)
+                goto out_rlist_free;
+        error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+                                 RES_STATFS + RES_QUOTA, blks);
+        if (error)
+                goto out_gunlock;
+        gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+        eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+        bstart = 0;
+        blen = 0;
+        for (; eablk < end; eablk++) {
+                u64 bn;
+                if (!*eablk)
+                        break;
+                bn = be64_to_cpu(*eablk);
+                if (bstart + blen == bn)
+                        blen++;
+                else {
+                        if (bstart)
+                                gfs2_free_meta(ip, bstart, blen);
+                        bstart = bn;
+                        blen = 1;
+                }
+                *eablk = 0;
+                if (!ip->i_di.di_blocks)
+                        gfs2_consist_inode(ip);
+                ip->i_di.di_blocks--;
+        }
+        if (bstart)
+                gfs2_free_meta(ip, bstart, blen);
+        ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+        gfs2_rlist_free(&rlist);
+out:
+        brelse(indbh);
+        return error;
+}
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_rgrpd *rgd;
+        struct buffer_head *dibh;
+        int error;
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+                                   &al->al_rgd_gh);
+        if (error)
+                return error;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+                                 RES_QUOTA, 1);
+        if (error)
+                goto out_gunlock;
+        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+        ip->i_di.di_eattr = 0;
+        if (!ip->i_di.di_blocks)
+                gfs2_consist_inode(ip);
+        ip->i_di.di_blocks--;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+        return error;
+}
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+        struct gfs2_alloc *al;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out_alloc;
+        error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+        if (error)
+                goto out_quota;
+        error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+        if (error)
+                goto out_rindex;
+        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+                error = ea_dealloc_indirect(ip);
+                if (error)
+                        goto out_rindex;
+        }
+        error = ea_dealloc_block(ip);
+out_rindex:
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+        gfs2_quota_unhold(ip);
+out_alloc:
+        gfs2_alloc_put(ip);
+        return error;
+}
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+struct gfs2_inode;
+struct iattr;
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+                                  (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
+      sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+#define GFS2_EA2DATAPTRS(ea) \
+((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+#define GFS2_ERF_MODE 0x80000000
+struct gfs2_ea_request {
+        const char *er_name;
+        char *er_data;
+        unsigned int er_name_len;
+        unsigned int er_data_len;
+        unsigned int er_type; /* GFS2_EATYPE_... */
+        int er_flags;
+        mode_t er_mode;
+};
+struct gfs2_ea_location {
+        struct buffer_head *el_bh;
+        struct gfs2_ea_header *el_ea;
+        struct gfs2_ea_header *el_prev;
+};
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
+/* Exported to acl.c */
+int gfs2_ea_find(struct gfs2_inode *ip,
+                 struct gfs2_ea_request *er,
+                 struct gfs2_ea_location *el);
+int gfs2_ea_get_copy(struct gfs2_inode *ip,
+                     struct gfs2_ea_location *el,
+                     char *data);
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+                      struct iattr *attr, char *data);
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+        switch (ea->ea_type) {
+        case GFS2_EATYPE_USR:
+                return 5 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SYS:
+                return 7 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SECURITY:
+                return 9 + ea->ea_name_len + 1;
+        default:
+                return 0;
+        }
+}
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+enum {
+        NO_CREATE = 0,
+        CREATE = 1,
+};
+enum {
+        NO_WAIT = 0,
+        WAIT = 1,
+};
+enum {
+        NO_FORCE = 0,
+        FORCE = 1,
+};
+#define GFS2_FAST_NAME_SIZE 8
+#endif /* __GFS2_DOT_H__ */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+struct greedy {
+        struct gfs2_holder gr_gh;
+        struct work_struct gr_work;
+};
+struct gfs2_gl_hash_bucket {
+        struct hlist_head hb_list;
+};
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
+static int dump_glock(struct gfs2_glock *gl);
+static int dump_inode(struct gfs2_inode *ip);
+#define GFS2_GL_HASH_SHIFT      15
+#define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
+static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+/*
+ * Despite what you might think, the numbers below are not arbitrary :-)
+ * They are taken from the ipv4 routing hash code, which is well tested
+ * and thus should be nearly optimal. Later on we might tweek the numbers
+ * but for now this should be fine.
+ *
+ * The reason for putting the locks in a separate array from the list heads
+ * is that we can have fewer locks than list heads and save memory. We use
+ * the same hash function for both, but with a different hash mask.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+        defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_LOCKDEP
+# define GL_HASH_LOCK_SZ        256
+#else
+# if NR_CPUS >= 32
+#  define GL_HASH_LOCK_SZ       4096
+# elif NR_CPUS >= 16
+#  define GL_HASH_LOCK_SZ       2048
+# elif NR_CPUS >= 8
+#  define GL_HASH_LOCK_SZ       1024
+# elif NR_CPUS >= 4
+#  define GL_HASH_LOCK_SZ       512
+# else
+#  define GL_HASH_LOCK_SZ       256
+# endif
+#endif
+/* We never want more locks than chains */
+#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
+# undef GL_HASH_LOCK_SZ
+# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
+#endif
+static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
+static inline rwlock_t *gl_lock_addr(unsigned int x)
+{
+        return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
+}
+#else /* not SMP, so no spinlocks required */
+static inline rwlock_t *gl_lock_addr(x)
+{
+        return NULL;
+}
+#endif
+/**
+ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
+ * @actual: the current state of the lock
+ * @requested: the lock state that was requested by the caller
+ * @flags: the modifier flags passed in by the caller
+ *
+ * Returns: 1 if the locks are compatible, 0 otherwise
+ */
+static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
+                                   int flags)
+{
+        if (actual == requested)
+                return 1;
+        if (flags & GL_EXACT)
+                return 0;
+        if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
+                return 1;
+        if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
+                return 1;
+        return 0;
+}
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+                            const struct lm_lockname *name)
+{
+        unsigned int h;
+        h = jhash(&name->ln_number, sizeof(u64), 0);
+        h = jhash(&name->ln_type, sizeof(unsigned int), h);
+        h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+        h &= GFS2_GL_HASH_MASK;
+        return h;
+}
+/**
+ * glock_free() - Perform a few checks and then release struct gfs2_glock
+ * @gl: The glock to release
+ *
+ * Also calls lock module to release its internal structure for this glock.
+ *
+ */
+static void glock_free(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct inode *aspace = gl->gl_aspace;
+        gfs2_lm_put_lock(sdp, gl->gl_lock);
+        if (aspace)
+                gfs2_aspace_put(aspace);
+        kmem_cache_free(gfs2_glock_cachep, gl);
+}
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+        atomic_inc(&gl->gl_ref);
+}
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+int gfs2_glock_put(struct gfs2_glock *gl)
+{
+        int rv = 0;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        write_lock(gl_lock_addr(gl->gl_hash));
+        if (atomic_dec_and_test(&gl->gl_ref)) {
+                hlist_del(&gl->gl_list);
+                write_unlock(gl_lock_addr(gl->gl_hash));
+                BUG_ON(spin_is_locked(&gl->gl_spin));
+                gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+                gfs2_assert(sdp, list_empty(&gl->gl_holders));
+                gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
+                gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
+                gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+                glock_free(gl);
+                rv = 1;
+                goto out;
+        }
+        write_unlock(gl_lock_addr(gl->gl_hash));
+out:
+        return rv;
+}
+/**
+ * queue_empty - check to see if a glock's queue is empty
+ * @gl: the glock
+ * @head: the head of the queue to check
+ *
+ * This function protects the list in the event that a process already
+ * has a holder on the list and is adding a second holder for itself.
+ * The glmutex lock is what generally prevents processes from working
+ * on the same glock at once, but the special case of adding a second
+ * holder for yourself ("recursive" locking) doesn't involve locking
+ * glmutex, making the spin lock necessary.
+ *
+ * Returns: 1 if the queue is empty
+ */
+static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
+{
+        int empty;
+        spin_lock(&gl->gl_spin);
+        empty = list_empty(head);
+        spin_unlock(&gl->gl_spin);
+        return empty;
+}
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+static struct gfs2_glock *search_bucket(unsigned int hash,
+                                        const struct gfs2_sbd *sdp,
+                                        const struct lm_lockname *name)
+{
+        struct gfs2_glock *gl;
+        struct hlist_node *h;
+        hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+                if (!lm_name_equal(&gl->gl_name, name))
+                        continue;
+                if (gl->gl_sbd != sdp)
+                        continue;
+                atomic_inc(&gl->gl_ref);
+                return gl;
+        }
+        return NULL;
+}
+/**
+ * gfs2_glock_find() - Find glock by lock number
+ * @sdp: The GFS2 superblock
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
+                                          const struct lm_lockname *name)
+{
+        unsigned int hash = gl_hash(sdp, name);
+        struct gfs2_glock *gl;
+        read_lock(gl_lock_addr(hash));
+        gl = search_bucket(hash, sdp, name);
+        read_unlock(gl_lock_addr(hash));
+        return gl;
+}
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+                   const struct gfs2_glock_operations *glops, int create,
+                   struct gfs2_glock **glp)
+{
+        struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+        struct gfs2_glock *gl, *tmp;
+        unsigned int hash = gl_hash(sdp, &name);
+        int error;
+        read_lock(gl_lock_addr(hash));
+        gl = search_bucket(hash, sdp, &name);
+        read_unlock(gl_lock_addr(hash));
+        if (gl || !create) {
+                *glp = gl;
+                return 0;
+        }
+        gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+        if (!gl)
+                return -ENOMEM;
+        gl->gl_flags = 0;
+        gl->gl_name = name;
+        atomic_set(&gl->gl_ref, 1);
+        gl->gl_state = LM_ST_UNLOCKED;
+        gl->gl_hash = hash;
+        gl->gl_owner = NULL;
+        gl->gl_ip = 0;
+        gl->gl_ops = glops;
+        gl->gl_req_gh = NULL;
+        gl->gl_req_bh = NULL;
+        gl->gl_vn = 0;
+        gl->gl_stamp = jiffies;
+        gl->gl_object = NULL;
+        gl->gl_sbd = sdp;
+        gl->gl_aspace = NULL;
+        lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+        /* If this glock protects actual on-disk data or metadata blocks,
+           create a VFS inode to manage the pages/buffers holding them. */
+        if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
+                gl->gl_aspace = gfs2_aspace_get(sdp);
+                if (!gl->gl_aspace) {
+                        error = -ENOMEM;
+                        goto fail;
+                }
+        }
+        error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
+        if (error)
+                goto fail_aspace;
+        write_lock(gl_lock_addr(hash));
+        tmp = search_bucket(hash, sdp, &name);
+        if (tmp) {
+                write_unlock(gl_lock_addr(hash));
+                glock_free(gl);
+                gl = tmp;
+        } else {
+                hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+                write_unlock(gl_lock_addr(hash));
+        }
+        *glp = gl;
+        return 0;
+fail_aspace:
+        if (gl->gl_aspace)
+                gfs2_aspace_put(gl->gl_aspace);
+fail:
+        kmem_cache_free(gfs2_glock_cachep, gl);
+        return error;
+}
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+                      struct gfs2_holder *gh)
+{
+        INIT_LIST_HEAD(&gh->gh_list);
+        gh->gh_gl = gl;
+        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        gh->gh_owner = current;
+        gh->gh_state = state;
+        gh->gh_flags = flags;
+        gh->gh_error = 0;
+        gh->gh_iflags = 0;
+        init_completion(&gh->gh_wait);
+        if (gh->gh_state == LM_ST_EXCLUSIVE)
+                gh->gh_flags |= GL_LOCAL_EXCL;
+        gfs2_glock_hold(gl);
+}
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+        gh->gh_state = state;
+        gh->gh_flags = flags;
+        if (gh->gh_state == LM_ST_EXCLUSIVE)
+                gh->gh_flags |= GL_LOCAL_EXCL;
+        gh->gh_iflags &= 1 << HIF_ALLOCED;
+        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+}
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+        gfs2_glock_put(gh->gh_gl);
+        gh->gh_gl = NULL;
+        gh->gh_ip = 0;
+}
+/**
+ * gfs2_holder_get - get a struct gfs2_holder structure
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gfp_flags:
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
+ * 2) Leave it like it is
+ *
+ * Returns: the holder structure, NULL on ENOMEM
+ */
+static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
+                                           unsigned int state,
+                                           int flags, gfp_t gfp_flags)
+{
+        struct gfs2_holder *gh;
+        gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
+        if (!gh)
+                return NULL;
+        gfs2_holder_init(gl, state, flags, gh);
+        set_bit(HIF_ALLOCED, &gh->gh_iflags);
+        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        return gh;
+}
+/**
+ * gfs2_holder_put - get rid of a struct gfs2_holder structure
+ * @gh: the holder structure
+ *
+ */
+static void gfs2_holder_put(struct gfs2_holder *gh)
+{
+        gfs2_holder_uninit(gh);
+        kfree(gh);
+}
+/**
+ * rq_mutex - process a mutex request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_mutex(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        list_del_init(&gh->gh_list);
+        /*  gh->gh_error never examined.  */
+        set_bit(GLF_LOCK, &gl->gl_flags);
+        complete(&gh->gh_wait);
+        return 1;
+}
+/**
+ * rq_promote - process a promote request in the queue
+ * @gh: the glock holder
+ *
+ * Acquire a new inter-node lock, or change a lock state to more restrictive.
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_promote(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+                if (list_empty(&gl->gl_holders)) {
+                        gl->gl_req_gh = gh;
+                        set_bit(GLF_LOCK, &gl->gl_flags);
+                        spin_unlock(&gl->gl_spin);
+                        if (atomic_read(&sdp->sd_reclaim_count) >
+                            gfs2_tune_get(sdp, gt_reclaim_limit) &&
+                            !(gh->gh_flags & LM_FLAG_PRIORITY)) {
+                                gfs2_reclaim_glock(sdp);
+                                gfs2_reclaim_glock(sdp);
+                        }
+                        glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+                        spin_lock(&gl->gl_spin);
+                }
+                return 1;
+        }
+        if (list_empty(&gl->gl_holders)) {
+                set_bit(HIF_FIRST, &gh->gh_iflags);
+                set_bit(GLF_LOCK, &gl->gl_flags);
+        } else {
+                struct gfs2_holder *next_gh;
+                if (gh->gh_flags & GL_LOCAL_EXCL)
+                        return 1;
+                next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
+                                     gh_list);
+                if (next_gh->gh_flags & GL_LOCAL_EXCL)
+                         return 1;
+        }
+        list_move_tail(&gh->gh_list, &gl->gl_holders);
+        gh->gh_error = 0;
+        set_bit(HIF_HOLDER, &gh->gh_iflags);
+        complete(&gh->gh_wait);
+        return 0;
+}
+/**
+ * rq_demote - process a demote request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_demote(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (!list_empty(&gl->gl_holders))
+                return 1;
+        if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+                list_del_init(&gh->gh_list);
+                gh->gh_error = 0;
+                spin_unlock(&gl->gl_spin);
+                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                        gfs2_holder_put(gh);
+                else
+                        complete(&gh->gh_wait);
+                spin_lock(&gl->gl_spin);
+        } else {
+                gl->gl_req_gh = gh;
+                set_bit(GLF_LOCK, &gl->gl_flags);
+                spin_unlock(&gl->gl_spin);
+                if (gh->gh_state == LM_ST_UNLOCKED ||
+                    gl->gl_state != LM_ST_EXCLUSIVE)
+                        glops->go_drop_th(gl);
+                else
+                        glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+                spin_lock(&gl->gl_spin);
+        }
+        return 0;
+}
+/**
+ * rq_greedy - process a queued request to drop greedy status
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+static int rq_greedy(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        list_del_init(&gh->gh_list);
+        /*  gh->gh_error never examined.  */
+        clear_bit(GLF_GREEDY, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_holder_uninit(gh);
+        kfree(container_of(gh, struct greedy, gr_gh));
+        spin_lock(&gl->gl_spin);
+        return 0;
+}
+/**
+ * run_queue - process holder structures on a glock
+ * @gl: the glock
+ *
+ */
+static void run_queue(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        int blocked = 1;
+        for (;;) {
+                if (test_bit(GLF_LOCK, &gl->gl_flags))
+                        break;
+                if (!list_empty(&gl->gl_waiters1)) {
+                        gh = list_entry(gl->gl_waiters1.next,
+                                        struct gfs2_holder, gh_list);
+                        if (test_bit(HIF_MUTEX, &gh->gh_iflags))
+                                blocked = rq_mutex(gh);
+                        else
+                                gfs2_assert_warn(gl->gl_sbd, 0);
+                } else if (!list_empty(&gl->gl_waiters2) &&
+                           !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+                        gh = list_entry(gl->gl_waiters2.next,
+                                        struct gfs2_holder, gh_list);
+                        if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
+                                blocked = rq_demote(gh);
+                        else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
+                                blocked = rq_greedy(gh);
+                        else
+                                gfs2_assert_warn(gl->gl_sbd, 0);
+                } else if (!list_empty(&gl->gl_waiters3)) {
+                        gh = list_entry(gl->gl_waiters3.next,
+                                        struct gfs2_holder, gh_list);
+                        if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
+                                blocked = rq_promote(gh);
+                        else
+                                gfs2_assert_warn(gl->gl_sbd, 0);
+                } else
+                        break;
+                if (blocked)
+                        break;
+        }
+}
+/**
+ * gfs2_glmutex_lock - acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Gives caller exclusive access to manipulate a glock structure.
+ */
+static void gfs2_glmutex_lock(struct gfs2_glock *gl)
+{
+        struct gfs2_holder gh;
+        gfs2_holder_init(gl, 0, 0, &gh);
+        set_bit(HIF_MUTEX, &gh.gh_iflags);
+        spin_lock(&gl->gl_spin);
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+        } else {
+                gl->gl_owner = current;
+                gl->gl_ip = (unsigned long)__builtin_return_address(0);
+                complete(&gh.gh_wait);
+        }
+        spin_unlock(&gl->gl_spin);
+        wait_for_completion(&gh.gh_wait);
+        gfs2_holder_uninit(&gh);
+}
+/**
+ * gfs2_glmutex_trylock - try to acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if the glock is acquired
+ */
+static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
+{
+        int acquired = 1;
+        spin_lock(&gl->gl_spin);
+        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                acquired = 0;
+        } else {
+                gl->gl_owner = current;
+                gl->gl_ip = (unsigned long)__builtin_return_address(0);
+        }
+        spin_unlock(&gl->gl_spin);
+        return acquired;
+}
+/**
+ * gfs2_glmutex_unlock - release a local lock on a glock
+ * @gl: the glock
+ *
+ */
+static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
+{
+        spin_lock(&gl->gl_spin);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        gl->gl_owner = NULL;
+        gl->gl_ip = 0;
+        run_queue(gl);
+        BUG_ON(!spin_is_locked(&gl->gl_spin));
+        spin_unlock(&gl->gl_spin);
+}
+/**
+ * handle_callback - add a demote request to a lock's queue
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * Note: This may fail sliently if we are out of memory.
+ */
+static void handle_callback(struct gfs2_glock *gl, unsigned int state)
+{
+        struct gfs2_holder *gh, *new_gh = NULL;
+restart:
+        spin_lock(&gl->gl_spin);
+        list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+                if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+                    gl->gl_req_gh != gh) {
+                        if (gh->gh_state != state)
+                                gh->gh_state = LM_ST_UNLOCKED;
+                        goto out;
+                }
+        }
+        if (new_gh) {
+                list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
+                new_gh = NULL;
+        } else {
+                spin_unlock(&gl->gl_spin);
+                new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
+                if (!new_gh)
+                        return;
+                set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
+                set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+                goto restart;
+        }
+out:
+        spin_unlock(&gl->gl_spin);
+        if (new_gh)
+                gfs2_holder_put(new_gh);
+}
+void gfs2_glock_inode_squish(struct inode *inode)
+{
+        struct gfs2_holder gh;
+        struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+        gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
+        set_bit(HIF_DEMOTE, &gh.gh_iflags);
+        spin_lock(&gl->gl_spin);
+        gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
+        list_add_tail(&gh.gh_list, &gl->gl_waiters2);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        wait_for_completion(&gh.gh_wait);
+        gfs2_holder_uninit(&gh);
+}
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+        int held1, held2;
+        held1 = (gl->gl_state != LM_ST_UNLOCKED);
+        held2 = (new_state != LM_ST_UNLOCKED);
+        if (held1 != held2) {
+                if (held2)
+                        gfs2_glock_hold(gl);
+                else
+                        gfs2_glock_put(gl);
+        }
+        gl->gl_state = new_state;
+}
+/**
+ * xmote_bh - Called after the lock module is done acquiring a lock
+ * @gl: The glock in question
+ * @ret: the int returned from the lock module
+ *
+ */
+static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        int prev_state = gl->gl_state;
+        int op_done = 1;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
+        state_change(gl, ret & LM_OUT_ST_MASK);
+        if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
+                if (glops->go_inval)
+                        glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+        } else if (gl->gl_state == LM_ST_DEFERRED) {
+                /* We might not want to do this here.
+                   Look at moving to the inode glops. */
+                if (glops->go_inval)
+                        glops->go_inval(gl, DIO_DATA);
+        }
+        /*  Deal with each possible exit condition  */
+        if (!gh)
+                gl->gl_stamp = jiffies;
+        else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = -EIO;
+                spin_unlock(&gl->gl_spin);
+        } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                if (gl->gl_state == gh->gh_state ||
+                    gl->gl_state == LM_ST_UNLOCKED) {
+                        gh->gh_error = 0;
+                } else {
+                        if (gfs2_assert_warn(sdp, gh->gh_flags &
+                                        (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+                                fs_warn(sdp, "ret = 0x%.8X\n", ret);
+                        gh->gh_error = GLR_TRYFAILED;
+                }
+                spin_unlock(&gl->gl_spin);
+                if (ret & LM_OUT_CANCELED)
+                        handle_callback(gl, LM_ST_UNLOCKED);
+        } else if (ret & LM_OUT_CANCELED) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = GLR_CANCELED;
+                spin_unlock(&gl->gl_spin);
+        } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+                spin_lock(&gl->gl_spin);
+                list_move_tail(&gh->gh_list, &gl->gl_holders);
+                gh->gh_error = 0;
+                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                spin_unlock(&gl->gl_spin);
+                set_bit(HIF_FIRST, &gh->gh_iflags);
+                op_done = 0;
+        } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = GLR_TRYFAILED;
+                spin_unlock(&gl->gl_spin);
+        } else {
+                if (gfs2_assert_withdraw(sdp, 0) == -1)
+                        fs_err(sdp, "ret = 0x%.8X\n", ret);
+        }
+        if (glops->go_xmote_bh)
+                glops->go_xmote_bh(gl);
+        if (op_done) {
+                spin_lock(&gl->gl_spin);
+                gl->gl_req_gh = NULL;
+                gl->gl_req_bh = NULL;
+                clear_bit(GLF_LOCK, &gl->gl_flags);
+                run_queue(gl);
+                spin_unlock(&gl->gl_spin);
+        }
+        gfs2_glock_put(gl);
+        if (gh) {
+                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                        gfs2_holder_put(gh);
+                else
+                        complete(&gh->gh_wait);
+        }
+}
+/**
+ * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
+ * @gl: The glock in question
+ * @state: the requested state
+ * @flags: modifier flags to the lock call
+ *
+ */
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
+                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
+                                 LM_FLAG_PRIORITY);
+        unsigned int lck_ret;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
+        gfs2_assert_warn(sdp, state != gl->gl_state);
+        if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+                glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+        gfs2_glock_hold(gl);
+        gl->gl_req_bh = xmote_bh;
+        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
+        if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
+                return;
+        if (lck_ret & LM_OUT_ASYNC)
+                gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
+        else
+                xmote_bh(gl, lck_ret);
+}
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        clear_bit(GLF_PREFETCH, &gl->gl_flags);
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, !ret);
+        state_change(gl, LM_ST_UNLOCKED);
+        if (glops->go_inval)
+                glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+        if (gh) {
+                spin_lock(&gl->gl_spin);
+                list_del_init(&gh->gh_list);
+                gh->gh_error = 0;
+                spin_unlock(&gl->gl_spin);
+        }
+        if (glops->go_drop_bh)
+                glops->go_drop_bh(gl);
+        spin_lock(&gl->gl_spin);
+        gl->gl_req_gh = NULL;
+        gl->gl_req_bh = NULL;
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+        if (gh) {
+                if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+                        gfs2_holder_put(gh);
+                else
+                        complete(&gh->gh_wait);
+        }
+}
+/**
+ * gfs2_glock_drop_th - call into the lock module to unlock a lock
+ * @gl: the glock
+ *
+ */
+void gfs2_glock_drop_th(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        unsigned int ret;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
+        if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+                glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+        gfs2_glock_hold(gl);
+        gl->gl_req_bh = drop_bh;
+        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
+        if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
+                return;
+        if (!ret)
+                drop_bh(gl, ret);
+        else
+                gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+}
+/**
+ * do_cancels - cancel requests for locks stuck waiting on an expire flag
+ * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ *
+ * Don't cancel GL_NOCANCEL requests.
+ */
+static void do_cancels(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        spin_lock(&gl->gl_spin);
+        while (gl->gl_req_gh != gh &&
+               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+               !list_empty(&gh->gh_list)) {
+                if (gl->gl_req_bh && !(gl->gl_req_gh &&
+                                     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+                        spin_unlock(&gl->gl_spin);
+                        gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+                        msleep(100);
+                        spin_lock(&gl->gl_spin);
+                } else {
+                        spin_unlock(&gl->gl_spin);
+                        msleep(100);
+                        spin_lock(&gl->gl_spin);
+                }
+        }
+        spin_unlock(&gl->gl_spin);
+}
+/**
+ * glock_wait_internal - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+static int glock_wait_internal(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (test_bit(HIF_ABORTED, &gh->gh_iflags))
+                return -EIO;
+        if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+                spin_lock(&gl->gl_spin);
+                if (gl->gl_req_gh != gh &&
+                    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+                    !list_empty(&gh->gh_list)) {
+                        list_del_init(&gh->gh_list);
+                        gh->gh_error = GLR_TRYFAILED;
+                        run_queue(gl);
+                        spin_unlock(&gl->gl_spin);
+                        return gh->gh_error;
+                }
+                spin_unlock(&gl->gl_spin);
+        }
+        if (gh->gh_flags & LM_FLAG_PRIORITY)
+                do_cancels(gh);
+        wait_for_completion(&gh->gh_wait);
+        if (gh->gh_error)
+                return gh->gh_error;
+        gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
+        gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
+                                                   gh->gh_flags));
+        if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
+                gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+                if (glops->go_lock) {
+                        gh->gh_error = glops->go_lock(gh);
+                        if (gh->gh_error) {
+                                spin_lock(&gl->gl_spin);
+                                list_del_init(&gh->gh_list);
+                                spin_unlock(&gl->gl_spin);
+                        }
+                }
+                spin_lock(&gl->gl_spin);
+                gl->gl_req_gh = NULL;
+                gl->gl_req_bh = NULL;
+                clear_bit(GLF_LOCK, &gl->gl_flags);
+                run_queue(gl);
+                spin_unlock(&gl->gl_spin);
+        }
+        return gh->gh_error;
+}
+static inline struct gfs2_holder *
+find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+{
+        struct gfs2_holder *gh;
+        list_for_each_entry(gh, head, gh_list) {
+                if (gh->gh_owner == owner)
+                        return gh;
+        }
+        return NULL;
+}
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ */
+static void add_to_queue(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_holder *existing;
+        BUG_ON(!gh->gh_owner);
+        existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+        if (existing) {
+                print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+                printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+                printk(KERN_INFO "lock type : %d lock state : %d\n",
+                                existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
+                print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+                printk(KERN_INFO "lock type : %d lock state : %d\n",
+                                gl->gl_name.ln_type, gl->gl_state);
+                BUG();
+        }
+        existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+        if (existing) {
+                print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+                print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+                BUG();
+        }
+        if (gh->gh_flags & LM_FLAG_PRIORITY)
+                list_add(&gh->gh_list, &gl->gl_waiters3);
+        else
+                list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+}
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int error = 0;
+restart:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+                set_bit(HIF_ABORTED, &gh->gh_iflags);
+                return -EIO;
+        }
+        set_bit(HIF_PROMOTE, &gh->gh_iflags);
+        spin_lock(&gl->gl_spin);
+        add_to_queue(gh);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        if (!(gh->gh_flags & GL_ASYNC)) {
+                error = glock_wait_internal(gh);
+                if (error == GLR_CANCELED) {
+                        msleep(100);
+                        goto restart;
+                }
+        }
+        clear_bit(GLF_PREFETCH, &gl->gl_flags);
+        if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
+                dump_glock(gl);
+        return error;
+}
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        int ready = 0;
+        spin_lock(&gl->gl_spin);
+        if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                ready = 1;
+        else if (list_empty(&gh->gh_list)) {
+                if (gh->gh_error == GLR_CANCELED) {
+                        spin_unlock(&gl->gl_spin);
+                        msleep(100);
+                        if (gfs2_glock_nq(gh))
+                                return 1;
+                        return 0;
+                } else
+                        ready = 1;
+        }
+        spin_unlock(&gl->gl_spin);
+        return ready;
+}
+/**
+ * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+        int error;
+        error = glock_wait_internal(gh);
+        if (error == GLR_CANCELED) {
+                msleep(100);
+                gh->gh_flags &= ~GL_ASYNC;
+                error = gfs2_glock_nq(gh);
+        }
+        return error;
+}
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (gh->gh_flags & GL_NOCACHE)
+                handle_callback(gl, LM_ST_UNLOCKED);
+        gfs2_glmutex_lock(gl);
+        spin_lock(&gl->gl_spin);
+        list_del_init(&gh->gh_list);
+        if (list_empty(&gl->gl_holders)) {
+                spin_unlock(&gl->gl_spin);
+                if (glops->go_unlock)
+                        glops->go_unlock(gh);
+                gl->gl_stamp = jiffies;
+                spin_lock(&gl->gl_spin);
+        }
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+}
+/**
+ * gfs2_glock_prefetch - Try to prefetch a glock
+ * @gl: the glock
+ * @state: the state to prefetch in
+ * @flags: flags passed to go_xmote_th()
+ *
+ */
+static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
+                                int flags)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        spin_lock(&gl->gl_spin);
+        if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
+            !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
+            !list_empty(&gl->gl_waiters3) ||
+            relaxed_state_ok(gl->gl_state, state, flags)) {
+                spin_unlock(&gl->gl_spin);
+                return;
+        }
+        set_bit(GLF_PREFETCH, &gl->gl_flags);
+        set_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        glops->go_xmote_th(gl, state, flags);
+}
+static void greedy_work(void *data)
+{
+        struct greedy *gr = data;
+        struct gfs2_holder *gh = &gr->gr_gh;
+        struct gfs2_glock *gl = gh->gh_gl;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+        if (glops->go_greedy)
+                glops->go_greedy(gl);
+        spin_lock(&gl->gl_spin);
+        if (list_empty(&gl->gl_waiters2)) {
+                clear_bit(GLF_GREEDY, &gl->gl_flags);
+                spin_unlock(&gl->gl_spin);
+                gfs2_holder_uninit(gh);
+                kfree(gr);
+        } else {
+                gfs2_glock_hold(gl);
+                list_add_tail(&gh->gh_list, &gl->gl_waiters2);
+                run_queue(gl);
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_put(gl);
+        }
+}
+/**
+ * gfs2_glock_be_greedy -
+ * @gl:
+ * @time:
+ *
+ * Returns: 0 if go_greedy will be called, 1 otherwise
+ */
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
+{
+        struct greedy *gr;
+        struct gfs2_holder *gh;
+        if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
+            test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
+                return 1;
+        gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
+        if (!gr) {
+                clear_bit(GLF_GREEDY, &gl->gl_flags);
+                return 1;
+        }
+        gh = &gr->gr_gh;
+        gfs2_holder_init(gl, 0, 0, gh);
+        set_bit(HIF_GREEDY, &gh->gh_iflags);
+        INIT_WORK(&gr->gr_work, greedy_work, gr);
+        set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+        schedule_delayed_work(&gr->gr_work, time);
+        return 0;
+}
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+        gfs2_glock_dq(gh);
+        gfs2_holder_uninit(gh);
+}
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+                      const struct gfs2_glock_operations *glops,
+                      unsigned int state, int flags, struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl;
+        int error;
+        error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+        if (!error) {
+                error = gfs2_glock_nq_init(gl, state, flags, gh);
+                gfs2_glock_put(gl);
+        }
+        return error;
+}
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+        const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+        const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+        const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+        const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+        if (a->ln_number > b->ln_number)
+                return 1;
+        if (a->ln_number < b->ln_number)
+                return -1;
+        if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+                return 1;
+        if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
+                return 1;
+        return 0;
+}
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+                     struct gfs2_holder **p)
+{
+        unsigned int x;
+        int error = 0;
+        for (x = 0; x < num_gh; x++)
+                p[x] = &ghs[x];
+        sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+        for (x = 0; x < num_gh; x++) {
+                p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                error = gfs2_glock_nq(p[x]);
+                if (error) {
+                        while (x--)
+                                gfs2_glock_dq(p[x]);
+                        break;
+                }
+        }
+        return error;
+}
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace this code with code that calls gfs2_glock_prefetch()
+ * 2) Forget async stuff and just call nq_m_sync()
+ * 3) Leave it like it is
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        int *e;
+        unsigned int x;
+        int borked = 0, serious = 0;
+        int error = 0;
+        if (!num_gh)
+                return 0;
+        if (num_gh == 1) {
+                ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                return gfs2_glock_nq(ghs);
+        }
+        e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+        if (!e)
+                return -ENOMEM;
+        for (x = 0; x < num_gh; x++) {
+                ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
+                error = gfs2_glock_nq(&ghs[x]);
+                if (error) {
+                        borked = 1;
+                        serious = error;
+                        num_gh = x;
+                        break;
+                }
+        }
+        for (x = 0; x < num_gh; x++) {
+                error = e[x] = glock_wait_internal(&ghs[x]);
+                if (error) {
+                        borked = 1;
+                        if (error != GLR_TRYFAILED && error != GLR_CANCELED)
+                                serious = error;
+                }
+        }
+        if (!borked) {
+                kfree(e);
+                return 0;
+        }
+        for (x = 0; x < num_gh; x++)
+                if (!e[x])
+                        gfs2_glock_dq(&ghs[x]);
+        if (serious)
+                error = serious;
+        else {
+                for (x = 0; x < num_gh; x++)
+                        gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
+                                          &ghs[x]);
+                error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
+        }
+        kfree(e);
+        return error;
+}
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        unsigned int x;
+        for (x = 0; x < num_gh; x++)
+                gfs2_glock_dq(&ghs[x]);
+}
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        unsigned int x;
+        for (x = 0; x < num_gh; x++)
+                gfs2_glock_dq_uninit(&ghs[x]);
+}
+/**
+ * gfs2_glock_prefetch_num - prefetch a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ *
+ * Returns: errno
+ */
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+                             const struct gfs2_glock_operations *glops,
+                             unsigned int state, int flags)
+{
+        struct gfs2_glock *gl;
+        int error;
+        if (atomic_read(&sdp->sd_reclaim_count) <
+            gfs2_tune_get(sdp, gt_reclaim_limit)) {
+                error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+                if (!error) {
+                        gfs2_glock_prefetch(gl, state, flags);
+                        gfs2_glock_put(gl);
+                }
+        }
+}
+/**
+ * gfs2_lvb_hold - attach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+int gfs2_lvb_hold(struct gfs2_glock *gl)
+{
+        int error;
+        gfs2_glmutex_lock(gl);
+        if (!atomic_read(&gl->gl_lvb_count)) {
+                error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
+                if (error) {
+                        gfs2_glmutex_unlock(gl);
+                        return error;
+                }
+                gfs2_glock_hold(gl);
+        }
+        atomic_inc(&gl->gl_lvb_count);
+        gfs2_glmutex_unlock(gl);
+        return 0;
+}
+/**
+ * gfs2_lvb_unhold - detach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+void gfs2_lvb_unhold(struct gfs2_glock *gl)
+{
+        gfs2_glock_hold(gl);
+        gfs2_glmutex_lock(gl);
+        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
+        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
+                gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+                gl->gl_lvb = NULL;
+                gfs2_glock_put(gl);
+        }
+        gfs2_glmutex_unlock(gl);
+        gfs2_glock_put(gl);
+}
+static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                        unsigned int state)
+{
+        struct gfs2_glock *gl;
+        gl = gfs2_glock_find(sdp, name);
+        if (!gl)
+                return;
+        if (gl->gl_ops->go_callback)
+                gl->gl_ops->go_callback(gl, state);
+        handle_callback(gl, state);
+        spin_lock(&gl->gl_spin);
+        run_queue(gl);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+/**
+ * gfs2_glock_cb - Callback used by locking module
+ * @sdp: Pointer to the superblock
+ * @type: Type of callback
+ * @data: Type dependent data pointer
+ *
+ * Called by the locking module when it wants to tell us something.
+ * Either we need to drop a lock, one of our ASYNC requests completed, or
+ * a journal from another client needs to be recovered.
+ */
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+{
+        struct gfs2_sbd *sdp = cb_data;
+        switch (type) {
+        case LM_CB_NEED_E:
+                blocking_cb(sdp, data, LM_ST_UNLOCKED);
+                return;
+        case LM_CB_NEED_D:
+                blocking_cb(sdp, data, LM_ST_DEFERRED);
+                return;
+        case LM_CB_NEED_S:
+                blocking_cb(sdp, data, LM_ST_SHARED);
+                return;
+        case LM_CB_ASYNC: {
+                struct lm_async_cb *async = data;
+                struct gfs2_glock *gl;
+                gl = gfs2_glock_find(sdp, &async->lc_name);
+                if (gfs2_assert_warn(sdp, gl))
+                        return;
+                if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+                        gl->gl_req_bh(gl, async->lc_ret);
+                gfs2_glock_put(gl);
+                return;
+        }
+        case LM_CB_NEED_RECOVERY:
+                gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
+                if (sdp->sd_recoverd_process)
+                        wake_up_process(sdp->sd_recoverd_process);
+                return;
+        case LM_CB_DROPLOCKS:
+                gfs2_gl_hash_clear(sdp, NO_WAIT);
+                gfs2_quota_scan(sdp);
+                return;
+        default:
+                gfs2_assert_warn(sdp, 0);
+                return;
+        }
+}
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int demote_ok(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        int demote = 1;
+        if (test_bit(GLF_STICKY, &gl->gl_flags))
+                demote = 0;
+        else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
+                demote = time_after_eq(jiffies, gl->gl_stamp +
+                                    gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
+        else if (glops->go_demote_ok)
+                demote = glops->go_demote_ok(gl);
+        return demote;
+}
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        spin_lock(&sdp->sd_reclaim_lock);
+        if (list_empty(&gl->gl_reclaim)) {
+                gfs2_glock_hold(gl);
+                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+                atomic_inc(&sdp->sd_reclaim_count);
+        }
+        spin_unlock(&sdp->sd_reclaim_lock);
+        wake_up(&sdp->sd_reclaim_wq);
+}
+/**
+ * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
+ * @sdp: the filesystem
+ *
+ * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
+ * different glock and we notice that there are a lot of glocks in the
+ * reclaim list.
+ *
+ */
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+{
+        struct gfs2_glock *gl;
+        spin_lock(&sdp->sd_reclaim_lock);
+        if (list_empty(&sdp->sd_reclaim_list)) {
+                spin_unlock(&sdp->sd_reclaim_lock);
+                return;
+        }
+        gl = list_entry(sdp->sd_reclaim_list.next,
+                        struct gfs2_glock, gl_reclaim);
+        list_del_init(&gl->gl_reclaim);
+        spin_unlock(&sdp->sd_reclaim_lock);
+        atomic_dec(&sdp->sd_reclaim_count);
+        atomic_inc(&sdp->sd_reclaimed);
+        if (gfs2_glmutex_trylock(gl)) {
+                if (queue_empty(gl, &gl->gl_holders) &&
+                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+                        handle_callback(gl, LM_ST_UNLOCKED);
+                gfs2_glmutex_unlock(gl);
+        }
+        gfs2_glock_put(gl);
+}
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ * Returns: 1 if the bucket has entries
+ */
+static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+                          unsigned int hash)
+{
+        struct gfs2_glock *gl, *prev = NULL;
+        int has_entries = 0;
+        struct hlist_head *head = &gl_hash_table[hash].hb_list;
+        read_lock(gl_lock_addr(hash));
+        /* Can't use hlist_for_each_entry - don't want prefetch here */
+        if (hlist_empty(head))
+                goto out;
+        gl = list_entry(head->first, struct gfs2_glock, gl_list);
+        while(1) {
+                if (gl->gl_sbd == sdp) {
+                        gfs2_glock_hold(gl);
+                        read_unlock(gl_lock_addr(hash));
+                        if (prev)
+                                gfs2_glock_put(prev);
+                        prev = gl;
+                        examiner(gl);
+                        has_entries = 1;
+                        read_lock(gl_lock_addr(hash));
+                }
+                if (gl->gl_list.next == NULL)
+                        break;
+                gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
+        }
+out:
+        read_unlock(gl_lock_addr(hash));
+        if (prev)
+                gfs2_glock_put(prev);
+        return has_entries;
+}
+/**
+ * scan_glock - look at a glock and see if we can reclaim it
+ * @gl: the glock to look at
+ *
+ */
+static void scan_glock(struct gfs2_glock *gl)
+{
+        if (gl->gl_ops == &gfs2_inode_glops)
+                return;
+        if (gfs2_glmutex_trylock(gl)) {
+                if (queue_empty(gl, &gl->gl_holders) &&
+                    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+                        goto out_schedule;
+                gfs2_glmutex_unlock(gl);
+        }
+        return;
+out_schedule:
+        gfs2_glmutex_unlock(gl);
+        gfs2_glock_schedule_for_reclaim(gl);
+}
+/**
+ * gfs2_scand_internal - Look for glocks and inodes to toss from memory
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_scand_internal(struct gfs2_sbd *sdp)
+{
+        unsigned int x;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(scan_glock, sdp, x);
+}
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+static void clear_glock(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int released;
+        spin_lock(&sdp->sd_reclaim_lock);
+        if (!list_empty(&gl->gl_reclaim)) {
+                list_del_init(&gl->gl_reclaim);
+                atomic_dec(&sdp->sd_reclaim_count);
+                spin_unlock(&sdp->sd_reclaim_lock);
+                released = gfs2_glock_put(gl);
+                gfs2_assert(sdp, !released);
+        } else {
+                spin_unlock(&sdp->sd_reclaim_lock);
+        }
+        if (gfs2_glmutex_trylock(gl)) {
+                if (queue_empty(gl, &gl->gl_holders) &&
+                    gl->gl_state != LM_ST_UNLOCKED)
+                        handle_callback(gl, LM_ST_UNLOCKED);
+                gfs2_glmutex_unlock(gl);
+        }
+}
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem, or when inter-node lock manager
+ * requests DROPLOCKS because it is running out of capacity.
+ */
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+{
+        unsigned long t;
+        unsigned int x;
+        int cont;
+        t = jiffies;
+        for (;;) {
+                cont = 0;
+                for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+                        if (examine_bucket(clear_glock, sdp, x))
+                                cont = 1;
+                }
+                if (!wait || !cont)
+                        break;
+                if (time_after_eq(jiffies,
+                                  t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
+                        fs_warn(sdp, "Unmount seems to be stalled. "
+                                     "Dumping lock state...\n");
+                        gfs2_dump_lockstate(sdp);
+                        t = jiffies;
+                }
+                invalidate_inodes(sdp->sd_vfs);
+                msleep(10);
+        }
+}
+/*
+ *  Diagnostic routines to help debug distributed deadlock
+ */
+/**
+ * dump_holder - print information about a glock holder
+ * @str: a string naming the type of holder
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int dump_holder(char *str, struct gfs2_holder *gh)
+{
+        unsigned int x;
+        int error = -ENOBUFS;
+        printk(KERN_INFO "  %s\n", str);
+        printk(KERN_INFO "    owner = %ld\n",
+                   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+        printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
+        printk(KERN_INFO "    gh_flags =");
+        for (x = 0; x < 32; x++)
+                if (gh->gh_flags & (1 << x))
+                        printk(" %u", x);
+        printk(" \n");
+        printk(KERN_INFO "    error = %d\n", gh->gh_error);
+        printk(KERN_INFO "    gh_iflags =");
+        for (x = 0; x < 32; x++)
+                if (test_bit(x, &gh->gh_iflags))
+                        printk(" %u", x);
+        printk(" \n");
+        print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
+        error = 0;
+        return error;
+}
+/**
+ * dump_inode - print information about an inode
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int dump_inode(struct gfs2_inode *ip)
+{
+        unsigned int x;
+        int error = -ENOBUFS;
+        printk(KERN_INFO "  Inode:\n");
+        printk(KERN_INFO "    num = %llu %llu\n",
+                    (unsigned long long)ip->i_num.no_formal_ino,
+                    (unsigned long long)ip->i_num.no_addr);
+        printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_di.di_mode));
+        printk(KERN_INFO "    i_flags =");
+        for (x = 0; x < 32; x++)
+                if (test_bit(x, &ip->i_flags))
+                        printk(" %u", x);
+        printk(" \n");
+        error = 0;
+        return error;
+}
+/**
+ * dump_glock - print information about a glock
+ * @gl: the glock
+ * @count: where we are in the buffer
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+static int dump_glock(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        unsigned int x;
+        int error = -ENOBUFS;
+        spin_lock(&gl->gl_spin);
+        printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+               (unsigned long long)gl->gl_name.ln_number);
+        printk(KERN_INFO "  gl_flags =");
+        for (x = 0; x < 32; x++) {
+                if (test_bit(x, &gl->gl_flags))
+                        printk(" %u", x);
+        }
+        printk(" \n");
+        printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+        printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
+        printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
+        print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
+        printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+        printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+        printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+        printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+        printk(KERN_INFO "  le = %s\n",
+                   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
+        printk(KERN_INFO "  reclaim = %s\n",
+                    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+        if (gl->gl_aspace)
+                printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+                       gl->gl_aspace->i_mapping->nrpages);
+        else
+                printk(KERN_INFO "  aspace = no\n");
+        printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+        if (gl->gl_req_gh) {
+                error = dump_holder("Request", gl->gl_req_gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                error = dump_holder("Holder", gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+                error = dump_holder("Waiter1", gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+                error = dump_holder("Waiter2", gh);
+                if (error)
+                        goto out;
+        }
+        list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
+                error = dump_holder("Waiter3", gh);
+                if (error)
+                        goto out;
+        }
+        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
+                if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
+                    list_empty(&gl->gl_holders)) {
+                        error = dump_inode(gl->gl_object);
+                        if (error)
+                                goto out;
+                } else {
+                        error = -ENOBUFS;
+                        printk(KERN_INFO "  Inode: busy\n");
+                }
+        }
+        error = 0;
+out:
+        spin_unlock(&gl->gl_spin);
+        return error;
+}
+/**
+ * gfs2_dump_lockstate - print out the current lockstate
+ * @sdp: the filesystem
+ * @ub: the buffer to copy the information into
+ *
+ * If @ub is NULL, dump the lockstate to the console.
+ *
+ */
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
+{
+        struct gfs2_glock *gl;
+        struct hlist_node *h;
+        unsigned int x;
+        int error = 0;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+                read_lock(gl_lock_addr(x));
+                hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
+                        if (gl->gl_sbd != sdp)
+                                continue;
+                        error = dump_glock(gl);
+                        if (error)
+                                break;
+                }
+                read_unlock(gl_lock_addr(x));
+                if (error)
+                        break;
+        }
+        return error;
+}
+int __init gfs2_glock_init(void)
+{
+        unsigned i;
+        for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+                INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+        }
+#ifdef GL_HASH_LOCK_SZ
+        for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
+                rwlock_init(&gl_hash_locks[i]);
+        }
+#endif
+        return 0;
+}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+#include "incore.h"
+/* Flags for lock requests; used in gfs2_holder gh_flag field.
+   From lm_interface.h:
+#define LM_FLAG_TRY             0x00000001
+#define LM_FLAG_TRY_1CB         0x00000002
+#define LM_FLAG_NOEXP           0x00000004
+#define LM_FLAG_ANY             0x00000008
+#define LM_FLAG_PRIORITY        0x00000010 */
+#define GL_LOCAL_EXCL           0x00000020
+#define GL_ASYNC                0x00000040
+#define GL_EXACT                0x00000080
+#define GL_SKIP                 0x00000100
+#define GL_ATIME                0x00000200
+#define GL_NOCACHE              0x00000400
+#define GL_NOCANCEL             0x00001000
+#define GL_AOP                  0x00004000
+#define GL_DUMP                 0x00008000
+#define GLR_TRYFAILED           13
+#define GLR_CANCELED            14
+static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh;
+        int locked = 0;
+        /* Look in glock's list of holders for one with current task as owner */
+        spin_lock(&gl->gl_spin);
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (gh->gh_owner == current) {
+                        locked = 1;
+                        break;
+                }
+        }
+        spin_unlock(&gl->gl_spin);
+        return locked;
+}
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+        return gl->gl_state == LM_ST_EXCLUSIVE;
+}
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+        return gl->gl_state == LM_ST_DEFERRED;
+}
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+        return gl->gl_state == LM_ST_SHARED;
+}
+static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
+int gfs2_glock_get(struct gfs2_sbd *sdp,
+                   u64 number, const struct gfs2_glock_operations *glops,
+                   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+                      struct gfs2_holder *gh);
+void gfs2_holder_reinit(unsigned int state, unsigned flags,
+                        struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
+void gfs2_glock_drop_th(struct gfs2_glock *gl);
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+                      u64 number, const struct gfs2_glock_operations *glops,
+                      unsigned int state, int flags, struct gfs2_holder *gh);
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+                             const struct gfs2_glock_operations *glops,
+                             unsigned int state, int flags);
+void gfs2_glock_inode_squish(struct inode *inode);
+/**
+ * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
+                                     unsigned int state, int flags,
+                                     struct gfs2_holder *gh)
+{
+        int error;
+        gfs2_holder_init(gl, state, flags, gh);
+        error = gfs2_glock_nq(gh);
+        if (error)
+                gfs2_holder_uninit(gh);
+        return error;
+}
+/*  Lock Value Block functions  */
+int gfs2_lvb_hold(struct gfs2_glock *gl);
+void gfs2_lvb_unhold(struct gfs2_glock *gl);
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+void gfs2_scand_internal(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+int __init gfs2_glock_init(void);
+#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..41a6b6818a50
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,615 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "util.h"
+#include "trans.h"
+/**
+ * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * @gl: the glock
+ *
+ * None of the buffers should be dirty, locked, or pinned.
+ */
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        unsigned int blocks;
+        struct list_head *head = &gl->gl_ail_list;
+        struct gfs2_bufdata *bd;
+        struct buffer_head *bh;
+        u64 blkno;
+        int error;
+        blocks = atomic_read(&gl->gl_ail_count);
+        if (!blocks)
+                return;
+        error = gfs2_trans_begin(sdp, 0, blocks);
+        if (gfs2_assert_withdraw(sdp, !error))
+                return;
+        gfs2_log_lock(sdp);
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata,
+                                bd_ail_gl_list);
+                bh = bd->bd_bh;
+                blkno = bh->b_blocknr;
+                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
+                bd->bd_ail = NULL;
+                list_del(&bd->bd_ail_st_list);
+                list_del(&bd->bd_ail_gl_list);
+                atomic_dec(&gl->gl_ail_count);
+                brelse(bh);
+                gfs2_log_unlock(sdp);
+                gfs2_trans_add_revoke(sdp, blkno);
+                gfs2_log_lock(sdp);
+        }
+        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+        gfs2_log_unlock(sdp);
+        gfs2_trans_end(sdp);
+        gfs2_log_flush(sdp, NULL);
+}
+/**
+ * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
+ * @gl: the glock
+ *
+ */
+static void gfs2_pte_inval(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip;
+        struct inode *inode;
+        ip = gl->gl_object;
+        inode = &ip->i_inode;
+        if (!ip || !S_ISREG(ip->i_di.di_mode))
+                return;
+        if (!test_bit(GIF_PAGED, &ip->i_flags))
+                return;
+        unmap_shared_mapping_range(inode->i_mapping, 0, 0);
+        if (test_bit(GIF_SW_PAGED, &ip->i_flags))
+                set_bit(GLF_DIRTY, &gl->gl_flags);
+        clear_bit(GIF_SW_PAGED, &ip->i_flags);
+}
+/**
+ * gfs2_page_inval - Invalidate all pages associated with a glock
+ * @gl: the glock
+ *
+ */
+static void gfs2_page_inval(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip;
+        struct inode *inode;
+        ip = gl->gl_object;
+        inode = &ip->i_inode;
+        if (!ip || !S_ISREG(ip->i_di.di_mode))
+                return;
+        truncate_inode_pages(inode->i_mapping, 0);
+        gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
+        clear_bit(GIF_PAGED, &ip->i_flags);
+}
+/**
+ * gfs2_page_wait - Wait for writeback of data
+ * @gl: the glock
+ *
+ * Syncs data (not metadata) for a regular file.
+ * No-op for all other types.
+ */
+static void gfs2_page_wait(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = gl->gl_object;
+        struct inode *inode = &ip->i_inode;
+        struct address_space *mapping = inode->i_mapping;
+        int error;
+        if (!S_ISREG(ip->i_di.di_mode))
+                return;
+        error = filemap_fdatawait(mapping);
+        /* Put back any errors cleared by filemap_fdatawait()
+           so they can be caught by someone who can pass them
+           up to user space. */
+        if (error == -ENOSPC)
+                set_bit(AS_ENOSPC, &mapping->flags);
+        else if (error)
+                set_bit(AS_EIO, &mapping->flags);
+}
+static void gfs2_page_writeback(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = gl->gl_object;
+        struct inode *inode = &ip->i_inode;
+        struct address_space *mapping = inode->i_mapping;
+        if (!S_ISREG(ip->i_di.di_mode))
+                return;
+        filemap_fdatawrite(mapping);
+}
+/**
+ * meta_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ * @flags: DIO_*
+ *
+ * Called when demoting or unlocking an EX glock.  We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+static void meta_go_sync(struct gfs2_glock *gl, int flags)
+{
+        if (!(flags & DIO_METADATA))
+                return;
+        if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
+                gfs2_log_flush(gl->gl_sbd, gl);
+                gfs2_meta_sync(gl);
+                if (flags & DIO_RELEASE)
+                        gfs2_ail_empty_gl(gl);
+        }
+}
+/**
+ * meta_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void meta_go_inval(struct gfs2_glock *gl, int flags)
+{
+        if (!(flags & DIO_METADATA))
+                return;
+        gfs2_meta_inval(gl);
+        gl->gl_vn++;
+}
+/**
+ * inode_go_xmote_th - promote/demote a glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+                              int flags)
+{
+        if (gl->gl_state != LM_ST_UNLOCKED)
+                gfs2_pte_inval(gl);
+        gfs2_glock_xmote_th(gl, state, flags);
+}
+/**
+ * inode_go_xmote_bh - After promoting/demoting a glock
+ * @gl: the glock
+ *
+ */
+static void inode_go_xmote_bh(struct gfs2_glock *gl)
+{
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        struct buffer_head *bh;
+        int error;
+        if (gl->gl_state != LM_ST_UNLOCKED &&
+            (!gh || !(gh->gh_flags & GL_SKIP))) {
+                error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
+                if (!error)
+                        brelse(bh);
+        }
+}
+/**
+ * inode_go_drop_th - unlock a glock
+ * @gl: the glock
+ *
+ * Invoked from rq_demote().
+ * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
+ * is being purged from our node's glock cache; we're dropping lock.
+ */
+static void inode_go_drop_th(struct gfs2_glock *gl)
+{
+        gfs2_pte_inval(gl);
+        gfs2_glock_drop_th(gl);
+}
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ * @flags:
+ *
+ */
+static void inode_go_sync(struct gfs2_glock *gl, int flags)
+{
+        int meta = (flags & DIO_METADATA);
+        int data = (flags & DIO_DATA);
+        if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
+                if (meta && data) {
+                        gfs2_page_writeback(gl);
+                        gfs2_log_flush(gl->gl_sbd, gl);
+                        gfs2_meta_sync(gl);
+                        gfs2_page_wait(gl);
+                        clear_bit(GLF_DIRTY, &gl->gl_flags);
+                } else if (meta) {
+                        gfs2_log_flush(gl->gl_sbd, gl);
+                        gfs2_meta_sync(gl);
+                } else if (data) {
+                        gfs2_page_writeback(gl);
+                        gfs2_page_wait(gl);
+                }
+                if (flags & DIO_RELEASE)
+                        gfs2_ail_empty_gl(gl);
+        }
+}
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+        int meta = (flags & DIO_METADATA);
+        int data = (flags & DIO_DATA);
+        if (meta) {
+                gfs2_meta_inval(gl);
+                gl->gl_vn++;
+        }
+        if (data)
+                gfs2_page_inval(gl);
+}
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int inode_go_demote_ok(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int demote = 0;
+        if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+                demote = 1;
+        else if (!sdp->sd_args.ar_localcaching &&
+                 time_after_eq(jiffies, gl->gl_stamp +
+                               gfs2_tune_get(sdp, gt_demote_secs) * HZ))
+                demote = 1;
+        return demote;
+}
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_inode *ip = gl->gl_object;
+        int error = 0;
+        if (!ip)
+                return 0;
+        if (ip->i_vn != gl->gl_vn) {
+                error = gfs2_inode_refresh(ip);
+                if (error)
+                        return error;
+                gfs2_inode_attr_in(ip);
+        }
+        if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+            (gl->gl_state == LM_ST_EXCLUSIVE) &&
+            (gh->gh_flags & GL_LOCAL_EXCL))
+                error = gfs2_truncatei_resume(ip);
+        return error;
+}
+/**
+ * inode_go_unlock - operation done before an inode lock is unlocked by a
+ *                   process
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void inode_go_unlock(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_inode *ip = gl->gl_object;
+        if (ip == NULL)
+                return;
+        if (test_bit(GLF_DIRTY, &gl->gl_flags))
+                gfs2_inode_attr_in(ip);
+        gfs2_meta_cache_flush(ip);
+}
+/**
+ * inode_greedy -
+ * @gl: the glock
+ *
+ */
+static void inode_greedy(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = gl->gl_object;
+        unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
+        unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
+        unsigned int new_time;
+        spin_lock(&ip->i_spin);
+        if (time_after(ip->i_last_pfault + quantum, jiffies)) {
+                new_time = ip->i_greedy + quantum;
+                if (new_time > max)
+                        new_time = max;
+        } else {
+                new_time = ip->i_greedy - quantum;
+                if (!new_time || new_time > max)
+                        new_time = 1;
+        }
+        ip->i_greedy = new_time;
+        spin_unlock(&ip->i_spin);
+        iput(&ip->i_inode);
+}
+/**
+ * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+{
+        return !gl->gl_aspace->i_mapping->nrpages;
+}
+/**
+ * rgrp_go_lock - operation done after an rgrp lock is locked by
+ *    a first holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+static int rgrp_go_lock(struct gfs2_holder *gh)
+{
+        return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
+}
+/**
+ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
+ *    a last holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ */
+static void rgrp_go_unlock(struct gfs2_holder *gh)
+{
+        gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
+}
+/**
+ * trans_go_xmote_th - promote/demote the transaction glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+                              int flags)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        if (gl->gl_state != LM_ST_UNLOCKED &&
+            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                gfs2_meta_syncfs(sdp);
+                gfs2_log_shutdown(sdp);
+        }
+        gfs2_glock_xmote_th(gl, state, flags);
+}
+/**
+ * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * @gl: the glock
+ *
+ */
+static void trans_go_xmote_bh(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+        struct gfs2_glock *j_gl = ip->i_gl;
+        struct gfs2_log_header head;
+        int error;
+        if (gl->gl_state != LM_ST_UNLOCKED &&
+            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
+                j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+                error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+                if (error)
+                        gfs2_consist(sdp);
+                if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
+                        gfs2_consist(sdp);
+                /*  Initialize some head of the log stuff  */
+                if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+                        sdp->sd_log_sequence = head.lh_sequence + 1;
+                        gfs2_log_pointers_init(sdp, head.lh_blkno);
+                }
+        }
+}
+/**
+ * trans_go_drop_th - unlock the transaction glock
+ * @gl: the glock
+ *
+ * We want to sync the device even with localcaching.  Remember
+ * that localcaching journal replay only marks buffers dirty.
+ */
+static void trans_go_drop_th(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                gfs2_meta_syncfs(sdp);
+                gfs2_log_shutdown(sdp);
+        }
+        gfs2_glock_drop_th(gl);
+}
+/**
+ * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int quota_go_demote_ok(struct gfs2_glock *gl)
+{
+        return !atomic_read(&gl->gl_lvb_count);
+}
+const struct gfs2_glock_operations gfs2_meta_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_META,
+};
+const struct gfs2_glock_operations gfs2_inode_glops = {
+        .go_xmote_th = inode_go_xmote_th,
+        .go_xmote_bh = inode_go_xmote_bh,
+        .go_drop_th = inode_go_drop_th,
+        .go_sync = inode_go_sync,
+        .go_inval = inode_go_inval,
+        .go_demote_ok = inode_go_demote_ok,
+        .go_lock = inode_go_lock,
+        .go_unlock = inode_go_unlock,
+        .go_greedy = inode_greedy,
+        .go_type = LM_TYPE_INODE,
+};
+const struct gfs2_glock_operations gfs2_rgrp_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_sync = meta_go_sync,
+        .go_inval = meta_go_inval,
+        .go_demote_ok = rgrp_go_demote_ok,
+        .go_lock = rgrp_go_lock,
+        .go_unlock = rgrp_go_unlock,
+        .go_type = LM_TYPE_RGRP,
+};
+const struct gfs2_glock_operations gfs2_trans_glops = {
+        .go_xmote_th = trans_go_xmote_th,
+        .go_xmote_bh = trans_go_xmote_bh,
+        .go_drop_th = trans_go_drop_th,
+        .go_type = LM_TYPE_NONDISK,
+};
+const struct gfs2_glock_operations gfs2_iopen_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_IOPEN,
+};
+const struct gfs2_glock_operations gfs2_flock_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_FLOCK,
+};
+const struct gfs2_glock_operations gfs2_nondisk_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_NONDISK,
+};
+const struct gfs2_glock_operations gfs2_quota_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_demote_ok = quota_go_demote_ok,
+        .go_type = LM_TYPE_QUOTA,
+};
+const struct gfs2_glock_operations gfs2_journal_glops = {
+        .go_xmote_th = gfs2_glock_xmote_th,
+        .go_drop_th = gfs2_glock_drop_th,
+        .go_type = LM_TYPE_JOURNAL,
+};
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..a1d9b5b024e6
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __GLOPS_DOT_H__
+#define __GLOPS_DOT_H__
+#include "incore.h"
+extern const struct gfs2_glock_operations gfs2_meta_glops;
+extern const struct gfs2_glock_operations gfs2_inode_glops;
+extern const struct gfs2_glock_operations gfs2_rgrp_glops;
+extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_iopen_glops;
+extern const struct gfs2_glock_operations gfs2_flock_glops;
+extern const struct gfs2_glock_operations gfs2_nondisk_glops;
+extern const struct gfs2_glock_operations gfs2_quota_glops;
+extern const struct gfs2_glock_operations gfs2_journal_glops;
+#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..118dc693d111
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,634 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __INCORE_DOT_H__
+#define __INCORE_DOT_H__
+#include <linux/fs.h>
+#define DIO_WAIT        0x00000010
+#define DIO_METADATA    0x00000020
+#define DIO_DATA        0x00000040
+#define DIO_RELEASE     0x00000080
+#define DIO_ALL         0x00000100
+struct gfs2_log_operations;
+struct gfs2_log_element;
+struct gfs2_holder;
+struct gfs2_glock;
+struct gfs2_quota_data;
+struct gfs2_trans;
+struct gfs2_ail;
+struct gfs2_jdesc;
+struct gfs2_sbd;
+typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
+/*
+ * Structure of operations that are associated with each
+ * type of element in the log.
+ */
+struct gfs2_log_operations {
+        void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
+        void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+        void (*lo_before_commit) (struct gfs2_sbd *sdp);
+        void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+        void (*lo_before_scan) (struct gfs2_jdesc *jd,
+                                struct gfs2_log_header *head, int pass);
+        int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
+                                 struct gfs2_log_descriptor *ld, __be64 *ptr,
+                                 int pass);
+        void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
+        const char *lo_name;
+};
+struct gfs2_log_element {
+        struct list_head le_list;
+        const struct gfs2_log_operations *le_ops;
+};
+struct gfs2_bitmap {
+        struct buffer_head *bi_bh;
+        char *bi_clone;
+        u32 bi_offset;
+        u32 bi_start;
+        u32 bi_len;
+};
+struct gfs2_rgrpd {
+        struct list_head rd_list;       /* Link with superblock */
+        struct list_head rd_list_mru;
+        struct list_head rd_recent;     /* Recently used rgrps */
+        struct gfs2_glock *rd_gl;       /* Glock for this rgrp */
+        struct gfs2_rindex rd_ri;
+        struct gfs2_rgrp rd_rg;
+        u64 rd_rg_vn;
+        struct gfs2_bitmap *rd_bits;
+        unsigned int rd_bh_count;
+        struct mutex rd_mutex;
+        u32 rd_free_clone;
+        struct gfs2_log_element rd_le;
+        u32 rd_last_alloc_data;
+        u32 rd_last_alloc_meta;
+        struct gfs2_sbd *rd_sbd;
+};
+enum gfs2_state_bits {
+        BH_Pinned = BH_PrivateStart,
+        BH_Escaped = BH_PrivateStart + 1,
+};
+BUFFER_FNS(Pinned, pinned)
+TAS_BUFFER_FNS(Pinned, pinned)
+BUFFER_FNS(Escaped, escaped)
+TAS_BUFFER_FNS(Escaped, escaped)
+struct gfs2_bufdata {
+        struct buffer_head *bd_bh;
+        struct gfs2_glock *bd_gl;
+        struct list_head bd_list_tr;
+        struct gfs2_log_element bd_le;
+        struct gfs2_ail *bd_ail;
+        struct list_head bd_ail_st_list;
+        struct list_head bd_ail_gl_list;
+};
+struct gfs2_glock_operations {
+        void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
+                             int flags);
+        void (*go_xmote_bh) (struct gfs2_glock * gl);
+        void (*go_drop_th) (struct gfs2_glock * gl);
+        void (*go_drop_bh) (struct gfs2_glock * gl);
+        void (*go_sync) (struct gfs2_glock * gl, int flags);
+        void (*go_inval) (struct gfs2_glock * gl, int flags);
+        int (*go_demote_ok) (struct gfs2_glock * gl);
+        int (*go_lock) (struct gfs2_holder * gh);
+        void (*go_unlock) (struct gfs2_holder * gh);
+        void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
+        void (*go_greedy) (struct gfs2_glock * gl);
+        const int go_type;
+};
+enum {
+        /* Actions */
+        HIF_MUTEX               = 0,
+        HIF_PROMOTE             = 1,
+        HIF_DEMOTE              = 2,
+        HIF_GREEDY              = 3,
+        /* States */
+        HIF_ALLOCED             = 4,
+        HIF_DEALLOC             = 5,
+        HIF_HOLDER              = 6,
+        HIF_FIRST               = 7,
+        HIF_ABORTED             = 9,
+};
+struct gfs2_holder {
+        struct list_head gh_list;
+        struct gfs2_glock *gh_gl;
+        struct task_struct *gh_owner;
+        unsigned int gh_state;
+        unsigned gh_flags;
+        int gh_error;
+        unsigned long gh_iflags;
+        struct completion gh_wait;
+        unsigned long gh_ip;
+};
+enum {
+        GLF_LOCK                = 1,
+        GLF_STICKY              = 2,
+        GLF_PREFETCH            = 3,
+        GLF_DIRTY               = 5,
+        GLF_SKIP_WAITERS2       = 6,
+        GLF_GREEDY              = 7,
+};
+struct gfs2_glock {
+        struct hlist_node gl_list;
+        unsigned long gl_flags;         /* GLF_... */
+        struct lm_lockname gl_name;
+        atomic_t gl_ref;
+        spinlock_t gl_spin;
+        unsigned int gl_state;
+        unsigned int gl_hash;
+        struct task_struct *gl_owner;
+        unsigned long gl_ip;
+        struct list_head gl_holders;
+        struct list_head gl_waiters1;   /* HIF_MUTEX */
+        struct list_head gl_waiters2;   /* HIF_DEMOTE, HIF_GREEDY */
+        struct list_head gl_waiters3;   /* HIF_PROMOTE */
+        const struct gfs2_glock_operations *gl_ops;
+        struct gfs2_holder *gl_req_gh;
+        gfs2_glop_bh_t gl_req_bh;
+        void *gl_lock;
+        char *gl_lvb;
+        atomic_t gl_lvb_count;
+        u64 gl_vn;
+        unsigned long gl_stamp;
+        void *gl_object;
+        struct list_head gl_reclaim;
+        struct gfs2_sbd *gl_sbd;
+        struct inode *gl_aspace;
+        struct gfs2_log_element gl_le;
+        struct list_head gl_ail_list;
+        atomic_t gl_ail_count;
+};
+struct gfs2_alloc {
+        /* Quota stuff */
+        struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
+        struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
+        unsigned int al_qd_num;
+        u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+        u32 al_alloced; /* Filled in by gfs2_alloc_*() */
+        /* Filled in by gfs2_inplace_reserve() */
+        unsigned int al_line;
+        char *al_file;
+        struct gfs2_holder al_ri_gh;
+        struct gfs2_holder al_rgd_gh;
+        struct gfs2_rgrpd *al_rgd;
+};
+enum {
+        GIF_QD_LOCKED           = 1,
+        GIF_PAGED               = 2,
+        GIF_SW_PAGED            = 3,
+};
+struct gfs2_inode {
+        struct inode i_inode;
+        struct gfs2_inum i_num;
+        unsigned long i_flags;          /* GIF_... */
+        u64 i_vn;
+        struct gfs2_dinode i_di; /* To be replaced by ref to block */
+        struct gfs2_glock *i_gl; /* Move into i_gh? */
+        struct gfs2_holder i_iopen_gh;
+        struct gfs2_holder i_gh; /* for prepare/commit_write only */
+        struct gfs2_alloc i_alloc;
+        u64 i_last_rg_alloc;
+        spinlock_t i_spin;
+        struct rw_semaphore i_rw_mutex;
+        unsigned int i_greedy;
+        unsigned long i_last_pfault;
+        struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
+};
+/*
+ * Since i_inode is the first element of struct gfs2_inode,
+ * this is effectively a cast.
+ */
+static inline struct gfs2_inode *GFS2_I(struct inode *inode)
+{
+        return container_of(inode, struct gfs2_inode, i_inode);
+}
+/* To be removed? */
+static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
+{
+        return inode->i_sb->s_fs_info;
+}
+enum {
+        GFF_DID_DIRECT_ALLOC    = 0,
+        GFF_EXLOCK = 1,
+};
+struct gfs2_file {
+        unsigned long f_flags;          /* GFF_... */
+        struct mutex f_fl_mutex;
+        struct gfs2_holder f_fl_gh;
+};
+struct gfs2_revoke {
+        struct gfs2_log_element rv_le;
+        u64 rv_blkno;
+};
+struct gfs2_revoke_replay {
+        struct list_head rr_list;
+        u64 rr_blkno;
+        unsigned int rr_where;
+};
+enum {
+        QDF_USER                = 0,
+        QDF_CHANGE              = 1,
+        QDF_LOCKED              = 2,
+};
+struct gfs2_quota_lvb {
+        __be32 qb_magic;
+        u32 __pad;
+        __be64 qb_limit;      /* Hard limit of # blocks to alloc */
+        __be64 qb_warn;       /* Warn user when alloc is above this # */
+        __be64 qb_value;       /* Current # blocks allocated */
+};
+struct gfs2_quota_data {
+        struct list_head qd_list;
+        unsigned int qd_count;
+        u32 qd_id;
+        unsigned long qd_flags;         /* QDF_... */
+        s64 qd_change;
+        s64 qd_change_sync;
+        unsigned int qd_slot;
+        unsigned int qd_slot_count;
+        struct buffer_head *qd_bh;
+        struct gfs2_quota_change *qd_bh_qc;
+        unsigned int qd_bh_count;
+        struct gfs2_glock *qd_gl;
+        struct gfs2_quota_lvb qd_qb;
+        u64 qd_sync_gen;
+        unsigned long qd_last_warn;
+        unsigned long qd_last_touched;
+};
+struct gfs2_log_buf {
+        struct list_head lb_list;
+        struct buffer_head *lb_bh;
+        struct buffer_head *lb_real;
+};
+struct gfs2_trans {
+        unsigned long tr_ip;
+        unsigned int tr_blocks;
+        unsigned int tr_revokes;
+        unsigned int tr_reserved;
+        struct gfs2_holder tr_t_gh;
+        int tr_touched;
+        unsigned int tr_num_buf;
+        unsigned int tr_num_buf_new;
+        unsigned int tr_num_buf_rm;
+        struct list_head tr_list_buf;
+        unsigned int tr_num_revoke;
+        unsigned int tr_num_revoke_rm;
+};
+struct gfs2_ail {
+        struct list_head ai_list;
+        unsigned int ai_first;
+        struct list_head ai_ail1_list;
+        struct list_head ai_ail2_list;
+        u64 ai_sync_gen;
+};
+struct gfs2_jdesc {
+        struct list_head jd_list;
+        struct inode *jd_inode;
+        unsigned int jd_jid;
+        int jd_dirty;
+        unsigned int jd_blocks;
+};
+#define GFS2_GLOCKD_DEFAULT     1
+#define GFS2_GLOCKD_MAX         16
+#define GFS2_QUOTA_DEFAULT      GFS2_QUOTA_OFF
+#define GFS2_QUOTA_OFF          0
+#define GFS2_QUOTA_ACCOUNT      1
+#define GFS2_QUOTA_ON           2
+#define GFS2_DATA_DEFAULT       GFS2_DATA_ORDERED
+#define GFS2_DATA_WRITEBACK     1
+#define GFS2_DATA_ORDERED       2
+struct gfs2_args {
+        char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+        char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+        char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+        int ar_spectator; /* Don't get a journal because we're always RO */
+        int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
+        int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
+        int ar_localcaching; /* Local-style caching (dangerous on multihost) */
+        int ar_debug; /* Oops on errors instead of trying to be graceful */
+        int ar_upgrade; /* Upgrade ondisk/multihost format */
+        unsigned int ar_num_glockd; /* Number of glockd threads */
+        int ar_posix_acl; /* Enable posix acls */
+        int ar_quota; /* off/account/on */
+        int ar_suiddir; /* suiddir support */
+        int ar_data; /* ordered/writeback */
+};
+struct gfs2_tune {
+        spinlock_t gt_spin;
+        unsigned int gt_ilimit;
+        unsigned int gt_ilimit_tries;
+        unsigned int gt_ilimit_min;
+        unsigned int gt_demote_secs; /* Cache retention for unheld glock */
+        unsigned int gt_incore_log_blocks;
+        unsigned int gt_log_flush_secs;
+        unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
+        unsigned int gt_scand_secs;
+        unsigned int gt_recoverd_secs;
+        unsigned int gt_logd_secs;
+        unsigned int gt_quotad_secs;
+        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
+        unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
+        unsigned int gt_quota_scale_num; /* Numerator */
+        unsigned int gt_quota_scale_den; /* Denominator */
+        unsigned int gt_quota_cache_secs;
+        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
+        unsigned int gt_atime_quantum; /* Min secs between atime updates */
+        unsigned int gt_new_files_jdata;
+        unsigned int gt_new_files_directio;
+        unsigned int gt_max_atomic_write; /* Split big writes into this size */
+        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
+        unsigned int gt_lockdump_size;
+        unsigned int gt_stall_secs; /* Detects trouble! */
+        unsigned int gt_complain_secs;
+        unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
+        unsigned int gt_entries_per_readdir;
+        unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
+        unsigned int gt_greedy_default;
+        unsigned int gt_greedy_quantum;
+        unsigned int gt_greedy_max;
+        unsigned int gt_statfs_quantum;
+        unsigned int gt_statfs_slow;
+};
+enum {
+        SDF_JOURNAL_CHECKED     = 0,
+        SDF_JOURNAL_LIVE        = 1,
+        SDF_SHUTDOWN            = 2,
+        SDF_NOATIME             = 3,
+};
+#define GFS2_FSNAME_LEN         256
+struct gfs2_sbd {
+        struct super_block *sd_vfs;
+        struct super_block *sd_vfs_meta;
+        struct kobject sd_kobj;
+        unsigned long sd_flags; /* SDF_... */
+        struct gfs2_sb sd_sb;
+        /* Constants computed on mount */
+        u32 sd_fsb2bb;
+        u32 sd_fsb2bb_shift;
+        u32 sd_diptrs;  /* Number of pointers in a dinode */
+        u32 sd_inptrs;  /* Number of pointers in a indirect block */
+        u32 sd_jbsize;  /* Size of a journaled data block */
+        u32 sd_hash_bsize;      /* sizeof(exhash block) */
+        u32 sd_hash_bsize_shift;
+        u32 sd_hash_ptrs;       /* Number of pointers in a hash block */
+        u32 sd_qc_per_block;
+        u32 sd_max_dirres;      /* Max blocks needed to add a directory entry */
+        u32 sd_max_height;      /* Max height of a file's metadata tree */
+        u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+        u32 sd_max_jheight; /* Max height of journaled file's meta tree */
+        u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+        struct gfs2_args sd_args;       /* Mount arguments */
+        struct gfs2_tune sd_tune;       /* Filesystem tuning structure */
+        /* Lock Stuff */
+        struct lm_lockstruct sd_lockstruct;
+        struct list_head sd_reclaim_list;
+        spinlock_t sd_reclaim_lock;
+        wait_queue_head_t sd_reclaim_wq;
+        atomic_t sd_reclaim_count;
+        struct gfs2_holder sd_live_gh;
+        struct gfs2_glock *sd_rename_gl;
+        struct gfs2_glock *sd_trans_gl;
+        /* Inode Stuff */
+        struct inode *sd_master_dir;
+        struct inode *sd_jindex;
+        struct inode *sd_inum_inode;
+        struct inode *sd_statfs_inode;
+        struct inode *sd_ir_inode;
+        struct inode *sd_sc_inode;
+        struct inode *sd_qc_inode;
+        struct inode *sd_rindex;
+        struct inode *sd_quota_inode;
+        /* Inum stuff */
+        struct mutex sd_inum_mutex;
+        /* StatFS stuff */
+        spinlock_t sd_statfs_spin;
+        struct mutex sd_statfs_mutex;
+        struct gfs2_statfs_change sd_statfs_master;
+        struct gfs2_statfs_change sd_statfs_local;
+        unsigned long sd_statfs_sync_time;
+        /* Resource group stuff */
+        u64 sd_rindex_vn;
+        spinlock_t sd_rindex_spin;
+        struct mutex sd_rindex_mutex;
+        struct list_head sd_rindex_list;
+        struct list_head sd_rindex_mru_list;
+        struct list_head sd_rindex_recent_list;
+        struct gfs2_rgrpd *sd_rindex_forward;
+        unsigned int sd_rgrps;
+        /* Journal index stuff */
+        struct list_head sd_jindex_list;
+        spinlock_t sd_jindex_spin;
+        struct mutex sd_jindex_mutex;
+        unsigned int sd_journals;
+        unsigned long sd_jindex_refresh_time;
+        struct gfs2_jdesc *sd_jdesc;
+        struct gfs2_holder sd_journal_gh;
+        struct gfs2_holder sd_jinode_gh;
+        struct gfs2_holder sd_ir_gh;
+        struct gfs2_holder sd_sc_gh;
+        struct gfs2_holder sd_qc_gh;
+        /* Daemon stuff */
+        struct task_struct *sd_scand_process;
+        struct task_struct *sd_recoverd_process;
+        struct task_struct *sd_logd_process;
+        struct task_struct *sd_quotad_process;
+        struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
+        unsigned int sd_glockd_num;
+        /* Quota stuff */
+        struct list_head sd_quota_list;
+        atomic_t sd_quota_count;
+        spinlock_t sd_quota_spin;
+        struct mutex sd_quota_mutex;
+        unsigned int sd_quota_slots;
+        unsigned int sd_quota_chunks;
+        unsigned char **sd_quota_bitmap;
+        u64 sd_quota_sync_gen;
+        unsigned long sd_quota_sync_time;
+        /* Log stuff */
+        spinlock_t sd_log_lock;
+        unsigned int sd_log_blks_reserved;
+        unsigned int sd_log_commited_buf;
+        unsigned int sd_log_commited_revoke;
+        unsigned int sd_log_num_gl;
+        unsigned int sd_log_num_buf;
+        unsigned int sd_log_num_revoke;
+        unsigned int sd_log_num_rg;
+        unsigned int sd_log_num_databuf;
+        unsigned int sd_log_num_jdata;
+        unsigned int sd_log_num_hdrs;
+        struct list_head sd_log_le_gl;
+        struct list_head sd_log_le_buf;
+        struct list_head sd_log_le_revoke;
+        struct list_head sd_log_le_rg;
+        struct list_head sd_log_le_databuf;
+        unsigned int sd_log_blks_free;
+        struct mutex sd_log_reserve_mutex;
+        u64 sd_log_sequence;
+        unsigned int sd_log_head;
+        unsigned int sd_log_tail;
+        int sd_log_idle;
+        unsigned long sd_log_flush_time;
+        struct rw_semaphore sd_log_flush_lock;
+        struct list_head sd_log_flush_list;
+        unsigned int sd_log_flush_head;
+        u64 sd_log_flush_wrapped;
+        struct list_head sd_ail1_list;
+        struct list_head sd_ail2_list;
+        u64 sd_ail_sync_gen;
+        /* Replay stuff */
+        struct list_head sd_revoke_list;
+        unsigned int sd_replay_tail;
+        unsigned int sd_found_blocks;
+        unsigned int sd_found_revokes;
+        unsigned int sd_replayed_blocks;
+        /* For quiescing the filesystem */
+        struct gfs2_holder sd_freeze_gh;
+        struct mutex sd_freeze_lock;
+        unsigned int sd_freeze_count;
+        /* Counters */
+        atomic_t sd_glock_count;
+        atomic_t sd_glock_held_count;
+        atomic_t sd_inode_count;
+        atomic_t sd_reclaimed;
+        char sd_fsname[GFS2_FSNAME_LEN];
+        char sd_table_name[GFS2_FSNAME_LEN];
+        char sd_proto_name[GFS2_FSNAME_LEN];
+        /* Debugging crud */
+        unsigned long sd_last_warning;
+        struct vfsmount *sd_gfs2mnt;
+};
+#endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..57c43ac47925
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1379 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <linux/security.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eattr.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "ops_file.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+/**
+ * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
+ * @ip: The GFS2 inode (with embedded disk inode data)
+ * @inode:  The Linux VFS inode
+ *
+ */
+void gfs2_inode_attr_in(struct gfs2_inode *ip)
+{
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_dinode *di = &ip->i_di;
+        inode->i_ino = ip->i_num.no_addr;
+        switch (di->di_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_rdev = MKDEV(di->di_major, di->di_minor);
+                break;
+        default:
+                inode->i_rdev = 0;
+                break;
+        };
+        inode->i_mode = di->di_mode;
+        inode->i_nlink = di->di_nlink;
+        inode->i_uid = di->di_uid;
+        inode->i_gid = di->di_gid;
+        i_size_write(inode, di->di_size);
+        inode->i_atime.tv_sec = di->di_atime;
+        inode->i_mtime.tv_sec = di->di_mtime;
+        inode->i_ctime.tv_sec = di->di_ctime;
+        inode->i_atime.tv_nsec = 0;
+        inode->i_mtime.tv_nsec = 0;
+        inode->i_ctime.tv_nsec = 0;
+        inode->i_blocks = di->di_blocks <<
+                (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+        if (di->di_flags & GFS2_DIF_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (di->di_flags & GFS2_DIF_APPENDONLY)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+}
+/**
+ * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
+ * @ip: The GFS2 inode
+ *
+ * Only copy out the attributes that we want the VFS layer
+ * to be able to modify.
+ */
+void gfs2_inode_attr_out(struct gfs2_inode *ip)
+{
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_dinode *di = &ip->i_di;
+        gfs2_assert_withdraw(GFS2_SB(inode),
+                (di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
+        di->di_mode = inode->i_mode;
+        di->di_uid = inode->i_uid;
+        di->di_gid = inode->i_gid;
+        di->di_atime = inode->i_atime.tv_sec;
+        di->di_mtime = inode->i_mtime.tv_sec;
+        di->di_ctime = inode->i_ctime.tv_sec;
+}
+static int iget_test(struct inode *inode, void *opaque)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inum *inum = opaque;
+        if (ip && ip->i_num.no_addr == inum->no_addr)
+                return 1;
+        return 0;
+}
+static int iget_set(struct inode *inode, void *opaque)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inum *inum = opaque;
+        ip->i_num = *inum;
+        return 0;
+}
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
+{
+        return ilookup5(sb, (unsigned long)inum->no_formal_ino,
+                        iget_test, inum);
+}
+static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
+{
+        return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
+                     iget_test, iget_set, inum);
+}
+/**
+ * gfs2_inode_lookup - Lookup an inode
+ * @sb: The super block
+ * @inum: The inode number
+ * @type: The type of the inode
+ *
+ * Returns: A VFS inode, or an error
+ */
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
+{
+        struct inode *inode = gfs2_iget(sb, inum);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_glock *io_gl;
+        int error;
+        if (inode->i_state & I_NEW) {
+                struct gfs2_sbd *sdp = GFS2_SB(inode);
+                umode_t mode = DT2IF(type);
+                inode->i_private = ip;
+                inode->i_mode = mode;
+                if (S_ISREG(mode)) {
+                        inode->i_op = &gfs2_file_iops;
+                        inode->i_fop = &gfs2_file_fops;
+                        inode->i_mapping->a_ops = &gfs2_file_aops;
+                } else if (S_ISDIR(mode)) {
+                        inode->i_op = &gfs2_dir_iops;
+                        inode->i_fop = &gfs2_dir_fops;
+                } else if (S_ISLNK(mode)) {
+                        inode->i_op = &gfs2_symlink_iops;
+                } else {
+                        inode->i_op = &gfs2_dev_iops;
+                }
+                error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+                if (unlikely(error))
+                        goto fail;
+                ip->i_gl->gl_object = ip;
+                error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+                if (unlikely(error))
+                        goto fail_put;
+                ip->i_vn = ip->i_gl->gl_vn - 1;
+                error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+                if (unlikely(error))
+                        goto fail_iopen;
+                gfs2_glock_put(io_gl);
+                unlock_new_inode(inode);
+        }
+        return inode;
+fail_iopen:
+        gfs2_glock_put(io_gl);
+fail_put:
+        ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+fail:
+        iput(inode);
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+                brelse(dibh);
+                return -EIO;
+        }
+        gfs2_dinode_in(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
+                return -ESTALE;
+        ip->i_vn = ip->i_gl->gl_vn;
+        return 0;
+}
+int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al;
+        struct gfs2_rgrpd *rgd;
+        int error;
+        if (ip->i_di.di_blocks != 1) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+        if (error)
+                goto out_qs;
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+        if (!rgd) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out_rindex_relse;
+        }
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+                                   &al->al_rgd_gh);
+        if (error)
+                goto out_rindex_relse;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
+        if (error)
+                goto out_rg_gunlock;
+        gfs2_trans_add_gl(ip->i_gl);
+        gfs2_free_di(rgd, ip);
+        gfs2_trans_end(sdp);
+        clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+out_rg_gunlock:
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+        gfs2_quota_unhold(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+/**
+ * gfs2_change_nlink - Change nlink count on inode
+ * @ip: The GFS2 inode
+ * @diff: The change in the nlink count required
+ *
+ * Returns: errno
+ */
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
+{
+        struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
+        struct buffer_head *dibh;
+        u32 nlink;
+        int error;
+        BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
+        nlink = ip->i_di.di_nlink + diff;
+        /* If we are reducing the nlink count, but the new value ends up being
+           bigger than the old one, we must have underflowed. */
+        if (diff < 0 && nlink > ip->i_di.di_nlink) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        ip->i_di.di_nlink = nlink;
+        ip->i_di.di_ctime = get_seconds();
+        ip->i_inode.i_nlink = nlink;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        mark_inode_dirty(&ip->i_inode);
+        if (ip->i_di.di_nlink == 0) {
+                struct gfs2_rgrpd *rgd;
+                struct gfs2_holder ri_gh, rg_gh;
+                error = gfs2_rindex_hold(sdp, &ri_gh);
+                if (error)
+                        goto out;
+                error = -EIO;
+                rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+                if (!rgd)
+                        goto out_norgrp;
+                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+                if (error)
+                        goto out_norgrp;
+                clear_nlink(&ip->i_inode);
+                gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
+                gfs2_glock_dq_uninit(&rg_gh);
+out_norgrp:
+                gfs2_glock_dq_uninit(&ri_gh);
+        }
+out:
+        return error;
+}
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+{
+        struct qstr qstr;
+        gfs2_str2qstr(&qstr, name);
+        return gfs2_lookupi(dip, &qstr, 1, NULL);
+}
+/**
+ * gfs2_lookupi - Look up a filename in a directory and return its inode
+ * @d_gh: An initialized holder for the directory glock
+ * @name: The name of the inode to look for
+ * @is_root: If 1, ignore the caller's permissions
+ * @i_gh: An uninitialized holder for the new inode glock
+ *
+ * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
+ * @is_root is true.
+ *
+ * Returns: errno
+ */
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+                           int is_root, struct nameidata *nd)
+{
+        struct super_block *sb = dir->i_sb;
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_holder d_gh;
+        struct gfs2_inum inum;
+        unsigned int type;
+        int error = 0;
+        struct inode *inode = NULL;
+        if (!name->len || name->len > GFS2_FNAMESIZE)
+                return ERR_PTR(-ENAMETOOLONG);
+        if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
+            (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
+             dir == sb->s_root->d_inode)) {
+                igrab(dir);
+                return dir;
+        }
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        if (error)
+                return ERR_PTR(error);
+        if (!is_root) {
+                error = permission(dir, MAY_EXEC, NULL);
+                if (error)
+                        goto out;
+        }
+        error = gfs2_dir_search(dir, name, &inum, &type);
+        if (error)
+                goto out;
+        inode = gfs2_inode_lookup(sb, &inum, type);
+out:
+        gfs2_glock_dq_uninit(&d_gh);
+        if (error == -ENOENT)
+                return NULL;
+        return inode;
+}
+static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+        struct buffer_head *bh;
+        struct gfs2_inum_range ir;
+        int error;
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                return error;
+        mutex_lock(&sdp->sd_inum_mutex);
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error) {
+                mutex_unlock(&sdp->sd_inum_mutex);
+                gfs2_trans_end(sdp);
+                return error;
+        }
+        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+        if (ir.ir_length) {
+                *formal_ino = ir.ir_start++;
+                ir.ir_length--;
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                gfs2_inum_range_out(&ir,
+                                    bh->b_data + sizeof(struct gfs2_dinode));
+                brelse(bh);
+                mutex_unlock(&sdp->sd_inum_mutex);
+                gfs2_trans_end(sdp);
+                return 0;
+        }
+        brelse(bh);
+        mutex_unlock(&sdp->sd_inum_mutex);
+        gfs2_trans_end(sdp);
+        return 1;
+}
+static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
+        struct gfs2_holder gh;
+        struct buffer_head *bh;
+        struct gfs2_inum_range ir;
+        int error;
+        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                return error;
+        error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+        if (error)
+                goto out;
+        mutex_lock(&sdp->sd_inum_mutex);
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                goto out_end_trans;
+        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+        if (!ir.ir_length) {
+                struct buffer_head *m_bh;
+                u64 x, y;
+                error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+                if (error)
+                        goto out_brelse;
+                x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
+                x = y = be64_to_cpu(x);
+                ir.ir_start = x;
+                ir.ir_length = GFS2_INUM_QUANTUM;
+                x += GFS2_INUM_QUANTUM;
+                if (x < y)
+                        gfs2_consist_inode(m_ip);
+                x = cpu_to_be64(x);
+                gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+                *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
+                brelse(m_bh);
+        }
+        *formal_ino = ir.ir_start++;
+        ir.ir_length--;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+out_brelse:
+        brelse(bh);
+out_end_trans:
+        mutex_unlock(&sdp->sd_inum_mutex);
+        gfs2_trans_end(sdp);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return error;
+}
+static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
+{
+        int error;
+        error = pick_formal_ino_1(sdp, inum);
+        if (error <= 0)
+                return error;
+        error = pick_formal_ino_2(sdp, inum);
+        return error;
+}
+/**
+ * create_ok - OK to create a new on-disk inode here?
+ * @dip:  Directory in which dinode is to be created
+ * @name:  Name of new dinode
+ * @mode:
+ *
+ * Returns: errno
+ */
+static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
+                     unsigned int mode)
+{
+        int error;
+        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        if (error)
+                return error;
+        /*  Don't create entries in an unlinked directory  */
+        if (!dip->i_di.di_nlink)
+                return -EPERM;
+        error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
+        switch (error) {
+        case -ENOENT:
+                error = 0;
+                break;
+        case 0:
+                return -EEXIST;
+        default:
+                return error;
+        }
+        if (dip->i_di.di_entries == (u32)-1)
+                return -EFBIG;
+        if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
+                return -EMLINK;
+        return 0;
+}
+static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+                               unsigned int *uid, unsigned int *gid)
+{
+        if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
+            (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
+                if (S_ISDIR(*mode))
+                        *mode |= S_ISUID;
+                else if (dip->i_di.di_uid != current->fsuid)
+                        *mode &= ~07111;
+                *uid = dip->i_di.di_uid;
+        } else
+                *uid = current->fsuid;
+        if (dip->i_di.di_mode & S_ISGID) {
+                if (S_ISDIR(*mode))
+                        *mode |= S_ISGID;
+                *gid = dip->i_di.di_gid;
+        } else
+                *gid = current->fsgid;
+}
+static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
+                        u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        int error;
+        gfs2_alloc_get(dip);
+        dip->i_alloc.al_requested = RES_DINODE;
+        error = gfs2_inplace_reserve(dip);
+        if (error)
+                goto out;
+        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+        if (error)
+                goto out_ipreserv;
+        inum->no_addr = gfs2_alloc_di(dip, generation);
+        gfs2_trans_end(sdp);
+out_ipreserv:
+        gfs2_inplace_release(dip);
+out:
+        gfs2_alloc_put(dip);
+        return error;
+}
+/**
+ * init_dinode - Fill in a new dinode structure
+ * @dip: the directory this inode is being created in
+ * @gl: The glock covering the new inode
+ * @inum: the inode number
+ * @mode: the file permissions
+ * @uid:
+ * @gid:
+ *
+ */
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+                        const struct gfs2_inum *inum, unsigned int mode,
+                        unsigned int uid, unsigned int gid,
+                        const u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_dinode *di;
+        struct buffer_head *dibh;
+        dibh = gfs2_meta_new(gl, inum->no_addr);
+        gfs2_trans_add_bh(gl, dibh, 1);
+        gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
+        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+        di = (struct gfs2_dinode *)dibh->b_data;
+        di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+        di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+        di->di_mode = cpu_to_be32(mode);
+        di->di_uid = cpu_to_be32(uid);
+        di->di_gid = cpu_to_be32(gid);
+        di->di_nlink = cpu_to_be32(0);
+        di->di_size = cpu_to_be64(0);
+        di->di_blocks = cpu_to_be64(1);
+        di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
+        di->di_major = di->di_minor = cpu_to_be32(0);
+        di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+        di->di_generation = cpu_to_be64(*generation);
+        di->di_flags = cpu_to_be32(0);
+        if (S_ISREG(mode)) {
+                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+                    gfs2_tune_get(sdp, gt_new_files_jdata))
+                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
+                    gfs2_tune_get(sdp, gt_new_files_directio))
+                        di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
+        } else if (S_ISDIR(mode)) {
+                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                                            GFS2_DIF_INHERIT_DIRECTIO);
+                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                                            GFS2_DIF_INHERIT_JDATA);
+        }
+        di->__pad1 = 0;
+        di->di_payload_format = cpu_to_be32(0);
+        di->di_height = cpu_to_be32(0);
+        di->__pad2 = 0;
+        di->__pad3 = 0;
+        di->di_depth = cpu_to_be16(0);
+        di->di_entries = cpu_to_be32(0);
+        memset(&di->__pad4, 0, sizeof(di->__pad4));
+        di->di_eattr = cpu_to_be64(0);
+        memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+        brelse(dibh);
+}
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+                       unsigned int mode, const struct gfs2_inum *inum,
+                       const u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        unsigned int uid, gid;
+        int error;
+        munge_mode_uid_gid(dip, &mode, &uid, &gid);
+        gfs2_alloc_get(dip);
+        error = gfs2_quota_lock(dip, uid, gid);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(dip, uid, gid);
+        if (error)
+                goto out_quota;
+        error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
+        if (error)
+                goto out_quota;
+        init_dinode(dip, gl, inum, mode, uid, gid, generation);
+        gfs2_quota_change(dip, +1, uid, gid);
+        gfs2_trans_end(sdp);
+out_quota:
+        gfs2_quota_unlock(dip);
+out:
+        gfs2_alloc_put(dip);
+        return error;
+}
+static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
+                       struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_alloc *al;
+        int alloc_required;
+        struct buffer_head *dibh;
+        int error;
+        al = gfs2_alloc_get(dip);
+        error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto fail;
+        error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
+        if (alloc_required < 0)
+                goto fail;
+        if (alloc_required) {
+                error = gfs2_quota_check(dip, dip->i_di.di_uid,
+                                         dip->i_di.di_gid);
+                if (error)
+                        goto fail_quota_locks;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(dip);
+                if (error)
+                        goto fail_quota_locks;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         al->al_rgd->rd_ri.ri_length +
+                                         2 * RES_DINODE +
+                                         RES_STATFS + RES_QUOTA, 0);
+                if (error)
+                        goto fail_ipreserv;
+        } else {
+                error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
+                if (error)
+                        goto fail_quota_locks;
+        }
+        error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
+        if (error)
+                goto fail_end_trans;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto fail_end_trans;
+        ip->i_di.di_nlink = 1;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        return 0;
+fail_end_trans:
+        gfs2_trans_end(sdp);
+fail_ipreserv:
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+fail_quota_locks:
+        gfs2_quota_unlock(dip);
+fail:
+        gfs2_alloc_put(dip);
+        return error;
+}
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+        int err;
+        size_t len;
+        void *value;
+        char *name;
+        struct gfs2_ea_request er;
+        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+                                           &name, &value, &len);
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        return 0;
+                return err;
+        }
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = GFS2_EATYPE_SECURITY;
+        er.er_name = name;
+        er.er_data = value;
+        er.er_name_len = strlen(name);
+        er.er_data_len = len;
+        err = gfs2_ea_set_i(ip, &er);
+        kfree(value);
+        kfree(name);
+        return err;
+}
+/**
+ * gfs2_createi - Create a new inode
+ * @ghs: An array of two holders
+ * @name: The name of the new file
+ * @mode: the permissions on the new inode
+ *
+ * @ghs[0] is an initialized holder for the directory
+ * @ghs[1] is the holder for the inode lock
+ *
+ * If the return value is not NULL, the glocks on both the directory and the new
+ * file are held.  A transaction has been started and an inplace reservation
+ * is held, as well.
+ *
+ * Returns: An inode
+ */
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+                           unsigned int mode)
+{
+        struct inode *inode;
+        struct gfs2_inode *dip = ghs->gh_gl->gl_object;
+        struct inode *dir = &dip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_inum inum;
+        int error;
+        u64 generation;
+        if (!name->len || name->len > GFS2_FNAMESIZE)
+                return ERR_PTR(-ENAMETOOLONG);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+        error = gfs2_glock_nq(ghs);
+        if (error)
+                goto fail;
+        error = create_ok(dip, name, mode);
+        if (error)
+                goto fail_gunlock;
+        error = pick_formal_ino(sdp, &inum.no_formal_ino);
+        if (error)
+                goto fail_gunlock;
+        error = alloc_dinode(dip, &inum, &generation);
+        if (error)
+                goto fail_gunlock;
+        if (inum.no_addr < dip->i_num.no_addr) {
+                gfs2_glock_dq(ghs);
+                error = gfs2_glock_nq_num(sdp, inum.no_addr,
+                                          &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+                                          GL_SKIP, ghs + 1);
+                if (error) {
+                        return ERR_PTR(error);
+                }
+                gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+                error = gfs2_glock_nq(ghs);
+                if (error) {
+                        gfs2_glock_dq_uninit(ghs + 1);
+                        return ERR_PTR(error);
+                }
+                error = create_ok(dip, name, mode);
+                if (error)
+                        goto fail_gunlock2;
+        } else {
+                error = gfs2_glock_nq_num(sdp, inum.no_addr,
+                                          &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+                                          GL_SKIP, ghs + 1);
+                if (error)
+                        goto fail_gunlock;
+        }
+        error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
+        if (error)
+                goto fail_gunlock2;
+        inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
+        if (IS_ERR(inode))
+                goto fail_gunlock2;
+        error = gfs2_inode_refresh(GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        error = gfs2_acl_create(dip, GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        error = gfs2_security_init(dip, GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        error = link_dinode(dip, name, GFS2_I(inode));
+        if (error)
+                goto fail_iput;
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        return inode;
+fail_iput:
+        iput(inode);
+fail_gunlock2:
+        gfs2_glock_dq_uninit(ghs + 1);
+fail_gunlock:
+        gfs2_glock_dq(ghs);
+fail:
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                struct gfs2_inode *ip)
+{
+        struct qstr dotname;
+        int error;
+        if (ip->i_di.di_entries != 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                return -EIO;
+        }
+        error = gfs2_dir_del(dip, name);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(dip, -1);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, ".");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, "..");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(ip, -2);
+        if (error)
+                return error;
+        return error;
+}
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                   struct gfs2_inode *ip)
+{
+        struct gfs2_inum inum;
+        unsigned int type;
+        int error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                return -EPERM;
+        if ((dip->i_di.di_mode & S_ISVTX) &&
+            dip->i_di.di_uid != current->fsuid &&
+            ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (IS_APPEND(&dip->i_inode))
+                return -EPERM;
+        error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+        if (error)
+                return error;
+        error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
+        if (error)
+                return error;
+        if (!gfs2_inum_equal(&inum, &ip->i_num))
+                return -ENOENT;
+        if (IF2DT(ip->i_di.di_mode) != type) {
+                gfs2_consist_inode(dip);
+                return -EIO;
+        }
+        return 0;
+}
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+        struct inode *dir = &to->i_inode;
+        struct super_block *sb = dir->i_sb;
+        struct inode *tmp;
+        struct qstr dotdot;
+        int error = 0;
+        gfs2_str2qstr(&dotdot, "..");
+        igrab(dir);
+        for (;;) {
+                if (dir == &this->i_inode) {
+                        error = -EINVAL;
+                        break;
+                }
+                if (dir == sb->s_root->d_inode) {
+                        error = 0;
+                        break;
+                }
+                tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
+                if (IS_ERR(tmp)) {
+                        error = PTR_ERR(tmp);
+                        break;
+                }
+                iput(dir);
+                dir = tmp;
+        }
+        iput(dir);
+        return error;
+}
+/**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+        struct gfs2_holder i_gh;
+        struct buffer_head *dibh;
+        unsigned int x;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        error = gfs2_glock_nq_atime(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        if (!ip->i_di.di_size) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        x = ip->i_di.di_size + 1;
+        if (x > *len) {
+                *buf = kmalloc(x, GFP_KERNEL);
+                if (!*buf) {
+                        error = -ENOMEM;
+                        goto out_brelse;
+                }
+        }
+        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+        *len = x;
+out_brelse:
+        brelse(dibh);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
+ *       conditionally update the inode's atime
+ * @gh: the holder to acquire
+ *
+ * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
+ * Update if the difference between the current time and the inode's current
+ * atime is greater than an interval specified at mount.
+ *
+ * Returns: errno
+ */
+int gfs2_glock_nq_atime(struct gfs2_holder *gh)
+{
+        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = gl->gl_object;
+        s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
+        unsigned int state;
+        int flags;
+        int error;
+        if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
+            gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
+            gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
+                return -EINVAL;
+        state = gh->gh_state;
+        flags = gh->gh_flags;
+        error = gfs2_glock_nq(gh);
+        if (error)
+                return error;
+        if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
+            (sdp->sd_vfs->s_flags & MS_RDONLY))
+                return 0;
+        curtime = get_seconds();
+        if (curtime - ip->i_di.di_atime >= quantum) {
+                gfs2_glock_dq(gh);
+                gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
+                                   gh);
+                error = gfs2_glock_nq(gh);
+                if (error)
+                        return error;
+                /* Verify that atime hasn't been updated while we were
+                   trying to get exclusive lock. */
+                curtime = get_seconds();
+                if (curtime - ip->i_di.di_atime >= quantum) {
+                        struct buffer_head *dibh;
+                        struct gfs2_dinode *di;
+                        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+                        if (error == -EROFS)
+                                return 0;
+                        if (error)
+                                goto fail;
+                        error = gfs2_meta_inode_buffer(ip, &dibh);
+                        if (error)
+                                goto fail_end_trans;
+                        ip->i_di.di_atime = curtime;
+                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                        di = (struct gfs2_dinode *)dibh->b_data;
+                        di->di_atime = cpu_to_be64(ip->i_di.di_atime);
+                        brelse(dibh);
+                        gfs2_trans_end(sdp);
+                }
+                /* If someone else has asked for the glock,
+                   unlock and let them have it. Then reacquire
+                   in the original state. */
+                if (gfs2_glock_is_blocking(gl)) {
+                        gfs2_glock_dq(gh);
+                        gfs2_holder_reinit(state, flags, gh);
+                        return gfs2_glock_nq(gh);
+                }
+        }
+        return 0;
+fail_end_trans:
+        gfs2_trans_end(sdp);
+fail:
+        gfs2_glock_dq(gh);
+        return error;
+}
+/**
+ * glock_compare_atime - Compare two struct gfs2_glock structures for sort
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ * Returns: 1 if A > B
+ *         -1 if A < B
+ *          0 if A == B
+ */
+static int glock_compare_atime(const void *arg_a, const void *arg_b)
+{
+        const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+        const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+        const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+        const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+        if (a->ln_number > b->ln_number)
+                return 1;
+        if (a->ln_number < b->ln_number)
+                return -1;
+        if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+                return 1;
+        if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
+                return 1;
+        return 0;
+}
+/**
+ * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
+ *      atime update
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+        struct gfs2_holder **p;
+        unsigned int x;
+        int error = 0;
+        if (!num_gh)
+                return 0;
+        if (num_gh == 1) {
+                ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                if (ghs->gh_flags & GL_ATIME)
+                        error = gfs2_glock_nq_atime(ghs);
+                else
+                        error = gfs2_glock_nq(ghs);
+                return error;
+        }
+        p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+        if (!p)
+                return -ENOMEM;
+        for (x = 0; x < num_gh; x++)
+                p[x] = &ghs[x];
+        sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
+        for (x = 0; x < num_gh; x++) {
+                p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+                if (p[x]->gh_flags & GL_ATIME)
+                        error = gfs2_glock_nq_atime(p[x]);
+                else
+                        error = gfs2_glock_nq(p[x]);
+                if (error) {
+                        while (x--)
+                                gfs2_glock_dq(p[x]);
+                        break;
+                }
+        }
+        kfree(p);
+        return error;
+}
+static int
+__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+        struct buffer_head *dibh;
+        int error;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!error) {
+                error = inode_setattr(&ip->i_inode, attr);
+                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+                gfs2_inode_attr_out(ip);
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        return error;
+}
+/**
+ * gfs2_setattr_simple -
+ * @ip:
+ * @attr:
+ *
+ * Called with a reference on the vnode.
+ *
+ * Returns: errno
+ */
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+        int error;
+        if (current->journal_info)
+                return __gfs2_setattr_simple(ip, attr);
+        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+        if (error)
+                return error;
+        error = __gfs2_setattr_simple(ip, attr);
+        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        return error;
+}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..f5d861760579
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __INODE_DOT_H__
+#define __INODE_DOT_H__
+static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
+{
+        return !ip->i_di.di_height;
+}
+static inline int gfs2_is_jdata(struct gfs2_inode *ip)
+{
+        return ip->i_di.di_flags & GFS2_DIF_JDATA;
+}
+static inline int gfs2_is_dir(struct gfs2_inode *ip)
+{
+        return S_ISDIR(ip->i_di.di_mode);
+}
+void gfs2_inode_attr_in(struct gfs2_inode *ip);
+void gfs2_inode_attr_out(struct gfs2_inode *ip);
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+                           int is_root, struct nameidata *nd);
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+                           unsigned int mode);
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                struct gfs2_inode *ip);
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                   struct gfs2_inode *ip);
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+int gfs2_glock_nq_atime(struct gfs2_holder *gh);
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+#endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..effe4a337c1d
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "super.h"
+#include "util.h"
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+        char *proto = sdp->sd_proto_name;
+        char *table = sdp->sd_table_name;
+        int flags = 0;
+        int error;
+        if (sdp->sd_args.ar_spectator)
+                flags |= LM_MFLAG_SPECTATOR;
+        fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+        error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+                                     gfs2_glock_cb, sdp,
+                                     GFS2_MIN_LVB_SIZE, flags,
+                                     &sdp->sd_lockstruct, &sdp->sd_kobj);
+        if (error) {
+                fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+                        proto, table, sdp->sd_args.ar_hostdata);
+                goto out;
+        }
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+                                  GFS2_MIN_LVB_SIZE)) {
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+                goto out;
+        }
+        if (sdp->sd_args.ar_spectator)
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+        else
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+                         sdp->sd_lockstruct.ls_jid);
+        fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+        if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+            !sdp->sd_args.ar_ignore_local_fs) {
+                sdp->sd_args.ar_localflocks = 1;
+                sdp->sd_args.ar_localcaching = 1;
+        }
+out:
+        return error;
+}
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+                                        sdp->sd_lockstruct.ls_lockspace);
+}
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+        va_list args;
+        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return 0;
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        fs_err(sdp, "about to withdraw from the cluster\n");
+        BUG_ON(sdp->sd_args.ar_debug);
+        fs_err(sdp, "waiting for outstanding I/O\n");
+        /* FIXME: suspend dm device so oustanding bio's complete
+           and all further io requests fail */
+        fs_err(sdp, "telling LM to withdraw\n");
+        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+        fs_err(sdp, "withdrawn\n");
+        dump_stack();
+        return -1;
+}
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                     void **lockp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
+        return error;
+}
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
+}
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                          unsigned int cur_state, unsigned int req_state,
+                          unsigned int flags)
+{
+        int ret = 0;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+                            unsigned int cur_state)
+{
+        int ret = 0;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+        return ret;
+}
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
+}
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+        return error;
+}
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
+}
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                      struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                  struct file *file, int cmd, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+        return error;
+}
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                    struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                           unsigned int message)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+                        sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..21cdc30ee08c
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __LM_DOT_H__
+#define __LM_DOT_H__
+struct gfs2_sbd;
+#define GFS2_MIN_LVB_SIZE 32
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+                                __attribute__ ((format(printf, 2, 3)));
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                     void **lockp);
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                         unsigned int cur_state, unsigned int req_state,
+                         unsigned int flags);
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+                           unsigned int cur_state);
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                      struct file *file, struct file_lock *fl);
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                  struct file *file, int cmd, struct file_lock *fl);
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                    struct file *file, struct file_lock *fl);
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                           unsigned int message);
+#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..663fee728783
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/lm_interface.h>
+struct lmh_wrapper {
+        struct list_head lw_list;
+        const struct lm_lockops *lw_ops;
+};
+/* List of registered low-level locking protocols.  A file system selects one
+   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static LIST_HEAD(lmh_list);
+static DEFINE_MUTEX(lmh_lock);
+/**
+ * gfs2_register_lockproto - Register a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+int gfs2_register_lockproto(const struct lm_lockops *proto)
+{
+        struct lmh_wrapper *lw;
+        mutex_lock(&lmh_lock);
+        list_for_each_entry(lw, &lmh_list, lw_list) {
+                if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+                        mutex_unlock(&lmh_lock);
+                        printk(KERN_INFO "GFS2: protocol %s already exists\n",
+                               proto->lm_proto_name);
+                        return -EEXIST;
+                }
+        }
+        lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
+        if (!lw) {
+                mutex_unlock(&lmh_lock);
+                return -ENOMEM;
+        }
+        lw->lw_ops = proto;
+        list_add(&lw->lw_list, &lmh_list);
+        mutex_unlock(&lmh_lock);
+        return 0;
+}
+/**
+ * gfs2_unregister_lockproto - Unregister a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ */
+void gfs2_unregister_lockproto(const struct lm_lockops *proto)
+{
+        struct lmh_wrapper *lw;
+        mutex_lock(&lmh_lock);
+        list_for_each_entry(lw, &lmh_list, lw_list) {
+                if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+                        list_del(&lw->lw_list);
+                        mutex_unlock(&lmh_lock);
+                        kfree(lw);
+                        return;
+                }
+        }
+        mutex_unlock(&lmh_lock);
+        printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
+               proto->lm_proto_name);
+}
+/**
+ * gfs2_mount_lockproto - Mount a lock protocol
+ * @proto_name - the name of the protocol
+ * @table_name - the name of the lock space
+ * @host_data - data specific to this host
+ * @cb - the callback to the code using the lock module
+ * @sdp - The GFS2 superblock
+ * @min_lvb_size - the mininum LVB size that the caller can deal with
+ * @flags - LM_MFLAG_*
+ * @lockstruct - a structure returned describing the mount
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
+                         lm_callback_t cb, void *cb_data,
+                         unsigned int min_lvb_size, int flags,
+                         struct lm_lockstruct *lockstruct,
+                         struct kobject *fskobj)
+{
+        struct lmh_wrapper *lw = NULL;
+        int try = 0;
+        int error, found;
+retry:
+        mutex_lock(&lmh_lock);
+        found = 0;
+        list_for_each_entry(lw, &lmh_list, lw_list) {
+                if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                if (!try && capable(CAP_SYS_MODULE)) {
+                        try = 1;
+                        mutex_unlock(&lmh_lock);
+                        request_module(proto_name);
+                        goto retry;
+                }
+                printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
+                error = -ENOENT;
+                goto out;
+        }
+        if (!try_module_get(lw->lw_ops->lm_owner)) {
+                try = 0;
+                mutex_unlock(&lmh_lock);
+                msleep(1000);
+                goto retry;
+        }
+        error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
+                                     min_lvb_size, flags, lockstruct, fskobj);
+        if (error)
+                module_put(lw->lw_ops->lm_owner);
+out:
+        mutex_unlock(&lmh_lock);
+        return error;
+}
+void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
+{
+        mutex_lock(&lmh_lock);
+        lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_owner)
+                module_put(lockstruct->ls_ops->lm_owner);
+        mutex_unlock(&lmh_lock);
+}
+/**
+ * gfs2_withdraw_lockproto - abnormally unmount a lock module
+ * @lockstruct: the lockstruct passed into mount
+ *
+ */
+void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
+{
+        mutex_lock(&lmh_lock);
+        lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
+        if (lockstruct->ls_ops->lm_owner)
+                module_put(lockstruct->ls_ops->lm_owner);
+        mutex_unlock(&lmh_lock);
+}
+EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
+EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..89b93b6b45cf
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
+lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..b167addf9fd1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include "lock_dlm.h"
+static char junk_lvb[GDLM_LVB_SIZE];
+static void queue_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        clear_bit(LFL_ACTIVE, &lp->flags);
+        spin_lock(&ls->async_lock);
+        list_add_tail(&lp->clist, &ls->complete);
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+static inline void gdlm_ast(void *astarg)
+{
+        queue_complete(astarg);
+}
+static inline void gdlm_bast(void *astarg, int mode)
+{
+        struct gdlm_lock *lp = astarg;
+        struct gdlm_ls *ls = lp->ls;
+        if (!mode) {
+                printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+                        lp->lockname.ln_type,
+                        (unsigned long long)lp->lockname.ln_number);
+                return;
+        }
+        spin_lock(&ls->async_lock);
+        if (!lp->bast_mode) {
+                list_add_tail(&lp->blist, &ls->blocking);
+                lp->bast_mode = mode;
+        } else if (lp->bast_mode < mode)
+                lp->bast_mode = mode;
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+void gdlm_queue_delayed(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        spin_lock(&ls->async_lock);
+        list_add_tail(&lp->delay_list, &ls->delayed);
+        spin_unlock(&ls->async_lock);
+}
+/* convert gfs lock-state to dlm lock-mode */
+static s16 make_mode(s16 lmstate)
+{
+        switch (lmstate) {
+        case LM_ST_UNLOCKED:
+                return DLM_LOCK_NL;
+        case LM_ST_EXCLUSIVE:
+                return DLM_LOCK_EX;
+        case LM_ST_DEFERRED:
+                return DLM_LOCK_CW;
+        case LM_ST_SHARED:
+                return DLM_LOCK_PR;
+        }
+        gdlm_assert(0, "unknown LM state %d", lmstate);
+        return -1;
+}
+/* convert dlm lock-mode to gfs lock-state */
+s16 gdlm_make_lmstate(s16 dlmmode)
+{
+        switch (dlmmode) {
+        case DLM_LOCK_IV:
+        case DLM_LOCK_NL:
+                return LM_ST_UNLOCKED;
+        case DLM_LOCK_EX:
+                return LM_ST_EXCLUSIVE;
+        case DLM_LOCK_CW:
+                return LM_ST_DEFERRED;
+        case DLM_LOCK_PR:
+                return LM_ST_SHARED;
+        }
+        gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+        return -1;
+}
+/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
+   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
+static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
+{
+        s16 cur = make_mode(cur_state);
+        if (lp->cur != DLM_LOCK_IV)
+                gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
+}
+static inline unsigned int make_flags(struct gdlm_lock *lp,
+                                      unsigned int gfs_flags,
+                                      s16 cur, s16 req)
+{
+        unsigned int lkf = 0;
+        if (gfs_flags & LM_FLAG_TRY)
+                lkf |= DLM_LKF_NOQUEUE;
+        if (gfs_flags & LM_FLAG_TRY_1CB) {
+                lkf |= DLM_LKF_NOQUEUE;
+                lkf |= DLM_LKF_NOQUEUEBAST;
+        }
+        if (gfs_flags & LM_FLAG_PRIORITY) {
+                lkf |= DLM_LKF_NOORDER;
+                lkf |= DLM_LKF_HEADQUE;
+        }
+        if (gfs_flags & LM_FLAG_ANY) {
+                if (req == DLM_LOCK_PR)
+                        lkf |= DLM_LKF_ALTCW;
+                else if (req == DLM_LOCK_CW)
+                        lkf |= DLM_LKF_ALTPR;
+        }
+        if (lp->lksb.sb_lkid != 0) {
+                lkf |= DLM_LKF_CONVERT;
+                /* Conversion deadlock avoidance by DLM */
+                if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+                    !(lkf & DLM_LKF_NOQUEUE) &&
+                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
+                        lkf |= DLM_LKF_CONVDEADLK;
+        }
+        if (lp->lvb)
+                lkf |= DLM_LKF_VALBLK;
+        return lkf;
+}
+/* make_strname - convert GFS lock numbers to a string */
+static inline void make_strname(struct lm_lockname *lockname,
+                                struct gdlm_strname *str)
+{
+        sprintf(str->name, "%8x%16llx", lockname->ln_type,
+                (unsigned long long)lockname->ln_number);
+        str->namelen = GDLM_STRNAME_BYTES;
+}
+static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
+                          struct gdlm_lock **lpp)
+{
+        struct gdlm_lock *lp;
+        lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+        if (!lp)
+                return -ENOMEM;
+        lp->lockname = *name;
+        lp->ls = ls;
+        lp->cur = DLM_LOCK_IV;
+        lp->lvb = NULL;
+        lp->hold_null = NULL;
+        init_completion(&lp->ast_wait);
+        INIT_LIST_HEAD(&lp->clist);
+        INIT_LIST_HEAD(&lp->blist);
+        INIT_LIST_HEAD(&lp->delay_list);
+        spin_lock(&ls->async_lock);
+        list_add(&lp->all_list, &ls->all_locks);
+        ls->all_locks_count++;
+        spin_unlock(&ls->async_lock);
+        *lpp = lp;
+        return 0;
+}
+void gdlm_delete_lp(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        spin_lock(&ls->async_lock);
+        if (!list_empty(&lp->clist))
+                list_del_init(&lp->clist);
+        if (!list_empty(&lp->blist))
+                list_del_init(&lp->blist);
+        if (!list_empty(&lp->delay_list))
+                list_del_init(&lp->delay_list);
+        gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
+                    (unsigned long long)lp->lockname.ln_number);
+        list_del_init(&lp->all_list);
+        ls->all_locks_count--;
+        spin_unlock(&ls->async_lock);
+        kfree(lp);
+}
+int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
+                  void **lockp)
+{
+        struct gdlm_lock *lp;
+        int error;
+        error = gdlm_create_lp(lockspace, name, &lp);
+        *lockp = lp;
+        return error;
+}
+void gdlm_put_lock(void *lock)
+{
+        gdlm_delete_lp(lock);
+}
+unsigned int gdlm_do_lock(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct gdlm_strname str;
+        int error, bast = 1;
+        /*
+         * When recovery is in progress, delay lock requests for submission
+         * once recovery is done.  Requests for recovery (NOEXP) and unlocks
+         * can pass.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
+                gdlm_queue_delayed(lp);
+                return LM_OUT_ASYNC;
+        }
+        /*
+         * Submit the actual lock request.
+         */
+        if (test_bit(LFL_NOBAST, &lp->flags))
+                bast = 0;
+        make_strname(&lp->lockname, &str);
+        set_bit(LFL_ACTIVE, &lp->flags);
+        log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
+                  (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
+                  lp->cur, lp->req, lp->lkf);
+        error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
+                         str.name, str.namelen, 0, gdlm_ast, lp,
+                         bast ? gdlm_bast : NULL);
+        if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
+                lp->lksb.sb_status = -EAGAIN;
+                queue_complete(lp);
+                error = 0;
+        }
+        if (error) {
+                log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+                          "flags=%lx", ls->fsname, lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number, error,
+                          lp->cur, lp->req, lp->lkf, lp->flags);
+                return LM_OUT_ERROR;
+        }
+        return LM_OUT_ASYNC;
+}
+static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int lkf = 0;
+        int error;
+        set_bit(LFL_DLM_UNLOCK, &lp->flags);
+        set_bit(LFL_ACTIVE, &lp->flags);
+        if (lp->lvb)
+                lkf = DLM_LKF_VALBLK;
+        log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
+                  (unsigned long long)lp->lockname.ln_number,
+                  lp->lksb.sb_lkid, lp->cur, lkf);
+        error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
+        if (error) {
+                log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+                          "flags=%lx", ls->fsname, lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number, error,
+                          lp->cur, lp->req, lp->lkf, lp->flags);
+                return LM_OUT_ERROR;
+        }
+        return LM_OUT_ASYNC;
+}
+unsigned int gdlm_lock(void *lock, unsigned int cur_state,
+                       unsigned int req_state, unsigned int flags)
+{
+        struct gdlm_lock *lp = lock;
+        clear_bit(LFL_DLM_CANCEL, &lp->flags);
+        if (flags & LM_FLAG_NOEXP)
+                set_bit(LFL_NOBLOCK, &lp->flags);
+        check_cur_state(lp, cur_state);
+        lp->req = make_mode(req_state);
+        lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
+        return gdlm_do_lock(lp);
+}
+unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
+{
+        struct gdlm_lock *lp = lock;
+        clear_bit(LFL_DLM_CANCEL, &lp->flags);
+        if (lp->cur == DLM_LOCK_IV)
+                return 0;
+        return gdlm_do_unlock(lp);
+}
+void gdlm_cancel(void *lock)
+{
+        struct gdlm_lock *lp = lock;
+        struct gdlm_ls *ls = lp->ls;
+        int error, delay_list = 0;
+        if (test_bit(LFL_DLM_CANCEL, &lp->flags))
+                return;
+        log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
+                 (unsigned long long)lp->lockname.ln_number, lp->flags);
+        spin_lock(&ls->async_lock);
+        if (!list_empty(&lp->delay_list)) {
+                list_del_init(&lp->delay_list);
+                delay_list = 1;
+        }
+        spin_unlock(&ls->async_lock);
+        if (delay_list) {
+                set_bit(LFL_CANCEL, &lp->flags);
+                set_bit(LFL_ACTIVE, &lp->flags);
+                queue_complete(lp);
+                return;
+        }
+        if (!test_bit(LFL_ACTIVE, &lp->flags) ||
+            test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                log_info("gdlm_cancel skip %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number, lp->flags);
+                return;
+        }
+        /* the lock is blocked in the dlm */
+        set_bit(LFL_DLM_CANCEL, &lp->flags);
+        set_bit(LFL_ACTIVE, &lp->flags);
+        error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
+                           NULL, lp);
+        log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
+                 lp->lockname.ln_type,
+                 (unsigned long long)lp->lockname.ln_number, lp->flags);
+        if (error == -EBUSY)
+                clear_bit(LFL_DLM_CANCEL, &lp->flags);
+}
+static int gdlm_add_lvb(struct gdlm_lock *lp)
+{
+        char *lvb;
+        lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+        if (!lvb)
+                return -ENOMEM;
+        lp->lksb.sb_lvbptr = lvb;
+        lp->lvb = lvb;
+        return 0;
+}
+static void gdlm_del_lvb(struct gdlm_lock *lp)
+{
+        kfree(lp->lvb);
+        lp->lvb = NULL;
+        lp->lksb.sb_lvbptr = NULL;
+}
+/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
+   the completion) because gfs won't call hold_lvb() during a callback (from
+   the context of a lock_dlm thread). */
+static int hold_null_lock(struct gdlm_lock *lp)
+{
+        struct gdlm_lock *lpn = NULL;
+        int error;
+        if (lp->hold_null) {
+                printk(KERN_INFO "lock_dlm: lvb already held\n");
+                return 0;
+        }
+        error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
+        if (error)
+                goto out;
+        lpn->lksb.sb_lvbptr = junk_lvb;
+        lpn->lvb = junk_lvb;
+        lpn->req = DLM_LOCK_NL;
+        lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
+        set_bit(LFL_NOBAST, &lpn->flags);
+        set_bit(LFL_INLOCK, &lpn->flags);
+        init_completion(&lpn->ast_wait);
+        gdlm_do_lock(lpn);
+        wait_for_completion(&lpn->ast_wait);
+        error = lpn->lksb.sb_status;
+        if (error) {
+                printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
+                       error);
+                gdlm_delete_lp(lpn);
+                lpn = NULL;
+        }
+out:
+        lp->hold_null = lpn;
+        return error;
+}
+/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
+   the completion) because gfs may call unhold_lvb() during a callback (from
+   the context of a lock_dlm thread) which could cause a deadlock since the
+   other lock_dlm thread could be engaged in recovery. */
+static void unhold_null_lock(struct gdlm_lock *lp)
+{
+        struct gdlm_lock *lpn = lp->hold_null;
+        gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
+                    (unsigned long long)lp->lockname.ln_number);
+        lpn->lksb.sb_lvbptr = NULL;
+        lpn->lvb = NULL;
+        set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
+        gdlm_do_unlock(lpn);
+        lp->hold_null = NULL;
+}
+/* Acquire a NL lock because gfs requires the value block to remain
+   intact on the resource while the lvb is "held" even if it's holding no locks
+   on the resource. */
+int gdlm_hold_lvb(void *lock, char **lvbp)
+{
+        struct gdlm_lock *lp = lock;
+        int error;
+        error = gdlm_add_lvb(lp);
+        if (error)
+                return error;
+        *lvbp = lp->lvb;
+        error = hold_null_lock(lp);
+        if (error)
+                gdlm_del_lvb(lp);
+        return error;
+}
+void gdlm_unhold_lvb(void *lock, char *lvb)
+{
+        struct gdlm_lock *lp = lock;
+        unhold_null_lock(lp);
+        gdlm_del_lvb(lp);
+}
+void gdlm_submit_delayed(struct gdlm_ls *ls)
+{
+        struct gdlm_lock *lp, *safe;
+        spin_lock(&ls->async_lock);
+        list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
+                list_del_init(&lp->delay_list);
+                list_add_tail(&lp->delay_list, &ls->submit);
+        }
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+int gdlm_release_all_locks(struct gdlm_ls *ls)
+{
+        struct gdlm_lock *lp, *safe;
+        int count = 0;
+        spin_lock(&ls->async_lock);
+        list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
+                list_del_init(&lp->all_list);
+                if (lp->lvb && lp->lvb != junk_lvb)
+                        kfree(lp->lvb);
+                kfree(lp);
+                count++;
+        }
+        spin_unlock(&ls->async_lock);
+        return count;
+}
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..33af707a4d3f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef LOCK_DLM_DOT_H
+#define LOCK_DLM_DOT_H
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/fcntl.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+#include <linux/dlm.h>
+#include <linux/lm_interface.h>
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.  Externally, GFS refers to this module
+ * as "lock_dlm".
+ */
+#define GDLM_STRNAME_BYTES      24
+#define GDLM_LVB_SIZE           32
+#define GDLM_DROP_COUNT         50000
+#define GDLM_DROP_PERIOD        60
+#define GDLM_NAME_LEN           128
+/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
+   We sprintf these numbers into a 24 byte string of hex values to make them
+   human-readable (to make debugging simpler.) */
+struct gdlm_strname {
+        unsigned char           name[GDLM_STRNAME_BYTES];
+        unsigned short          namelen;
+};
+enum {
+        DFL_BLOCK_LOCKS         = 0,
+        DFL_SPECTATOR           = 1,
+        DFL_WITHDRAW            = 2,
+};
+struct gdlm_ls {
+        u32             id;
+        int                     jid;
+        int                     first;
+        int                     first_done;
+        unsigned long           flags;
+        struct kobject          kobj;
+        char                    clustername[GDLM_NAME_LEN];
+        char                    fsname[GDLM_NAME_LEN];
+        int                     fsflags;
+        dlm_lockspace_t         *dlm_lockspace;
+        lm_callback_t           fscb;
+        struct gfs2_sbd         *sdp;
+        int                     recover_jid;
+        int                     recover_jid_done;
+        int                     recover_jid_status;
+        spinlock_t              async_lock;
+        struct list_head        complete;
+        struct list_head        blocking;
+        struct list_head        delayed;
+        struct list_head        submit;
+        struct list_head        all_locks;
+        u32             all_locks_count;
+        wait_queue_head_t       wait_control;
+        struct task_struct      *thread1;
+        struct task_struct      *thread2;
+        wait_queue_head_t       thread_wait;
+        unsigned long           drop_time;
+        int                     drop_locks_count;
+        int                     drop_locks_period;
+};
+enum {
+        LFL_NOBLOCK             = 0,
+        LFL_NOCACHE             = 1,
+        LFL_DLM_UNLOCK          = 2,
+        LFL_DLM_CANCEL          = 3,
+        LFL_SYNC_LVB            = 4,
+        LFL_FORCE_PROMOTE       = 5,
+        LFL_REREQUEST           = 6,
+        LFL_ACTIVE              = 7,
+        LFL_INLOCK              = 8,
+        LFL_CANCEL              = 9,
+        LFL_NOBAST              = 10,
+        LFL_HEADQUE             = 11,
+        LFL_UNLOCK_DELETE       = 12,
+};
+struct gdlm_lock {
+        struct gdlm_ls          *ls;
+        struct lm_lockname      lockname;
+        char                    *lvb;
+        struct dlm_lksb         lksb;
+        s16                     cur;
+        s16                     req;
+        s16                     prev_req;
+        u32                     lkf;            /* dlm flags DLM_LKF_ */
+        unsigned long           flags;          /* lock_dlm flags LFL_ */
+        int                     bast_mode;      /* protected by async_lock */
+        struct completion       ast_wait;
+        struct list_head        clist;          /* complete */
+        struct list_head        blist;          /* blocking */
+        struct list_head        delay_list;     /* delayed */
+        struct list_head        all_list;       /* all locks for the fs */
+        struct gdlm_lock        *hold_null;     /* NL lock for hold_lvb */
+};
+#define gdlm_assert(assertion, fmt, args...)                                  \
+do {                                                                          \
+        if (unlikely(!(assertion))) {                                         \
+                printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
+                                  "lock_dlm:  " fmt "\n",                     \
+                                  #assertion, ##args);                        \
+                BUG();                                                        \
+        }                                                                     \
+} while (0)
+#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
+#define log_info(fmt, arg...)  log_print(KERN_INFO , fmt , ## arg)
+#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
+#ifdef LOCK_DLM_LOG_DEBUG
+#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
+#else
+#define log_debug(fmt, arg...)
+#endif
+/* sysfs.c */
+int gdlm_sysfs_init(void);
+void gdlm_sysfs_exit(void);
+int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
+void gdlm_kobject_release(struct gdlm_ls *);
+/* thread.c */
+int gdlm_init_threads(struct gdlm_ls *);
+void gdlm_release_threads(struct gdlm_ls *);
+/* lock.c */
+s16 gdlm_make_lmstate(s16);
+void gdlm_queue_delayed(struct gdlm_lock *);
+void gdlm_submit_delayed(struct gdlm_ls *);
+int gdlm_release_all_locks(struct gdlm_ls *);
+void gdlm_delete_lp(struct gdlm_lock *);
+unsigned int gdlm_do_lock(struct gdlm_lock *);
+int gdlm_get_lock(void *, struct lm_lockname *, void **);
+void gdlm_put_lock(void *);
+unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
+unsigned int gdlm_unlock(void *, unsigned int);
+void gdlm_cancel(void *);
+int gdlm_hold_lvb(void *, char **);
+void gdlm_unhold_lvb(void *, char *);
+/* plock.c */
+int gdlm_plock_init(void);
+void gdlm_plock_exit(void);
+int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
+                struct file_lock *);
+int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
+                struct file_lock *);
+int gdlm_punlock(void *, struct lm_lockname *, struct file *,
+                struct file_lock *);
+#endif
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2194b1d5b5ec
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/init.h>
+#include "lock_dlm.h"
+extern int gdlm_drop_count;
+extern int gdlm_drop_period;
+extern struct lm_lockops gdlm_ops;
+static int __init init_lock_dlm(void)
+{
+        int error;
+        error = gfs2_register_lockproto(&gdlm_ops);
+        if (error) {
+                printk(KERN_WARNING "lock_dlm:  can't register protocol: %d\n",
+                       error);
+                return error;
+        }
+        error = gdlm_sysfs_init();
+        if (error) {
+                gfs2_unregister_lockproto(&gdlm_ops);
+                return error;
+        }
+        error = gdlm_plock_init();
+        if (error) {
+                gdlm_sysfs_exit();
+                gfs2_unregister_lockproto(&gdlm_ops);
+                return error;
+        }
+        gdlm_drop_count = GDLM_DROP_COUNT;
+        gdlm_drop_period = GDLM_DROP_PERIOD;
+        printk(KERN_INFO
+               "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+}
+static void __exit exit_lock_dlm(void)
+{
+        gdlm_plock_exit();
+        gdlm_sysfs_exit();
+        gfs2_unregister_lockproto(&gdlm_ops);
+}
+module_init(init_lock_dlm);
+module_exit(exit_lock_dlm);
+MODULE_DESCRIPTION("GFS DLM Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..cdd1694e889b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include "lock_dlm.h"
+int gdlm_drop_count;
+int gdlm_drop_period;
+const struct lm_lockops gdlm_ops;
+static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
+                                 int flags, char *table_name)
+{
+        struct gdlm_ls *ls;
+        char buf[256], *p;
+        ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
+        if (!ls)
+                return NULL;
+        ls->drop_locks_count = gdlm_drop_count;
+        ls->drop_locks_period = gdlm_drop_period;
+        ls->fscb = cb;
+        ls->sdp = sdp;
+        ls->fsflags = flags;
+        spin_lock_init(&ls->async_lock);
+        INIT_LIST_HEAD(&ls->complete);
+        INIT_LIST_HEAD(&ls->blocking);
+        INIT_LIST_HEAD(&ls->delayed);
+        INIT_LIST_HEAD(&ls->submit);
+        INIT_LIST_HEAD(&ls->all_locks);
+        init_waitqueue_head(&ls->thread_wait);
+        init_waitqueue_head(&ls->wait_control);
+        ls->thread1 = NULL;
+        ls->thread2 = NULL;
+        ls->drop_time = jiffies;
+        ls->jid = -1;
+        strncpy(buf, table_name, 256);
+        buf[255] = '\0';
+        p = strchr(buf, ':');
+        if (!p) {
+                log_info("invalid table_name \"%s\"", table_name);
+                kfree(ls);
+                return NULL;
+        }
+        *p = '\0';
+        p++;
+        strncpy(ls->clustername, buf, GDLM_NAME_LEN);
+        strncpy(ls->fsname, p, GDLM_NAME_LEN);
+        return ls;
+}
+static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
+{
+        char data[256];
+        char *options, *x, *y;
+        int error = 0;
+        memset(data, 0, 256);
+        strncpy(data, data_arg, 255);
+        for (options = data; (x = strsep(&options, ":")); ) {
+                if (!*x)
+                        continue;
+                y = strchr(x, '=');
+                if (y)
+                        *y++ = 0;
+                if (!strcmp(x, "jid")) {
+                        if (!y) {
+                                log_error("need argument to jid");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", &ls->jid);
+                } else if (!strcmp(x, "first")) {
+                        if (!y) {
+                                log_error("need argument to first");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", &ls->first);
+                } else if (!strcmp(x, "id")) {
+                        if (!y) {
+                                log_error("need argument to id");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", &ls->id);
+                } else if (!strcmp(x, "nodir")) {
+                        if (!y) {
+                                log_error("need argument to nodir");
+                                error = -EINVAL;
+                                break;
+                        }
+                        sscanf(y, "%u", nodir);
+                } else {
+                        log_error("unkonwn option: %s", x);
+                        error = -EINVAL;
+                        break;
+                }
+        }
+        return error;
+}
+static int gdlm_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        struct gdlm_ls *ls;
+        int error = -ENOMEM, nodir = 0;
+        if (min_lvb_size > GDLM_LVB_SIZE)
+                goto out;
+        ls = init_gdlm(cb, cb_data, flags, table_name);
+        if (!ls)
+                goto out;
+        error = make_args(ls, host_data, &nodir);
+        if (error)
+                goto out;
+        error = gdlm_init_threads(ls);
+        if (error)
+                goto out_free;
+        error = gdlm_kobject_setup(ls, fskobj);
+        if (error)
+                goto out_thread;
+        error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
+                                  &ls->dlm_lockspace,
+                                  nodir ? DLM_LSFL_NODIR : 0,
+                                  GDLM_LVB_SIZE);
+        if (error) {
+                log_error("dlm_new_lockspace error %d", error);
+                goto out_kobj;
+        }
+        lockstruct->ls_jid = ls->jid;
+        lockstruct->ls_first = ls->first;
+        lockstruct->ls_lockspace = ls;
+        lockstruct->ls_ops = &gdlm_ops;
+        lockstruct->ls_flags = 0;
+        lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
+        return 0;
+out_kobj:
+        gdlm_kobject_release(ls);
+out_thread:
+        gdlm_release_threads(ls);
+out_free:
+        kfree(ls);
+out:
+        return error;
+}
+static void gdlm_unmount(void *lockspace)
+{
+        struct gdlm_ls *ls = lockspace;
+        int rv;
+        log_debug("unmount flags %lx", ls->flags);
+        /* FIXME: serialize unmount and withdraw in case they
+           happen at once.  Also, if unmount follows withdraw,
+           wait for withdraw to finish. */
+        if (test_bit(DFL_WITHDRAW, &ls->flags))
+                goto out;
+        gdlm_kobject_release(ls);
+        dlm_release_lockspace(ls->dlm_lockspace, 2);
+        gdlm_release_threads(ls);
+        rv = gdlm_release_all_locks(ls);
+        if (rv)
+                log_info("gdlm_unmount: %d stray locks freed", rv);
+out:
+        kfree(ls);
+}
+static void gdlm_recovery_done(void *lockspace, unsigned int jid,
+                               unsigned int message)
+{
+        struct gdlm_ls *ls = lockspace;
+        ls->recover_jid_done = jid;
+        ls->recover_jid_status = message;
+        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+static void gdlm_others_may_mount(void *lockspace)
+{
+        struct gdlm_ls *ls = lockspace;
+        ls->first_done = 1;
+        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+/* Userspace gets the offline uevent, blocks new gfs locks on
+   other mounters, and lets us know (sets WITHDRAW flag).  Then,
+   userspace leaves the mount group while we leave the lockspace. */
+static void gdlm_withdraw(void *lockspace)
+{
+        struct gdlm_ls *ls = lockspace;
+        kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
+        wait_event_interruptible(ls->wait_control,
+                                 test_bit(DFL_WITHDRAW, &ls->flags));
+        dlm_release_lockspace(ls->dlm_lockspace, 2);
+        gdlm_release_threads(ls);
+        gdlm_release_all_locks(ls);
+        gdlm_kobject_release(ls);
+}
+const struct lm_lockops gdlm_ops = {
+        .lm_proto_name = "lock_dlm",
+        .lm_mount = gdlm_mount,
+        .lm_others_may_mount = gdlm_others_may_mount,
+        .lm_unmount = gdlm_unmount,
+        .lm_withdraw = gdlm_withdraw,
+        .lm_get_lock = gdlm_get_lock,
+        .lm_put_lock = gdlm_put_lock,
+        .lm_lock = gdlm_lock,
+        .lm_unlock = gdlm_unlock,
+        .lm_plock = gdlm_plock,
+        .lm_punlock = gdlm_punlock,
+        .lm_plock_get = gdlm_plock_get,
+        .lm_cancel = gdlm_cancel,
+        .lm_hold_lvb = gdlm_hold_lvb,
+        .lm_unhold_lvb = gdlm_unhold_lvb,
+        .lm_recovery_done = gdlm_recovery_done,
+        .lm_owner = THIS_MODULE,
+};
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..7365aec9511b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/miscdevice.h>
+#include <linux/lock_dlm_plock.h>
+#include "lock_dlm.h"
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+struct plock_op {
+        struct list_head list;
+        int done;
+        struct gdlm_plock_info info;
+};
+static inline void set_version(struct gdlm_plock_info *info)
+{
+        info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
+        info->version[1] = GDLM_PLOCK_VERSION_MINOR;
+        info->version[2] = GDLM_PLOCK_VERSION_PATCH;
+}
+static int check_version(struct gdlm_plock_info *info)
+{
+        if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+            (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
+                log_error("plock device version mismatch: "
+                          "kernel (%u.%u.%u), user (%u.%u.%u)",
+                          GDLM_PLOCK_VERSION_MAJOR,
+                          GDLM_PLOCK_VERSION_MINOR,
+                          GDLM_PLOCK_VERSION_PATCH,
+                          info->version[0],
+                          info->version[1],
+                          info->version[2]);
+                return -EINVAL;
+        }
+        return 0;
+}
+static void send_op(struct plock_op *op)
+{
+        set_version(&op->info);
+        INIT_LIST_HEAD(&op->list);
+        spin_lock(&ops_lock);
+        list_add_tail(&op->list, &send_list);
+        spin_unlock(&ops_lock);
+        wake_up(&send_wq);
+}
+int gdlm_plock(void *lockspace, struct lm_lockname *name,
+               struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gdlm_ls *ls = lockspace;
+        struct plock_op *op;
+        int rv;
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op)
+                return -ENOMEM;
+        op->info.optype         = GDLM_PLOCK_OP_LOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.ex             = (fl->fl_type == F_WRLCK);
+        op->info.wait           = IS_SETLKW(cmd);
+        op->info.fsid           = ls->id;
+        op->info.number         = name->ln_number;
+        op->info.start          = fl->fl_start;
+        op->info.end            = fl->fl_end;
+        op->info.owner          = (__u64)(long) fl->fl_owner;
+        send_op(op);
+        wait_event(recv_wq, (op->done != 0));
+        spin_lock(&ops_lock);
+        if (!list_empty(&op->list)) {
+                printk(KERN_INFO "plock op on list\n");
+                list_del(&op->list);
+        }
+        spin_unlock(&ops_lock);
+        rv = op->info.rv;
+        if (!rv) {
+                if (posix_lock_file_wait(file, fl) < 0)
+                        log_error("gdlm_plock: vfs lock error %x,%llx",
+                                  name->ln_type,
+                                  (unsigned long long)name->ln_number);
+        }
+        kfree(op);
+        return rv;
+}
+int gdlm_punlock(void *lockspace, struct lm_lockname *name,
+                 struct file *file, struct file_lock *fl)
+{
+        struct gdlm_ls *ls = lockspace;
+        struct plock_op *op;
+        int rv;
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op)
+                return -ENOMEM;
+        if (posix_lock_file_wait(file, fl) < 0)
+                log_error("gdlm_punlock: vfs unlock error %x,%llx",
+                          name->ln_type, (unsigned long long)name->ln_number);
+        op->info.optype         = GDLM_PLOCK_OP_UNLOCK;
+        op->info.pid            = fl->fl_pid;
+        op->info.fsid           = ls->id;
+        op->info.number         = name->ln_number;
+        op->info.start          = fl->fl_start;
+        op->info.end            = fl->fl_end;
+        op->info.owner          = (__u64)(long) fl->fl_owner;
+        send_op(op);
+        wait_event(recv_wq, (op->done != 0));
+        spin_lock(&ops_lock);
+        if (!list_empty(&op->list)) {
+                printk(KERN_INFO "punlock op on list\n");
+                list_del(&op->list);
+        }
+        spin_unlock(&ops_lock);
+        rv = op->info.rv;
+        kfree(op);
+        return rv;
+}
+int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
+                   struct file *file, struct file_lock *fl)
+{
+        struct gdlm_ls *ls = lockspace;
+        struct plock_op *op;
+        int rv;
+        op = kzalloc(sizeof(*op), GFP_KERNEL);
+        if (!op)
+                return -ENOMEM;
+        op->info.optype         = GDLM_PLOCK_OP_GET;
+        op->info.pid            = fl->fl_pid;
+        op->info.ex             = (fl->fl_type == F_WRLCK);
+        op->info.fsid           = ls->id;
+        op->info.number         = name->ln_number;
+        op->info.start          = fl->fl_start;
+        op->info.end            = fl->fl_end;
+        send_op(op);
+        wait_event(recv_wq, (op->done != 0));
+        spin_lock(&ops_lock);
+        if (!list_empty(&op->list)) {
+                printk(KERN_INFO "plock_get op on list\n");
+                list_del(&op->list);
+        }
+        spin_unlock(&ops_lock);
+        rv = op->info.rv;
+        if (rv == 0)
+                fl->fl_type = F_UNLCK;
+        else if (rv > 0) {
+                fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+                fl->fl_pid = op->info.pid;
+                fl->fl_start = op->info.start;
+                fl->fl_end = op->info.end;
+        }
+        kfree(op);
+        return rv;
+}
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+                        loff_t *ppos)
+{
+        struct gdlm_plock_info info;
+        struct plock_op *op = NULL;
+        if (count < sizeof(info))
+                return -EINVAL;
+        spin_lock(&ops_lock);
+        if (!list_empty(&send_list)) {
+                op = list_entry(send_list.next, struct plock_op, list);
+                list_move(&op->list, &recv_list);
+                memcpy(&info, &op->info, sizeof(info));
+        }
+        spin_unlock(&ops_lock);
+        if (!op)
+                return -EAGAIN;
+        if (copy_to_user(u, &info, sizeof(info)))
+                return -EFAULT;
+        return sizeof(info);
+}
+/* a write copies in one plock result that should match a plock_op
+   on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+                         loff_t *ppos)
+{
+        struct gdlm_plock_info info;
+        struct plock_op *op;
+        int found = 0;
+        if (count != sizeof(info))
+                return -EINVAL;
+        if (copy_from_user(&info, u, sizeof(info)))
+                return -EFAULT;
+        if (check_version(&info))
+                return -EINVAL;
+        spin_lock(&ops_lock);
+        list_for_each_entry(op, &recv_list, list) {
+                if (op->info.fsid == info.fsid && op->info.number == info.number &&
+                    op->info.owner == info.owner) {
+                        list_del_init(&op->list);
+                        found = 1;
+                        op->done = 1;
+                        memcpy(&op->info, &info, sizeof(info));
+                        break;
+                }
+        }
+        spin_unlock(&ops_lock);
+        if (found)
+                wake_up(&recv_wq);
+        else
+                printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
+                        (unsigned long long)info.number);
+        return count;
+}
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+        poll_wait(file, &send_wq, wait);
+        spin_lock(&ops_lock);
+        if (!list_empty(&send_list)) {
+                spin_unlock(&ops_lock);
+                return POLLIN | POLLRDNORM;
+        }
+        spin_unlock(&ops_lock);
+        return 0;
+}
+static struct file_operations dev_fops = {
+        .read    = dev_read,
+        .write   = dev_write,
+        .poll    = dev_poll,
+        .owner   = THIS_MODULE
+};
+static struct miscdevice plock_dev_misc = {
+        .minor = MISC_DYNAMIC_MINOR,
+        .name = GDLM_PLOCK_MISC_NAME,
+        .fops = &dev_fops
+};
+int gdlm_plock_init(void)
+{
+        int rv;
+        spin_lock_init(&ops_lock);
+        INIT_LIST_HEAD(&send_list);
+        INIT_LIST_HEAD(&recv_list);
+        init_waitqueue_head(&send_wq);
+        init_waitqueue_head(&recv_wq);
+        rv = misc_register(&plock_dev_misc);
+        if (rv)
+                printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
+                       rv);
+        return rv;
+}
+void gdlm_plock_exit(void)
+{
+        if (misc_deregister(&plock_dev_misc) < 0)
+                printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
+}
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..29ae06f94944
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/ctype.h>
+#include <linux/stat.h>
+#include "lock_dlm.h"
+extern struct lm_lockops gdlm_ops;
+static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
+}
+static ssize_t block_show(struct gdlm_ls *ls, char *buf)
+{
+        ssize_t ret;
+        int val = 0;
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
+                val = 1;
+        ret = sprintf(buf, "%d\n", val);
+        return ret;
+}
+static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int val;
+        val = simple_strtol(buf, NULL, 0);
+        if (val == 1)
+                set_bit(DFL_BLOCK_LOCKS, &ls->flags);
+        else if (val == 0) {
+                clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
+                gdlm_submit_delayed(ls);
+        } else {
+                ret = -EINVAL;
+        }
+        return ret;
+}
+static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
+{
+        ssize_t ret;
+        int val = 0;
+        if (test_bit(DFL_WITHDRAW, &ls->flags))
+                val = 1;
+        ret = sprintf(buf, "%d\n", val);
+        return ret;
+}
+static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int val;
+        val = simple_strtol(buf, NULL, 0);
+        if (val == 1)
+                set_bit(DFL_WITHDRAW, &ls->flags);
+        else
+                ret = -EINVAL;
+        wake_up(&ls->wait_control);
+        return ret;
+}
+static ssize_t id_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%u\n", ls->id);
+}
+static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->jid);
+}
+static ssize_t first_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->first);
+}
+static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->first_done);
+}
+static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->recover_jid);
+}
+static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+        ls->recover_jid = simple_strtol(buf, NULL, 0);
+        ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
+        return len;
+}
+static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->recover_jid_done);
+}
+static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
+{
+        return sprintf(buf, "%d\n", ls->recover_jid_status);
+}
+struct gdlm_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gdlm_ls *, char *);
+        ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
+};
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(id,             0444, id_show,             NULL);
+GDLM_ATTR(jid,            0444, jid_show,            NULL);
+GDLM_ATTR(first,          0444, first_show,          NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+static struct attribute *gdlm_attrs[] = {
+        &gdlm_attr_proto_name.attr,
+        &gdlm_attr_block.attr,
+        &gdlm_attr_withdraw.attr,
+        &gdlm_attr_id.attr,
+        &gdlm_attr_jid.attr,
+        &gdlm_attr_first.attr,
+        &gdlm_attr_first_done.attr,
+        &gdlm_attr_recover.attr,
+        &gdlm_attr_recover_done.attr,
+        &gdlm_attr_recover_status.attr,
+        NULL,
+};
+static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+        return a->show ? a->show(ls, buf) : 0;
+}
+static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+        return a->store ? a->store(ls, buf, len) : len;
+}
+static struct sysfs_ops gdlm_attr_ops = {
+        .show  = gdlm_attr_show,
+        .store = gdlm_attr_store,
+};
+static struct kobj_type gdlm_ktype = {
+        .default_attrs = gdlm_attrs,
+        .sysfs_ops     = &gdlm_attr_ops,
+};
+static struct kset gdlm_kset = {
+        .subsys = &kernel_subsys,
+        .kobj   = {.name = "lock_dlm",},
+        .ktype  = &gdlm_ktype,
+};
+int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
+{
+        int error;
+        error = kobject_set_name(&ls->kobj, "%s", "lock_module");
+        if (error) {
+                log_error("can't set kobj name %d", error);
+                return error;
+        }
+        ls->kobj.kset = &gdlm_kset;
+        ls->kobj.ktype = &gdlm_ktype;
+        ls->kobj.parent = fskobj;
+        error = kobject_register(&ls->kobj);
+        if (error)
+                log_error("can't register kobj %d", error);
+        return error;
+}
+void gdlm_kobject_release(struct gdlm_ls *ls)
+{
+        kobject_unregister(&ls->kobj);
+}
+int gdlm_sysfs_init(void)
+{
+        int error;
+        error = kset_register(&gdlm_kset);
+        if (error)
+                printk("lock_dlm: cannot register kset %d\n", error);
+        return error;
+}
+void gdlm_sysfs_exit(void)
+{
+        kset_unregister(&gdlm_kset);
+}
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..9cf1f168eaf8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include "lock_dlm.h"
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+static void queue_submit(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        spin_lock(&ls->async_lock);
+        list_add_tail(&lp->delay_list, &ls->submit);
+        spin_unlock(&ls->async_lock);
+        wake_up(&ls->thread_wait);
+}
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+        struct gdlm_ls *ls = lp->ls;
+        unsigned int cb = 0;
+        switch (gdlm_make_lmstate(bast_mode)) {
+        case LM_ST_EXCLUSIVE:
+                cb = LM_CB_NEED_E;
+                break;
+        case LM_ST_DEFERRED:
+                cb = LM_CB_NEED_D;
+                break;
+        case LM_ST_SHARED:
+                cb = LM_CB_NEED_S;
+                break;
+        default:
+                gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
+        }
+        ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+static void process_complete(struct gdlm_lock *lp)
+{
+        struct gdlm_ls *ls = lp->ls;
+        struct lm_async_cb acb;
+        s16 prev_mode = lp->cur;
+        memset(&acb, 0, sizeof(acb));
+        if (lp->lksb.sb_status == -DLM_ECANCEL) {
+                log_info("complete dlm cancel %x,%llx flags %lx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                if (lp->cur == DLM_LOCK_IV)
+                        lp->lksb.sb_lkid = 0;
+                goto out;
+        }
+        if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+                if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+                        log_info("unlock sb_status %d %x,%llx flags %lx",
+                                 lp->lksb.sb_status, lp->lockname.ln_type,
+                                 (unsigned long long)lp->lockname.ln_number,
+                                 lp->flags);
+                        return;
+                }
+                lp->cur = DLM_LOCK_IV;
+                lp->req = DLM_LOCK_IV;
+                lp->lksb.sb_lkid = 0;
+                if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+                        gdlm_delete_lp(lp);
+                        return;
+                }
+                goto out;
+        }
+        if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+                memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+        if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+                if (lp->req == DLM_LOCK_PR)
+                        lp->req = DLM_LOCK_CW;
+                else if (lp->req == DLM_LOCK_CW)
+                        lp->req = DLM_LOCK_PR;
+        }
+        /*
+         * A canceled lock request.  The lock was just taken off the delayed
+         * list and was never even submitted to dlm.
+         */
+        if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+                log_info("complete internal cancel %x,%llx",
+                         lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number);
+                lp->req = lp->cur;
+                acb.lc_ret |= LM_OUT_CANCELED;
+                goto out;
+        }
+        /*
+         * An error occured.
+         */
+        if (lp->lksb.sb_status) {
+                /* a "normal" error */
+                if ((lp->lksb.sb_status == -EAGAIN) &&
+                    (lp->lkf & DLM_LKF_NOQUEUE)) {
+                        lp->req = lp->cur;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                }
+                /* this could only happen with cancels I think */
+                log_info("ast sb_status %d %x,%llx flags %lx",
+                         lp->lksb.sb_status, lp->lockname.ln_type,
+                         (unsigned long long)lp->lockname.ln_number,
+                         lp->flags);
+                return;
+        }
+        /*
+         * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+         */
+        if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+                complete(&lp->ast_wait);
+                return;
+        }
+        /*
+         * A lock has been demoted to NL because it initially completed during
+         * BLOCK_LOCKS.  Now it must be requested in the originally requested
+         * mode.
+         */
+        if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+                gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+                            lp->lockname.ln_type,
+                            (unsigned long long)lp->lockname.ln_number);
+                lp->cur = DLM_LOCK_NL;
+                lp->req = lp->prev_req;
+                lp->prev_req = DLM_LOCK_IV;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                set_bit(LFL_NOCACHE, &lp->flags);
+                if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+                    !test_bit(LFL_NOBLOCK, &lp->flags))
+                        gdlm_queue_delayed(lp);
+                else
+                        queue_submit(lp);
+                return;
+        }
+        /*
+         * A request is granted during dlm recovery.  It may be granted
+         * because the locks of a failed node were cleared.  In that case,
+         * there may be inconsistent data beneath this lock and we must wait
+         * for recovery to complete to use it.  When gfs recovery is done this
+         * granted lock will be converted to NL and then reacquired in this
+         * granted state.
+         */
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+            !test_bit(LFL_NOBLOCK, &lp->flags) &&
+            lp->req != DLM_LOCK_NL) {
+                lp->cur = lp->req;
+                lp->prev_req = lp->req;
+                lp->req = DLM_LOCK_NL;
+                lp->lkf |= DLM_LKF_CONVERT;
+                lp->lkf &= ~DLM_LKF_CONVDEADLK;
+                log_debug("rereq %x,%llx id %x %d,%d",
+                          lp->lockname.ln_type,
+                          (unsigned long long)lp->lockname.ln_number,
+                          lp->lksb.sb_lkid, lp->cur, lp->req);
+                set_bit(LFL_REREQUEST, &lp->flags);
+                queue_submit(lp);
+                return;
+        }
+        /*
+         * DLM demoted the lock to NL before it was granted so GFS must be
+         * told it cannot cache data for this lock.
+         */
+        if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+                set_bit(LFL_NOCACHE, &lp->flags);
+out:
+        /*
+         * This is an internal lock_dlm lock
+         */
+        if (test_bit(LFL_INLOCK, &lp->flags)) {
+                clear_bit(LFL_NOBLOCK, &lp->flags);
+                lp->cur = lp->req;
+                complete(&lp->ast_wait);
+                return;
+        }
+        /*
+         * Normal completion of a lock request.  Tell GFS it now has the lock.
+         */
+        clear_bit(LFL_NOBLOCK, &lp->flags);
+        lp->cur = lp->req;
+        acb.lc_name = lp->lockname;
+        acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+        if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
+            (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
+                acb.lc_ret |= LM_OUT_CACHEABLE;
+        ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+static inline int no_work(struct gdlm_ls *ls, int blocking)
+{
+        int ret;
+        spin_lock(&ls->async_lock);
+        ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+        if (ret && blocking)
+                ret = list_empty(&ls->blocking);
+        spin_unlock(&ls->async_lock);
+        return ret;
+}
+static inline int check_drop(struct gdlm_ls *ls)
+{
+        if (!ls->drop_locks_count)
+                return 0;
+        if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
+                ls->drop_time = jiffies;
+                if (ls->all_locks_count >= ls->drop_locks_count)
+                        return 1;
+        }
+        return 0;
+}
+static int gdlm_thread(void *data)
+{
+        struct gdlm_ls *ls = (struct gdlm_ls *) data;
+        struct gdlm_lock *lp = NULL;
+        int blist = 0;
+        uint8_t complete, blocking, submit, drop;
+        DECLARE_WAITQUEUE(wait, current);
+        /* Only thread1 is allowed to do blocking callbacks since gfs
+           may wait for a completion callback within a blocking cb. */
+        if (current == ls->thread1)
+                blist = 1;
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&ls->thread_wait, &wait);
+                if (no_work(ls, blist))
+                        schedule();
+                remove_wait_queue(&ls->thread_wait, &wait);
+                set_current_state(TASK_RUNNING);
+                complete = blocking = submit = drop = 0;
+                spin_lock(&ls->async_lock);
+                if (blist && !list_empty(&ls->blocking)) {
+                        lp = list_entry(ls->blocking.next, struct gdlm_lock,
+                                        blist);
+                        list_del_init(&lp->blist);
+                        blocking = lp->bast_mode;
+                        lp->bast_mode = 0;
+                } else if (!list_empty(&ls->complete)) {
+                        lp = list_entry(ls->complete.next, struct gdlm_lock,
+                                        clist);
+                        list_del_init(&lp->clist);
+                        complete = 1;
+                } else if (!list_empty(&ls->submit)) {
+                        lp = list_entry(ls->submit.next, struct gdlm_lock,
+                                        delay_list);
+                        list_del_init(&lp->delay_list);
+                        submit = 1;
+                }
+                drop = check_drop(ls);
+                spin_unlock(&ls->async_lock);
+                if (complete)
+                        process_complete(lp);
+                else if (blocking)
+                        process_blocking(lp, blocking);
+                else if (submit)
+                        gdlm_do_lock(lp);
+                if (drop)
+                        ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
+                schedule();
+        }
+        return 0;
+}
+int gdlm_init_threads(struct gdlm_ls *ls)
+{
+        struct task_struct *p;
+        int error;
+        p = kthread_run(gdlm_thread, ls, "lock_dlm1");
+        error = IS_ERR(p);
+        if (error) {
+                log_error("can't start lock_dlm1 thread %d", error);
+                return error;
+        }
+        ls->thread1 = p;
+        p = kthread_run(gdlm_thread, ls, "lock_dlm2");
+        error = IS_ERR(p);
+        if (error) {
+                log_error("can't start lock_dlm2 thread %d", error);
+                kthread_stop(ls->thread1);
+                return error;
+        }
+        ls->thread2 = p;
+        return 0;
+}
+void gdlm_release_threads(struct gdlm_ls *ls)
+{
+        kthread_stop(ls->thread1);
+        kthread_stop(ls->thread2);
+}
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..35e9730bc3a8
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
+lock_nolock-y := main.o
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..acfbc941f319
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/lm_interface.h>
+struct nolock_lockspace {
+        unsigned int nl_lvb_size;
+};
+static const struct lm_lockops nolock_ops;
+static int nolock_mount(char *table_name, char *host_data,
+                        lm_callback_t cb, void *cb_data,
+                        unsigned int min_lvb_size, int flags,
+                        struct lm_lockstruct *lockstruct,
+                        struct kobject *fskobj)
+{
+        char *c;
+        unsigned int jid;
+        struct nolock_lockspace *nl;
+        c = strstr(host_data, "jid=");
+        if (!c)
+                jid = 0;
+        else {
+                c += 4;
+                sscanf(c, "%u", &jid);
+        }
+        nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
+        if (!nl)
+                return -ENOMEM;
+        nl->nl_lvb_size = min_lvb_size;
+        lockstruct->ls_jid = jid;
+        lockstruct->ls_first = 1;
+        lockstruct->ls_lvb_size = min_lvb_size;
+        lockstruct->ls_lockspace = nl;
+        lockstruct->ls_ops = &nolock_ops;
+        lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+        return 0;
+}
+static void nolock_others_may_mount(void *lockspace)
+{
+}
+static void nolock_unmount(void *lockspace)
+{
+        struct nolock_lockspace *nl = lockspace;
+        kfree(nl);
+}
+static void nolock_withdraw(void *lockspace)
+{
+}
+/**
+ * nolock_get_lock - get a lm_lock_t given a descripton of the lock
+ * @lockspace: the lockspace the lock lives in
+ * @name: the name of the lock
+ * @lockp: return the lm_lock_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
+                           void **lockp)
+{
+        *lockp = lockspace;
+        return 0;
+}
+/**
+ * nolock_put_lock - get rid of a lock structure
+ * @lock: the lock to throw away
+ *
+ */
+static void nolock_put_lock(void *lock)
+{
+}
+/**
+ * nolock_lock - acquire a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ * @req_state: the requested state
+ * @flags: modifier flags
+ *
+ * Returns: A bitmap of LM_OUT_*
+ */
+static unsigned int nolock_lock(void *lock, unsigned int cur_state,
+                                unsigned int req_state, unsigned int flags)
+{
+        return req_state | LM_OUT_CACHEABLE;
+}
+/**
+ * nolock_unlock - unlock a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ *
+ * Returns: 0
+ */
+static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
+{
+        return 0;
+}
+static void nolock_cancel(void *lock)
+{
+}
+/**
+ * nolock_hold_lvb - hold on to a lock value block
+ * @lock: the lock the LVB is associated with
+ * @lvbp: return the lm_lvb_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+static int nolock_hold_lvb(void *lock, char **lvbp)
+{
+        struct nolock_lockspace *nl = lock;
+        int error = 0;
+        *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+        if (!*lvbp)
+                error = -ENOMEM;
+        return error;
+}
+/**
+ * nolock_unhold_lvb - release a LVB
+ * @lock: the lock the LVB is associated with
+ * @lvb: the lock value block
+ *
+ */
+static void nolock_unhold_lvb(void *lock, char *lvb)
+{
+        kfree(lvb);
+}
+static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
+                            struct file *file, struct file_lock *fl)
+{
+        struct file_lock tmp;
+        int ret;
+        ret = posix_test_lock(file, fl, &tmp);
+        fl->fl_type = F_UNLCK;
+        if (ret)
+                memcpy(fl, &tmp, sizeof(struct file_lock));
+        return 0;
+}
+static int nolock_plock(void *lockspace, struct lm_lockname *name,
+                        struct file *file, int cmd, struct file_lock *fl)
+{
+        int error;
+        error = posix_lock_file_wait(file, fl);
+        return error;
+}
+static int nolock_punlock(void *lockspace, struct lm_lockname *name,
+                          struct file *file, struct file_lock *fl)
+{
+        int error;
+        error = posix_lock_file_wait(file, fl);
+        return error;
+}
+static void nolock_recovery_done(void *lockspace, unsigned int jid,
+                                 unsigned int message)
+{
+}
+static const struct lm_lockops nolock_ops = {
+        .lm_proto_name = "lock_nolock",
+        .lm_mount = nolock_mount,
+        .lm_others_may_mount = nolock_others_may_mount,
+        .lm_unmount = nolock_unmount,
+        .lm_withdraw = nolock_withdraw,
+        .lm_get_lock = nolock_get_lock,
+        .lm_put_lock = nolock_put_lock,
+        .lm_lock = nolock_lock,
+        .lm_unlock = nolock_unlock,
+        .lm_cancel = nolock_cancel,
+        .lm_hold_lvb = nolock_hold_lvb,
+        .lm_unhold_lvb = nolock_unhold_lvb,
+        .lm_plock_get = nolock_plock_get,
+        .lm_plock = nolock_plock,
+        .lm_punlock = nolock_punlock,
+        .lm_recovery_done = nolock_recovery_done,
+        .lm_owner = THIS_MODULE,
+};
+static int __init init_nolock(void)
+{
+        int error;
+        error = gfs2_register_lockproto(&nolock_ops);
+        if (error) {
+                printk(KERN_WARNING
+                       "lock_nolock: can't register protocol: %d\n", error);
+                return error;
+        }
+        printk(KERN_INFO
+               "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+}
+static void __exit exit_nolock(void)
+{
+        gfs2_unregister_lockproto(&nolock_ops);
+}
+module_init(init_nolock);
+module_exit(exit_nolock);
+MODULE_DESCRIPTION("GFS Nolock Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..0cace3da9dbb
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "util.h"
+#include "dir.h"
+#define PULL 1
+/**
+ * gfs2_struct2blk - compute stuff
+ * @sdp: the filesystem
+ * @nstruct: the number of structures
+ * @ssize: the size of the structures
+ *
+ * Compute the number of log descriptor blocks needed to hold a certain number
+ * of structures of a certain size.
+ *
+ * Returns: the number of blocks needed (minimum is always 1)
+ */
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+                             unsigned int ssize)
+{
+        unsigned int blks;
+        unsigned int first, second;
+        blks = 1;
+        first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
+        if (nstruct > first) {
+                second = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_meta_header)) / ssize;
+                blks += DIV_ROUND_UP(nstruct - first, second);
+        }
+        return blks;
+}
+/**
+ * gfs2_ail1_start_one - Start I/O on a part of the AIL
+ * @sdp: the filesystem
+ * @tr: the part of the AIL
+ *
+ */
+static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct gfs2_bufdata *bd, *s;
+        struct buffer_head *bh;
+        int retry;
+        BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
+        do {
+                retry = 0;
+                list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+                                                 bd_ail_st_list) {
+                        bh = bd->bd_bh;
+                        gfs2_assert(sdp, bd->bd_ail == ai);
+                        if (!buffer_busy(bh)) {
+                                if (!buffer_uptodate(bh)) {
+                                        gfs2_log_unlock(sdp);
+                                        gfs2_io_error_bh(sdp, bh);
+                                        gfs2_log_lock(sdp);
+                                }
+                                list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+                                continue;
+                        }
+                        if (!buffer_dirty(bh))
+                                continue;
+                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+                        gfs2_log_unlock(sdp);
+                        wait_on_buffer(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        gfs2_log_lock(sdp);
+                        retry = 1;
+                        break;
+                }
+        } while (retry);
+}
+/**
+ * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+{
+        struct gfs2_bufdata *bd, *s;
+        struct buffer_head *bh;
+        list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+                                         bd_ail_st_list) {
+                bh = bd->bd_bh;
+                gfs2_assert(sdp, bd->bd_ail == ai);
+                if (buffer_busy(bh)) {
+                        if (flags & DIO_ALL)
+                                continue;
+                        else
+                                break;
+                }
+                if (!buffer_uptodate(bh))
+                        gfs2_io_error_bh(sdp, bh);
+                list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+        }
+        return list_empty(&ai->ai_ail1_list);
+}
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+{
+        struct list_head *head = &sdp->sd_ail1_list;
+        u64 sync_gen;
+        struct list_head *first;
+        struct gfs2_ail *first_ai, *ai, *tmp;
+        int done = 0;
+        gfs2_log_lock(sdp);
+        if (list_empty(head)) {
+                gfs2_log_unlock(sdp);
+                return;
+        }
+        sync_gen = sdp->sd_ail_sync_gen++;
+        first = head->prev;
+        first_ai = list_entry(first, struct gfs2_ail, ai_list);
+        first_ai->ai_sync_gen = sync_gen;
+        gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
+        if (flags & DIO_ALL)
+                first = NULL;
+        while(!done) {
+                if (first && (head->prev != first ||
+                              gfs2_ail1_empty_one(sdp, first_ai, 0)))
+                        break;
+                done = 1;
+                list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+                        if (ai->ai_sync_gen >= sync_gen)
+                                continue;
+                        ai->ai_sync_gen = sync_gen;
+                        gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+                        done = 0;
+                        break;
+                }
+        }
+        gfs2_log_unlock(sdp);
+}
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+{
+        struct gfs2_ail *ai, *s;
+        int ret;
+        gfs2_log_lock(sdp);
+        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
+                if (gfs2_ail1_empty_one(sdp, ai, flags))
+                        list_move(&ai->ai_list, &sdp->sd_ail2_list);
+                else if (!(flags & DIO_ALL))
+                        break;
+        }
+        ret = list_empty(&sdp->sd_ail1_list);
+        gfs2_log_unlock(sdp);
+        return ret;
+}
+/**
+ * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &ai->ai_ail2_list;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->prev, struct gfs2_bufdata,
+                                bd_ail_st_list);
+                gfs2_assert(sdp, bd->bd_ail == ai);
+                bd->bd_ail = NULL;
+                list_del(&bd->bd_ail_st_list);
+                list_del(&bd->bd_ail_gl_list);
+                atomic_dec(&bd->bd_gl->gl_ail_count);
+                brelse(bd->bd_bh);
+        }
+}
+static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+        struct gfs2_ail *ai, *safe;
+        unsigned int old_tail = sdp->sd_log_tail;
+        int wrap = (new_tail < old_tail);
+        int a, b, rm;
+        gfs2_log_lock(sdp);
+        list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
+                a = (old_tail <= ai->ai_first);
+                b = (ai->ai_first < new_tail);
+                rm = (wrap) ? (a || b) : (a && b);
+                if (!rm)
+                        continue;
+                gfs2_ail2_empty_one(sdp, ai);
+                list_del(&ai->ai_list);
+                gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
+                gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
+                kfree(ai);
+        }
+        gfs2_log_unlock(sdp);
+}
+/**
+ * gfs2_log_reserve - Make a log reservation
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks to reserve
+ *
+ * Returns: errno
+ */
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        unsigned int try = 0;
+        if (gfs2_assert_warn(sdp, blks) ||
+            gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
+                return -EINVAL;
+        mutex_lock(&sdp->sd_log_reserve_mutex);
+        gfs2_log_lock(sdp);
+        while(sdp->sd_log_blks_free <= blks) {
+                gfs2_log_unlock(sdp);
+                gfs2_ail1_empty(sdp, 0);
+                gfs2_log_flush(sdp, NULL);
+                if (try++)
+                        gfs2_ail1_start(sdp, 0);
+                gfs2_log_lock(sdp);
+        }
+        sdp->sd_log_blks_free -= blks;
+        gfs2_log_unlock(sdp);
+        mutex_unlock(&sdp->sd_log_reserve_mutex);
+        down_read(&sdp->sd_log_flush_lock);
+        return 0;
+}
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        gfs2_log_lock(sdp);
+        sdp->sd_log_blks_free += blks;
+        gfs2_assert_withdraw(sdp,
+                             sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+        gfs2_log_unlock(sdp);
+        up_read(&sdp->sd_log_flush_lock);
+}
+static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
+{
+        struct inode *inode = sdp->sd_jdesc->jd_inode;
+        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+        bh_map.b_size = 1 << inode->i_blkbits;
+        error = gfs2_block_map(inode, lbn, 0, &bh_map);
+        if (error || !bh_map.b_blocknr)
+                printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
+        gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
+        return bh_map.b_blocknr;
+}
+/**
+ * log_distance - Compute distance between two journal blocks
+ * @sdp: The GFS2 superblock
+ * @newer: The most recent journal block of the pair
+ * @older: The older journal block of the pair
+ *
+ *   Compute the distance (in the journal direction) between two
+ *   blocks in the journal
+ *
+ * Returns: the distance in blocks
+ */
+static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
+                                        unsigned int older)
+{
+        int dist;
+        dist = newer - older;
+        if (dist < 0)
+                dist += sdp->sd_jdesc->jd_blocks;
+        return dist;
+}
+static unsigned int current_tail(struct gfs2_sbd *sdp)
+{
+        struct gfs2_ail *ai;
+        unsigned int tail;
+        gfs2_log_lock(sdp);
+        if (list_empty(&sdp->sd_ail1_list)) {
+                tail = sdp->sd_log_head;
+        } else {
+                ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
+                tail = ai->ai_first;
+        }
+        gfs2_log_unlock(sdp);
+        return tail;
+}
+static inline void log_incr_head(struct gfs2_sbd *sdp)
+{
+        if (sdp->sd_log_flush_head == sdp->sd_log_tail)
+                gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
+        if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+                sdp->sd_log_flush_head = 0;
+                sdp->sd_log_flush_wrapped = 1;
+        }
+}
+/**
+ * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the buffer_head
+ */
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+{
+        u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+        struct gfs2_log_buf *lb;
+        struct buffer_head *bh;
+        lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+        list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+        bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_uptodate(bh);
+        clear_buffer_dirty(bh);
+        unlock_buffer(bh);
+        log_incr_head(sdp);
+        return bh;
+}
+/**
+ * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * @sdp: the filesystem
+ * @data: the data the buffer_head should point to
+ *
+ * Returns: the log buffer descriptor
+ */
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+                                      struct buffer_head *real)
+{
+        u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+        struct gfs2_log_buf *lb;
+        struct buffer_head *bh;
+        lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+        list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+        lb->lb_real = real;
+        bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+        atomic_set(&bh->b_count, 1);
+        bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
+        set_bh_page(bh, real->b_page, bh_offset(real));
+        bh->b_blocknr = blkno;
+        bh->b_size = sdp->sd_sb.sb_bsize;
+        bh->b_bdev = sdp->sd_vfs->s_bdev;
+        log_incr_head(sdp);
+        return bh;
+}
+static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
+{
+        unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
+        ail2_empty(sdp, new_tail);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_blks_free += dist - (pull ? 1 : 0);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+        gfs2_log_unlock(sdp);
+        sdp->sd_log_tail = new_tail;
+}
+/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+{
+        u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+        struct buffer_head *bh;
+        struct gfs2_log_header *lh;
+        unsigned int tail;
+        u32 hash;
+        bh = sb_getblk(sdp->sd_vfs, blkno);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_uptodate(bh);
+        clear_buffer_dirty(bh);
+        unlock_buffer(bh);
+        gfs2_ail1_empty(sdp, 0);
+        tail = current_tail(sdp);
+        lh = (struct gfs2_log_header *)bh->b_data;
+        memset(lh, 0, sizeof(struct gfs2_log_header));
+        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+        lh->lh_flags = cpu_to_be32(flags);
+        lh->lh_tail = cpu_to_be32(tail);
+        lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+        hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+        lh->lh_hash = cpu_to_be32(hash);
+        set_buffer_dirty(bh);
+        if (sync_dirty_buffer(bh))
+                gfs2_io_error_bh(sdp, bh);
+        brelse(bh);
+        if (sdp->sd_log_tail != tail)
+                log_pull_tail(sdp, tail, pull);
+        else
+                gfs2_assert_withdraw(sdp, !pull);
+        sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+        log_incr_head(sdp);
+}
+static void log_flush_commit(struct gfs2_sbd *sdp)
+{
+        struct list_head *head = &sdp->sd_log_flush_list;
+        struct gfs2_log_buf *lb;
+        struct buffer_head *bh;
+        while (!list_empty(head)) {
+                lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
+                list_del(&lb->lb_list);
+                bh = lb->lb_bh;
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh))
+                        gfs2_io_error_bh(sdp, bh);
+                if (lb->lb_real) {
+                        while (atomic_read(&bh->b_count) != 1)  /* Grrrr... */
+                                schedule();
+                        free_buffer_head(bh);
+                } else
+                        brelse(bh);
+                kfree(lb);
+        }
+        log_write_header(sdp, 0, 0);
+}
+/**
+ * gfs2_log_flush - flush incore transaction(s)
+ * @sdp: the filesystem
+ * @gl: The glock structure to flush.  If NULL, flush the whole incore log
+ *
+ */
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+{
+        struct gfs2_ail *ai;
+        down_write(&sdp->sd_log_flush_lock);
+        if (gl) {
+                gfs2_log_lock(sdp);
+                if (list_empty(&gl->gl_le.le_list)) {
+                        gfs2_log_unlock(sdp);
+                        up_write(&sdp->sd_log_flush_lock);
+                        return;
+                }
+                gfs2_log_unlock(sdp);
+        }
+        ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+        INIT_LIST_HEAD(&ai->ai_ail1_list);
+        INIT_LIST_HEAD(&ai->ai_ail2_list);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
+        gfs2_assert_withdraw(sdp,
+                        sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
+        sdp->sd_log_flush_head = sdp->sd_log_head;
+        sdp->sd_log_flush_wrapped = 0;
+        ai->ai_first = sdp->sd_log_flush_head;
+        lops_before_commit(sdp);
+        if (!list_empty(&sdp->sd_log_flush_list))
+                log_flush_commit(sdp);
+        else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
+                log_write_header(sdp, 0, PULL);
+        lops_after_commit(sdp, ai);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_head = sdp->sd_log_flush_head;
+        sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
+        sdp->sd_log_blks_reserved = 0;
+        sdp->sd_log_commited_buf = 0;
+        sdp->sd_log_num_hdrs = 0;
+        sdp->sd_log_commited_revoke = 0;
+        if (!list_empty(&ai->ai_ail1_list)) {
+                list_add(&ai->ai_list, &sdp->sd_ail1_list);
+                ai = NULL;
+        }
+        gfs2_log_unlock(sdp);
+        sdp->sd_vfs->s_dirt = 0;
+        up_write(&sdp->sd_log_flush_lock);
+        kfree(ai);
+}
+static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        unsigned int reserved = 0;
+        unsigned int old;
+        gfs2_log_lock(sdp);
+        sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
+        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
+        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
+        if (sdp->sd_log_commited_buf)
+                reserved += sdp->sd_log_commited_buf;
+        if (sdp->sd_log_commited_revoke)
+                reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+                                            sizeof(u64));
+        if (reserved)
+                reserved++;
+        old = sdp->sd_log_blks_free;
+        sdp->sd_log_blks_free += tr->tr_reserved -
+                                 (reserved - sdp->sd_log_blks_reserved);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
+        gfs2_assert_withdraw(sdp,
+                             sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
+                             sdp->sd_log_num_hdrs);
+        sdp->sd_log_blks_reserved = reserved;
+        gfs2_log_unlock(sdp);
+}
+/**
+ * gfs2_log_commit - Commit a transaction to the log
+ * @sdp: the filesystem
+ * @tr: the transaction
+ *
+ * Returns: errno
+ */
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        log_refund(sdp, tr);
+        lops_incore_commit(sdp, tr);
+        sdp->sd_vfs->s_dirt = 1;
+        up_read(&sdp->sd_log_flush_lock);
+        gfs2_log_lock(sdp);
+        if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
+                gfs2_log_unlock(sdp);
+                gfs2_log_flush(sdp, NULL);
+        } else {
+                gfs2_log_unlock(sdp);
+        }
+}
+/**
+ * gfs2_log_shutdown - write a shutdown header into a journal
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_log_shutdown(struct gfs2_sbd *sdp)
+{
+        down_write(&sdp->sd_log_flush_lock);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
+        gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
+        sdp->sd_log_flush_head = sdp->sd_log_head;
+        sdp->sd_log_flush_wrapped = 0;
+        log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
+        gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
+        gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
+        sdp->sd_log_head = sdp->sd_log_flush_head;
+        sdp->sd_log_tail = sdp->sd_log_head;
+        up_write(&sdp->sd_log_flush_lock);
+}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..7f5737d55612
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __LOG_DOT_H__
+#define __LOG_DOT_H__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include "incore.h"
+/**
+ * gfs2_log_lock - acquire the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+{
+        spin_lock(&sdp->sd_log_lock);
+}
+/**
+ * gfs2_log_unlock - release the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+{
+        spin_unlock(&sdp->sd_log_lock);
+}
+static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+                                          unsigned int value)
+{
+        if (++value == sdp->sd_jdesc->jd_blocks) {
+                value = 0;
+        }
+        sdp->sd_log_head = sdp->sd_log_tail = value;
+}
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+                            unsigned int ssize);
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+                                      struct buffer_head *real);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..ab6d1115f95d
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_glock *gl;
+        struct gfs2_trans *tr = current->journal_info;
+        tr->tr_touched = 1;
+        if (!list_empty(&le->le_list))
+                return;
+        gl = container_of(le, struct gfs2_glock, gl_le);
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
+                return;
+        gfs2_glock_hold(gl);
+        set_bit(GLF_DIRTY, &gl->gl_flags);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_gl++;
+        list_add(&le->le_list, &sdp->sd_log_le_gl);
+        gfs2_log_unlock(sdp);
+}
+static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_gl;
+        struct gfs2_glock *gl;
+        while (!list_empty(head)) {
+                gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
+                list_del_init(&gl->gl_le.le_list);
+                sdp->sd_log_num_gl--;
+                gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
+                gfs2_glock_put(gl);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
+}
+static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_trans *tr;
+        if (!list_empty(&bd->bd_list_tr))
+                return;
+        tr = current->journal_info;
+        tr->tr_touched = 1;
+        tr->tr_num_buf++;
+        list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+        if (!list_empty(&le->le_list))
+                return;
+        gfs2_trans_add_gl(bd->bd_gl);
+        gfs2_meta_check(sdp, bd->bd_bh);
+        gfs2_pin(sdp, bd->bd_bh);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_buf++;
+        list_add(&le->le_list, &sdp->sd_log_le_buf);
+        gfs2_log_unlock(sdp);
+        tr->tr_num_buf_new++;
+}
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        struct list_head *head = &tr->tr_list_buf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+                list_del_init(&bd->bd_list_tr);
+                tr->tr_num_buf--;
+        }
+        gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+        struct buffer_head *bh;
+        struct gfs2_log_descriptor *ld;
+        struct gfs2_bufdata *bd1 = NULL, *bd2;
+        unsigned int total = sdp->sd_log_num_buf;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        unsigned int limit;
+        unsigned int num;
+        unsigned n;
+        __be64 *ptr;
+        offset += sizeof(__be64) - 1;
+        offset &= ~(sizeof(__be64) - 1);
+        limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+        /* for 4k blocks, limit = 503 */
+        bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
+        while(total) {
+                num = total;
+                if (total > limit)
+                        num = limit;
+                bh = gfs2_log_get_buf(sdp);
+                sdp->sd_log_num_hdrs++;
+                ld = (struct gfs2_log_descriptor *)bh->b_data;
+                ptr = (__be64 *)(bh->b_data + offset);
+                ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+                ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+                ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+                ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
+                ld->ld_length = cpu_to_be32(num + 1);
+                ld->ld_data1 = cpu_to_be32(num);
+                ld->ld_data2 = cpu_to_be32(0);
+                memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+                n = 0;
+                list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
+                                             bd_le.le_list) {
+                        *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+                        if (++n >= num)
+                                break;
+                }
+                set_buffer_dirty(bh);
+                ll_rw_block(WRITE, 1, &bh);
+                n = 0;
+                list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
+                                             bd_le.le_list) {
+                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        if (++n >= num)
+                                break;
+                }
+                total -= num;
+        }
+}
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_buf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+                list_del_init(&bd->bd_le.le_list);
+                sdp->sd_log_num_buf--;
+                gfs2_unpin(sdp, bd->bd_bh, ai);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
+}
+static void buf_lo_before_scan(struct gfs2_jdesc *jd,
+                               struct gfs2_log_header *head, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (pass != 0)
+                return;
+        sdp->sd_found_blocks = 0;
+        sdp->sd_replayed_blocks = 0;
+}
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                struct gfs2_log_descriptor *ld, __be64 *ptr,
+                                int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        unsigned int blks = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh_log, *bh_ip;
+        u64 blkno;
+        int error = 0;
+        if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
+                return 0;
+        gfs2_replay_incr_blk(sdp, &start);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                blkno = be64_to_cpu(*ptr++);
+                sdp->sd_found_blocks++;
+                if (gfs2_revoke_check(sdp, blkno, start))
+                        continue;
+                error = gfs2_replay_read_block(jd, start, &bh_log);
+                if (error)
+                        return error;
+                bh_ip = gfs2_meta_new(gl, blkno);
+                memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+                if (gfs2_meta_check(sdp, bh_ip))
+                        error = -EIO;
+                else
+                        mark_buffer_dirty(bh_ip);
+                brelse(bh_log);
+                brelse(bh_ip);
+                if (error)
+                        break;
+                sdp->sd_replayed_blocks++;
+        }
+        return error;
+}
+static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (error) {
+                gfs2_meta_sync(ip->i_gl);
+                return;
+        }
+        if (pass != 1)
+                return;
+        gfs2_meta_sync(ip->i_gl);
+        fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
+                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_trans *tr;
+        tr = current->journal_info;
+        tr->tr_touched = 1;
+        tr->tr_num_revoke++;
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_revoke++;
+        list_add(&le->le_list, &sdp->sd_log_le_revoke);
+        gfs2_log_unlock(sdp);
+}
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+{
+        struct gfs2_log_descriptor *ld;
+        struct gfs2_meta_header *mh;
+        struct buffer_head *bh;
+        unsigned int offset;
+        struct list_head *head = &sdp->sd_log_le_revoke;
+        struct gfs2_revoke *rv;
+        if (!sdp->sd_log_num_revoke)
+                return;
+        bh = gfs2_log_get_buf(sdp);
+        ld = (struct gfs2_log_descriptor *)bh->b_data;
+        ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+        ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+        ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
+        ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
+                                                    sizeof(u64)));
+        ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
+        ld->ld_data2 = cpu_to_be32(0);
+        memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+        offset = sizeof(struct gfs2_log_descriptor);
+        while (!list_empty(head)) {
+                rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
+                list_del_init(&rv->rv_le.le_list);
+                sdp->sd_log_num_revoke--;
+                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        bh = gfs2_log_get_buf(sdp);
+                        mh = (struct gfs2_meta_header *)bh->b_data;
+                        mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+                        mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
+                        mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
+                        offset = sizeof(struct gfs2_meta_header);
+                }
+                *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
+                kfree(rv);
+                offset += sizeof(u64);
+        }
+        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+        set_buffer_dirty(bh);
+        ll_rw_block(WRITE, 1, &bh);
+}
+static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
+                                  struct gfs2_log_header *head, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (pass != 0)
+                return;
+        sdp->sd_found_revokes = 0;
+        sdp->sd_replay_tail = head->lh_tail;
+}
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                   struct gfs2_log_descriptor *ld, __be64 *ptr,
+                                   int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        unsigned int blks = be32_to_cpu(ld->ld_length);
+        unsigned int revokes = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh;
+        unsigned int offset;
+        u64 blkno;
+        int first = 1;
+        int error;
+        if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
+                return 0;
+        offset = sizeof(struct gfs2_log_descriptor);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                error = gfs2_replay_read_block(jd, start, &bh);
+                if (error)
+                        return error;
+                if (!first)
+                        gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
+                while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
+                        blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
+                        error = gfs2_revoke_add(sdp, blkno, start);
+                        if (error < 0)
+                                return error;
+                        else if (error)
+                                sdp->sd_found_revokes++;
+                        if (!--revokes)
+                                break;
+                        offset += sizeof(u64);
+                }
+                brelse(bh);
+                offset = sizeof(struct gfs2_meta_header);
+                first = 0;
+        }
+        return 0;
+}
+static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (error) {
+                gfs2_revoke_clean(sdp);
+                return;
+        }
+        if (pass != 1)
+                return;
+        fs_info(sdp, "jid=%u: Found %u revoke tags\n",
+                jd->jd_jid, sdp->sd_found_revokes);
+        gfs2_revoke_clean(sdp);
+}
+static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_trans *tr = current->journal_info;
+        tr->tr_touched = 1;
+        if (!list_empty(&le->le_list))
+                return;
+        rgd = container_of(le, struct gfs2_rgrpd, rd_le);
+        gfs2_rgrp_bh_hold(rgd);
+        gfs2_log_lock(sdp);
+        sdp->sd_log_num_rg++;
+        list_add(&le->le_list, &sdp->sd_log_le_rg);
+        gfs2_log_unlock(sdp);
+}
+static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_rg;
+        struct gfs2_rgrpd *rgd;
+        while (!list_empty(head)) {
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
+                list_del_init(&rgd->rd_le.le_list);
+                sdp->sd_log_num_rg--;
+                gfs2_rgrp_repolish_clones(rgd);
+                gfs2_rgrp_bh_put(rgd);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
+}
+/**
+ * databuf_lo_add - Add a databuf to the transaction.
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
+ */
+static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_trans *tr = current->journal_info;
+        struct address_space *mapping = bd->bd_bh->b_page->mapping;
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        tr->tr_touched = 1;
+        if (list_empty(&bd->bd_list_tr) &&
+            (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
+                tr->tr_num_buf++;
+                list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+                gfs2_pin(sdp, bd->bd_bh);
+                tr->tr_num_buf_new++;
+        }
+        gfs2_trans_add_gl(bd->bd_gl);
+        gfs2_log_lock(sdp);
+        if (list_empty(&le->le_list)) {
+                if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+                        sdp->sd_log_num_jdata++;
+                sdp->sd_log_num_databuf++;
+                list_add(&le->le_list, &sdp->sd_log_le_databuf);
+        }
+        gfs2_log_unlock(sdp);
+}
+static int gfs2_check_magic(struct buffer_head *bh)
+{
+        struct page *page = bh->b_page;
+        void *kaddr;
+        __be32 *ptr;
+        int rv = 0;
+        kaddr = kmap_atomic(page, KM_USER0);
+        ptr = kaddr + bh_offset(bh);
+        if (*ptr == cpu_to_be32(GFS2_MAGIC))
+                rv = 1;
+        kunmap_atomic(kaddr, KM_USER0);
+        return rv;
+}
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ * Here we scan through the lists of buffers and make the assumption
+ * that any buffer thats been pinned is being journaled, and that
+ * any unpinned buffer is an ordered write data buffer and therefore
+ * will be written back rather than journaled.
+ */
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+        LIST_HEAD(started);
+        struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
+        struct buffer_head *bh = NULL;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        struct gfs2_log_descriptor *ld;
+        unsigned int limit;
+        unsigned int total_dbuf = sdp->sd_log_num_databuf;
+        unsigned int total_jdata = sdp->sd_log_num_jdata;
+        unsigned int num, n;
+        __be64 *ptr = NULL;
+        offset += 2*sizeof(__be64) - 1;
+        offset &= ~(2*sizeof(__be64) - 1);
+        limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+        /*
+         * Start writing ordered buffers, write journaled buffers
+         * into the log along with a header
+         */
+        gfs2_log_lock(sdp);
+        bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
+                                       bd_le.le_list);
+        while(total_dbuf) {
+                num = total_jdata;
+                if (num > limit)
+                        num = limit;
+                n = 0;
+                list_for_each_entry_safe_continue(bd1, bdt,
+                                                  &sdp->sd_log_le_databuf,
+                                                  bd_le.le_list) {
+                        /* An ordered write buffer */
+                        if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
+                                list_move(&bd1->bd_le.le_list, &started);
+                                if (bd1 == bd2) {
+                                        bd2 = NULL;
+                                        bd2 = list_prepare_entry(bd2,
+                                                        &sdp->sd_log_le_databuf,
+                                                        bd_le.le_list);
+                                }
+                                total_dbuf--;
+                                if (bd1->bd_bh) {
+                                        get_bh(bd1->bd_bh);
+                                        if (buffer_dirty(bd1->bd_bh)) {
+                                                gfs2_log_unlock(sdp);
+                                                wait_on_buffer(bd1->bd_bh);
+                                                ll_rw_block(WRITE, 1,
+                                                            &bd1->bd_bh);
+                                                gfs2_log_lock(sdp);
+                                        }
+                                        brelse(bd1->bd_bh);
+                                        continue;
+                                }
+                                continue;
+                        } else if (bd1->bd_bh) { /* A journaled buffer */
+                                int magic;
+                                gfs2_log_unlock(sdp);
+                                if (!bh) {
+                                        bh = gfs2_log_get_buf(sdp);
+                                        sdp->sd_log_num_hdrs++;
+                                        ld = (struct gfs2_log_descriptor *)
+                                             bh->b_data;
+                                        ptr = (__be64 *)(bh->b_data + offset);
+                                        ld->ld_header.mh_magic =
+                                                cpu_to_be32(GFS2_MAGIC);
+                                        ld->ld_header.mh_type =
+                                                cpu_to_be32(GFS2_METATYPE_LD);
+                                        ld->ld_header.mh_format =
+                                                cpu_to_be32(GFS2_FORMAT_LD);
+                                        ld->ld_type =
+                                                cpu_to_be32(GFS2_LOG_DESC_JDATA);
+                                        ld->ld_length = cpu_to_be32(num + 1);
+                                        ld->ld_data1 = cpu_to_be32(num);
+                                        ld->ld_data2 = cpu_to_be32(0);
+                                        memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+                                }
+                                magic = gfs2_check_magic(bd1->bd_bh);
+                                *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+                                *ptr++ = cpu_to_be64((__u64)magic);
+                                clear_buffer_escaped(bd1->bd_bh);
+                                if (unlikely(magic != 0))
+                                        set_buffer_escaped(bd1->bd_bh);
+                                gfs2_log_lock(sdp);
+                                if (n++ > num)
+                                        break;
+                        } else if (!bd1->bd_bh) {
+                                total_dbuf--;
+                                sdp->sd_log_num_databuf--;
+                                list_del_init(&bd1->bd_le.le_list);
+                                if (bd1 == bd2) {
+                                        bd2 = NULL;
+                                        bd2 = list_prepare_entry(bd2,
+                                                &sdp->sd_log_le_databuf,
+                                                bd_le.le_list);
+                                }
+                                kmem_cache_free(gfs2_bufdata_cachep, bd1);
+                        }
+                }
+                gfs2_log_unlock(sdp);
+                if (bh) {
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        bh = NULL;
+                }
+                n = 0;
+                gfs2_log_lock(sdp);
+                list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
+                                             bd_le.le_list) {
+                        if (!bd2->bd_bh)
+                                continue;
+                        /* copy buffer if it needs escaping */
+                        gfs2_log_unlock(sdp);
+                        if (unlikely(buffer_escaped(bd2->bd_bh))) {
+                                void *kaddr;
+                                struct page *page = bd2->bd_bh->b_page;
+                                bh = gfs2_log_get_buf(sdp);
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memcpy(bh->b_data,
+                                       kaddr + bh_offset(bd2->bd_bh),
+                                       sdp->sd_sb.sb_bsize);
+                                kunmap_atomic(kaddr, KM_USER0);
+                                *(__be32 *)bh->b_data = 0;
+                        } else {
+                                bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+                        }
+                        set_buffer_dirty(bh);
+                        ll_rw_block(WRITE, 1, &bh);
+                        gfs2_log_lock(sdp);
+                        if (++n >= num)
+                                break;
+                }
+                bh = NULL;
+                total_dbuf -= num;
+                total_jdata -= num;
+        }
+        gfs2_log_unlock(sdp);
+        /* Wait on all ordered buffers */
+        while (!list_empty(&started)) {
+                gfs2_log_lock(sdp);
+                bd1 = list_entry(started.next, struct gfs2_bufdata,
+                                 bd_le.le_list);
+                list_del_init(&bd1->bd_le.le_list);
+                sdp->sd_log_num_databuf--;
+                bh = bd1->bd_bh;
+                if (bh) {
+                        bh->b_private = NULL;
+                        get_bh(bh);
+                        gfs2_log_unlock(sdp);
+                        wait_on_buffer(bh);
+                        brelse(bh);
+                } else
+                        gfs2_log_unlock(sdp);
+                kmem_cache_free(gfs2_bufdata_cachep, bd1);
+        }
+        /* We've removed all the ordered write bufs here, so only jdata left */
+        gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
+}
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                    struct gfs2_log_descriptor *ld,
+                                    __be64 *ptr, int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        unsigned int blks = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh_log, *bh_ip;
+        u64 blkno;
+        u64 esc;
+        int error = 0;
+        if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+                return 0;
+        gfs2_replay_incr_blk(sdp, &start);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                blkno = be64_to_cpu(*ptr++);
+                esc = be64_to_cpu(*ptr++);
+                sdp->sd_found_blocks++;
+                if (gfs2_revoke_check(sdp, blkno, start))
+                        continue;
+                error = gfs2_replay_read_block(jd, start, &bh_log);
+                if (error)
+                        return error;
+                bh_ip = gfs2_meta_new(gl, blkno);
+                memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+                /* Unescape */
+                if (esc) {
+                        __be32 *eptr = (__be32 *)bh_ip->b_data;
+                        *eptr = cpu_to_be32(GFS2_MAGIC);
+                }
+                mark_buffer_dirty(bh_ip);
+                brelse(bh_log);
+                brelse(bh_ip);
+                if (error)
+                        break;
+                sdp->sd_replayed_blocks++;
+        }
+        return error;
+}
+/* FIXME: sort out accounting for log blocks etc. */
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        if (error) {
+                gfs2_meta_sync(ip->i_gl);
+                return;
+        }
+        if (pass != 1)
+                return;
+        /* data sync? */
+        gfs2_meta_sync(ip->i_gl);
+        fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_databuf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+                list_del_init(&bd->bd_le.le_list);
+                sdp->sd_log_num_databuf--;
+                sdp->sd_log_num_jdata--;
+                gfs2_unpin(sdp, bd->bd_bh, ai);
+        }
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
+}
+const struct gfs2_log_operations gfs2_glock_lops = {
+        .lo_add = glock_lo_add,
+        .lo_after_commit = glock_lo_after_commit,
+        .lo_name = "glock",
+};
+const struct gfs2_log_operations gfs2_buf_lops = {
+        .lo_add = buf_lo_add,
+        .lo_incore_commit = buf_lo_incore_commit,
+        .lo_before_commit = buf_lo_before_commit,
+        .lo_after_commit = buf_lo_after_commit,
+        .lo_before_scan = buf_lo_before_scan,
+        .lo_scan_elements = buf_lo_scan_elements,
+        .lo_after_scan = buf_lo_after_scan,
+        .lo_name = "buf",
+};
+const struct gfs2_log_operations gfs2_revoke_lops = {
+        .lo_add = revoke_lo_add,
+        .lo_before_commit = revoke_lo_before_commit,
+        .lo_before_scan = revoke_lo_before_scan,
+        .lo_scan_elements = revoke_lo_scan_elements,
+        .lo_after_scan = revoke_lo_after_scan,
+        .lo_name = "revoke",
+};
+const struct gfs2_log_operations gfs2_rg_lops = {
+        .lo_add = rg_lo_add,
+        .lo_after_commit = rg_lo_after_commit,
+        .lo_name = "rg",
+};
+const struct gfs2_log_operations gfs2_databuf_lops = {
+        .lo_add = databuf_lo_add,
+        .lo_incore_commit = buf_lo_incore_commit,
+        .lo_before_commit = databuf_lo_before_commit,
+        .lo_after_commit = databuf_lo_after_commit,
+        .lo_scan_elements = databuf_lo_scan_elements,
+        .lo_after_scan = databuf_lo_after_scan,
+        .lo_name = "databuf",
+};
+const struct gfs2_log_operations *gfs2_log_ops[] = {
+        &gfs2_glock_lops,
+        &gfs2_buf_lops,
+        &gfs2_revoke_lops,
+        &gfs2_rg_lops,
+        &gfs2_databuf_lops,
+        NULL,
+};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..5839c05ae6be
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __LOPS_DOT_H__
+#define __LOPS_DOT_H__
+#include <linux/list.h>
+#include "incore.h"
+extern const struct gfs2_log_operations gfs2_glock_lops;
+extern const struct gfs2_log_operations gfs2_buf_lops;
+extern const struct gfs2_log_operations gfs2_revoke_lops;
+extern const struct gfs2_log_operations gfs2_rg_lops;
+extern const struct gfs2_log_operations gfs2_databuf_lops;
+extern const struct gfs2_log_operations *gfs2_log_ops[];
+static inline void lops_init_le(struct gfs2_log_element *le,
+                                const struct gfs2_log_operations *lops)
+{
+        INIT_LIST_HEAD(&le->le_list);
+        le->le_ops = lops;
+}
+static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+        if (le->le_ops->lo_add)
+                le->le_ops->lo_add(sdp, le);
+}
+static inline void lops_incore_commit(struct gfs2_sbd *sdp,
+                                      struct gfs2_trans *tr)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_incore_commit)
+                        gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
+}
+static inline void lops_before_commit(struct gfs2_sbd *sdp)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_before_commit)
+                        gfs2_log_ops[x]->lo_before_commit(sdp);
+}
+static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_after_commit)
+                        gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+}
+static inline void lops_before_scan(struct gfs2_jdesc *jd,
+                                    struct gfs2_log_header *head,
+                                    unsigned int pass)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_before_scan)
+                        gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
+}
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                     struct gfs2_log_descriptor *ld,
+                                     __be64 *ptr,
+                                     unsigned int pass)
+{
+        int x, error;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_scan_elements) {
+                        error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
+                                                                  ld, ptr, pass);
+                        if (error)
+                                return error;
+                }
+        return 0;
+}
+static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
+                                   unsigned int pass)
+{
+        int x;
+        for (x = 0; gfs2_log_ops[x]; x++)
+                if (gfs2_log_ops[x]->lo_before_scan)
+                        gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
+}
+#endif /* __LOPS_DOT_H__ */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..21508a13bb78
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/atomic.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "ops_fstype.h"
+#include "sys.h"
+#include "util.h"
+#include "glock.h"
+static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct gfs2_inode *ip = foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                inode_init_once(&ip->i_inode);
+                spin_lock_init(&ip->i_spin);
+                init_rwsem(&ip->i_rw_mutex);
+                memset(ip->i_cache, 0, sizeof(ip->i_cache));
+        }
+}
+static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct gfs2_glock *gl = foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR) {
+                INIT_HLIST_NODE(&gl->gl_list);
+                spin_lock_init(&gl->gl_spin);
+                INIT_LIST_HEAD(&gl->gl_holders);
+                INIT_LIST_HEAD(&gl->gl_waiters1);
+                INIT_LIST_HEAD(&gl->gl_waiters2);
+                INIT_LIST_HEAD(&gl->gl_waiters3);
+                gl->gl_lvb = NULL;
+                atomic_set(&gl->gl_lvb_count, 0);
+                INIT_LIST_HEAD(&gl->gl_reclaim);
+                INIT_LIST_HEAD(&gl->gl_ail_list);
+                atomic_set(&gl->gl_ail_count, 0);
+        }
+}
+/**
+ * init_gfs2_fs - Register GFS2 as a filesystem
+ *
+ * Returns: 0 on success, error code on failure
+ */
+static int __init init_gfs2_fs(void)
+{
+        int error;
+        error = gfs2_sys_init();
+        if (error)
+                return error;
+        error = gfs2_glock_init();
+        if (error)
+                goto fail;
+        error = -ENOMEM;
+        gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
+                                              sizeof(struct gfs2_glock),
+                                              0, 0,
+                                              gfs2_init_glock_once, NULL);
+        if (!gfs2_glock_cachep)
+                goto fail;
+        gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
+                                              sizeof(struct gfs2_inode),
+                                              0, (SLAB_RECLAIM_ACCOUNT|
+                                              SLAB_PANIC|SLAB_MEM_SPREAD),
+                                              gfs2_init_inode_once, NULL);
+        if (!gfs2_inode_cachep)
+                goto fail;
+        gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
+                                                sizeof(struct gfs2_bufdata),
+                                                0, 0, NULL, NULL);
+        if (!gfs2_bufdata_cachep)
+                goto fail;
+        error = register_filesystem(&gfs2_fs_type);
+        if (error)
+                goto fail;
+        error = register_filesystem(&gfs2meta_fs_type);
+        if (error)
+                goto fail_unregister;
+        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
+        return 0;
+fail_unregister:
+        unregister_filesystem(&gfs2_fs_type);
+fail:
+        if (gfs2_bufdata_cachep)
+                kmem_cache_destroy(gfs2_bufdata_cachep);
+        if (gfs2_inode_cachep)
+                kmem_cache_destroy(gfs2_inode_cachep);
+        if (gfs2_glock_cachep)
+                kmem_cache_destroy(gfs2_glock_cachep);
+        gfs2_sys_uninit();
+        return error;
+}
+/**
+ * exit_gfs2_fs - Unregister the file system
+ *
+ */
+static void __exit exit_gfs2_fs(void)
+{
+        unregister_filesystem(&gfs2_fs_type);
+        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_bufdata_cachep);
+        kmem_cache_destroy(gfs2_inode_cachep);
+        kmem_cache_destroy(gfs2_glock_cachep);
+        gfs2_sys_uninit();
+}
+MODULE_DESCRIPTION("Global File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+module_init(init_gfs2_fs);
+module_exit(exit_gfs2_fs);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..3912d6a4b1e6
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "ops_address.h"
+static int aspace_get_block(struct inode *inode, sector_t lblock,
+                            struct buffer_head *bh_result, int create)
+{
+        gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+        return -EOPNOTSUPP;
+}
+static int gfs2_aspace_writepage(struct page *page,
+                                 struct writeback_control *wbc)
+{
+        return block_write_full_page(page, aspace_get_block, wbc);
+}
+static const struct address_space_operations aspace_aops = {
+        .writepage = gfs2_aspace_writepage,
+        .releasepage = gfs2_releasepage,
+};
+/**
+ * gfs2_aspace_get - Create and initialize a struct inode structure
+ * @sdp: the filesystem the aspace is in
+ *
+ * Right now a struct inode is just a struct inode.  Maybe Linux
+ * will supply a more lightweight address space construct (that works)
+ * in the future.
+ *
+ * Make sure pages/buffers in this aspace aren't in high memory.
+ *
+ * Returns: the aspace
+ */
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
+{
+        struct inode *aspace;
+        aspace = new_inode(sdp->sd_vfs);
+        if (aspace) {
+                mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
+                aspace->i_mapping->a_ops = &aspace_aops;
+                aspace->i_size = ~0ULL;
+                aspace->i_private = NULL;
+                insert_inode_hash(aspace);
+        }
+        return aspace;
+}
+void gfs2_aspace_put(struct inode *aspace)
+{
+        remove_inode_hash(aspace);
+        iput(aspace);
+}
+/**
+ * gfs2_meta_inval - Invalidate all buffers associated with a glock
+ * @gl: the glock
+ *
+ */
+void gfs2_meta_inval(struct gfs2_glock *gl)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct inode *aspace = gl->gl_aspace;
+        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+        atomic_inc(&aspace->i_writecount);
+        truncate_inode_pages(mapping, 0);
+        atomic_dec(&aspace->i_writecount);
+        gfs2_assert_withdraw(sdp, !mapping->nrpages);
+}
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+        struct address_space *mapping = gl->gl_aspace->i_mapping;
+        int error;
+        filemap_fdatawrite(mapping);
+        error = filemap_fdatawait(mapping);
+        if (error)
+                gfs2_io_error(gl->gl_sbd);
+}
+/**
+ * getbuf - Get a buffer with a given address space
+ * @sdp: the filesystem
+ * @aspace: the address space
+ * @blkno: the block number (filesystem scope)
+ * @create: 1 if the buffer should be created
+ *
+ * Returns: the buffer
+ */
+static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
+                                  u64 blkno, int create)
+{
+        struct page *page;
+        struct buffer_head *bh;
+        unsigned int shift;
+        unsigned long index;
+        unsigned int bufnum;
+        shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+        index = blkno >> shift;             /* convert block to page */
+        bufnum = blkno - (index << shift);  /* block buf index within page */
+        if (create) {
+                for (;;) {
+                        page = grab_cache_page(aspace->i_mapping, index);
+                        if (page)
+                                break;
+                        yield();
+                }
+        } else {
+                page = find_lock_page(aspace->i_mapping, index);
+                if (!page)
+                        return NULL;
+        }
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+        /* Locate header for our buffer within our page */
+        for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+                /* Do nothing */;
+        get_bh(bh);
+        if (!buffer_mapped(bh))
+                map_bh(bh, sdp->sd_vfs, blkno);
+        unlock_page(page);
+        mark_page_accessed(page);
+        page_cache_release(page);
+        return bh;
+}
+static void meta_prep_new(struct buffer_head *bh)
+{
+        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+        lock_buffer(bh);
+        clear_buffer_dirty(bh);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+}
+/**
+ * gfs2_meta_new - Get a block
+ * @gl: The glock associated with this block
+ * @blkno: The block number
+ *
+ * Returns: The buffer
+ */
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
+{
+        struct buffer_head *bh;
+        bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+        meta_prep_new(bh);
+        return bh;
+}
+/**
+ * gfs2_meta_read - Read a block from disk
+ * @gl: The glock covering the block
+ * @blkno: The block number
+ * @flags: flags
+ * @bhp: the place where the buffer is returned (NULL on failure)
+ *
+ * Returns: errno
+ */
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+                   struct buffer_head **bhp)
+{
+        *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+        if (!buffer_uptodate(*bhp))
+                ll_rw_block(READ_META, 1, bhp);
+        if (flags & DIO_WAIT) {
+                int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+                if (error) {
+                        brelse(*bhp);
+                        return error;
+                }
+        }
+        return 0;
+}
+/**
+ * gfs2_meta_wait - Reread a block from disk
+ * @sdp: the filesystem
+ * @bh: The block to wait for
+ *
+ * Returns: errno
+ */
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                return -EIO;
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
+                struct gfs2_trans *tr = current->journal_info;
+                if (tr && tr->tr_touched)
+                        gfs2_io_error_bh(sdp, bh);
+                return -EIO;
+        }
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                return -EIO;
+        return 0;
+}
+/**
+ * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to be attached to
+ * @meta: Flag to indicate whether its metadata or not
+ */
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+                         int meta)
+{
+        struct gfs2_bufdata *bd;
+        if (meta)
+                lock_page(bh->b_page);
+        if (bh->b_private) {
+                if (meta)
+                        unlock_page(bh->b_page);
+                return;
+        }
+        bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+        memset(bd, 0, sizeof(struct gfs2_bufdata));
+        bd->bd_bh = bh;
+        bd->bd_gl = gl;
+        INIT_LIST_HEAD(&bd->bd_list_tr);
+        if (meta)
+                lops_init_le(&bd->bd_le, &gfs2_buf_lops);
+        else
+                lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
+        bh->b_private = bd;
+        if (meta)
+                unlock_page(bh->b_page);
+}
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to be pinned
+ *
+ */
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+        struct gfs2_bufdata *bd = bh->b_private;
+        gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+        if (test_set_buffer_pinned(bh))
+                gfs2_assert_withdraw(sdp, 0);
+        wait_on_buffer(bh);
+        /* If this buffer is in the AIL and it has already been written
+           to in-place disk block, remove it from the AIL. */
+        gfs2_log_lock(sdp);
+        if (bd->bd_ail && !buffer_in_io(bh))
+                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+        gfs2_log_unlock(sdp);
+        clear_buffer_dirty(bh);
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh))
+                gfs2_io_error_bh(sdp, bh);
+        get_bh(bh);
+}
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ *
+ */
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                struct gfs2_ail *ai)
+{
+        struct gfs2_bufdata *bd = bh->b_private;
+        gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
+        if (!buffer_pinned(bh))
+                gfs2_assert_withdraw(sdp, 0);
+        mark_buffer_dirty(bh);
+        clear_buffer_pinned(bh);
+        gfs2_log_lock(sdp);
+        if (bd->bd_ail) {
+                list_del(&bd->bd_ail_st_list);
+                brelse(bh);
+        } else {
+                struct gfs2_glock *gl = bd->bd_gl;
+                list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+                atomic_inc(&gl->gl_ail_count);
+        }
+        bd->bd_ail = ai;
+        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+        gfs2_log_unlock(sdp);
+}
+/**
+ * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * @ip: the inode who owns the buffers
+ * @bstart: the first buffer in the run
+ * @blen: the number of buffers in the run
+ *
+ */
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *aspace = ip->i_gl->gl_aspace;
+        struct buffer_head *bh;
+        while (blen) {
+                bh = getbuf(sdp, aspace, bstart, NO_CREATE);
+                if (bh) {
+                        struct gfs2_bufdata *bd = bh->b_private;
+                        if (test_clear_buffer_pinned(bh)) {
+                                struct gfs2_trans *tr = current->journal_info;
+                                gfs2_log_lock(sdp);
+                                list_del_init(&bd->bd_le.le_list);
+                                gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
+                                sdp->sd_log_num_buf--;
+                                gfs2_log_unlock(sdp);
+                                tr->tr_num_buf_rm++;
+                                brelse(bh);
+                        }
+                        if (bd) {
+                                gfs2_log_lock(sdp);
+                                if (bd->bd_ail) {
+                                        u64 blkno = bh->b_blocknr;
+                                        bd->bd_ail = NULL;
+                                        list_del(&bd->bd_ail_st_list);
+                                        list_del(&bd->bd_ail_gl_list);
+                                        atomic_dec(&bd->bd_gl->gl_ail_count);
+                                        brelse(bh);
+                                        gfs2_log_unlock(sdp);
+                                        gfs2_trans_add_revoke(sdp, blkno);
+                                } else
+                                        gfs2_log_unlock(sdp);
+                        }
+                        lock_buffer(bh);
+                        clear_buffer_dirty(bh);
+                        clear_buffer_uptodate(bh);
+                        unlock_buffer(bh);
+                        brelse(bh);
+                }
+                bstart++;
+                blen--;
+        }
+}
+/**
+ * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
+ * @ip: The GFS2 inode
+ *
+ * This releases buffers that are in the most-recently-used array of
+ * blocks used for indirect block addressing for this inode.
+ */
+void gfs2_meta_cache_flush(struct gfs2_inode *ip)
+{
+        struct buffer_head **bh_slot;
+        unsigned int x;
+        spin_lock(&ip->i_spin);
+        for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
+                bh_slot = &ip->i_cache[x];
+                if (!*bh_slot)
+                        break;
+                brelse(*bh_slot);
+                *bh_slot = NULL;
+        }
+        spin_unlock(&ip->i_spin);
+}
+/**
+ * gfs2_meta_indirect_buffer - Get a metadata buffer
+ * @ip: The GFS2 inode
+ * @height: The level of this buf in the metadata (indir addr) tree (if any)
+ * @num: The block number (device relative) of the buffer
+ * @new: Non-zero if we may create a new buffer
+ * @bhp: the buffer is returned here
+ *
+ * Try to use the gfs2_inode's MRU metadata tree cache.
+ *
+ * Returns: errno
+ */
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+                              int new, struct buffer_head **bhp)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
+        int in_cache = 0;
+        spin_lock(&ip->i_spin);
+        if (*bh_slot && (*bh_slot)->b_blocknr == num) {
+                bh = *bh_slot;
+                get_bh(bh);
+                in_cache = 1;
+        }
+        spin_unlock(&ip->i_spin);
+        if (!bh)
+                bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
+        if (!bh)
+                return -ENOBUFS;
+        if (new) {
+                if (gfs2_assert_warn(sdp, height))
+                        goto err;
+                meta_prep_new(bh);
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+                gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+        } else {
+                u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+                if (!buffer_uptodate(bh)) {
+                        ll_rw_block(READ_META, 1, &bh);
+                        if (gfs2_meta_wait(sdp, bh))
+                                goto err;
+                }
+                if (gfs2_metatype_check(sdp, bh, mtype))
+                        goto err;
+        }
+        if (!in_cache) {
+                spin_lock(&ip->i_spin);
+                if (*bh_slot)
+                        brelse(*bh_slot);
+                *bh_slot = bh;
+                get_bh(bh);
+                spin_unlock(&ip->i_spin);
+        }
+        *bhp = bh;
+        return 0;
+err:
+        brelse(bh);
+        return -EIO;
+}
+/**
+ * gfs2_meta_ra - start readahead on an extent of a file
+ * @gl: the glock the blocks belong to
+ * @dblock: the starting disk block
+ * @extlen: the number of blocks in the extent
+ *
+ * returns: the first buffer in the extent
+ */
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct inode *aspace = gl->gl_aspace;
+        struct buffer_head *first_bh, *bh;
+        u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
+                          sdp->sd_sb.sb_bsize_shift;
+        BUG_ON(!extlen);
+        if (max_ra < 1)
+                max_ra = 1;
+        if (extlen > max_ra)
+                extlen = max_ra;
+        first_bh = getbuf(sdp, aspace, dblock, CREATE);
+        if (buffer_uptodate(first_bh))
+                goto out;
+        if (!buffer_locked(first_bh))
+                ll_rw_block(READ_META, 1, &first_bh);
+        dblock++;
+        extlen--;
+        while (extlen) {
+                bh = getbuf(sdp, aspace, dblock, CREATE);
+                if (!buffer_uptodate(bh) && !buffer_locked(bh))
+                        ll_rw_block(READA, 1, &bh);
+                brelse(bh);
+                dblock++;
+                extlen--;
+                if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
+                        goto out;
+        }
+        wait_on_buffer(first_bh);
+out:
+        return first_bh;
+}
+/**
+ * gfs2_meta_syncfs - sync all the buffers in a filesystem
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
+{
+        gfs2_log_flush(sdp, NULL);
+        for (;;) {
+                gfs2_ail1_start(sdp, DIO_ALL);
+                if (gfs2_ail1_empty(sdp, DIO_ALL))
+                        break;
+                msleep(10);
+        }
+}
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..3ec939e20dff
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __DIO_DOT_H__
+#define __DIO_DOT_H__
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "incore.h"
+static inline void gfs2_buffer_clear(struct buffer_head *bh)
+{
+        memset(bh->b_data, 0, bh->b_size);
+}
+static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
+{
+        BUG_ON(head > bh->b_size);
+        memset(bh->b_data + head, 0, bh->b_size - head);
+}
+static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
+                                         int to_head,
+                                         struct buffer_head *from_bh,
+                                         int from_head)
+{
+        BUG_ON(from_head < to_head);
+        memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
+               from_bh->b_size - from_head);
+        memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
+               0, from_head - to_head);
+}
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
+void gfs2_aspace_put(struct inode *aspace);
+void gfs2_meta_inval(struct gfs2_glock *gl);
+void gfs2_meta_sync(struct gfs2_glock *gl);
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+                   int flags, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+                         int meta);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                struct gfs2_ail *ai);
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_meta_cache_flush(struct gfs2_inode *ip);
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+                              int new, struct buffer_head **bhp);
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+                                         struct buffer_head **bhp)
+{
+        return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
+}
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+#define buffer_busy(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
+#define buffer_in_io(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
+#endif /* __DIO_DOT_H__ */
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..ef3092e29607
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "mount.h"
+#include "sys.h"
+#include "util.h"
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
+{
+        struct gfs2_args *args = &sdp->sd_args;
+        char *data = data_arg;
+        char *options, *o, *v;
+        int error = 0;
+        if (!remount) {
+                /*  If someone preloaded options, use those instead  */
+                spin_lock(&gfs2_sys_margs_lock);
+                if (gfs2_sys_margs) {
+                        data = gfs2_sys_margs;
+                        gfs2_sys_margs = NULL;
+                }
+                spin_unlock(&gfs2_sys_margs_lock);
+                /*  Set some defaults  */
+                args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
+                args->ar_quota = GFS2_QUOTA_DEFAULT;
+                args->ar_data = GFS2_DATA_DEFAULT;
+        }
+        /* Split the options into tokens with the "," character and
+           process them */
+        for (options = data; (o = strsep(&options, ",")); ) {
+                if (!*o)
+                        continue;
+                v = strchr(o, '=');
+                if (v)
+                        *v++ = 0;
+                if (!strcmp(o, "lockproto")) {
+                        if (!v)
+                                goto need_value;
+                        if (remount && strcmp(v, args->ar_lockproto))
+                                goto cant_remount;
+                        strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
+                        args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
+                }
+                else if (!strcmp(o, "locktable")) {
+                        if (!v)
+                                goto need_value;
+                        if (remount && strcmp(v, args->ar_locktable))
+                                goto cant_remount;
+                        strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
+                        args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
+                }
+                else if (!strcmp(o, "hostdata")) {
+                        if (!v)
+                                goto need_value;
+                        if (remount && strcmp(v, args->ar_hostdata))
+                                goto cant_remount;
+                        strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
+                        args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
+                }
+                else if (!strcmp(o, "spectator")) {
+                        if (remount && !args->ar_spectator)
+                                goto cant_remount;
+                        args->ar_spectator = 1;
+                        sdp->sd_vfs->s_flags |= MS_RDONLY;
+                }
+                else if (!strcmp(o, "ignore_local_fs")) {
+                        if (remount && !args->ar_ignore_local_fs)
+                                goto cant_remount;
+                        args->ar_ignore_local_fs = 1;
+                }
+                else if (!strcmp(o, "localflocks")) {
+                        if (remount && !args->ar_localflocks)
+                                goto cant_remount;
+                        args->ar_localflocks = 1;
+                }
+                else if (!strcmp(o, "localcaching")) {
+                        if (remount && !args->ar_localcaching)
+                                goto cant_remount;
+                        args->ar_localcaching = 1;
+                }
+                else if (!strcmp(o, "debug"))
+                        args->ar_debug = 1;
+                else if (!strcmp(o, "nodebug"))
+                        args->ar_debug = 0;
+                else if (!strcmp(o, "upgrade")) {
+                        if (remount && !args->ar_upgrade)
+                                goto cant_remount;
+                        args->ar_upgrade = 1;
+                }
+                else if (!strcmp(o, "num_glockd")) {
+                        unsigned int x;
+                        if (!v)
+                                goto need_value;
+                        sscanf(v, "%u", &x);
+                        if (remount && x != args->ar_num_glockd)
+                                goto cant_remount;
+                        if (!x || x > GFS2_GLOCKD_MAX) {
+                                fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
+                                        GFS2_GLOCKD_MAX, x);
+                                error = -EINVAL;
+                                break;
+                        }
+                        args->ar_num_glockd = x;
+                }
+                else if (!strcmp(o, "acl")) {
+                        args->ar_posix_acl = 1;
+                        sdp->sd_vfs->s_flags |= MS_POSIXACL;
+                }
+                else if (!strcmp(o, "noacl")) {
+                        args->ar_posix_acl = 0;
+                        sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
+                }
+                else if (!strcmp(o, "quota")) {
+                        if (!v)
+                                goto need_value;
+                        if (!strcmp(v, "off"))
+                                args->ar_quota = GFS2_QUOTA_OFF;
+                        else if (!strcmp(v, "account"))
+                                args->ar_quota = GFS2_QUOTA_ACCOUNT;
+                        else if (!strcmp(v, "on"))
+                                args->ar_quota = GFS2_QUOTA_ON;
+                        else {
+                                fs_info(sdp, "invalid value for quota\n");
+                                error = -EINVAL;
+                                break;
+                        }
+                }
+                else if (!strcmp(o, "suiddir"))
+                        args->ar_suiddir = 1;
+                else if (!strcmp(o, "nosuiddir"))
+                        args->ar_suiddir = 0;
+                else if (!strcmp(o, "data")) {
+                        if (!v)
+                                goto need_value;
+                        if (!strcmp(v, "writeback"))
+                                args->ar_data = GFS2_DATA_WRITEBACK;
+                        else if (!strcmp(v, "ordered"))
+                                args->ar_data = GFS2_DATA_ORDERED;
+                        else {
+                                fs_info(sdp, "invalid value for data\n");
+                                error = -EINVAL;
+                                break;
+                        }
+                }
+                else {
+                        fs_info(sdp, "unknown option: %s\n", o);
+                        error = -EINVAL;
+                        break;
+                }
+        }
+        if (error)
+                fs_info(sdp, "invalid mount option(s)\n");
+        if (data != data_arg)
+                kfree(data);
+        return error;
+need_value:
+        fs_info(sdp, "need value for option %s\n", o);
+        return -EINVAL;
+cant_remount:
+        fs_info(sdp, "can't remount with option %s\n", o);
+        return -EINVAL;
+}
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..401288acfdf3
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __MOUNT_DOT_H__
+#define __MOUNT_DOT_H__
+struct gfs2_sbd;
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
+#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..1025960b0e6e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include "gfs2.h"
+#include <linux/gfs2_ondisk.h>
+#define pv(struct, member, fmt) printk(KERN_INFO "  "#member" = "fmt"\n", \
+                                       struct->member);
+/*
+ * gfs2_xxx_in - read in an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_out - write out an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_print - print out an xxx struct
+ * first arg: the cpu-order structure
+ */
+void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
+{
+        const struct gfs2_inum *str = buf;
+        no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
+        no->no_addr = be64_to_cpu(str->no_addr);
+}
+void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
+{
+        struct gfs2_inum *str = buf;
+        str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
+        str->no_addr = cpu_to_be64(no->no_addr);
+}
+static void gfs2_inum_print(const struct gfs2_inum *no)
+{
+        printk(KERN_INFO "  no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
+        printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
+}
+static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
+{
+        const struct gfs2_meta_header *str = buf;
+        mh->mh_magic = be32_to_cpu(str->mh_magic);
+        mh->mh_type = be32_to_cpu(str->mh_type);
+        mh->mh_format = be32_to_cpu(str->mh_format);
+}
+static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
+{
+        struct gfs2_meta_header *str = buf;
+        str->mh_magic = cpu_to_be32(mh->mh_magic);
+        str->mh_type = cpu_to_be32(mh->mh_type);
+        str->mh_format = cpu_to_be32(mh->mh_format);
+}
+static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
+{
+        pv(mh, mh_magic, "0x%.8X");
+        pv(mh, mh_type, "%u");
+        pv(mh, mh_format, "%u");
+}
+void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
+{
+        const struct gfs2_sb *str = buf;
+        gfs2_meta_header_in(&sb->sb_header, buf);
+        sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+        sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+        sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+        sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+        gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
+        gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
+        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
+{
+        const struct gfs2_rindex *str = buf;
+        ri->ri_addr = be64_to_cpu(str->ri_addr);
+        ri->ri_length = be32_to_cpu(str->ri_length);
+        ri->ri_data0 = be64_to_cpu(str->ri_data0);
+        ri->ri_data = be32_to_cpu(str->ri_data);
+        ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
+}
+void gfs2_rindex_print(const struct gfs2_rindex *ri)
+{
+        printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
+        pv(ri, ri_length, "%u");
+        printk(KERN_INFO "  ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
+        pv(ri, ri_data, "%u");
+        pv(ri, ri_bitbytes, "%u");
+}
+void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
+{
+        const struct gfs2_rgrp *str = buf;
+        gfs2_meta_header_in(&rg->rg_header, buf);
+        rg->rg_flags = be32_to_cpu(str->rg_flags);
+        rg->rg_free = be32_to_cpu(str->rg_free);
+        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+}
+void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
+{
+        struct gfs2_rgrp *str = buf;
+        gfs2_meta_header_out(&rg->rg_header, buf);
+        str->rg_flags = cpu_to_be32(rg->rg_flags);
+        str->rg_free = cpu_to_be32(rg->rg_free);
+        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+        str->__pad = cpu_to_be32(0);
+        str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+        memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+}
+void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
+{
+        const struct gfs2_quota *str = buf;
+        qu->qu_limit = be64_to_cpu(str->qu_limit);
+        qu->qu_warn = be64_to_cpu(str->qu_warn);
+        qu->qu_value = be64_to_cpu(str->qu_value);
+}
+void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
+{
+        const struct gfs2_dinode *str = buf;
+        gfs2_meta_header_in(&di->di_header, buf);
+        gfs2_inum_in(&di->di_num, &str->di_num);
+        di->di_mode = be32_to_cpu(str->di_mode);
+        di->di_uid = be32_to_cpu(str->di_uid);
+        di->di_gid = be32_to_cpu(str->di_gid);
+        di->di_nlink = be32_to_cpu(str->di_nlink);
+        di->di_size = be64_to_cpu(str->di_size);
+        di->di_blocks = be64_to_cpu(str->di_blocks);
+        di->di_atime = be64_to_cpu(str->di_atime);
+        di->di_mtime = be64_to_cpu(str->di_mtime);
+        di->di_ctime = be64_to_cpu(str->di_ctime);
+        di->di_major = be32_to_cpu(str->di_major);
+        di->di_minor = be32_to_cpu(str->di_minor);
+        di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+        di->di_goal_data = be64_to_cpu(str->di_goal_data);
+        di->di_generation = be64_to_cpu(str->di_generation);
+        di->di_flags = be32_to_cpu(str->di_flags);
+        di->di_payload_format = be32_to_cpu(str->di_payload_format);
+        di->di_height = be16_to_cpu(str->di_height);
+        di->di_depth = be16_to_cpu(str->di_depth);
+        di->di_entries = be32_to_cpu(str->di_entries);
+        di->di_eattr = be64_to_cpu(str->di_eattr);
+}
+void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
+{
+        struct gfs2_dinode *str = buf;
+        gfs2_meta_header_out(&di->di_header, buf);
+        gfs2_inum_out(&di->di_num, (char *)&str->di_num);
+        str->di_mode = cpu_to_be32(di->di_mode);
+        str->di_uid = cpu_to_be32(di->di_uid);
+        str->di_gid = cpu_to_be32(di->di_gid);
+        str->di_nlink = cpu_to_be32(di->di_nlink);
+        str->di_size = cpu_to_be64(di->di_size);
+        str->di_blocks = cpu_to_be64(di->di_blocks);
+        str->di_atime = cpu_to_be64(di->di_atime);
+        str->di_mtime = cpu_to_be64(di->di_mtime);
+        str->di_ctime = cpu_to_be64(di->di_ctime);
+        str->di_major = cpu_to_be32(di->di_major);
+        str->di_minor = cpu_to_be32(di->di_minor);
+        str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
+        str->di_goal_data = cpu_to_be64(di->di_goal_data);
+        str->di_generation = cpu_to_be64(di->di_generation);
+        str->di_flags = cpu_to_be32(di->di_flags);
+        str->di_payload_format = cpu_to_be32(di->di_payload_format);
+        str->di_height = cpu_to_be16(di->di_height);
+        str->di_depth = cpu_to_be16(di->di_depth);
+        str->di_entries = cpu_to_be32(di->di_entries);
+        str->di_eattr = cpu_to_be64(di->di_eattr);
+}
+void gfs2_dinode_print(const struct gfs2_dinode *di)
+{
+        gfs2_meta_header_print(&di->di_header);
+        gfs2_inum_print(&di->di_num);
+        pv(di, di_mode, "0%o");
+        pv(di, di_uid, "%u");
+        pv(di, di_gid, "%u");
+        pv(di, di_nlink, "%u");
+        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+        printk(KERN_INFO "  di_blocks = %llu\n", (unsigned long long)di->di_blocks);
+        printk(KERN_INFO "  di_atime = %lld\n", (long long)di->di_atime);
+        printk(KERN_INFO "  di_mtime = %lld\n", (long long)di->di_mtime);
+        printk(KERN_INFO "  di_ctime = %lld\n", (long long)di->di_ctime);
+        pv(di, di_major, "%u");
+        pv(di, di_minor, "%u");
+        printk(KERN_INFO "  di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
+        printk(KERN_INFO "  di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
+        pv(di, di_flags, "0x%.8X");
+        pv(di, di_payload_format, "%u");
+        pv(di, di_height, "%u");
+        pv(di, di_depth, "%u");
+        pv(di, di_entries, "%u");
+        printk(KERN_INFO "  di_eattr = %llu\n", (unsigned long long)di->di_eattr);
+}
+void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
+{
+        const struct gfs2_log_header *str = buf;
+        gfs2_meta_header_in(&lh->lh_header, buf);
+        lh->lh_sequence = be64_to_cpu(str->lh_sequence);
+        lh->lh_flags = be32_to_cpu(str->lh_flags);
+        lh->lh_tail = be32_to_cpu(str->lh_tail);
+        lh->lh_blkno = be32_to_cpu(str->lh_blkno);
+        lh->lh_hash = be32_to_cpu(str->lh_hash);
+}
+void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
+{
+        const struct gfs2_inum_range *str = buf;
+        ir->ir_start = be64_to_cpu(str->ir_start);
+        ir->ir_length = be64_to_cpu(str->ir_length);
+}
+void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
+{
+        struct gfs2_inum_range *str = buf;
+        str->ir_start = cpu_to_be64(ir->ir_start);
+        str->ir_length = cpu_to_be64(ir->ir_length);
+}
+void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
+{
+        const struct gfs2_statfs_change *str = buf;
+        sc->sc_total = be64_to_cpu(str->sc_total);
+        sc->sc_free = be64_to_cpu(str->sc_free);
+        sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
+}
+void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
+{
+        struct gfs2_statfs_change *str = buf;
+        str->sc_total = cpu_to_be64(sc->sc_total);
+        str->sc_free = cpu_to_be64(sc->sc_free);
+        str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
+}
+void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
+{
+        const struct gfs2_quota_change *str = buf;
+        qc->qc_change = be64_to_cpu(str->qc_change);
+        qc->qc_flags = be32_to_cpu(str->qc_flags);
+        qc->qc_id = be32_to_cpu(str->qc_id);
+}
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..8d5963c7e123
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "ops_file.h"
+#include "util.h"
+#include "glops.h"
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                                   unsigned int from, unsigned int to)
+{
+        struct buffer_head *head = page_buffers(page);
+        unsigned int bsize = head->b_size;
+        struct buffer_head *bh;
+        unsigned int start, end;
+        for (bh = head, start = 0; bh != head || !start;
+             bh = bh->b_this_page, start = end) {
+                end = start + bsize;
+                if (end <= from || start >= to)
+                        continue;
+                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        }
+}
+/**
+ * gfs2_get_block - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+int gfs2_get_block(struct inode *inode, sector_t lblock,
+                   struct buffer_head *bh_result, int create)
+{
+        return gfs2_block_map(inode, lblock, create, bh_result);
+}
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+                                  struct buffer_head *bh_result, int create)
+{
+        int error;
+        error = gfs2_block_map(inode, lblock, 0, bh_result);
+        if (error)
+                return error;
+        if (bh_result->b_blocknr == 0)
+                return -EIO;
+        return 0;
+}
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+                                 struct buffer_head *bh_result, int create)
+{
+        return gfs2_block_map(inode, lblock, 0, bh_result);
+}
+/**
+ * gfs2_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ * Some of this is copied from block_write_full_page() although we still
+ * call it to do most of the work.
+ */
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        loff_t i_size = i_size_read(inode);
+        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+        unsigned offset;
+        int error;
+        int done_trans = 0;
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
+                unlock_page(page);
+                return -EIO;
+        }
+        if (current->journal_info)
+                goto out_ignore;
+        /* Is the page fully outside i_size? (truncate in progress) */
+        offset = i_size & (PAGE_CACHE_SIZE-1);
+        if (page->index > end_index || (page->index == end_index && !offset)) {
+                page->mapping->a_ops->invalidatepage(page, 0);
+                unlock_page(page);
+                return 0; /* don't care */
+        }
+        if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
+                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+                if (error)
+                        goto out_ignore;
+                if (!page_has_buffers(page)) {
+                        create_empty_buffers(page, inode->i_sb->s_blocksize,
+                                             (1 << BH_Dirty)|(1 << BH_Uptodate));
+                }
+                gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+                done_trans = 1;
+        }
+        error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+        if (done_trans)
+                gfs2_trans_end(sdp);
+        gfs2_meta_cache_flush(ip);
+        return error;
+out_ignore:
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        return 0;
+}
+static int zero_readpage(struct page *page)
+{
+        void *kaddr;
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr, 0, PAGE_CACHE_SIZE);
+        kunmap_atomic(kaddr, KM_USER0);
+        SetPageUptodate(page);
+        return 0;
+}
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+        struct buffer_head *dibh;
+        void *kaddr;
+        int error;
+        /* Only the first page of a stuffed file might contain data */
+        if (unlikely(page->index))
+                return zero_readpage(page);
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                return error;
+        kaddr = kmap_atomic(page, KM_USER0);
+        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+               ip->i_di.di_size);
+        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+        kunmap_atomic(kaddr, KM_USER0);
+        brelse(dibh);
+        SetPageUptodate(page);
+        return 0;
+}
+/**
+ * gfs2_readpage - readpage with locking
+ * @file: The file to read a page for. N.B. This may be NULL if we are
+ * reading an internal file.
+ * @page: The page to read
+ *
+ * Returns: errno
+ */
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        struct gfs2_file *gf = NULL;
+        struct gfs2_holder gh;
+        int error;
+        int do_unlock = 0;
+        if (likely(file != &gfs2_internal_file_sentinel)) {
+                if (file) {
+                        gf = file->private_data;
+                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
+                                /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+                                goto skip_lock;
+                }
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
+                do_unlock = 1;
+                error = gfs2_glock_nq_m_atime(1, &gh);
+                if (unlikely(error))
+                        goto out_unlock;
+        }
+skip_lock:
+        if (gfs2_is_stuffed(ip)) {
+                error = stuffed_readpage(ip, page);
+                unlock_page(page);
+        } else
+                error = mpage_readpage(page, gfs2_get_block);
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = -EIO;
+        if (do_unlock) {
+                gfs2_glock_dq_m(1, &gh);
+                gfs2_holder_uninit(&gh);
+        }
+out:
+        return error;
+out_unlock:
+        unlock_page(page);
+        if (do_unlock)
+                gfs2_holder_uninit(&gh);
+        goto out;
+}
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We have to handle stuffed files here too.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
+ * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
+ *    well as read-ahead.
+ */
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *pages, unsigned nr_pages)
+{
+        struct inode *inode = mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_holder gh;
+        unsigned page_idx;
+        int ret;
+        int do_unlock = 0;
+        if (likely(file != &gfs2_internal_file_sentinel)) {
+                if (file) {
+                        struct gfs2_file *gf = file->private_data;
+                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
+                                goto skip_lock;
+                }
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
+                                 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
+                do_unlock = 1;
+                ret = gfs2_glock_nq_m_atime(1, &gh);
+                if (ret == GLR_TRYFAILED)
+                        goto out_noerror;
+                if (unlikely(ret))
+                        goto out_unlock;
+        }
+skip_lock:
+        if (gfs2_is_stuffed(ip)) {
+                struct pagevec lru_pvec;
+                pagevec_init(&lru_pvec, 0);
+                for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                        struct page *page = list_entry(pages->prev, struct page, lru);
+                        prefetchw(&page->flags);
+                        list_del(&page->lru);
+                        if (!add_to_page_cache(page, mapping,
+                                               page->index, GFP_KERNEL)) {
+                                ret = stuffed_readpage(ip, page);
+                                unlock_page(page);
+                                if (!pagevec_add(&lru_pvec, page))
+                                         __pagevec_lru_add(&lru_pvec);
+                        } else {
+                                page_cache_release(page);
+                        }
+                }
+                pagevec_lru_add(&lru_pvec);
+                ret = 0;
+        } else {
+                /* What we really want to do .... */
+                ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
+        }
+        if (do_unlock) {
+                gfs2_glock_dq_m(1, &gh);
+                gfs2_holder_uninit(&gh);
+        }
+out:
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = -EIO;
+        return ret;
+out_noerror:
+        ret = 0;
+out_unlock:
+        /* unlock all pages, we can't do any I/O right now */
+        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                struct page *page = list_entry(pages->prev, struct page, lru);
+                list_del(&page->lru);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        if (do_unlock)
+                gfs2_holder_uninit(&gh);
+        goto out;
+}
+/**
+ * gfs2_prepare_write - Prepare to write a page to a file
+ * @file: The file to write to
+ * @page: The page which is to be prepared for writing
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+static int gfs2_prepare_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
+{
+        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        unsigned int data_blocks, ind_blocks, rblocks;
+        int alloc_required;
+        int error = 0;
+        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
+        loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        struct gfs2_alloc *al;
+        unsigned int write_len = to - from;
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
+        error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
+        if (error)
+                goto out_uninit;
+        gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks);
+        error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required);
+        if (error)
+                goto out_unlock;
+        ip->i_alloc.al_requested = 0;
+        if (alloc_required) {
+                al = gfs2_alloc_get(ip);
+                error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                if (error)
+                        goto out_alloc_put;
+                error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+                if (error)
+                        goto out_qunlock;
+                al->al_requested = data_blocks + ind_blocks;
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_qunlock;
+        }
+        rblocks = RES_DINODE + ind_blocks;
+        if (gfs2_is_jdata(ip))
+                rblocks += data_blocks ? data_blocks : 1;
+        if (ind_blocks || data_blocks)
+                rblocks += RES_STATFS + RES_QUOTA;
+        error = gfs2_trans_begin(sdp, rblocks, 0);
+        if (error)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+                        error = gfs2_unstuff_dinode(ip, page);
+                        if (error == 0)
+                                goto prepare_write;
+                } else if (!PageUptodate(page))
+                        error = stuffed_readpage(ip, page);
+                goto out;
+        }
+prepare_write:
+        error = block_prepare_write(page, from, to, gfs2_get_block);
+out:
+        if (error) {
+                gfs2_trans_end(sdp);
+                if (alloc_required) {
+                        gfs2_inplace_release(ip);
+out_qunlock:
+                        gfs2_quota_unlock(ip);
+out_alloc_put:
+                        gfs2_alloc_put(ip);
+                }
+out_unlock:
+                gfs2_glock_dq_m(1, &ip->i_gh);
+out_uninit:
+                gfs2_holder_uninit(&ip->i_gh);
+        }
+        return error;
+}
+/**
+ * gfs2_commit_write - Commit write to a file
+ * @file: The file to write to
+ * @page: The page containing the data
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+static int gfs2_commit_write(struct file *file, struct page *page,
+                             unsigned from, unsigned to)
+{
+        struct inode *inode = page->mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        int error = -EOPNOTSUPP;
+        struct buffer_head *dibh;
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_dinode *di;
+        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
+                goto fail_nounlock;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto fail_endtrans;
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        di = (struct gfs2_dinode *)dibh->b_data;
+        if (gfs2_is_stuffed(ip)) {
+                u64 file_size;
+                void *kaddr;
+                file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
+                       kaddr + from, to - from);
+                kunmap_atomic(kaddr, KM_USER0);
+                SetPageUptodate(page);
+                if (inode->i_size < file_size)
+                        i_size_write(inode, file_size);
+        } else {
+                if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
+                    gfs2_is_jdata(ip))
+                        gfs2_page_add_databufs(ip, page, from, to);
+                error = generic_commit_write(file, page, from, to);
+                if (error)
+                        goto fail;
+        }
+        if (ip->i_di.di_size < inode->i_size) {
+                ip->i_di.di_size = inode->i_size;
+                di->di_size = cpu_to_be64(inode->i_size);
+        }
+        di->di_mode = cpu_to_be32(inode->i_mode);
+        di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+        di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+        di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+        brelse(dibh);
+        gfs2_trans_end(sdp);
+        if (al->al_requested) {
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        gfs2_glock_dq_m(1, &ip->i_gh);
+        gfs2_holder_uninit(&ip->i_gh);
+        return 0;
+fail:
+        brelse(dibh);
+fail_endtrans:
+        gfs2_trans_end(sdp);
+        if (al->al_requested) {
+                gfs2_inplace_release(ip);
+                gfs2_quota_unlock(ip);
+                gfs2_alloc_put(ip);
+        }
+        gfs2_glock_dq_m(1, &ip->i_gh);
+        gfs2_holder_uninit(&ip->i_gh);
+fail_nounlock:
+        ClearPageUptodate(page);
+        return error;
+}
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_holder i_gh;
+        sector_t dblock = 0;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (error)
+                return 0;
+        if (!gfs2_is_stuffed(ip))
+                dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+        gfs2_glock_dq_uninit(&i_gh);
+        return dblock;
+}
+static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+        struct gfs2_bufdata *bd;
+        gfs2_log_lock(sdp);
+        bd = bh->b_private;
+        if (bd) {
+                bd->bd_bh = NULL;
+                bh->b_private = NULL;
+        }
+        gfs2_log_unlock(sdp);
+        lock_buffer(bh);
+        clear_buffer_dirty(bh);
+        bh->b_bdev = NULL;
+        clear_buffer_mapped(bh);
+        clear_buffer_req(bh);
+        clear_buffer_new(bh);
+        clear_buffer_delay(bh);
+        unlock_buffer(bh);
+}
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+        struct buffer_head *head, *bh, *next;
+        unsigned int curr_off = 0;
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                return;
+        bh = head = page_buffers(page);
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                next = bh->b_this_page;
+                if (offset <= curr_off)
+                        discard_buffer(sdp, bh);
+                curr_off = next_off;
+                bh = next;
+        } while (bh != head);
+        if (!offset)
+                try_to_release_page(page, 0);
+        return;
+}
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t offset,
+                              unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int rv;
+        if (rw == READ)
+                mutex_lock(&inode->i_mutex);
+        /*
+         * Shared lock, even if its a write, since we do no allocation
+         * on this path. All we need change is atime.
+         */
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        rv = gfs2_glock_nq_m_atime(1, &gh);
+        if (rv)
+                goto out;
+        if (offset > i_size_read(inode))
+                goto out;
+        /*
+         * Should we return an error here? I can't see that O_DIRECT for
+         * a journaled file makes any sense. For now we'll silently fall
+         * back to buffered I/O, likewise we do the same for stuffed
+         * files since they are (a) small and (b) unaligned.
+         */
+        if (gfs2_is_jdata(ip))
+                goto out;
+        if (gfs2_is_stuffed(ip))
+                goto out;
+        rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
+                                            inode->i_sb->s_bdev,
+                                            iov, offset, nr_segs,
+                                            gfs2_get_block_direct, NULL);
+out:
+        gfs2_glock_dq_m(1, &gh);
+        gfs2_holder_uninit(&gh);
+        if (rw == READ)
+                mutex_unlock(&inode->i_mutex);
+        return rv;
+}
+/**
+ * stuck_releasepage - We're stuck in gfs2_releasepage().  Print stuff out.
+ * @bh: the buffer we're stuck on
+ *
+ */
+static void stuck_releasepage(struct buffer_head *bh)
+{
+        struct inode *inode = bh->b_page->mapping->host;
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_bufdata *bd = bh->b_private;
+        struct gfs2_glock *gl;
+static unsigned limit = 0;
+        if (limit > 3)
+                return;
+        limit++;
+        fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
+        fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
+                (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
+        fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
+        fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
+        if (!bd)
+                return;
+        gl = bd->bd_gl;
+        fs_warn(sdp, "gl = (%u, %llu)\n",
+                gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
+        fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
+                (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
+                (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
+        if (gl->gl_ops == &gfs2_inode_glops) {
+                struct gfs2_inode *ip = gl->gl_object;
+                unsigned int x;
+                if (!ip)
+                        return;
+                fs_warn(sdp, "ip = %llu %llu\n",
+                        (unsigned long long)ip->i_num.no_formal_ino,
+                        (unsigned long long)ip->i_num.no_addr);
+                for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
+                        fs_warn(sdp, "ip->i_cache[%u] = %s\n",
+                                x, (ip->i_cache[x]) ? "!NULL" : "NULL");
+        }
+}
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+        struct inode *aspace = page->mapping->host;
+        struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+        struct buffer_head *bh, *head;
+        struct gfs2_bufdata *bd;
+        unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
+        if (!page_has_buffers(page))
+                goto out;
+        head = bh = page_buffers(page);
+        do {
+                while (atomic_read(&bh->b_count)) {
+                        if (!atomic_read(&aspace->i_writecount))
+                                return 0;
+                        if (time_after_eq(jiffies, t)) {
+                                stuck_releasepage(bh);
+                                /* should we withdraw here? */
+                                return 0;
+                        }
+                        yield();
+                }
+                gfs2_assert_warn(sdp, !buffer_pinned(bh));
+                gfs2_assert_warn(sdp, !buffer_dirty(bh));
+                gfs2_log_lock(sdp);
+                bd = bh->b_private;
+                if (bd) {
+                        gfs2_assert_warn(sdp, bd->bd_bh == bh);
+                        gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+                        gfs2_assert_warn(sdp, !bd->bd_ail);
+                        bd->bd_bh = NULL;
+                        if (!list_empty(&bd->bd_le.le_list))
+                                bd = NULL;
+                        bh->b_private = NULL;
+                }
+                gfs2_log_unlock(sdp);
+                if (bd)
+                        kmem_cache_free(gfs2_bufdata_cachep, bd);
+                bh = bh->b_this_page;
+        } while (bh != head);
+out:
+        return try_to_free_buffers(page);
+}
+const struct address_space_operations gfs2_file_aops = {
+        .writepage = gfs2_writepage,
+        .readpage = gfs2_readpage,
+        .readpages = gfs2_readpages,
+        .sync_page = block_sync_page,
+        .prepare_write = gfs2_prepare_write,
+        .commit_write = gfs2_commit_write,
+        .bmap = gfs2_bmap,
+        .invalidatepage = gfs2_invalidatepage,
+        .releasepage = gfs2_releasepage,
+        .direct_IO = gfs2_direct_IO,
+};
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..35aaee4aa7e1
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_ADDRESS_DOT_H__
+#define __OPS_ADDRESS_DOT_H__
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+extern const struct address_space_operations gfs2_file_aops;
+extern int gfs2_get_block(struct inode *inode, sector_t lblock,
+                          struct buffer_head *bh_result, int create);
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..00041b1b8025
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "ops_dentry.h"
+#include "util.h"
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *parent = dget_parent(dentry);
+        struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+        struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_holder d_gh;
+        struct gfs2_inode *ip;
+        struct gfs2_inum inum;
+        unsigned int type;
+        int error;
+        if (inode && is_bad_inode(inode))
+                goto invalid;
+        if (sdp->sd_args.ar_localcaching)
+                goto valid;
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+        if (error)
+                goto fail;
+        error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
+        switch (error) {
+        case 0:
+                if (!inode)
+                        goto invalid_gunlock;
+                break;
+        case -ENOENT:
+                if (!inode)
+                        goto valid_gunlock;
+                goto invalid_gunlock;
+        default:
+                goto fail_gunlock;
+        }
+        ip = GFS2_I(inode);
+        if (!gfs2_inum_equal(&ip->i_num, &inum))
+                goto invalid_gunlock;
+        if (IF2DT(ip->i_di.di_mode) != type) {
+                gfs2_consist_inode(dip);
+                goto fail_gunlock;
+        }
+valid_gunlock:
+        gfs2_glock_dq_uninit(&d_gh);
+valid:
+        dput(parent);
+        return 1;
+invalid_gunlock:
+        gfs2_glock_dq_uninit(&d_gh);
+invalid:
+        if (inode && S_ISDIR(inode->i_mode)) {
+                if (have_submounts(dentry))
+                        goto valid;
+                shrink_dcache_parent(dentry);
+        }
+        d_drop(dentry);
+        dput(parent);
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq_uninit(&d_gh);
+fail:
+        dput(parent);
+        return 0;
+}
+static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+{
+        str->hash = gfs2_disk_hash(str->name, str->len);
+        return 0;
+}
+struct dentry_operations gfs2_dops = {
+        .d_revalidate = gfs2_drevalidate,
+        .d_hash = gfs2_dhash,
+};
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..5caa3db4d3f5
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_DENTRY_DOT_H__
+#define __OPS_DENTRY_DOT_H__
+#include <linux/dcache.h>
+extern struct dentry_operations gfs2_dops;
+#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..86127d93bd35
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "ops_export.h"
+#include "rgrp.h"
+#include "util.h"
+static struct dentry *gfs2_decode_fh(struct super_block *sb,
+                                     __u32 *fh,
+                                     int fh_len,
+                                     int fh_type,
+                                     int (*acceptable)(void *context,
+                                                       struct dentry *dentry),
+                                     void *context)
+{
+        struct gfs2_fh_obj fh_obj;
+        struct gfs2_inum *this, parent;
+        if (fh_type != fh_len)
+                return NULL;
+        this            = &fh_obj.this;
+        fh_obj.imode    = DT_UNKNOWN;
+        memset(&parent, 0, sizeof(struct gfs2_inum));
+        switch (fh_type) {
+        case GFS2_LARGE_FH_SIZE:
+                parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+                parent.no_formal_ino |= be32_to_cpu(fh[5]);
+                parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+                parent.no_addr |= be32_to_cpu(fh[7]);
+                fh_obj.imode = be32_to_cpu(fh[8]);
+        case GFS2_SMALL_FH_SIZE:
+                this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+                this->no_formal_ino |= be32_to_cpu(fh[1]);
+                this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+                this->no_addr |= be32_to_cpu(fh[3]);
+                break;
+        default:
+                return NULL;
+        }
+        return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
+                                                    acceptable, context);
+}
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+                          int connectable)
+{
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (*len < GFS2_SMALL_FH_SIZE ||
+            (connectable && *len < GFS2_LARGE_FH_SIZE))
+                return 255;
+        fh[0] = ip->i_num.no_formal_ino >> 32;
+        fh[0] = cpu_to_be32(fh[0]);
+        fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+        fh[1] = cpu_to_be32(fh[1]);
+        fh[2] = ip->i_num.no_addr >> 32;
+        fh[2] = cpu_to_be32(fh[2]);
+        fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
+        fh[3] = cpu_to_be32(fh[3]);
+        *len = GFS2_SMALL_FH_SIZE;
+        if (!connectable || inode == sb->s_root->d_inode)
+                return *len;
+        spin_lock(&dentry->d_lock);
+        inode = dentry->d_parent->d_inode;
+        ip = GFS2_I(inode);
+        igrab(inode);
+        spin_unlock(&dentry->d_lock);
+        fh[4] = ip->i_num.no_formal_ino >> 32;
+        fh[4] = cpu_to_be32(fh[4]);
+        fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+        fh[5] = cpu_to_be32(fh[5]);
+        fh[6] = ip->i_num.no_addr >> 32;
+        fh[6] = cpu_to_be32(fh[6]);
+        fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
+        fh[7] = cpu_to_be32(fh[7]);
+        fh[8]  = cpu_to_be32(inode->i_mode);
+        fh[9]  = 0;     /* pad to double word */
+        *len = GFS2_LARGE_FH_SIZE;
+        iput(inode);
+        return *len;
+}
+struct get_name_filldir {
+        struct gfs2_inum inum;
+        char *name;
+};
+static int get_name_filldir(void *opaque, const char *name, unsigned int length,
+                            u64 offset, struct gfs2_inum *inum,
+                            unsigned int type)
+{
+        struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
+        if (!gfs2_inum_equal(inum, &gnfd->inum))
+                return 0;
+        memcpy(gnfd->name, name, length);
+        gnfd->name[length] = 0;
+        return 1;
+}
+static int gfs2_get_name(struct dentry *parent, char *name,
+                         struct dentry *child)
+{
+        struct inode *dir = parent->d_inode;
+        struct inode *inode = child->d_inode;
+        struct gfs2_inode *dip, *ip;
+        struct get_name_filldir gnfd;
+        struct gfs2_holder gh;
+        u64 offset = 0;
+        int error;
+        if (!dir)
+                return -EINVAL;
+        if (!S_ISDIR(dir->i_mode) || !inode)
+                return -EINVAL;
+        dip = GFS2_I(dir);
+        ip = GFS2_I(inode);
+        *name = 0;
+        gnfd.inum = ip->i_num;
+        gnfd.name = name;
+        error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+        if (error)
+                return error;
+        error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+        gfs2_glock_dq_uninit(&gh);
+        if (!error && !*name)
+                error = -ENOENT;
+        return error;
+}
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+        struct qstr dotdot;
+        struct inode *inode;
+        struct dentry *dentry;
+        gfs2_str2qstr(&dotdot, "..");
+        inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
+        /*
+         * In case of an error, @inode carries the error value, and we
+         * have to return that as a(n invalid) pointer to dentry.
+         */
+        if (IS_ERR(inode))
+                return ERR_PTR(PTR_ERR(inode));
+        dentry = d_alloc_anon(inode);
+        if (!dentry) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return dentry;
+}
+static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
+        struct gfs2_inum *inum = &fh_obj->this;
+        struct gfs2_holder i_gh, ri_gh, rgd_gh;
+        struct gfs2_rgrpd *rgd;
+        struct inode *inode;
+        struct dentry *dentry;
+        int error;
+        /* System files? */
+        inode = gfs2_ilookup(sb, inum);
+        if (inode) {
+                if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
+                        iput(inode);
+                        return ERR_PTR(-ESTALE);
+                }
+                goto out_inode;
+        }
+        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
+                                  &i_gh);
+        if (error)
+                return ERR_PTR(error);
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto fail;
+        error = -EINVAL;
+        rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
+        if (!rgd)
+                goto fail_rindex;
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+        if (error)
+                goto fail_rindex;
+        error = -ESTALE;
+        if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
+                goto fail_rgd;
+        gfs2_glock_dq_uninit(&rgd_gh);
+        gfs2_glock_dq_uninit(&ri_gh);
+        inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
+        if (!inode)
+                goto fail;
+        if (IS_ERR(inode)) {
+                error = PTR_ERR(inode);
+                goto fail;
+        }
+        error = gfs2_inode_refresh(GFS2_I(inode));
+        if (error) {
+                iput(inode);
+                goto fail;
+        }
+        error = -EIO;
+        if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+                iput(inode);
+                goto fail;
+        }
+        gfs2_glock_dq_uninit(&i_gh);
+out_inode:
+        dentry = d_alloc_anon(inode);
+        if (!dentry) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return dentry;
+fail_rgd:
+        gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+        gfs2_glock_dq_uninit(&ri_gh);
+fail:
+        gfs2_glock_dq_uninit(&i_gh);
+        return ERR_PTR(error);
+}
+struct export_operations gfs2_export_ops = {
+        .decode_fh = gfs2_decode_fh,
+        .encode_fh = gfs2_encode_fh,
+        .get_name = gfs2_get_name,
+        .get_parent = gfs2_get_parent,
+        .get_dentry = gfs2_get_dentry,
+};
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09aca5046fb1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_EXPORT_DOT_H__
+#define __OPS_EXPORT_DOT_H__
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 10
+extern struct export_operations gfs2_export_ops;
+struct gfs2_fh_obj {
+        struct gfs2_inum this;
+        __u32            imode;
+};
+#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3064f133bf3c
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,661 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_file.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "eaops.h"
+/* For regular, non-NFS */
+struct filldir_reg {
+        struct gfs2_sbd *fdr_sbd;
+        int fdr_prefetch;
+        filldir_t fdr_filldir;
+        void *fdr_opaque;
+};
+/*
+ * Most fields left uninitialised to catch anybody who tries to
+ * use them. f_flags set to prevent file_accessed() from touching
+ * any other part of this. Its use is purely as a flag so that we
+ * know (in readpage()) whether or not do to locking.
+ */
+struct file gfs2_internal_file_sentinel = {
+        .f_flags = O_NOATIME|O_RDONLY,
+};
+static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
+                           unsigned long offset, unsigned long size)
+{
+        char *kaddr;
+        unsigned long count = desc->count;
+        if (size > count)
+                size = count;
+        kaddr = kmap(page);
+        memcpy(desc->arg.buf, kaddr + offset, size);
+        kunmap(page);
+        desc->count = count - size;
+        desc->written += size;
+        desc->arg.buf += size;
+        return size;
+}
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+        struct inode *inode = &ip->i_inode;
+        read_descriptor_t desc;
+        desc.written = 0;
+        desc.arg.buf = buf;
+        desc.count = size;
+        desc.error = 0;
+        do_generic_mapping_read(inode->i_mapping, ra_state,
+                                &gfs2_internal_file_sentinel, pos, &desc,
+                                gfs2_read_actor);
+        return desc.written ? desc.written : desc.error;
+}
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_holder i_gh;
+        loff_t error;
+        if (origin == 2) {
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+                                           &i_gh);
+                if (!error) {
+                        error = remote_llseek(file, offset, origin);
+                        gfs2_glock_dq_uninit(&i_gh);
+                }
+        } else
+                error = remote_llseek(file, offset, origin);
+        return error;
+}
+/**
+ * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+static int filldir_func(void *opaque, const char *name, unsigned int length,
+                        u64 offset, struct gfs2_inum *inum,
+                        unsigned int type)
+{
+        struct filldir_reg *fdr = (struct filldir_reg *)opaque;
+        struct gfs2_sbd *sdp = fdr->fdr_sbd;
+        int error;
+        error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
+                                 inum->no_addr, type);
+        if (error)
+                return 1;
+        if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
+                gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
+                                       LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
+                gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
+                                       LM_ST_SHARED, LM_FLAG_TRY);
+        }
+        return 0;
+}
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        struct inode *dir = file->f_mapping->host;
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct filldir_reg fdr;
+        struct gfs2_holder d_gh;
+        u64 offset = file->f_pos;
+        int error;
+        fdr.fdr_sbd = GFS2_SB(dir);
+        fdr.fdr_prefetch = 1;
+        fdr.fdr_filldir = filldir;
+        fdr.fdr_opaque = dirent;
+        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
+        error = gfs2_glock_nq_atime(&d_gh);
+        if (error) {
+                gfs2_holder_uninit(&d_gh);
+                return error;
+        }
+        error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
+        gfs2_glock_dq_uninit(&d_gh);
+        file->f_pos = offset;
+        return error;
+}
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+        u32 res = 0;
+        while(val) {
+                if (val & 1)
+                        res |= *table;
+                table++;
+                val >>= 1;
+        }
+        return res;
+}
+static const u32 fsflags_to_gfs2[32] = {
+        [3] = GFS2_DIF_SYNC,
+        [4] = GFS2_DIF_IMMUTABLE,
+        [5] = GFS2_DIF_APPENDONLY,
+        [7] = GFS2_DIF_NOATIME,
+        [12] = GFS2_DIF_EXHASH,
+        [14] = GFS2_DIF_JDATA,
+        [20] = GFS2_DIF_DIRECTIO,
+};
+static const u32 gfs2_to_fsflags[32] = {
+        [gfs2fl_Sync] = FS_SYNC_FL,
+        [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+        [gfs2fl_AppendOnly] = FS_APPEND_FL,
+        [gfs2fl_NoAtime] = FS_NOATIME_FL,
+        [gfs2fl_ExHash] = FS_INDEX_FL,
+        [gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
+        [gfs2fl_Directio] = FS_DIRECTIO_FL,
+        [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
+        [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        u32 fsflags;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        error = gfs2_glock_nq_m_atime(1, &gh);
+        if (error)
+                return error;
+        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+        if (put_user(fsflags, ptr))
+                error = -EFAULT;
+        gfs2_glock_dq_m(1, &gh);
+        gfs2_holder_uninit(&gh);
+        return error;
+}
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|                    \
+                             GFS2_DIF_DIRECTIO|                 \
+                             GFS2_DIF_IMMUTABLE|                \
+                             GFS2_DIF_APPENDONLY|               \
+                             GFS2_DIF_NOATIME|                  \
+                             GFS2_DIF_SYNC|                     \
+                             GFS2_DIF_SYSTEM|                   \
+                             GFS2_DIF_INHERIT_DIRECTIO|         \
+                             GFS2_DIF_INHERIT_JDATA)
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *bh;
+        struct gfs2_holder gh;
+        int error;
+        u32 new_flags, flags;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (error)
+                return error;
+        flags = ip->i_di.di_flags;
+        new_flags = (flags & ~mask) | (reqflags & mask);
+        if ((new_flags ^ flags) == 0)
+                goto out;
+        if (S_ISDIR(inode->i_mode)) {
+                if ((new_flags ^ flags) & GFS2_DIF_JDATA)
+                        new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
+                if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
+                        new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
+        }
+        error = -EINVAL;
+        if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+                goto out;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+                goto out;
+        if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+                goto out;
+        if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                goto out;
+        if (!IS_IMMUTABLE(inode)) {
+                error = permission(inode, MAY_WRITE, NULL);
+                if (error)
+                        goto out;
+        }
+        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (error)
+                goto out;
+        error = gfs2_meta_inode_buffer(ip, &bh);
+        if (error)
+                goto out_trans_end;
+        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        ip->i_di.di_flags = new_flags;
+        gfs2_dinode_out(&ip->i_di, bh->b_data);
+        brelse(bh);
+out_trans_end:
+        gfs2_trans_end(sdp);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return error;
+}
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+        u32 fsflags, gfsflags;
+        if (get_user(fsflags, ptr))
+                return -EFAULT;
+        gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+        return do_gfs2_set_flags(filp, gfsflags, ~0);
+}
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        switch(cmd) {
+        case FS_IOC_GETFLAGS:
+                return gfs2_get_flags(filp, (u32 __user *)arg);
+        case FS_IOC_SETFLAGS:
+                return gfs2_set_flags(filp, (u32 __user *)arg);
+        }
+        return -ENOTTY;
+}
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * Returns: 0 or error code
+ */
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_holder i_gh;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        error = gfs2_glock_nq_atime(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        /* This is VM_MAYWRITE instead of VM_WRITE because a call
+           to mprotect() can turn on VM_WRITE later. */
+        if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
+            (VM_MAYSHARE | VM_MAYWRITE))
+                vma->vm_ops = &gfs2_vm_ops_sharewrite;
+        else
+                vma->vm_ops = &gfs2_vm_ops_private;
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        struct gfs2_file *fp;
+        int error;
+        fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+        if (!fp)
+                return -ENOMEM;
+        mutex_init(&fp->f_fl_mutex);
+        gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+        file->private_data = fp;
+        if (S_ISREG(ip->i_di.di_mode)) {
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+                                           &i_gh);
+                if (error)
+                        goto fail;
+                if (!(file->f_flags & O_LARGEFILE) &&
+                    ip->i_di.di_size > MAX_NON_LFS) {
+                        error = -EFBIG;
+                        goto fail_gunlock;
+                }
+                /* Listen to the Direct I/O flag */
+                if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
+                        file->f_flags |= O_DIRECT;
+                gfs2_glock_dq_uninit(&i_gh);
+        }
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq_uninit(&i_gh);
+fail:
+        file->private_data = NULL;
+        kfree(fp);
+        return error;
+}
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_file *fp;
+        fp = file->private_data;
+        file->private_data = NULL;
+        if (gfs2_assert_warn(sdp, fp))
+                return -EIO;
+        kfree(fp);
+        return 0;
+}
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @dentry: the dentry that points to the inode to sync
+ *
+ * Returns: errno
+ */
+static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+        return 0;
+}
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+        struct lm_lockname name =
+                { .ln_number = ip->i_num.no_addr,
+                  .ln_type = LM_TYPE_PLOCK };
+        if (!(fl->fl_flags & FL_POSIX))
+                return -ENOLCK;
+        if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+                return -ENOLCK;
+        if (sdp->sd_args.ar_localflocks) {
+                if (IS_GETLK(cmd)) {
+                        struct file_lock tmp;
+                        int ret;
+                        ret = posix_test_lock(file, fl, &tmp);
+                        fl->fl_type = F_UNLCK;
+                        if (ret)
+                                memcpy(fl, &tmp, sizeof(struct file_lock));
+                        return 0;
+                } else {
+                        return posix_lock_file_wait(file, fl);
+                }
+        }
+        if (IS_GETLK(cmd))
+                return gfs2_lm_plock_get(sdp, &name, file, fl);
+        else if (fl->fl_type == F_UNLCK)
+                return gfs2_lm_punlock(sdp, &name, file, fl);
+        else
+                return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+}
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gfs2_file *fp = file->private_data;
+        struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+        struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
+        struct gfs2_glock *gl;
+        unsigned int state;
+        int flags;
+        int error = 0;
+        state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+        flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+        mutex_lock(&fp->f_fl_mutex);
+        gl = fl_gh->gh_gl;
+        if (gl) {
+                if (fl_gh->gh_state == state)
+                        goto out;
+                gfs2_glock_hold(gl);
+                flock_lock_file_wait(file,
+                                     &(struct file_lock){.fl_type = F_UNLCK});
+                gfs2_glock_dq_uninit(fl_gh);
+        } else {
+                error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+                                      ip->i_num.no_addr, &gfs2_flock_glops,
+                                      CREATE, &gl);
+                if (error)
+                        goto out;
+        }
+        gfs2_holder_init(gl, state, flags, fl_gh);
+        gfs2_glock_put(gl);
+        error = gfs2_glock_nq(fl_gh);
+        if (error) {
+                gfs2_holder_uninit(fl_gh);
+                if (error == GLR_TRYFAILED)
+                        error = -EAGAIN;
+        } else {
+                error = flock_lock_file_wait(file, fl);
+                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+        }
+out:
+        mutex_unlock(&fp->f_fl_mutex);
+        return error;
+}
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+        struct gfs2_file *fp = file->private_data;
+        struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+        mutex_lock(&fp->f_fl_mutex);
+        flock_lock_file_wait(file, fl);
+        if (fl_gh->gh_gl)
+                gfs2_glock_dq_uninit(fl_gh);
+        mutex_unlock(&fp->f_fl_mutex);
+}
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+        if (!(fl->fl_flags & FL_FLOCK))
+                return -ENOLCK;
+        if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+                return -ENOLCK;
+        if (sdp->sd_args.ar_localflocks)
+                return flock_lock_file_wait(file, fl);
+        if (fl->fl_type == F_UNLCK) {
+                do_unflock(file, fl);
+                return 0;
+        } else {
+                return do_flock(file, cmd, fl);
+        }
+}
+const struct file_operations gfs2_file_fops = {
+        .llseek         = gfs2_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .write          = do_sync_write,
+        .aio_write      = generic_file_aio_write,
+        .unlocked_ioctl = gfs2_ioctl,
+        .mmap           = gfs2_mmap,
+        .open           = gfs2_open,
+        .release        = gfs2_close,
+        .fsync          = gfs2_fsync,
+        .lock           = gfs2_lock,
+        .sendfile       = generic_file_sendfile,
+        .flock          = gfs2_flock,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
+};
+const struct file_operations gfs2_dir_fops = {
+        .readdir        = gfs2_readdir,
+        .unlocked_ioctl = gfs2_ioctl,
+        .open           = gfs2_open,
+        .release        = gfs2_close,
+        .fsync          = gfs2_fsync,
+        .lock           = gfs2_lock,
+        .flock          = gfs2_flock,
+};
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..ce319f89ec8e
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_FILE_DOT_H__
+#define __OPS_FILE_DOT_H__
+#include <linux/fs.h>
+struct gfs2_inode;
+extern struct file gfs2_internal_file_sentinel;
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+                              struct file_ra_state *ra_state,
+                              char *buf, loff_t *pos, unsigned size);
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..882873a6bd69
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,925 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "mount.h"
+#include "ops_export.h"
+#include "ops_fstype.h"
+#include "ops_super.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#define DO 0
+#define UNDO 1
+extern struct dentry_operations gfs2_dops;
+static struct gfs2_sbd *init_sbd(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp;
+        sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
+        if (!sdp)
+                return NULL;
+        sb->s_fs_info = sdp;
+        sdp->sd_vfs = sb;
+        gfs2_tune_init(&sdp->sd_tune);
+        INIT_LIST_HEAD(&sdp->sd_reclaim_list);
+        spin_lock_init(&sdp->sd_reclaim_lock);
+        init_waitqueue_head(&sdp->sd_reclaim_wq);
+        mutex_init(&sdp->sd_inum_mutex);
+        spin_lock_init(&sdp->sd_statfs_spin);
+        mutex_init(&sdp->sd_statfs_mutex);
+        spin_lock_init(&sdp->sd_rindex_spin);
+        mutex_init(&sdp->sd_rindex_mutex);
+        INIT_LIST_HEAD(&sdp->sd_rindex_list);
+        INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+        INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
+        INIT_LIST_HEAD(&sdp->sd_jindex_list);
+        spin_lock_init(&sdp->sd_jindex_spin);
+        mutex_init(&sdp->sd_jindex_mutex);
+        INIT_LIST_HEAD(&sdp->sd_quota_list);
+        spin_lock_init(&sdp->sd_quota_spin);
+        mutex_init(&sdp->sd_quota_mutex);
+        spin_lock_init(&sdp->sd_log_lock);
+        INIT_LIST_HEAD(&sdp->sd_log_le_gl);
+        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
+        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
+        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
+        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
+        mutex_init(&sdp->sd_log_reserve_mutex);
+        INIT_LIST_HEAD(&sdp->sd_ail1_list);
+        INIT_LIST_HEAD(&sdp->sd_ail2_list);
+        init_rwsem(&sdp->sd_log_flush_lock);
+        INIT_LIST_HEAD(&sdp->sd_log_flush_list);
+        INIT_LIST_HEAD(&sdp->sd_revoke_list);
+        mutex_init(&sdp->sd_freeze_lock);
+        return sdp;
+}
+static void init_vfs(struct super_block *sb, unsigned noatime)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        sb->s_magic = GFS2_MAGIC;
+        sb->s_op = &gfs2_super_ops;
+        sb->s_export_op = &gfs2_export_ops;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
+                set_bit(noatime, &sdp->sd_flags);
+        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
+        sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+}
+static int init_names(struct gfs2_sbd *sdp, int silent)
+{
+        struct page *page;
+        char *proto, *table;
+        int error = 0;
+        proto = sdp->sd_args.ar_lockproto;
+        table = sdp->sd_args.ar_locktable;
+        /*  Try to autodetect  */
+        if (!proto[0] || !table[0]) {
+                struct gfs2_sb *sb;
+                page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+                if (!page)
+                        return -ENOBUFS;
+                sb = kmap(page);
+                gfs2_sb_in(&sdp->sd_sb, sb);
+                kunmap(page);
+                __free_page(page);
+                error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+                if (error)
+                        goto out;
+                if (!proto[0])
+                        proto = sdp->sd_sb.sb_lockproto;
+                if (!table[0])
+                        table = sdp->sd_sb.sb_locktable;
+        }
+        if (!table[0])
+                table = sdp->sd_vfs->s_id;
+        snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
+        snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
+out:
+        return error;
+}
+static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
+                        int undo)
+{
+        struct task_struct *p;
+        int error = 0;
+        if (undo)
+                goto fail_trans;
+        p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start scand thread: %d\n", error);
+                return error;
+        }
+        sdp->sd_scand_process = p;
+        for (sdp->sd_glockd_num = 0;
+             sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
+             sdp->sd_glockd_num++) {
+                p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
+                error = IS_ERR(p);
+                if (error) {
+                        fs_err(sdp, "can't start glockd thread: %d\n", error);
+                        goto fail;
+                }
+                sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
+        }
+        error = gfs2_glock_nq_num(sdp,
+                                  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
+                                  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+                                  mount_gh);
+        if (error) {
+                fs_err(sdp, "can't acquire mount glock: %d\n", error);
+                goto fail;
+        }
+        error = gfs2_glock_nq_num(sdp,
+                                  GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
+                                  LM_ST_SHARED,
+                                  LM_FLAG_NOEXP | GL_EXACT,
+                                  &sdp->sd_live_gh);
+        if (error) {
+                fs_err(sdp, "can't acquire live glock: %d\n", error);
+                goto fail_mount;
+        }
+        error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
+                               CREATE, &sdp->sd_rename_gl);
+        if (error) {
+                fs_err(sdp, "can't create rename glock: %d\n", error);
+                goto fail_live;
+        }
+        error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
+                               CREATE, &sdp->sd_trans_gl);
+        if (error) {
+                fs_err(sdp, "can't create transaction glock: %d\n", error);
+                goto fail_rename;
+        }
+        set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
+        return 0;
+fail_trans:
+        gfs2_glock_put(sdp->sd_trans_gl);
+fail_rename:
+        gfs2_glock_put(sdp->sd_rename_gl);
+fail_live:
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+fail_mount:
+        gfs2_glock_dq_uninit(mount_gh);
+fail:
+        while (sdp->sd_glockd_num--)
+                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+        kthread_stop(sdp->sd_scand_process);
+        return error;
+}
+static struct inode *gfs2_lookup_root(struct super_block *sb,
+                                      struct gfs2_inum *inum)
+{
+        return gfs2_inode_lookup(sb, inum, DT_DIR);
+}
+static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+{
+        struct super_block *sb = sdp->sd_vfs;
+        struct gfs2_holder sb_gh;
+        struct gfs2_inum *inum;
+        struct inode *inode;
+        int error = 0;
+        if (undo) {
+                if (sb->s_root) {
+                        dput(sb->s_root);
+                        sb->s_root = NULL;
+                }
+                return 0;
+        }
+        error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+                                 LM_ST_SHARED, 0, &sb_gh);
+        if (error) {
+                fs_err(sdp, "can't acquire superblock glock: %d\n", error);
+                return error;
+        }
+        error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+        if (error) {
+                fs_err(sdp, "can't read superblock: %d\n", error);
+                goto out;
+        }
+        /* Set up the buffer cache and SB for real */
+        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+                error = -EINVAL;
+                fs_err(sdp, "FS block size (%u) is too small for device "
+                       "block size (%u)\n",
+                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+                goto out;
+        }
+        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
+                error = -EINVAL;
+                fs_err(sdp, "FS block size (%u) is too big for machine "
+                       "page size (%u)\n",
+                       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
+                goto out;
+        }
+        sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+        /* Get the root inode */
+        inum = &sdp->sd_sb.sb_root_dir;
+        if (sb->s_type == &gfs2meta_fs_type)
+                inum = &sdp->sd_sb.sb_master_dir;
+        inode = gfs2_lookup_root(sb, inum);
+        if (IS_ERR(inode)) {
+                error = PTR_ERR(inode);
+                fs_err(sdp, "can't read in root inode: %d\n", error);
+                goto out;
+        }
+        sb->s_root = d_alloc_root(inode);
+        if (!sb->s_root) {
+                fs_err(sdp, "can't get root dentry\n");
+                error = -ENOMEM;
+                iput(inode);
+        }
+        sb->s_root->d_op = &gfs2_dops;
+out:
+        gfs2_glock_dq_uninit(&sb_gh);
+        return error;
+}
+static int init_journal(struct gfs2_sbd *sdp, int undo)
+{
+        struct gfs2_holder ji_gh;
+        struct task_struct *p;
+        struct gfs2_inode *ip;
+        int jindex = 1;
+        int error = 0;
+        if (undo) {
+                jindex = 0;
+                goto fail_recoverd;
+        }
+        sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+        if (IS_ERR(sdp->sd_jindex)) {
+                fs_err(sdp, "can't lookup journal index: %d\n", error);
+                return PTR_ERR(sdp->sd_jindex);
+        }
+        ip = GFS2_I(sdp->sd_jindex);
+        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+        /* Load in the journal index special file */
+        error = gfs2_jindex_hold(sdp, &ji_gh);
+        if (error) {
+                fs_err(sdp, "can't read journal index: %d\n", error);
+                goto fail;
+        }
+        error = -EINVAL;
+        if (!gfs2_jindex_size(sdp)) {
+                fs_err(sdp, "no journals!\n");
+                goto fail_jindex;
+        }
+        if (sdp->sd_args.ar_spectator) {
+                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
+                sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+        } else {
+                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
+                        fs_err(sdp, "can't mount journal #%u\n",
+                               sdp->sd_lockstruct.ls_jid);
+                        fs_err(sdp, "there are only %u journals (0 - %u)\n",
+                               gfs2_jindex_size(sdp),
+                               gfs2_jindex_size(sdp) - 1);
+                        goto fail_jindex;
+                }
+                sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
+                error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
+                                          &gfs2_journal_glops,
+                                          LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+                                          &sdp->sd_journal_gh);
+                if (error) {
+                        fs_err(sdp, "can't acquire journal glock: %d\n", error);
+                        goto fail_jindex;
+                }
+                ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+                                           LM_FLAG_NOEXP | GL_EXACT,
+                                           &sdp->sd_jinode_gh);
+                if (error) {
+                        fs_err(sdp, "can't acquire journal inode glock: %d\n",
+                               error);
+                        goto fail_journal_gh;
+                }
+                error = gfs2_jdesc_check(sdp->sd_jdesc);
+                if (error) {
+                        fs_err(sdp, "my journal (%u) is bad: %d\n",
+                               sdp->sd_jdesc->jd_jid, error);
+                        goto fail_jinode_gh;
+                }
+                sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+        }
+        if (sdp->sd_lockstruct.ls_first) {
+                unsigned int x;
+                for (x = 0; x < sdp->sd_journals; x++) {
+                        error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+                        if (error) {
+                                fs_err(sdp, "error recovering journal %u: %d\n",
+                                       x, error);
+                                goto fail_jinode_gh;
+                        }
+                }
+                gfs2_lm_others_may_mount(sdp);
+        } else if (!sdp->sd_args.ar_spectator) {
+                error = gfs2_recover_journal(sdp->sd_jdesc);
+                if (error) {
+                        fs_err(sdp, "error recovering my journal: %d\n", error);
+                        goto fail_jinode_gh;
+                }
+        }
+        set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
+        gfs2_glock_dq_uninit(&ji_gh);
+        jindex = 0;
+        p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start recoverd thread: %d\n", error);
+                goto fail_jinode_gh;
+        }
+        sdp->sd_recoverd_process = p;
+        return 0;
+fail_recoverd:
+        kthread_stop(sdp->sd_recoverd_process);
+fail_jinode_gh:
+        if (!sdp->sd_args.ar_spectator)
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+fail_journal_gh:
+        if (!sdp->sd_args.ar_spectator)
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+fail_jindex:
+        gfs2_jindex_free(sdp);
+        if (jindex)
+                gfs2_glock_dq_uninit(&ji_gh);
+fail:
+        iput(sdp->sd_jindex);
+        return error;
+}
+static int init_inodes(struct gfs2_sbd *sdp, int undo)
+{
+        int error = 0;
+        struct gfs2_inode *ip;
+        struct inode *inode;
+        if (undo)
+                goto fail_qinode;
+        inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
+        if (IS_ERR(inode)) {
+                error = PTR_ERR(inode);
+                fs_err(sdp, "can't read in master directory: %d\n", error);
+                goto fail;
+        }
+        sdp->sd_master_dir = inode;
+        error = init_journal(sdp, undo);
+        if (error)
+                goto fail_master;
+        /* Read in the master inode number inode */
+        sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+        if (IS_ERR(sdp->sd_inum_inode)) {
+                error = PTR_ERR(sdp->sd_inum_inode);
+                fs_err(sdp, "can't read in inum inode: %d\n", error);
+                goto fail_journal;
+        }
+        /* Read in the master statfs inode */
+        sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+        if (IS_ERR(sdp->sd_statfs_inode)) {
+                error = PTR_ERR(sdp->sd_statfs_inode);
+                fs_err(sdp, "can't read in statfs inode: %d\n", error);
+                goto fail_inum;
+        }
+        /* Read in the resource index inode */
+        sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+        if (IS_ERR(sdp->sd_rindex)) {
+                error = PTR_ERR(sdp->sd_rindex);
+                fs_err(sdp, "can't get resource index inode: %d\n", error);
+                goto fail_statfs;
+        }
+        ip = GFS2_I(sdp->sd_rindex);
+        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+        sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+        /* Read in the quota inode */
+        sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+        if (IS_ERR(sdp->sd_quota_inode)) {
+                error = PTR_ERR(sdp->sd_quota_inode);
+                fs_err(sdp, "can't get quota file inode: %d\n", error);
+                goto fail_rindex;
+        }
+        return 0;
+fail_qinode:
+        iput(sdp->sd_quota_inode);
+fail_rindex:
+        gfs2_clear_rgrpd(sdp);
+        iput(sdp->sd_rindex);
+fail_statfs:
+        iput(sdp->sd_statfs_inode);
+fail_inum:
+        iput(sdp->sd_inum_inode);
+fail_journal:
+        init_journal(sdp, UNDO);
+fail_master:
+        iput(sdp->sd_master_dir);
+fail:
+        return error;
+}
+static int init_per_node(struct gfs2_sbd *sdp, int undo)
+{
+        struct inode *pn = NULL;
+        char buf[30];
+        int error = 0;
+        struct gfs2_inode *ip;
+        if (sdp->sd_args.ar_spectator)
+                return 0;
+        if (undo)
+                goto fail_qc_gh;
+        pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+        if (IS_ERR(pn)) {
+                error = PTR_ERR(pn);
+                fs_err(sdp, "can't find per_node directory: %d\n", error);
+                return error;
+        }
+        sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
+        sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
+        if (IS_ERR(sdp->sd_ir_inode)) {
+                error = PTR_ERR(sdp->sd_ir_inode);
+                fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
+                goto fail;
+        }
+        sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
+        sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
+        if (IS_ERR(sdp->sd_sc_inode)) {
+                error = PTR_ERR(sdp->sd_sc_inode);
+                fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
+                goto fail_ir_i;
+        }
+        sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
+        sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+        if (IS_ERR(sdp->sd_qc_inode)) {
+                error = PTR_ERR(sdp->sd_qc_inode);
+                fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
+                goto fail_ut_i;
+        }
+        iput(pn);
+        pn = NULL;
+        ip = GFS2_I(sdp->sd_ir_inode);
+        error = gfs2_glock_nq_init(ip->i_gl,
+                                   LM_ST_EXCLUSIVE, 0,
+                                   &sdp->sd_ir_gh);
+        if (error) {
+                fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
+                goto fail_qc_i;
+        }
+        ip = GFS2_I(sdp->sd_sc_inode);
+        error = gfs2_glock_nq_init(ip->i_gl,
+                                   LM_ST_EXCLUSIVE, 0,
+                                   &sdp->sd_sc_gh);
+        if (error) {
+                fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+                goto fail_ir_gh;
+        }
+        ip = GFS2_I(sdp->sd_qc_inode);
+        error = gfs2_glock_nq_init(ip->i_gl,
+                                   LM_ST_EXCLUSIVE, 0,
+                                   &sdp->sd_qc_gh);
+        if (error) {
+                fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
+                goto fail_ut_gh;
+        }
+        return 0;
+fail_qc_gh:
+        gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+fail_ut_gh:
+        gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+fail_ir_gh:
+        gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+fail_qc_i:
+        iput(sdp->sd_qc_inode);
+fail_ut_i:
+        iput(sdp->sd_sc_inode);
+fail_ir_i:
+        iput(sdp->sd_ir_inode);
+fail:
+        if (pn)
+                iput(pn);
+        return error;
+}
+static int init_threads(struct gfs2_sbd *sdp, int undo)
+{
+        struct task_struct *p;
+        int error = 0;
+        if (undo)
+                goto fail_quotad;
+        sdp->sd_log_flush_time = jiffies;
+        sdp->sd_jindex_refresh_time = jiffies;
+        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start logd thread: %d\n", error);
+                return error;
+        }
+        sdp->sd_logd_process = p;
+        sdp->sd_statfs_sync_time = jiffies;
+        sdp->sd_quota_sync_time = jiffies;
+        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+        error = IS_ERR(p);
+        if (error) {
+                fs_err(sdp, "can't start quotad thread: %d\n", error);
+                goto fail;
+        }
+        sdp->sd_quotad_process = p;
+        return 0;
+fail_quotad:
+        kthread_stop(sdp->sd_quotad_process);
+fail:
+        kthread_stop(sdp->sd_logd_process);
+        return error;
+}
+/**
+ * fill_super - Read in superblock
+ * @sb: The VFS superblock
+ * @data: Mount options
+ * @silent: Don't complain if it's not a GFS2 filesystem
+ *
+ * Returns: errno
+ */
+static int fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct gfs2_sbd *sdp;
+        struct gfs2_holder mount_gh;
+        int error;
+        sdp = init_sbd(sb);
+        if (!sdp) {
+                printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+                return -ENOMEM;
+        }
+        error = gfs2_mount_args(sdp, (char *)data, 0);
+        if (error) {
+                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+                goto fail;
+        }
+        init_vfs(sb, SDF_NOATIME);
+        /* Set up the buffer cache and fill in some fake block size values
+           to allow us to read-in the on-disk superblock. */
+        sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+        sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
+        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        error = init_names(sdp, silent);
+        if (error)
+                goto fail;
+        error = gfs2_sys_fs_add(sdp);
+        if (error)
+                goto fail;
+        error = gfs2_lm_mount(sdp, silent);
+        if (error)
+                goto fail_sys;
+        error = init_locking(sdp, &mount_gh, DO);
+        if (error)
+                goto fail_lm;
+        error = init_sb(sdp, silent, DO);
+        if (error)
+                goto fail_locking;
+        error = init_inodes(sdp, DO);
+        if (error)
+                goto fail_sb;
+        error = init_per_node(sdp, DO);
+        if (error)
+                goto fail_inodes;
+        error = gfs2_statfs_init(sdp);
+        if (error) {
+                fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
+                goto fail_per_node;
+        }
+        error = init_threads(sdp, DO);
+        if (error)
+                goto fail_per_node;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_rw(sdp);
+                if (error) {
+                        fs_err(sdp, "can't make FS RW: %d\n", error);
+                        goto fail_threads;
+                }
+        }
+        gfs2_glock_dq_uninit(&mount_gh);
+        return 0;
+fail_threads:
+        init_threads(sdp, UNDO);
+fail_per_node:
+        init_per_node(sdp, UNDO);
+fail_inodes:
+        init_inodes(sdp, UNDO);
+fail_sb:
+        init_sb(sdp, 0, UNDO);
+fail_locking:
+        init_locking(sdp, &mount_gh, UNDO);
+fail_lm:
+        gfs2_gl_hash_clear(sdp, WAIT);
+        gfs2_lm_unmount(sdp);
+        while (invalidate_inodes(sb))
+                yield();
+fail_sys:
+        gfs2_sys_fs_del(sdp);
+fail:
+        kfree(sdp);
+        sb->s_fs_info = NULL;
+        return error;
+}
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct super_block *sb;
+        struct gfs2_sbd *sdp;
+        int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+        if (error)
+                goto out;
+        sb = mnt->mnt_sb;
+        sdp = sb->s_fs_info;
+        sdp->sd_gfs2mnt = mnt;
+out:
+        return error;
+}
+static int fill_super_meta(struct super_block *sb, struct super_block *new,
+                           void *data, int silent)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct inode *inode;
+        int error = 0;
+        new->s_fs_info = sdp;
+        sdp->sd_vfs_meta = sb;
+        init_vfs(new, SDF_NOATIME);
+        /* Get the master inode */
+        inode = igrab(sdp->sd_master_dir);
+        new->s_root = d_alloc_root(inode);
+        if (!new->s_root) {
+                fs_err(sdp, "can't get root dentry\n");
+                error = -ENOMEM;
+                iput(inode);
+        } else
+                new->s_root->d_op = &gfs2_dops;
+        return error;
+}
+static int set_bdev_super(struct super_block *s, void *data)
+{
+        s->s_bdev = data;
+        s->s_dev = s->s_bdev->bd_dev;
+        return 0;
+}
+static int test_bdev_super(struct super_block *s, void *data)
+{
+        return s->s_bdev == data;
+}
+static struct super_block* get_gfs2_sb(const char *dev_name)
+{
+        struct kstat stat;
+        struct nameidata nd;
+        struct file_system_type *fstype;
+        struct super_block *sb = NULL, *s;
+        struct list_head *l;
+        int error;
+        error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+        if (error) {
+                printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
+                       dev_name);
+                goto out;
+        }
+        error = vfs_getattr(nd.mnt, nd.dentry, &stat);
+        fstype = get_fs_type("gfs2");
+        list_for_each(l, &fstype->fs_supers) {
+                s = list_entry(l, struct super_block, s_instances);
+                if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
+                    (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
+                        sb = s;
+                        goto free_nd;
+                }
+        }
+        printk(KERN_WARNING "GFS2: Unrecognized block device or "
+               "mount point %s", dev_name);
+free_nd:
+        path_release(&nd);
+out:
+        return sb;
+}
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+                            const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        int error = 0;
+        struct super_block *sb = NULL, *new;
+        struct gfs2_sbd *sdp;
+        sb = get_gfs2_sb(dev_name);
+        if (!sb) {
+                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+                error = -ENOENT;
+                goto error;
+        }
+        sdp = (struct gfs2_sbd*) sb->s_fs_info;
+        if (sdp->sd_vfs_meta) {
+                printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
+                error = -EBUSY;
+                goto error;
+        }
+        mutex_lock(&sb->s_bdev->bd_mount_mutex);
+        new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
+        mutex_unlock(&sb->s_bdev->bd_mount_mutex);
+        if (IS_ERR(new)) {
+                error = PTR_ERR(new);
+                goto error;
+        }
+        module_put(fs_type->owner);
+        new->s_flags = flags;
+        strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
+        sb_set_blocksize(new, sb->s_blocksize);
+        error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
+        if (error) {
+                up_write(&new->s_umount);
+                deactivate_super(new);
+                goto error;
+        }
+        new->s_flags |= MS_ACTIVE;
+        /* Grab a reference to the gfs2 mount point */
+        atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
+        return simple_set_mnt(mnt, new);
+error:
+        return error;
+}
+static void gfs2_kill_sb(struct super_block *sb)
+{
+        kill_block_super(sb);
+}
+static void gfs2_kill_sb_meta(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        generic_shutdown_super(sb);
+        sdp->sd_vfs_meta = NULL;
+        atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+}
+struct file_system_type gfs2_fs_type = {
+        .name = "gfs2",
+        .fs_flags = FS_REQUIRES_DEV,
+        .get_sb = gfs2_get_sb,
+        .kill_sb = gfs2_kill_sb,
+        .owner = THIS_MODULE,
+};
+struct file_system_type gfs2meta_fs_type = {
+        .name = "gfs2meta",
+        .fs_flags = FS_REQUIRES_DEV,
+        .get_sb = gfs2_get_sb_meta,
+        .kill_sb = gfs2_kill_sb_meta,
+        .owner = THIS_MODULE,
+};
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7cc2c296271b
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_FSTYPE_DOT_H__
+#define __OPS_FSTYPE_DOT_H__
+#include <linux/fs.h>
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..ef6e5ed70e94
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "ops_dentry.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+                       int mode, struct nameidata *nd)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        for (;;) {
+                inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
+                if (!IS_ERR(inode)) {
+                        gfs2_trans_end(sdp);
+                        if (dip->i_alloc.al_rgd)
+                                gfs2_inplace_release(dip);
+                        gfs2_quota_unlock(dip);
+                        gfs2_alloc_put(dip);
+                        gfs2_glock_dq_uninit_m(2, ghs);
+                        mark_inode_dirty(inode);
+                        break;
+                } else if (PTR_ERR(inode) != -EEXIST ||
+                           (nd->intent.open.flags & O_EXCL)) {
+                        gfs2_holder_uninit(ghs);
+                        return PTR_ERR(inode);
+                }
+                inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+                if (inode) {
+                        if (!IS_ERR(inode)) {
+                                gfs2_holder_uninit(ghs);
+                                break;
+                        } else {
+                                gfs2_holder_uninit(ghs);
+                                return PTR_ERR(inode);
+                        }
+                }
+        }
+        d_instantiate(dentry, inode);
+        return 0;
+}
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+        struct inode *inode = NULL;
+        dentry->d_op = &gfs2_dops;
+        inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+        if (inode && IS_ERR(inode))
+                return ERR_PTR(PTR_ERR(inode));
+        if (inode)
+                return d_splice_alias(inode, dentry);
+        d_add(dentry, inode);
+        return NULL;
+}
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+                     struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct inode *inode = old_dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder ghs[2];
+        int alloc_required;
+        int error;
+        if (S_ISDIR(ip->i_di.di_mode))
+                return -EPERM;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq_m(2, ghs);
+        if (error)
+                goto out;
+        error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
+        switch (error) {
+        case -ENOENT:
+                break;
+        case 0:
+                error = -EEXIST;
+        default:
+                goto out_gunlock;
+        }
+        error = -EINVAL;
+        if (!dip->i_di.di_nlink)
+                goto out_gunlock;
+        error = -EFBIG;
+        if (dip->i_di.di_entries == (u32)-1)
+                goto out_gunlock;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                goto out_gunlock;
+        error = -EINVAL;
+        if (!ip->i_di.di_nlink)
+                goto out_gunlock;
+        error = -EMLINK;
+        if (ip->i_di.di_nlink == (u32)-1)
+                goto out_gunlock;
+        alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+        if (error < 0)
+                goto out_gunlock;
+        error = 0;
+        if (alloc_required) {
+                struct gfs2_alloc *al = gfs2_alloc_get(dip);
+                error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                if (error)
+                        goto out_alloc;
+                error = gfs2_quota_check(dip, dip->i_di.di_uid,
+                                         dip->i_di.di_gid);
+                if (error)
+                        goto out_gunlock_q;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(dip);
+                if (error)
+                        goto out_gunlock_q;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         al->al_rgd->rd_ri.ri_length +
+                                         2 * RES_DINODE + RES_STATFS +
+                                         RES_QUOTA, 0);
+                if (error)
+                        goto out_ipres;
+        } else {
+                error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+                if (error)
+                        goto out_ipres;
+        }
+        error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
+                             IF2DT(ip->i_di.di_mode));
+        if (error)
+                goto out_end_trans;
+        error = gfs2_change_nlink(ip, +1);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        if (alloc_required)
+                gfs2_inplace_release(dip);
+out_gunlock_q:
+        if (alloc_required)
+                gfs2_quota_unlock(dip);
+out_alloc:
+        if (alloc_required)
+                gfs2_alloc_put(dip);
+out_gunlock:
+        gfs2_glock_dq_m(2, ghs);
+out:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        if (!error) {
+                atomic_inc(&inode->i_count);
+                d_instantiate(dentry, inode);
+                mark_inode_dirty(inode);
+        }
+        return error;
+}
+/**
+ * gfs2_unlink - Unlink a file
+ * @dir: The inode of the directory containing the file to unlink
+ * @dentry: The file itself
+ *
+ * Unlink a file.  Call gfs2_unlinki()
+ *
+ * Returns: errno
+ */
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_holder ghs[2];
+        int error;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq_m(2, ghs);
+        if (error)
+                goto out;
+        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_dir_del(dip, &dentry->d_name);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_change_nlink(ip, -1);
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_m(2, ghs);
+out:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        return error;
+}
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+                        const char *symname)
+{
+        struct gfs2_inode *dip = GFS2_I(dir), *ip;
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        struct buffer_head *dibh;
+        int size;
+        int error;
+        /* Must be stuffed with a null terminator for gfs2_follow_link() */
+        size = strlen(symname);
+        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+                return -ENAMETOOLONG;
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
+        if (IS_ERR(inode)) {
+                gfs2_holder_uninit(ghs);
+                return PTR_ERR(inode);
+        }
+        ip = ghs[1].gh_gl->gl_object;
+        ip->i_di.di_size = size;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
+                       size);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
+        return 0;
+}
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct gfs2_inode *dip = GFS2_I(dir), *ip;
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        struct buffer_head *dibh;
+        int error;
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
+        if (IS_ERR(inode)) {
+                gfs2_holder_uninit(ghs);
+                return PTR_ERR(inode);
+        }
+        ip = ghs[1].gh_gl->gl_object;
+        ip->i_di.di_nlink = 2;
+        ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+        ip->i_di.di_flags |= GFS2_DIF_JDATA;
+        ip->i_di.di_payload_format = GFS2_FORMAT_DE;
+        ip->i_di.di_entries = 2;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+                struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+                struct qstr str;
+                gfs2_str2qstr(&str, ".");
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+                dent->de_inum = di->di_num; /* already GFS2 endian */
+                dent->de_type = cpu_to_be16(DT_DIR);
+                di->di_entries = cpu_to_be32(1);
+                gfs2_str2qstr(&str, "..");
+                dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+                gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+                gfs2_inum_out(&dip->i_num, &dent->de_inum);
+                dent->de_type = cpu_to_be16(DT_DIR);
+                gfs2_dinode_out(&ip->i_di, di);
+                brelse(dibh);
+        }
+        error = gfs2_change_nlink(dip, +1);
+        gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
+        return 0;
+}
+/**
+ * gfs2_rmdir - Remove a directory
+ * @dir: The parent directory of the directory to be removed
+ * @dentry: The dentry of the directory to remove
+ *
+ * Remove a directory. Call gfs2_rmdiri()
+ *
+ * Returns: errno
+ */
+static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct gfs2_inode *dip = GFS2_I(dir);
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_holder ghs[2];
+        int error;
+        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+        error = gfs2_glock_nq_m(2, ghs);
+        if (error)
+                goto out;
+        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        if (ip->i_di.di_entries < 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(&ip->i_di);
+                error = -EIO;
+                goto out_gunlock;
+        }
+        if (ip->i_di.di_entries > 2) {
+                error = -ENOTEMPTY;
+                goto out_gunlock;
+        }
+        error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
+        if (error)
+                goto out_gunlock;
+        error = gfs2_rmdiri(dip, &dentry->d_name, ip);
+        gfs2_trans_end(sdp);
+out_gunlock:
+        gfs2_glock_dq_m(2, ghs);
+out:
+        gfs2_holder_uninit(ghs);
+        gfs2_holder_uninit(ghs + 1);
+        return error;
+}
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @rdev: The device specification of the special file
+ *
+ */
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                      dev_t dev)
+{
+        struct gfs2_inode *dip = GFS2_I(dir), *ip;
+        struct gfs2_sbd *sdp = GFS2_SB(dir);
+        struct gfs2_holder ghs[2];
+        struct inode *inode;
+        struct buffer_head *dibh;
+        u32 major = 0, minor = 0;
+        int error;
+        switch (mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                major = MAJOR(dev);
+                minor = MINOR(dev);
+                break;
+        case S_IFIFO:
+        case S_IFSOCK:
+                break;
+        default:
+                return -EOPNOTSUPP;
+        };
+        gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+        inode = gfs2_createi(ghs, &dentry->d_name, mode);
+        if (IS_ERR(inode)) {
+                gfs2_holder_uninit(ghs);
+                return PTR_ERR(inode);
+        }
+        ip = ghs[1].gh_gl->gl_object;
+        ip->i_di.di_major = major;
+        ip->i_di.di_minor = minor;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (!gfs2_assert_withdraw(sdp, !error)) {
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        gfs2_trans_end(sdp);
+        if (dip->i_alloc.al_rgd)
+                gfs2_inplace_release(dip);
+        gfs2_quota_unlock(dip);
+        gfs2_alloc_put(dip);
+        gfs2_glock_dq_uninit_m(2, ghs);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
+        return 0;
+}
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+                       struct inode *ndir, struct dentry *ndentry)
+{
+        struct gfs2_inode *odip = GFS2_I(odir);
+        struct gfs2_inode *ndip = GFS2_I(ndir);
+        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+        struct gfs2_inode *nip = NULL;
+        struct gfs2_sbd *sdp = GFS2_SB(odir);
+        struct gfs2_holder ghs[4], r_gh;
+        unsigned int num_gh;
+        int dir_rename = 0;
+        int alloc_required;
+        unsigned int x;
+        int error;
+        if (ndentry->d_inode) {
+                nip = GFS2_I(ndentry->d_inode);
+                if (ip == nip)
+                        return 0;
+        }
+        /* Make sure we aren't trying to move a dirctory into it's subdir */
+        if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
+                dir_rename = 1;
+                error = gfs2_glock_nq_init(sdp->sd_rename_gl,
+                                           LM_ST_EXCLUSIVE, 0,
+                                           &r_gh);
+                if (error)
+                        goto out;
+                error = gfs2_ok_to_move(ip, ndip);
+                if (error)
+                        goto out_gunlock_r;
+        }
+        num_gh = 1;
+        gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+        if (odip != ndip) {
+                gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+                num_gh++;
+        }
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+        num_gh++;
+        if (nip) {
+                gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+                num_gh++;
+        }
+        error = gfs2_glock_nq_m(num_gh, ghs);
+        if (error)
+                goto out_uninit;
+        /* Check out the old directory */
+        error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+        if (error)
+                goto out_gunlock;
+        /* Check out the new directory */
+        if (nip) {
+                error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+                if (error)
+                        goto out_gunlock;
+                if (S_ISDIR(nip->i_di.di_mode)) {
+                        if (nip->i_di.di_entries < 2) {
+                                if (gfs2_consist_inode(nip))
+                                        gfs2_dinode_print(&nip->i_di);
+                                error = -EIO;
+                                goto out_gunlock;
+                        }
+                        if (nip->i_di.di_entries > 2) {
+                                error = -ENOTEMPTY;
+                                goto out_gunlock;
+                        }
+                }
+        } else {
+                error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+                if (error)
+                        goto out_gunlock;
+                error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
+                switch (error) {
+                case -ENOENT:
+                        error = 0;
+                        break;
+                case 0:
+                        error = -EEXIST;
+                default:
+                        goto out_gunlock;
+                };
+                if (odip != ndip) {
+                        if (!ndip->i_di.di_nlink) {
+                                error = -EINVAL;
+                                goto out_gunlock;
+                        }
+                        if (ndip->i_di.di_entries == (u32)-1) {
+                                error = -EFBIG;
+                                goto out_gunlock;
+                        }
+                        if (S_ISDIR(ip->i_di.di_mode) &&
+                            ndip->i_di.di_nlink == (u32)-1) {
+                                error = -EMLINK;
+                                goto out_gunlock;
+                        }
+                }
+        }
+        /* Check out the dir to be renamed */
+        if (dir_rename) {
+                error = permission(odentry->d_inode, MAY_WRITE, NULL);
+                if (error)
+                        goto out_gunlock;
+        }
+        alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+        if (error < 0)
+                goto out_gunlock;
+        error = 0;
+        if (alloc_required) {
+                struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+                error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                if (error)
+                        goto out_alloc;
+                error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
+                                         ndip->i_di.di_gid);
+                if (error)
+                        goto out_gunlock_q;
+                al->al_requested = sdp->sd_max_dirres;
+                error = gfs2_inplace_reserve(ndip);
+                if (error)
+                        goto out_gunlock_q;
+                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+                                         al->al_rgd->rd_ri.ri_length +
+                                         4 * RES_DINODE + 4 * RES_LEAF +
+                                         RES_STATFS + RES_QUOTA, 0);
+                if (error)
+                        goto out_ipreserv;
+        } else {
+                error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+                                         5 * RES_LEAF, 0);
+                if (error)
+                        goto out_gunlock;
+        }
+        /* Remove the target file, if it exists */
+        if (nip) {
+                if (S_ISDIR(nip->i_di.di_mode))
+                        error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
+                else {
+                        error = gfs2_dir_del(ndip, &ndentry->d_name);
+                        if (error)
+                                goto out_end_trans;
+                        error = gfs2_change_nlink(nip, -1);
+                }
+                if (error)
+                        goto out_end_trans;
+        }
+        if (dir_rename) {
+                struct qstr name;
+                gfs2_str2qstr(&name, "..");
+                error = gfs2_change_nlink(ndip, +1);
+                if (error)
+                        goto out_end_trans;
+                error = gfs2_change_nlink(odip, -1);
+                if (error)
+                        goto out_end_trans;
+                error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
+                if (error)
+                        goto out_end_trans;
+        } else {
+                struct buffer_head *dibh;
+                error = gfs2_meta_inode_buffer(ip, &dibh);
+                if (error)
+                        goto out_end_trans;
+                ip->i_di.di_ctime = get_seconds();
+                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_dinode_out(&ip->i_di, dibh->b_data);
+                brelse(dibh);
+        }
+        error = gfs2_dir_del(odip, &odentry->d_name);
+        if (error)
+                goto out_end_trans;
+        error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
+                             IF2DT(ip->i_di.di_mode));
+        if (error)
+                goto out_end_trans;
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipreserv:
+        if (alloc_required)
+                gfs2_inplace_release(ndip);
+out_gunlock_q:
+        if (alloc_required)
+                gfs2_quota_unlock(ndip);
+out_alloc:
+        if (alloc_required)
+                gfs2_alloc_put(ndip);
+out_gunlock:
+        gfs2_glock_dq_m(num_gh, ghs);
+out_uninit:
+        for (x = 0; x < num_gh; x++)
+                gfs2_holder_uninit(ghs + x);
+out_gunlock_r:
+        if (dir_rename)
+                gfs2_glock_dq_uninit(&r_gh);
+out:
+        return error;
+}
+/**
+ * gfs2_readlink - Read the value of a symlink
+ * @dentry: the symlink
+ * @buf: the buffer to read the symlink data into
+ * @size: the size of the buffer
+ *
+ * Returns: errno
+ */
+static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
+                         int user_size)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        char array[GFS2_FAST_NAME_SIZE], *buf = array;
+        unsigned int len = GFS2_FAST_NAME_SIZE;
+        int error;
+        error = gfs2_readlinki(ip, &buf, &len);
+        if (error)
+                return error;
+        if (user_size > len - 1)
+                user_size = len - 1;
+        if (copy_to_user(user_buf, buf, user_size))
+                error = -EFAULT;
+        else
+                error = user_size;
+        if (buf != array)
+                kfree(buf);
+        return error;
+}
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size. It is optimised for symlinks
+ * under GFS2_FAST_NAME_SIZE.
+ *
+ * Returns: 0 on success or error code
+ */
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        char array[GFS2_FAST_NAME_SIZE], *buf = array;
+        unsigned int len = GFS2_FAST_NAME_SIZE;
+        int error;
+        error = gfs2_readlinki(ip, &buf, &len);
+        if (!error) {
+                error = vfs_follow_link(nd, buf);
+                if (buf != array)
+                        kfree(buf);
+        }
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_permission -
+ * @inode:
+ * @mask:
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Returns: errno
+ */
+static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        if (ip->i_vn == ip->i_gl->gl_vn)
+                return generic_permission(inode, mask, gfs2_check_acl);
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+        if (!error) {
+                error = generic_permission(inode, mask, gfs2_check_acl_locked);
+                gfs2_glock_dq_uninit(&i_gh);
+        }
+        return error;
+}
+static int setattr_size(struct inode *inode, struct iattr *attr)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        int error;
+        if (attr->ia_size != ip->i_di.di_size) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        error = gfs2_truncatei(ip, attr->ia_size);
+        if (error)
+                return error;
+        return error;
+}
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *dibh;
+        u32 ouid, ogid, nuid, ngid;
+        int error;
+        ouid = ip->i_di.di_uid;
+        ogid = ip->i_di.di_gid;
+        nuid = attr->ia_uid;
+        ngid = attr->ia_gid;
+        if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+                ouid = nuid = NO_QUOTA_CHANGE;
+        if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+                ogid = ngid = NO_QUOTA_CHANGE;
+        gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, nuid, ngid);
+        if (error)
+                goto out_alloc;
+        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+                error = gfs2_quota_check(ip, nuid, ngid);
+                if (error)
+                        goto out_gunlock_q;
+        }
+        error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
+        error = inode_setattr(inode, attr);
+        gfs2_assert_warn(sdp, !error);
+        gfs2_inode_attr_out(ip);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(&ip->i_di, dibh->b_data);
+        brelse(dibh);
+        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+                gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
+                gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+        }
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out_alloc:
+        gfs2_alloc_put(ip);
+        return error;
+}
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder i_gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return error;
+        error = -EPERM;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                goto out;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                goto out;
+        if (attr->ia_valid & ATTR_SIZE)
+                error = setattr_size(inode, attr);
+        else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+                error = setattr_chown(inode, attr);
+        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+                error = gfs2_acl_chmod(ip, attr);
+        else
+                error = gfs2_setattr_simple(ip, attr);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        if (!error)
+                mark_inode_dirty(inode);
+        return error;
+}
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * Returns: errno
+ */
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        if (!error) {
+                generic_fillattr(inode, stat);
+                gfs2_glock_dq_uninit(&gh);
+        }
+        return error;
+}
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+                         const void *data, size_t size, int flags)
+{
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        if (er.er_type == GFS2_EATYPE_UNUSED)
+                return -EOPNOTSUPP;
+        er.er_data = (char *)data;
+        er.er_name_len = strlen(er.er_name);
+        er.er_data_len = size;
+        er.er_flags = flags;
+        gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+        return gfs2_ea_set(GFS2_I(inode), &er);
+}
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+                             void *data, size_t size)
+{
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        if (er.er_type == GFS2_EATYPE_UNUSED)
+                return -EOPNOTSUPP;
+        er.er_data = data;
+        er.er_name_len = strlen(er.er_name);
+        er.er_data_len = size;
+        return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
+}
+static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_data = (size) ? buffer : NULL;
+        er.er_data_len = size;
+        return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+}
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+        struct gfs2_ea_request er;
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        if (er.er_type == GFS2_EATYPE_UNUSED)
+                return -EOPNOTSUPP;
+        er.er_name_len = strlen(er.er_name);
+        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+}
+struct inode_operations gfs2_file_iops = {
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
+struct inode_operations gfs2_dev_iops = {
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
+struct inode_operations gfs2_dir_iops = {
+        .create = gfs2_create,
+        .lookup = gfs2_lookup,
+        .link = gfs2_link,
+        .unlink = gfs2_unlink,
+        .symlink = gfs2_symlink,
+        .mkdir = gfs2_mkdir,
+        .rmdir = gfs2_rmdir,
+        .mknod = gfs2_mknod,
+        .rename = gfs2_rename,
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
+struct inode_operations gfs2_symlink_iops = {
+        .readlink = gfs2_readlink,
+        .follow_link = gfs2_follow_link,
+        .permission = gfs2_permission,
+        .setattr = gfs2_setattr,
+        .getattr = gfs2_getattr,
+        .setxattr = gfs2_setxattr,
+        .getxattr = gfs2_getxattr,
+        .listxattr = gfs2_listxattr,
+        .removexattr = gfs2_removexattr,
+};
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..b15acb4fd34c
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_INODE_DOT_H__
+#define __OPS_INODE_DOT_H__
+#include <linux/fs.h>
+extern struct inode_operations gfs2_file_iops;
+extern struct inode_operations gfs2_dir_iops;
+extern struct inode_operations gfs2_symlink_iops;
+extern struct inode_operations gfs2_dev_iops;
+#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..06f06f7773d0
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "mount.h"
+#include "ops_super.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "trans.h"
+#include "dir.h"
+#include "eattr.h"
+#include "bmap.h"
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        /* Check this is a "normal" inode */
+        if (inode->i_private) {
+                if (current->flags & PF_MEMALLOC)
+                        return 0;
+                if (sync)
+                        gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        }
+        return 0;
+}
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+static void gfs2_put_super(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        if (!sdp)
+                return;
+        if (!strncmp(sb->s_type->name, "gfs2meta", 8))
+                return; /* Nothing to do */
+        /*  Unfreeze the filesystem, if we need to  */
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
+        kthread_stop(sdp->sd_recoverd_process);
+        while (sdp->sd_glockd_num--)
+                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+        kthread_stop(sdp->sd_scand_process);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_ro(sdp);
+                if (error)
+                        gfs2_io_error(sdp);
+        }
+        /*  At this point, we're through modifying the disk  */
+        /*  Release stuff  */
+        iput(sdp->sd_master_dir);
+        iput(sdp->sd_jindex);
+        iput(sdp->sd_inum_inode);
+        iput(sdp->sd_statfs_inode);
+        iput(sdp->sd_rindex);
+        iput(sdp->sd_quota_inode);
+        gfs2_glock_put(sdp->sd_rename_gl);
+        gfs2_glock_put(sdp->sd_trans_gl);
+        if (!sdp->sd_args.ar_spectator) {
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+                iput(sdp->sd_ir_inode);
+                iput(sdp->sd_sc_inode);
+                iput(sdp->sd_qc_inode);
+        }
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+        gfs2_clear_rgrpd(sdp);
+        gfs2_jindex_free(sdp);
+        /*  Take apart glock structures and buffer lists  */
+        gfs2_gl_hash_clear(sdp, WAIT);
+        /*  Unmount the locking protocol  */
+        gfs2_lm_unmount(sdp);
+        /*  At this point, we're through participating in the lockspace  */
+        gfs2_sys_fs_del(sdp);
+        kfree(sdp);
+}
+/**
+ * gfs2_write_super - disk commit all incore transactions
+ * @sb: the filesystem
+ *
+ * This function is called every time sync(2) is called.
+ * After this exits, all dirty buffers are synced.
+ */
+static void gfs2_write_super(struct super_block *sb)
+{
+        gfs2_log_flush(sb->s_fs_info, NULL);
+}
+/**
+ * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static void gfs2_write_super_lockfs(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        for (;;) {
+                error = gfs2_freeze_fs(sdp);
+                if (!error)
+                        break;
+                switch (error) {
+                case -EBUSY:
+                        fs_err(sdp, "waiting for recovery before freeze\n");
+                        break;
+                default:
+                        fs_err(sdp, "error freezing FS: %d\n", error);
+                        break;
+                }
+                fs_err(sdp, "retrying...\n");
+                msleep(1000);
+        }
+}
+/**
+ * gfs2_unlockfs - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static void gfs2_unlockfs(struct super_block *sb)
+{
+        gfs2_unfreeze_fs(sb->s_fs_info);
+}
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_statfs_change sc;
+        int error;
+        if (gfs2_tune_get(sdp, gt_statfs_slow))
+                error = gfs2_statfs_slow(sdp, &sc);
+        else
+                error = gfs2_statfs_i(sdp, &sc);
+        if (error)
+                return error;
+        buf->f_type = GFS2_MAGIC;
+        buf->f_bsize = sdp->sd_sb.sb_bsize;
+        buf->f_blocks = sc.sc_total;
+        buf->f_bfree = sc.sc_free;
+        buf->f_bavail = sc.sc_free;
+        buf->f_files = sc.sc_dinodes + sc.sc_free;
+        buf->f_ffree = sc.sc_free;
+        buf->f_namelen = GFS2_FNAMESIZE;
+        return 0;
+}
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        error = gfs2_mount_args(sdp, data, 1);
+        if (error)
+                return error;
+        if (sdp->sd_args.ar_spectator)
+                *flags |= MS_RDONLY;
+        else {
+                if (*flags & MS_RDONLY) {
+                        if (!(sb->s_flags & MS_RDONLY))
+                                error = gfs2_make_fs_ro(sdp);
+                } else if (!(*flags & MS_RDONLY) &&
+                           (sb->s_flags & MS_RDONLY)) {
+                        error = gfs2_make_fs_rw(sdp);
+                }
+        }
+        if (*flags & (MS_NOATIME | MS_NODIRATIME))
+                set_bit(SDF_NOATIME, &sdp->sd_flags);
+        else
+                clear_bit(SDF_NOATIME, &sdp->sd_flags);
+        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
+        *flags |= MS_NOATIME | MS_NODIRATIME;
+        return error;
+}
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+static void gfs2_clear_inode(struct inode *inode)
+{
+        /* This tells us its a "real" inode and not one which only
+         * serves to contain an address space (see rgrp.c, meta_io.c)
+         * which therefore doesn't have its own glocks.
+         */
+        if (inode->i_private) {
+                struct gfs2_inode *ip = GFS2_I(inode);
+                gfs2_glock_inode_squish(inode);
+                gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
+                ip->i_gl->gl_object = NULL;
+                gfs2_glock_schedule_for_reclaim(ip->i_gl);
+                gfs2_glock_put(ip->i_gl);
+                ip->i_gl = NULL;
+                if (ip->i_iopen_gh.gh_gl)
+                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+        }
+}
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+        struct gfs2_args *args = &sdp->sd_args;
+        if (args->ar_lockproto[0])
+                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+        if (args->ar_locktable[0])
+                seq_printf(s, ",locktable=%s", args->ar_locktable);
+        if (args->ar_hostdata[0])
+                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+        if (args->ar_spectator)
+                seq_printf(s, ",spectator");
+        if (args->ar_ignore_local_fs)
+                seq_printf(s, ",ignore_local_fs");
+        if (args->ar_localflocks)
+                seq_printf(s, ",localflocks");
+        if (args->ar_localcaching)
+                seq_printf(s, ",localcaching");
+        if (args->ar_debug)
+                seq_printf(s, ",debug");
+        if (args->ar_upgrade)
+                seq_printf(s, ",upgrade");
+        if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
+                seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
+        if (args->ar_posix_acl)
+                seq_printf(s, ",acl");
+        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+                char *state;
+                switch (args->ar_quota) {
+                case GFS2_QUOTA_OFF:
+                        state = "off";
+                        break;
+                case GFS2_QUOTA_ACCOUNT:
+                        state = "account";
+                        break;
+                case GFS2_QUOTA_ON:
+                        state = "on";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",quota=%s", state);
+        }
+        if (args->ar_suiddir)
+                seq_printf(s, ",suiddir");
+        if (args->ar_data != GFS2_DATA_DEFAULT) {
+                char *state;
+                switch (args->ar_data) {
+                case GFS2_DATA_WRITEBACK:
+                        state = "writeback";
+                        break;
+                case GFS2_DATA_ORDERED:
+                        state = "ordered";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",data=%s", state);
+        }
+        return 0;
+}
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        if (!inode->i_private)
+                goto out;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
+        if (unlikely(error)) {
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                goto out;
+        }
+        gfs2_glock_dq(&ip->i_iopen_gh);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+        error = gfs2_glock_nq(&ip->i_iopen_gh);
+        if (error)
+                goto out_uninit;
+        if (S_ISDIR(ip->i_di.di_mode) &&
+            (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                error = gfs2_dir_exhash_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (ip->i_di.di_eattr) {
+                error = gfs2_ea_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (!gfs2_is_stuffed(ip)) {
+                error = gfs2_file_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        error = gfs2_dinode_dealloc(ip);
+out_unlock:
+        gfs2_glock_dq(&ip->i_iopen_gh);
+out_uninit:
+        gfs2_holder_uninit(&ip->i_iopen_gh);
+        gfs2_glock_dq_uninit(&gh);
+        if (error)
+                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+}
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_inode *ip;
+        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+        if (ip) {
+                ip->i_flags = 0;
+                ip->i_gl = NULL;
+                ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
+                ip->i_last_pfault = jiffies;
+        }
+        return &ip->i_inode;
+}
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(gfs2_inode_cachep, inode);
+}
+struct super_operations gfs2_super_ops = {
+        .alloc_inode = gfs2_alloc_inode,
+        .destroy_inode = gfs2_destroy_inode,
+        .write_inode = gfs2_write_inode,
+        .delete_inode = gfs2_delete_inode,
+        .put_super = gfs2_put_super,
+        .write_super = gfs2_write_super,
+        .write_super_lockfs = gfs2_write_super_lockfs,
+        .unlockfs = gfs2_unlockfs,
+        .statfs = gfs2_statfs,
+        .remount_fs = gfs2_remount_fs,
+        .clear_inode = gfs2_clear_inode,
+        .show_options = gfs2_show_options,
+};
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..9de73f042f78
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_SUPER_DOT_H__
+#define __OPS_SUPER_DOT_H__
+#include <linux/fs.h>
+extern struct super_operations gfs2_super_ops;
+#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..5453d2947ab3
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+static void pfault_be_greedy(struct gfs2_inode *ip)
+{
+        unsigned int time;
+        spin_lock(&ip->i_spin);
+        time = ip->i_greedy;
+        ip->i_last_pfault = jiffies;
+        spin_unlock(&ip->i_spin);
+        igrab(&ip->i_inode);
+        if (gfs2_glock_be_greedy(ip->i_gl, time))
+                iput(&ip->i_inode);
+}
+static struct page *gfs2_private_nopage(struct vm_area_struct *area,
+                                        unsigned long address, int *type)
+{
+        struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
+        struct page *result;
+        set_bit(GIF_PAGED, &ip->i_flags);
+        result = filemap_nopage(area, address, type);
+        if (result && result != NOPAGE_OOM)
+                pfault_be_greedy(ip);
+        return result;
+}
+static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned long index = page->index;
+        u64 lblock = index << (PAGE_CACHE_SHIFT -
+                                    sdp->sd_sb.sb_bsize_shift);
+        unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
+        struct gfs2_alloc *al;
+        unsigned int data_blocks, ind_blocks;
+        unsigned int x;
+        int error;
+        al = gfs2_alloc_get(ip);
+        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (error)
+                goto out;
+        error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+        if (error)
+                goto out_gunlock_q;
+        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+        al->al_requested = data_blocks + ind_blocks;
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_gunlock_q;
+        error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
+                                 ind_blocks + RES_DINODE +
+                                 RES_STATFS + RES_QUOTA, 0);
+        if (error)
+                goto out_ipres;
+        if (gfs2_is_stuffed(ip)) {
+                error = gfs2_unstuff_dinode(ip, NULL);
+                if (error)
+                        goto out_trans;
+        }
+        for (x = 0; x < blocks; ) {
+                u64 dblock;
+                unsigned int extlen;
+                int new = 1;
+                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+                if (error)
+                        goto out_trans;
+                lblock += extlen;
+                x += extlen;
+        }
+        gfs2_assert_warn(sdp, al->al_alloced);
+out_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        gfs2_inplace_release(ip);
+out_gunlock_q:
+        gfs2_quota_unlock(ip);
+out:
+        gfs2_alloc_put(ip);
+        return error;
+}
+static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
+                                           unsigned long address, int *type)
+{
+        struct file *file = area->vm_file;
+        struct gfs2_file *gf = file->private_data;
+        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+        struct gfs2_holder i_gh;
+        struct page *result = NULL;
+        unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
+                              area->vm_pgoff;
+        int alloc_required;
+        int error;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                return NULL;
+        set_bit(GIF_PAGED, &ip->i_flags);
+        set_bit(GIF_SW_PAGED, &ip->i_flags);
+        error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
+                                          PAGE_CACHE_SIZE, &alloc_required);
+        if (error)
+                goto out;
+        set_bit(GFF_EXLOCK, &gf->f_flags);
+        result = filemap_nopage(area, address, type);
+        clear_bit(GFF_EXLOCK, &gf->f_flags);
+        if (!result || result == NOPAGE_OOM)
+                goto out;
+        if (alloc_required) {
+                error = alloc_page_backing(ip, result);
+                if (error) {
+                        page_cache_release(result);
+                        result = NULL;
+                        goto out;
+                }
+                set_page_dirty(result);
+        }
+        pfault_be_greedy(ip);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return result;
+}
+struct vm_operations_struct gfs2_vm_ops_private = {
+        .nopage = gfs2_private_nopage,
+};
+struct vm_operations_struct gfs2_vm_ops_sharewrite = {
+        .nopage = gfs2_sharewrite_nopage,
+};
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..4ae8f43ed5e3
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __OPS_VM_DOT_H__
+#define __OPS_VM_DOT_H__
+#include <linux/mm.h>
+extern struct vm_operations_struct gfs2_vm_ops_private;
+extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
+#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..a3deae7416c9
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1228 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+/*
+ * Quota change tags are associated with each transaction that allocates or
+ * deallocates space.  Those changes are accumulated locally to each node (in a
+ * per-node file) and then are periodically synced to the quota file.  This
+ * avoids the bottleneck of constantly touching the quota file, but introduces
+ * fuzziness in the current usage value of IDs that are being used on different
+ * nodes in the cluster simultaneously.  So, it is possible for a user on
+ * multiple nodes to overrun their quota, but that overrun is controlable.
+ * Since quota tags are part of transactions, there is no need to a quota check
+ * program to be run on node crashes or anything like that.
+ *
+ * There are couple of knobs that let the administrator manage the quota
+ * fuzziness.  "quota_quantum" sets the maximum time a quota change can be
+ * sitting on one node before being synced to the quota file.  (The default is
+ * 60 seconds.)  Another knob, "quota_scale" controls how quickly the frequency
+ * of quota file syncs increases as the user moves closer to their limit.  The
+ * more frequent the syncs, the more accurate the quota enforcement, but that
+ * means that there is more contention between the nodes for the quota file.
+ * The default value is one.  This sets the maximum theoretical quota overrun
+ * (with infinite node with infinite bandwidth) to twice the user's limit.  (In
+ * practice, the maximum overrun you see should be much less.)  A "quota_scale"
+ * number greater than one makes quota syncs more frequent and reduces the
+ * maximum overrun.  Numbers less than one (but greater than zero) make quota
+ * syncs less frequent.
+ *
+ * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
+ * the quota file, so it is not being constantly read.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "inode.h"
+#include "ops_file.h"
+#include "ops_address.h"
+#include "util.h"
+#define QUOTA_USER 1
+#define QUOTA_GROUP 0
+static u64 qd2offset(struct gfs2_quota_data *qd)
+{
+        u64 offset;
+        offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+        offset *= sizeof(struct gfs2_quota);
+        return offset;
+}
+static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+                    struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd;
+        int error;
+        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+        if (!qd)
+                return -ENOMEM;
+        qd->qd_count = 1;
+        qd->qd_id = id;
+        if (user)
+                set_bit(QDF_USER, &qd->qd_flags);
+        qd->qd_slot = -1;
+        error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+                              &gfs2_quota_glops, CREATE, &qd->qd_gl);
+        if (error)
+                goto fail;
+        error = gfs2_lvb_hold(qd->qd_gl);
+        gfs2_glock_put(qd->qd_gl);
+        if (error)
+                goto fail;
+        *qdp = qd;
+        return 0;
+fail:
+        kfree(qd);
+        return error;
+}
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+                  struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+        int error, found;
+        *qdp = NULL;
+        for (;;) {
+                found = 0;
+                spin_lock(&sdp->sd_quota_spin);
+                list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+                        if (qd->qd_id == id &&
+                            !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+                                qd->qd_count++;
+                                found = 1;
+                                break;
+                        }
+                }
+                if (!found)
+                        qd = NULL;
+                if (!qd && new_qd) {
+                        qd = new_qd;
+                        list_add(&qd->qd_list, &sdp->sd_quota_list);
+                        atomic_inc(&sdp->sd_quota_count);
+                        new_qd = NULL;
+                }
+                spin_unlock(&sdp->sd_quota_spin);
+                if (qd || !create) {
+                        if (new_qd) {
+                                gfs2_lvb_unhold(new_qd->qd_gl);
+                                kfree(new_qd);
+                        }
+                        *qdp = qd;
+                        return 0;
+                }
+                error = qd_alloc(sdp, user, id, &new_qd);
+                if (error)
+                        return error;
+        }
+}
+static void qd_hold(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_count);
+        qd->qd_count++;
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static void qd_put(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_count);
+        if (!--qd->qd_count)
+                qd->qd_last_touched = jiffies;
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static int slot_get(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        unsigned int c, o = 0, b;
+        unsigned char byte = 0;
+        spin_lock(&sdp->sd_quota_spin);
+        if (qd->qd_slot_count++) {
+                spin_unlock(&sdp->sd_quota_spin);
+                return 0;
+        }
+        for (c = 0; c < sdp->sd_quota_chunks; c++)
+                for (o = 0; o < PAGE_SIZE; o++) {
+                        byte = sdp->sd_quota_bitmap[c][o];
+                        if (byte != 0xFF)
+                                goto found;
+                }
+        goto fail;
+found:
+        for (b = 0; b < 8; b++)
+                if (!(byte & (1 << b)))
+                        break;
+        qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
+        if (qd->qd_slot >= sdp->sd_quota_slots)
+                goto fail;
+        sdp->sd_quota_bitmap[c][o] |= 1 << b;
+        spin_unlock(&sdp->sd_quota_spin);
+        return 0;
+fail:
+        qd->qd_slot_count--;
+        spin_unlock(&sdp->sd_quota_spin);
+        return -ENOSPC;
+}
+static void slot_hold(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_slot_count);
+        qd->qd_slot_count++;
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static void slot_put(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        spin_lock(&sdp->sd_quota_spin);
+        gfs2_assert(sdp, qd->qd_slot_count);
+        if (!--qd->qd_slot_count) {
+                gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+                qd->qd_slot = -1;
+        }
+        spin_unlock(&sdp->sd_quota_spin);
+}
+static int bh_get(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+        unsigned int block, offset;
+        struct buffer_head *bh;
+        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+        mutex_lock(&sdp->sd_quota_mutex);
+        if (qd->qd_bh_count++) {
+                mutex_unlock(&sdp->sd_quota_mutex);
+                return 0;
+        }
+        block = qd->qd_slot / sdp->sd_qc_per_block;
+        offset = qd->qd_slot % sdp->sd_qc_per_block;;
+        bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+        error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
+        if (error)
+                goto fail;
+        error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+        if (error)
+                goto fail;
+        error = -EIO;
+        if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+                goto fail_brelse;
+        qd->qd_bh = bh;
+        qd->qd_bh_qc = (struct gfs2_quota_change *)
+                (bh->b_data + sizeof(struct gfs2_meta_header) +
+                 offset * sizeof(struct gfs2_quota_change));
+        mutex_lock(&sdp->sd_quota_mutex);
+        return 0;
+fail_brelse:
+        brelse(bh);
+fail:
+        qd->qd_bh_count--;
+        mutex_unlock(&sdp->sd_quota_mutex);
+        return error;
+}
+static void bh_put(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        mutex_lock(&sdp->sd_quota_mutex);
+        gfs2_assert(sdp, qd->qd_bh_count);
+        if (!--qd->qd_bh_count) {
+                brelse(qd->qd_bh);
+                qd->qd_bh = NULL;
+                qd->qd_bh_qc = NULL;
+        }
+        mutex_unlock(&sdp->sd_quota_mutex);
+}
+static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
+{
+        struct gfs2_quota_data *qd = NULL;
+        int error;
+        int found = 0;
+        *qdp = NULL;
+        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                return 0;
+        spin_lock(&sdp->sd_quota_spin);
+        list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+                if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+                    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+                    qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
+                        continue;
+                list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+                set_bit(QDF_LOCKED, &qd->qd_flags);
+                gfs2_assert_warn(sdp, qd->qd_count);
+                qd->qd_count++;
+                qd->qd_change_sync = qd->qd_change;
+                gfs2_assert_warn(sdp, qd->qd_slot_count);
+                qd->qd_slot_count++;
+                found = 1;
+                break;
+        }
+        if (!found)
+                qd = NULL;
+        spin_unlock(&sdp->sd_quota_spin);
+        if (qd) {
+                gfs2_assert_warn(sdp, qd->qd_change_sync);
+                error = bh_get(qd);
+                if (error) {
+                        clear_bit(QDF_LOCKED, &qd->qd_flags);
+                        slot_put(qd);
+                        qd_put(qd);
+                        return error;
+                }
+        }
+        *qdp = qd;
+        return 0;
+}
+static int qd_trylock(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                return 0;
+        spin_lock(&sdp->sd_quota_spin);
+        if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+            !test_bit(QDF_CHANGE, &qd->qd_flags)) {
+                spin_unlock(&sdp->sd_quota_spin);
+                return 0;
+        }
+        list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+        set_bit(QDF_LOCKED, &qd->qd_flags);
+        gfs2_assert_warn(sdp, qd->qd_count);
+        qd->qd_count++;
+        qd->qd_change_sync = qd->qd_change;
+        gfs2_assert_warn(sdp, qd->qd_slot_count);
+        qd->qd_slot_count++;
+        spin_unlock(&sdp->sd_quota_spin);
+        gfs2_assert_warn(sdp, qd->qd_change_sync);
+        if (bh_get(qd)) {
+                clear_bit(QDF_LOCKED, &qd->qd_flags);
+                slot_put(qd);
+                qd_put(qd);
+                return 0;
+        }
+        return 1;
+}
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+        gfs2_assert_warn(qd->qd_gl->gl_sbd,
+                         test_bit(QDF_LOCKED, &qd->qd_flags));
+        clear_bit(QDF_LOCKED, &qd->qd_flags);
+        bh_put(qd);
+        slot_put(qd);
+        qd_put(qd);
+}
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+                    struct gfs2_quota_data **qdp)
+{
+        int error;
+        error = qd_get(sdp, user, id, create, qdp);
+        if (error)
+                return error;
+        error = slot_get(*qdp);
+        if (error)
+                goto fail;
+        error = bh_get(*qdp);
+        if (error)
+                goto fail_slot;
+        return 0;
+fail_slot:
+        slot_put(*qdp);
+fail:
+        qd_put(*qdp);
+        return error;
+}
+static void qdsb_put(struct gfs2_quota_data *qd)
+{
+        bh_put(qd);
+        slot_put(qd);
+        qd_put(qd);
+}
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data **qd = al->al_qd;
+        int error;
+        if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
+            gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
+                return -EIO;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return 0;
+        error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
+        if (error)
+                goto out;
+        al->al_qd_num++;
+        qd++;
+        error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
+        if (error)
+                goto out;
+        al->al_qd_num++;
+        qd++;
+        if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
+                error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+                if (error)
+                        goto out;
+                al->al_qd_num++;
+                qd++;
+        }
+        if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
+                error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+                if (error)
+                        goto out;
+                al->al_qd_num++;
+                qd++;
+        }
+out:
+        if (error)
+                gfs2_quota_unhold(ip);
+        return error;
+}
+void gfs2_quota_unhold(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        unsigned int x;
+        gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
+        for (x = 0; x < al->al_qd_num; x++) {
+                qdsb_put(al->al_qd[x]);
+                al->al_qd[x] = NULL;
+        }
+        al->al_qd_num = 0;
+}
+static int sort_qd(const void *a, const void *b)
+{
+        const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
+        const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
+        if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
+            !test_bit(QDF_USER, &qd_b->qd_flags)) {
+                if (test_bit(QDF_USER, &qd_a->qd_flags))
+                        return -1;
+                else
+                        return 1;
+        }
+        if (qd_a->qd_id < qd_b->qd_id)
+                return -1;
+        if (qd_a->qd_id > qd_b->qd_id)
+                return 1;
+        return 0;
+}
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+        struct gfs2_quota_change *qc = qd->qd_bh_qc;
+        s64 x;
+        mutex_lock(&sdp->sd_quota_mutex);
+        gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+        if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
+                qc->qc_change = 0;
+                qc->qc_flags = 0;
+                if (test_bit(QDF_USER, &qd->qd_flags))
+                        qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
+                qc->qc_id = cpu_to_be32(qd->qd_id);
+        }
+        x = qc->qc_change;
+        x = be64_to_cpu(x) + change;
+        qc->qc_change = cpu_to_be64(x);
+        spin_lock(&sdp->sd_quota_spin);
+        qd->qd_change = x;
+        spin_unlock(&sdp->sd_quota_spin);
+        if (!x) {
+                gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
+                clear_bit(QDF_CHANGE, &qd->qd_flags);
+                qc->qc_flags = 0;
+                qc->qc_id = 0;
+                slot_put(qd);
+                qd_put(qd);
+        } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
+                qd_hold(qd);
+                slot_hold(qd);
+        }
+        mutex_unlock(&sdp->sd_quota_mutex);
+}
+/**
+ * gfs2_adjust_quota
+ *
+ * This function was mostly borrowed from gfs2_block_truncate_page which was
+ * in turn mostly borrowed from ext3
+ */
+static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+                             s64 change, struct gfs2_quota_data *qd)
+{
+        struct inode *inode = &ip->i_inode;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long index = loc >> PAGE_CACHE_SHIFT;
+        unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
+        unsigned blocksize, iblock, pos;
+        struct buffer_head *bh;
+        struct page *page;
+        void *kaddr;
+        __be64 *ptr;
+        s64 value;
+        int err = -EIO;
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        blocksize = inode->i_sb->s_blocksize;
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        if (!buffer_mapped(bh)) {
+                gfs2_get_block(inode, iblock, bh, 1);
+                if (!buffer_mapped(bh))
+                        goto unlock;
+        }
+        if (PageUptodate(page))
+                set_buffer_uptodate(bh);
+        if (!buffer_uptodate(bh)) {
+                ll_rw_block(READ_META, 1, &bh);
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh))
+                        goto unlock;
+        }
+        gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        kaddr = kmap_atomic(page, KM_USER0);
+        ptr = kaddr + offset;
+        value = (s64)be64_to_cpu(*ptr) + change;
+        *ptr = cpu_to_be64(value);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        err = 0;
+        qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+        qd->qd_qb.qb_value = cpu_to_be64(value);
+unlock:
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+{
+        struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        unsigned int data_blocks, ind_blocks;
+        struct gfs2_holder *ghs, i_gh;
+        unsigned int qx, x;
+        struct gfs2_quota_data *qd;
+        loff_t offset;
+        unsigned int nalloc = 0;
+        struct gfs2_alloc *al = NULL;
+        int error;
+        gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+                              &data_blocks, &ind_blocks);
+        ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!ghs)
+                return -ENOMEM;
+        sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+        for (qx = 0; qx < num_qd; qx++) {
+                error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+                                           LM_ST_EXCLUSIVE,
+                                           GL_NOCACHE, &ghs[qx]);
+                if (error)
+                        goto out;
+        }
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (error)
+                goto out;
+        for (x = 0; x < num_qd; x++) {
+                int alloc_required;
+                offset = qd2offset(qda[x]);
+                error = gfs2_write_alloc_required(ip, offset,
+                                                  sizeof(struct gfs2_quota),
+                                                  &alloc_required);
+                if (error)
+                        goto out_gunlock;
+                if (alloc_required)
+                        nalloc++;
+        }
+        if (nalloc) {
+                al = gfs2_alloc_get(ip);
+                al->al_requested = nalloc * (data_blocks + ind_blocks);
+                error = gfs2_inplace_reserve(ip);
+                if (error)
+                        goto out_alloc;
+                error = gfs2_trans_begin(sdp,
+                                         al->al_rgd->rd_ri.ri_length +
+                                         num_qd * data_blocks +
+                                         nalloc * ind_blocks +
+                                         RES_DINODE + num_qd +
+                                         RES_STATFS, 0);
+                if (error)
+                        goto out_ipres;
+        } else {
+                error = gfs2_trans_begin(sdp,
+                                         num_qd * data_blocks +
+                                         RES_DINODE + num_qd, 0);
+                if (error)
+                        goto out_gunlock;
+        }
+        for (x = 0; x < num_qd; x++) {
+                qd = qda[x];
+                offset = qd2offset(qd);
+                error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+                                          (struct gfs2_quota_data *)
+                                          qd->qd_gl->gl_lvb);
+                if (error)
+                        goto out_end_trans;
+                do_qc(qd, -qd->qd_change_sync);
+        }
+        error = 0;
+out_end_trans:
+        gfs2_trans_end(sdp);
+out_ipres:
+        if (nalloc)
+                gfs2_inplace_release(ip);
+out_alloc:
+        if (nalloc)
+                gfs2_alloc_put(ip);
+out_gunlock:
+        gfs2_glock_dq_uninit(&i_gh);
+out:
+        while (qx--)
+                gfs2_glock_dq_uninit(&ghs[qx]);
+        kfree(ghs);
+        gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+        return error;
+}
+static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
+                    struct gfs2_holder *q_gh)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+        struct gfs2_holder i_gh;
+        struct gfs2_quota q;
+        char buf[sizeof(struct gfs2_quota)];
+        struct file_ra_state ra_state;
+        int error;
+        struct gfs2_quota_lvb *qlvb;
+        file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
+restart:
+        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
+        if (error)
+                return error;
+        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
+                loff_t pos;
+                gfs2_glock_dq_uninit(q_gh);
+                error = gfs2_glock_nq_init(qd->qd_gl,
+                                          LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                          q_gh);
+                if (error)
+                        return error;
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+                if (error)
+                        goto fail;
+                memset(buf, 0, sizeof(struct gfs2_quota));
+                pos = qd2offset(qd);
+                error = gfs2_internal_read(ip, &ra_state, buf,
+                                           &pos, sizeof(struct gfs2_quota));
+                if (error < 0)
+                        goto fail_gunlock;
+                gfs2_glock_dq_uninit(&i_gh);
+                gfs2_quota_in(&q, buf);
+                qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+                qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+                qlvb->__pad = 0;
+                qlvb->qb_limit = cpu_to_be64(q.qu_limit);
+                qlvb->qb_warn = cpu_to_be64(q.qu_warn);
+                qlvb->qb_value = cpu_to_be64(q.qu_value);
+                qd->qd_qb = *qlvb;
+                if (gfs2_glock_is_blocking(qd->qd_gl)) {
+                        gfs2_glock_dq_uninit(q_gh);
+                        force_refresh = 0;
+                        goto restart;
+                }
+        }
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq_uninit(&i_gh);
+fail:
+        gfs2_glock_dq_uninit(q_gh);
+        return error;
+}
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        unsigned int x;
+        int error = 0;
+        gfs2_quota_hold(ip, uid, gid);
+        if (capable(CAP_SYS_RESOURCE) ||
+            sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+        sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
+             sort_qd, NULL);
+        for (x = 0; x < al->al_qd_num; x++) {
+                error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+                if (error)
+                        break;
+        }
+        if (!error)
+                set_bit(GIF_QD_LOCKED, &ip->i_flags);
+        else {
+                while (x--)
+                        gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+                gfs2_quota_unhold(ip);
+        }
+        return error;
+}
+static int need_sync(struct gfs2_quota_data *qd)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        s64 value;
+        unsigned int num, den;
+        int do_sync = 1;
+        if (!qd->qd_qb.qb_limit)
+                return 0;
+        spin_lock(&sdp->sd_quota_spin);
+        value = qd->qd_change;
+        spin_unlock(&sdp->sd_quota_spin);
+        spin_lock(&gt->gt_spin);
+        num = gt->gt_quota_scale_num;
+        den = gt->gt_quota_scale_den;
+        spin_unlock(&gt->gt_spin);
+        if (value < 0)
+                do_sync = 0;
+        else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
+                 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+                do_sync = 0;
+        else {
+                value *= gfs2_jindex_size(sdp) * num;
+                do_div(value, den);
+                value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
+                if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+                        do_sync = 0;
+        }
+        return do_sync;
+}
+void gfs2_quota_unlock(struct gfs2_inode *ip)
+{
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data *qda[4];
+        unsigned int count = 0;
+        unsigned int x;
+        if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
+                goto out;
+        for (x = 0; x < al->al_qd_num; x++) {
+                struct gfs2_quota_data *qd;
+                int sync;
+                qd = al->al_qd[x];
+                sync = need_sync(qd);
+                gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+                if (sync && qd_trylock(qd))
+                        qda[count++] = qd;
+        }
+        if (count) {
+                do_sync(count, qda);
+                for (x = 0; x < count; x++)
+                        qd_unlock(qda[x]);
+        }
+out:
+        gfs2_quota_unhold(ip);
+}
+#define MAX_LINE 256
+static int print_message(struct gfs2_quota_data *qd, char *type)
+{
+        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+               sdp->sd_fsname, type,
+               (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
+               qd->qd_id);
+        return 0;
+}
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data *qd;
+        s64 value;
+        unsigned int x;
+        int error = 0;
+        if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
+                return 0;
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+        for (x = 0; x < al->al_qd_num; x++) {
+                qd = al->al_qd[x];
+                if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+                      (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+                        continue;
+                value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+                spin_lock(&sdp->sd_quota_spin);
+                value += qd->qd_change;
+                spin_unlock(&sdp->sd_quota_spin);
+                if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+                        print_message(qd, "exceeded");
+                        error = -EDQUOT;
+                        break;
+                } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+                           (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+                           time_after_eq(jiffies, qd->qd_last_warn +
+                                         gfs2_tune_get(sdp,
+                                                gt_quota_warn_period) * HZ)) {
+                        error = print_message(qd, "warning");
+                        qd->qd_last_warn = jiffies;
+                }
+        }
+        return error;
+}
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+                       u32 uid, u32 gid)
+{
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_quota_data *qd;
+        unsigned int x;
+        unsigned int found = 0;
+        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+                return;
+        if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+                return;
+        for (x = 0; x < al->al_qd_num; x++) {
+                qd = al->al_qd[x];
+                if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+                    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+                        do_qc(qd, change);
+                        found++;
+                }
+        }
+}
+int gfs2_quota_sync(struct gfs2_sbd *sdp)
+{
+        struct gfs2_quota_data **qda;
+        unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+        unsigned int num_qd;
+        unsigned int x;
+        int error = 0;
+        sdp->sd_quota_sync_gen++;
+        qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
+        if (!qda)
+                return -ENOMEM;
+        do {
+                num_qd = 0;
+                for (;;) {
+                        error = qd_fish(sdp, qda + num_qd);
+                        if (error || !qda[num_qd])
+                                break;
+                        if (++num_qd == max_qd)
+                                break;
+                }
+                if (num_qd) {
+                        if (!error)
+                                error = do_sync(num_qd, qda);
+                        if (!error)
+                                for (x = 0; x < num_qd; x++)
+                                        qda[x]->qd_sync_gen =
+                                                sdp->sd_quota_sync_gen;
+                        for (x = 0; x < num_qd; x++)
+                                qd_unlock(qda[x]);
+                }
+        } while (!error && num_qd == max_qd);
+        kfree(qda);
+        return error;
+}
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+{
+        struct gfs2_quota_data *qd;
+        struct gfs2_holder q_gh;
+        int error;
+        error = qd_get(sdp, user, id, CREATE, &qd);
+        if (error)
+                return error;
+        error = do_glock(qd, FORCE, &q_gh);
+        if (!error)
+                gfs2_glock_dq_uninit(&q_gh);
+        qd_put(qd);
+        return error;
+}
+int gfs2_quota_init(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+        unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        unsigned int x, slot = 0;
+        unsigned int found = 0;
+        u64 dblock;
+        u32 extlen = 0;
+        int error;
+        if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+            ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
+        sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+        error = -ENOMEM;
+        sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+                                       sizeof(unsigned char *), GFP_KERNEL);
+        if (!sdp->sd_quota_bitmap)
+                return error;
+        for (x = 0; x < sdp->sd_quota_chunks; x++) {
+                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+                if (!sdp->sd_quota_bitmap[x])
+                        goto fail;
+        }
+        for (x = 0; x < blocks; x++) {
+                struct buffer_head *bh;
+                unsigned int y;
+                if (!extlen) {
+                        int new = 0;
+                        error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
+                        if (error)
+                                goto fail;
+                }
+                error = -EIO;
+                bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+                if (!bh)
+                        goto fail;
+                if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
+                        brelse(bh);
+                        goto fail;
+                }
+                for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
+                     y++, slot++) {
+                        struct gfs2_quota_change qc;
+                        struct gfs2_quota_data *qd;
+                        gfs2_quota_change_in(&qc, bh->b_data +
+                                          sizeof(struct gfs2_meta_header) +
+                                          y * sizeof(struct gfs2_quota_change));
+                        if (!qc.qc_change)
+                                continue;
+                        error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
+                                         qc.qc_id, &qd);
+                        if (error) {
+                                brelse(bh);
+                                goto fail;
+                        }
+                        set_bit(QDF_CHANGE, &qd->qd_flags);
+                        qd->qd_change = qc.qc_change;
+                        qd->qd_slot = slot;
+                        qd->qd_slot_count = 1;
+                        qd->qd_last_touched = jiffies;
+                        spin_lock(&sdp->sd_quota_spin);
+                        gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+                        list_add(&qd->qd_list, &sdp->sd_quota_list);
+                        atomic_inc(&sdp->sd_quota_count);
+                        spin_unlock(&sdp->sd_quota_spin);
+                        found++;
+                }
+                brelse(bh);
+                dblock++;
+                extlen--;
+        }
+        if (found)
+                fs_info(sdp, "found %u quota changes\n", found);
+        return 0;
+fail:
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+void gfs2_quota_scan(struct gfs2_sbd *sdp)
+{
+        struct gfs2_quota_data *qd, *safe;
+        LIST_HEAD(dead);
+        spin_lock(&sdp->sd_quota_spin);
+        list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
+                if (!qd->qd_count &&
+                    time_after_eq(jiffies, qd->qd_last_touched +
+                                gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
+                        list_move(&qd->qd_list, &dead);
+                        gfs2_assert_warn(sdp,
+                                         atomic_read(&sdp->sd_quota_count) > 0);
+                        atomic_dec(&sdp->sd_quota_count);
+                }
+        }
+        spin_unlock(&sdp->sd_quota_spin);
+        while (!list_empty(&dead)) {
+                qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
+                list_del(&qd->qd_list);
+                gfs2_assert_warn(sdp, !qd->qd_change);
+                gfs2_assert_warn(sdp, !qd->qd_slot_count);
+                gfs2_assert_warn(sdp, !qd->qd_bh_count);
+                gfs2_lvb_unhold(qd->qd_gl);
+                kfree(qd);
+        }
+}
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
+{
+        struct list_head *head = &sdp->sd_quota_list;
+        struct gfs2_quota_data *qd;
+        unsigned int x;
+        spin_lock(&sdp->sd_quota_spin);
+        while (!list_empty(head)) {
+                qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+                if (qd->qd_count > 1 ||
+                    (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+                        list_move(&qd->qd_list, head);
+                        spin_unlock(&sdp->sd_quota_spin);
+                        schedule();
+                        spin_lock(&sdp->sd_quota_spin);
+                        continue;
+                }
+                list_del(&qd->qd_list);
+                atomic_dec(&sdp->sd_quota_count);
+                spin_unlock(&sdp->sd_quota_spin);
+                if (!qd->qd_count) {
+                        gfs2_assert_warn(sdp, !qd->qd_change);
+                        gfs2_assert_warn(sdp, !qd->qd_slot_count);
+                } else
+                        gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+                gfs2_assert_warn(sdp, !qd->qd_bh_count);
+                gfs2_lvb_unhold(qd->qd_gl);
+                kfree(qd);
+                spin_lock(&sdp->sd_quota_spin);
+        }
+        spin_unlock(&sdp->sd_quota_spin);
+        gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+        if (sdp->sd_quota_bitmap) {
+                for (x = 0; x < sdp->sd_quota_chunks; x++)
+                        kfree(sdp->sd_quota_bitmap[x]);
+                kfree(sdp->sd_quota_bitmap);
+        }
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..a8be1417051f
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __QUOTA_DOT_H__
+#define __QUOTA_DOT_H__
+struct gfs2_inode;
+struct gfs2_sbd;
+#define NO_QUOTA_CHANGE ((u32)-1)
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+                       u32 uid, u32 gid);
+int gfs2_quota_sync(struct gfs2_sbd *sdp);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_scan(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..62cd223819b7
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+#include "dir.h"
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+                           struct buffer_head **bh)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_glock *gl = ip->i_gl;
+        int new = 0;
+        u64 dblock;
+        u32 extlen;
+        int error;
+        error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
+        if (error)
+                return error;
+        if (!dblock) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        *bh = gfs2_meta_ra(gl, dblock, extlen);
+        return error;
+}
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+        struct list_head *head = &sdp->sd_revoke_list;
+        struct gfs2_revoke_replay *rr;
+        int found = 0;
+        list_for_each_entry(rr, head, rr_list) {
+                if (rr->rr_blkno == blkno) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (found) {
+                rr->rr_where = where;
+                return 0;
+        }
+        rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+        if (!rr)
+                return -ENOMEM;
+        rr->rr_blkno = blkno;
+        rr->rr_where = where;
+        list_add(&rr->rr_list, head);
+        return 1;
+}
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+        struct gfs2_revoke_replay *rr;
+        int wrap, a, b, revoke;
+        int found = 0;
+        list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+                if (rr->rr_blkno == blkno) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                return 0;
+        wrap = (rr->rr_where < sdp->sd_replay_tail);
+        a = (sdp->sd_replay_tail < where);
+        b = (where < rr->rr_where);
+        revoke = (wrap) ? (a || b) : (a && b);
+        return revoke;
+}
+void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+{
+        struct list_head *head = &sdp->sd_revoke_list;
+        struct gfs2_revoke_replay *rr;
+        while (!list_empty(head)) {
+                rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+                list_del(&rr->rr_list);
+                kfree(rr);
+        }
+}
+/**
+ * get_log_header - read the log header for a given segment
+ * @jd: the journal
+ * @blk: the block to look at
+ * @lh: the log header to return
+ *
+ * Read the log header for a given segement in a given journal.  Do a few
+ * sanity checks on it.
+ *
+ * Returns: 0 on success,
+ *          1 if the header was invalid or incomplete,
+ *          errno on error
+ */
+static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
+                          struct gfs2_log_header *head)
+{
+        struct buffer_head *bh;
+        struct gfs2_log_header lh;
+        u32 hash;
+        int error;
+        error = gfs2_replay_read_block(jd, blk, &bh);
+        if (error)
+                return error;
+        memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
+        lh.lh_hash = 0;
+        hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
+        gfs2_log_header_in(&lh, bh->b_data);
+        brelse(bh);
+        if (lh.lh_header.mh_magic != GFS2_MAGIC ||
+            lh.lh_header.mh_type != GFS2_METATYPE_LH ||
+            lh.lh_blkno != blk || lh.lh_hash != hash)
+                return 1;
+        *head = lh;
+        return 0;
+}
+/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+                        struct gfs2_log_header *head)
+{
+        unsigned int orig_blk = *blk;
+        int error;
+        for (;;) {
+                error = get_log_header(jd, *blk, head);
+                if (error <= 0)
+                        return error;
+                if (++*blk == jd->jd_blocks)
+                        *blk = 0;
+                if (*blk == orig_blk) {
+                        gfs2_consist_inode(GFS2_I(jd->jd_inode));
+                        return -EIO;
+                }
+        }
+}
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before.  Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+        unsigned int blk = head->lh_blkno;
+        struct gfs2_log_header lh;
+        int error;
+        for (;;) {
+                if (++blk == jd->jd_blocks)
+                        blk = 0;
+                error = get_log_header(jd, blk, &lh);
+                if (error < 0)
+                        return error;
+                if (error == 1)
+                        continue;
+                if (lh.lh_sequence == head->lh_sequence) {
+                        gfs2_consist_inode(GFS2_I(jd->jd_inode));
+                        return -EIO;
+                }
+                if (lh.lh_sequence < head->lh_sequence)
+                        break;
+                *head = lh;
+        }
+        return 0;
+}
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number.  (i.e. the log head)
+ *
+ * Returns: errno
+ */
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+        struct gfs2_log_header lh_1, lh_m;
+        u32 blk_1, blk_2, blk_m;
+        int error;
+        blk_1 = 0;
+        blk_2 = jd->jd_blocks - 1;
+        for (;;) {
+                blk_m = (blk_1 + blk_2) / 2;
+                error = find_good_lh(jd, &blk_1, &lh_1);
+                if (error)
+                        return error;
+                error = find_good_lh(jd, &blk_m, &lh_m);
+                if (error)
+                        return error;
+                if (blk_1 == blk_m || blk_m == blk_2)
+                        break;
+                if (lh_1.lh_sequence <= lh_m.lh_sequence)
+                        blk_1 = blk_m;
+                else
+                        blk_2 = blk_m;
+        }
+        error = jhead_scan(jd, &lh_1);
+        if (error)
+                return error;
+        *head = lh_1;
+        return error;
+}
+/**
+ * foreach_descriptor - go through the active part of the log
+ * @jd: the journal
+ * @start: the first log header in the active region
+ * @end: the last log header (don't process the contents of this entry))
+ *
+ * Call a given function once for every log descriptor in the active
+ * portion of the log.
+ *
+ * Returns: errno
+ */
+static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+                              unsigned int end, int pass)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct buffer_head *bh;
+        struct gfs2_log_descriptor *ld;
+        int error = 0;
+        u32 length;
+        __be64 *ptr;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        offset += sizeof(__be64) - 1;
+        offset &= ~(sizeof(__be64) - 1);
+        while (start != end) {
+                error = gfs2_replay_read_block(jd, start, &bh);
+                if (error)
+                        return error;
+                if (gfs2_meta_check(sdp, bh)) {
+                        brelse(bh);
+                        return -EIO;
+                }
+                ld = (struct gfs2_log_descriptor *)bh->b_data;
+                length = be32_to_cpu(ld->ld_length);
+                if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
+                        struct gfs2_log_header lh;
+                        error = get_log_header(jd, start, &lh);
+                        if (!error) {
+                                gfs2_replay_incr_blk(sdp, &start);
+                                brelse(bh);
+                                continue;
+                        }
+                        if (error == 1) {
+                                gfs2_consist_inode(GFS2_I(jd->jd_inode));
+                                error = -EIO;
+                        }
+                        brelse(bh);
+                        return error;
+                } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
+                        brelse(bh);
+                        return -EIO;
+                }
+                ptr = (__be64 *)(bh->b_data + offset);
+                error = lops_scan_elements(jd, start, ld, ptr, pass);
+                if (error) {
+                        brelse(bh);
+                        return error;
+                }
+                while (length--)
+                        gfs2_replay_incr_blk(sdp, &start);
+                brelse(bh);
+        }
+        return 0;
+}
+/**
+ * clean_journal - mark a dirty journal as being clean
+ * @sdp: the filesystem
+ * @jd: the journal
+ * @gl: the journal's glock
+ * @head: the head journal to start from
+ *
+ * Returns: errno
+ */
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        unsigned int lblock;
+        struct gfs2_log_header *lh;
+        u32 hash;
+        struct buffer_head *bh;
+        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+        lblock = head->lh_blkno;
+        gfs2_replay_incr_blk(sdp, &lblock);
+        bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+        error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
+        if (error)
+                return error;
+        if (!bh_map.b_blocknr) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, bh->b_size);
+        set_buffer_uptodate(bh);
+        clear_buffer_dirty(bh);
+        unlock_buffer(bh);
+        lh = (struct gfs2_log_header *)bh->b_data;
+        memset(lh, 0, sizeof(struct gfs2_log_header));
+        lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+        lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+        lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+        lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
+        lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
+        lh->lh_blkno = cpu_to_be32(lblock);
+        hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
+        lh->lh_hash = cpu_to_be32(hash);
+        set_buffer_dirty(bh);
+        if (sync_dirty_buffer(bh))
+                gfs2_io_error_bh(sdp, bh);
+        brelse(bh);
+        return error;
+}
+/**
+ * gfs2_recover_journal - recovery a given journal
+ * @jd: the struct gfs2_jdesc describing the journal
+ *
+ * Acquire the journal's lock, check to see if the journal is clean, and
+ * do recovery if necessary.
+ *
+ * Returns: errno
+ */
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        struct gfs2_log_header head;
+        struct gfs2_holder j_gh, ji_gh, t_gh;
+        unsigned long t;
+        int ro = 0;
+        unsigned int pass;
+        int error;
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+                fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
+                        jd->jd_jid);
+                /* Aquire the journal lock so we can do recovery */
+                error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
+                                          LM_ST_EXCLUSIVE,
+                                          LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
+                                          &j_gh);
+                switch (error) {
+                case 0:
+                        break;
+                case GLR_TRYFAILED:
+                        fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
+                        error = 0;
+                default:
+                        goto fail;
+                };
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+                                           LM_FLAG_NOEXP, &ji_gh);
+                if (error)
+                        goto fail_gunlock_j;
+        } else {
+                fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
+        }
+        fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
+        error = gfs2_jdesc_check(jd);
+        if (error)
+                goto fail_gunlock_ji;
+        error = gfs2_find_jhead(jd, &head);
+        if (error)
+                goto fail_gunlock_ji;
+        if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+                fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
+                        jd->jd_jid);
+                t = jiffies;
+                /* Acquire a shared hold on the transaction lock */
+                error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+                                           LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+                                           GL_NOCANCEL | GL_NOCACHE, &t_gh);
+                if (error)
+                        goto fail_gunlock_ji;
+                if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+                        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+                                ro = 1;
+                } else {
+                        if (sdp->sd_vfs->s_flags & MS_RDONLY)
+                                ro = 1;
+                }
+                if (ro) {
+                        fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
+                                jd->jd_jid);
+                        error = -EROFS;
+                        goto fail_gunlock_tr;
+                }
+                fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+                for (pass = 0; pass < 2; pass++) {
+                        lops_before_scan(jd, &head, pass);
+                        error = foreach_descriptor(jd, head.lh_tail,
+                                                   head.lh_blkno, pass);
+                        lops_after_scan(jd, error, pass);
+                        if (error)
+                                goto fail_gunlock_tr;
+                }
+                error = clean_journal(jd, &head);
+                if (error)
+                        goto fail_gunlock_tr;
+                gfs2_glock_dq_uninit(&t_gh);
+                t = DIV_ROUND_UP(jiffies - t, HZ);
+                fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
+                        jd->jd_jid, t);
+        }
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+                gfs2_glock_dq_uninit(&ji_gh);
+        gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+                gfs2_glock_dq_uninit(&j_gh);
+        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
+        return 0;
+fail_gunlock_tr:
+        gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_ji:
+        if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+                gfs2_glock_dq_uninit(&ji_gh);
+fail_gunlock_j:
+                gfs2_glock_dq_uninit(&j_gh);
+        }
+        fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
+fail:
+        gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+        return error;
+}
+/**
+ * gfs2_check_journals - Recover any dirty journals
+ * @sdp: the filesystem
+ *
+ */
+void gfs2_check_journals(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        for (;;) {
+                jd = gfs2_jdesc_find_dirty(sdp);
+                if (!jd)
+                        break;
+                if (jd != sdp->sd_jdesc)
+                        gfs2_recover_journal(jd);
+        }
+}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..961feedf4d8b
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __RECOVERY_DOT_H__
+#define __RECOVERY_DOT_H__
+#include "incore.h"
+static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+{
+        if (++*blk == sdp->sd_jdesc->jd_blocks)
+                *blk = 0;
+}
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+                           struct buffer_head **bh);
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+                    struct gfs2_log_header *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+void gfs2_check_journals(struct gfs2_sbd *sdp);
+#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..b261385c0065
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "ops_file.h"
+#include "util.h"
+#define BFITNOENT ((u32)~0)
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation.  Each block is represented by two
+ * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
+ *
+ * 0 = Free
+ * 1 = Used (not metadata)
+ * 2 = Unlinked (still in use) inode
+ * 3 = Used (metadata)
+ */
+static const char valid_change[16] = {
+                /* current */
+        /* n */ 0, 1, 1, 1,
+        /* e */ 1, 0, 0, 0,
+        /* w */ 0, 0, 0, 1,
+                1, 0, 0, 0
+};
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to set
+ * @new_state: the new state of the block
+ *
+ */
+static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                        unsigned int buflen, u32 block,
+                        unsigned char new_state)
+{
+        unsigned char *byte, *end, cur_state;
+        unsigned int bit;
+        byte = buffer + (block / GFS2_NBBY);
+        bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buffer + buflen;
+        gfs2_assert(rgd->rd_sbd, byte < end);
+        cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+        if (valid_change[new_state * 4 + cur_state]) {
+                *byte ^= cur_state << bit;
+                *byte |= new_state << bit;
+        } else
+                gfs2_consist_rgrpd(rgd);
+}
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to read
+ *
+ */
+static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                                  unsigned int buflen, u32 block)
+{
+        unsigned char *byte, *end, cur_state;
+        unsigned int bit;
+        byte = buffer + (block / GFS2_NBBY);
+        bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buffer + buflen;
+        gfs2_assert(rgd->rd_sbd, byte < end);
+        cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+        return cur_state;
+}
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ *       a block in a given allocation state.
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
+ *       bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem.  @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                            unsigned int buflen, u32 goal,
+                            unsigned char old_state)
+{
+        unsigned char *byte, *end, alloc;
+        u32 blk = goal;
+        unsigned int bit;
+        byte = buffer + (goal / GFS2_NBBY);
+        bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buffer + buflen;
+        alloc = (old_state & 1) ? 0 : 0x55;
+        while (byte < end) {
+                if ((*byte & 0x55) == alloc) {
+                        blk += (8 - bit) >> 1;
+                        bit = 0;
+                        byte++;
+                        continue;
+                }
+                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
+                        return blk;
+                bit += GFS2_BIT_SIZE;
+                if (bit >= 8) {
+                        bit = 0;
+                        byte++;
+                }
+                blk++;
+        }
+        return BFITNOENT;
+}
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+                              unsigned int buflen, unsigned char state)
+{
+        unsigned char *byte = buffer;
+        unsigned char *end = buffer + buflen;
+        unsigned char state1 = state << 2;
+        unsigned char state2 = state << 4;
+        unsigned char state3 = state << 6;
+        u32 count = 0;
+        for (; byte < end; byte++) {
+                if (((*byte) & 0x03) == state)
+                        count++;
+                if (((*byte) & 0x0C) == state1)
+                        count++;
+                if (((*byte) & 0x30) == state2)
+                        count++;
+                if (((*byte) & 0xC0) == state3)
+                        count++;
+        }
+        return count;
+}
+/**
+ * gfs2_rgrp_verify - Verify that a resource group is consistent
+ * @sdp: the filesystem
+ * @rgd: the rgrp
+ *
+ */
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_bitmap *bi = NULL;
+        u32 length = rgd->rd_ri.ri_length;
+        u32 count[4], tmp;
+        int buf, x;
+        memset(count, 0, 4 * sizeof(u32));
+        /* Count # blocks in each of 4 possible allocation states */
+        for (buf = 0; buf < length; buf++) {
+                bi = rgd->rd_bits + buf;
+                for (x = 0; x < 4; x++)
+                        count[x] += gfs2_bitcount(rgd,
+                                                  bi->bi_bh->b_data +
+                                                  bi->bi_offset,
+                                                  bi->bi_len, x);
+        }
+        if (count[0] != rgd->rd_rg.rg_free) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "free data mismatch:  %u != %u\n",
+                               count[0], rgd->rd_rg.rg_free);
+                return;
+        }
+        tmp = rgd->rd_ri.ri_data -
+                rgd->rd_rg.rg_free -
+                rgd->rd_rg.rg_dinodes;
+        if (count[1] + count[2] != tmp) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "used data mismatch:  %u != %u\n",
+                               count[1], tmp);
+                return;
+        }
+        if (count[3] != rgd->rd_rg.rg_dinodes) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
+                               count[3], rgd->rd_rg.rg_dinodes);
+                return;
+        }
+        if (count[2] > count[3]) {
+                if (gfs2_consist_rgrpd(rgd))
+                        fs_err(sdp, "unlinked inodes > inodes:  %u\n",
+                               count[2]);
+                return;
+        }
+}
+static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
+{
+        u64 first = ri->ri_data0;
+        u64 last = first + ri->ri_data;
+        return first <= block && block < last;
+}
+/**
+ * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
+ * @sdp: The GFS2 superblock
+ * @n: The data block number
+ *
+ * Returns: The resource group, or NULL if not found
+ */
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
+{
+        struct gfs2_rgrpd *rgd;
+        spin_lock(&sdp->sd_rindex_spin);
+        list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
+                if (rgrp_contains_block(&rgd->rd_ri, blk)) {
+                        list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                        spin_unlock(&sdp->sd_rindex_spin);
+                        return rgd;
+                }
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        return NULL;
+}
+/**
+ * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The first rgrp in the filesystem
+ */
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
+{
+        gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
+        return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+}
+/**
+ * gfs2_rgrpd_get_next - get the next RG
+ * @rgd: A RG
+ *
+ * Returns: The next rgrp
+ */
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
+{
+        if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+                return NULL;
+        return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+}
+static void clear_rgrpdi(struct gfs2_sbd *sdp)
+{
+        struct list_head *head;
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_glock *gl;
+        spin_lock(&sdp->sd_rindex_spin);
+        sdp->sd_rindex_forward = NULL;
+        head = &sdp->sd_rindex_recent_list;
+        while (!list_empty(head)) {
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+                list_del(&rgd->rd_recent);
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_list;
+        while (!list_empty(head)) {
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+                gl = rgd->rd_gl;
+                list_del(&rgd->rd_list);
+                list_del(&rgd->rd_list_mru);
+                if (gl) {
+                        gl->gl_object = NULL;
+                        gfs2_glock_put(gl);
+                }
+                kfree(rgd->rd_bits);
+                kfree(rgd);
+        }
+}
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+        mutex_lock(&sdp->sd_rindex_mutex);
+        clear_rgrpdi(sdp);
+        mutex_unlock(&sdp->sd_rindex_mutex);
+}
+/**
+ * gfs2_compute_bitstructs - Compute the bitmap sizes
+ * @rgd: The resource group descriptor
+ *
+ * Calculates bitmap descriptors, one for each block that contains bitmap data
+ *
+ * Returns: errno
+ */
+static int compute_bitstructs(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_bitmap *bi;
+        u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
+        u32 bytes_left, bytes;
+        int x;
+        if (!length)
+                return -EINVAL;
+        rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
+        if (!rgd->rd_bits)
+                return -ENOMEM;
+        bytes_left = rgd->rd_ri.ri_bitbytes;
+        for (x = 0; x < length; x++) {
+                bi = rgd->rd_bits + x;
+                /* small rgrp; bitmap stored completely in header block */
+                if (length == 1) {
+                        bytes = bytes_left;
+                        bi->bi_offset = sizeof(struct gfs2_rgrp);
+                        bi->bi_start = 0;
+                        bi->bi_len = bytes;
+                /* header block */
+                } else if (x == 0) {
+                        bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
+                        bi->bi_offset = sizeof(struct gfs2_rgrp);
+                        bi->bi_start = 0;
+                        bi->bi_len = bytes;
+                /* last block */
+                } else if (x + 1 == length) {
+                        bytes = bytes_left;
+                        bi->bi_offset = sizeof(struct gfs2_meta_header);
+                        bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+                        bi->bi_len = bytes;
+                /* other blocks */
+                } else {
+                        bytes = sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_meta_header);
+                        bi->bi_offset = sizeof(struct gfs2_meta_header);
+                        bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+                        bi->bi_len = bytes;
+                }
+                bytes_left -= bytes;
+        }
+        if (bytes_left) {
+                gfs2_consist_rgrpd(rgd);
+                return -EIO;
+        }
+        bi = rgd->rd_bits + (length - 1);
+        if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
+                if (gfs2_consist_rgrpd(rgd)) {
+                        gfs2_rindex_print(&rgd->rd_ri);
+                        fs_err(sdp, "start=%u len=%u offset=%u\n",
+                               bi->bi_start, bi->bi_len, bi->bi_offset);
+                }
+                return -EIO;
+        }
+        return 0;
+}
+/**
+ * gfs2_ri_update - Pull in a new resource index from the disk
+ * @gl: The glock covering the rindex inode
+ *
+ * Returns: 0 on successful update, error code otherwise
+ */
+static int gfs2_ri_update(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_rgrpd *rgd;
+        char buf[sizeof(struct gfs2_rindex)];
+        struct file_ra_state ra_state;
+        u64 junk = ip->i_di.di_size;
+        int error;
+        if (do_div(junk, sizeof(struct gfs2_rindex))) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        clear_rgrpdi(sdp);
+        file_ra_state_init(&ra_state, inode->i_mapping);
+        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
+                loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
+                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
+                                            sizeof(struct gfs2_rindex));
+                if (!error)
+                        break;
+                if (error != sizeof(struct gfs2_rindex)) {
+                        if (error > 0)
+                                error = -EIO;
+                        goto fail;
+                }
+                rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+                error = -ENOMEM;
+                if (!rgd)
+                        goto fail;
+                mutex_init(&rgd->rd_mutex);
+                lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
+                rgd->rd_sbd = sdp;
+                list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
+                list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                gfs2_rindex_in(&rgd->rd_ri, buf);
+                error = compute_bitstructs(rgd);
+                if (error)
+                        goto fail;
+                error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
+                                       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
+                if (error)
+                        goto fail;
+                rgd->rd_gl->gl_object = rgd;
+                rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+        }
+        sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+        return 0;
+fail:
+        clear_rgrpdi(sdp);
+        return error;
+}
+/**
+ * gfs2_rindex_hold - Grab a lock on the rindex
+ * @sdp: The GFS2 superblock
+ * @ri_gh: the glock holder
+ *
+ * We grab a lock on the rindex inode to make sure that it doesn't
+ * change whilst we are performing an operation. We keep this lock
+ * for quite long periods of time compared to other locks. This
+ * doesn't matter, since it is shared and it is very, very rarely
+ * accessed in the exclusive mode (i.e. only when expanding the filesystem).
+ *
+ * This makes sure that we're using the latest copy of the resource index
+ * special file, which might have been updated if someone expanded the
+ * filesystem (via gfs2_grow utility), which adds new resource groups.
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+        struct gfs2_glock *gl = ip->i_gl;
+        int error;
+        error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
+        if (error)
+                return error;
+        /* Read new copy from disk if we don't have the latest */
+        if (sdp->sd_rindex_vn != gl->gl_vn) {
+                mutex_lock(&sdp->sd_rindex_mutex);
+                if (sdp->sd_rindex_vn != gl->gl_vn) {
+                        error = gfs2_ri_update(ip);
+                        if (error)
+                                gfs2_glock_dq_uninit(ri_gh);
+                }
+                mutex_unlock(&sdp->sd_rindex_mutex);
+        }
+        return error;
+}
+/**
+ * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ * Read in all of a Resource Group's header and bitmap blocks.
+ * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ *
+ * Returns: errno
+ */
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_glock *gl = rgd->rd_gl;
+        unsigned int length = rgd->rd_ri.ri_length;
+        struct gfs2_bitmap *bi;
+        unsigned int x, y;
+        int error;
+        mutex_lock(&rgd->rd_mutex);
+        spin_lock(&sdp->sd_rindex_spin);
+        if (rgd->rd_bh_count) {
+                rgd->rd_bh_count++;
+                spin_unlock(&sdp->sd_rindex_spin);
+                mutex_unlock(&rgd->rd_mutex);
+                return 0;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        for (x = 0; x < length; x++) {
+                bi = rgd->rd_bits + x;
+                error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh);
+                if (error)
+                        goto fail;
+        }
+        for (y = length; y--;) {
+                bi = rgd->rd_bits + y;
+                error = gfs2_meta_wait(sdp, bi->bi_bh);
+                if (error)
+                        goto fail;
+                if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
+                                              GFS2_METATYPE_RG)) {
+                        error = -EIO;
+                        goto fail;
+                }
+        }
+        if (rgd->rd_rg_vn != gl->gl_vn) {
+                gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+                rgd->rd_rg_vn = gl->gl_vn;
+        }
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_bh_count++;
+        spin_unlock(&sdp->sd_rindex_spin);
+        mutex_unlock(&rgd->rd_mutex);
+        return 0;
+fail:
+        while (x--) {
+                bi = rgd->rd_bits + x;
+                brelse(bi->bi_bh);
+                bi->bi_bh = NULL;
+                gfs2_assert_warn(sdp, !bi->bi_clone);
+        }
+        mutex_unlock(&rgd->rd_mutex);
+        return error;
+}
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        spin_lock(&sdp->sd_rindex_spin);
+        gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+        rgd->rd_bh_count++;
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ */
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        int x, length = rgd->rd_ri.ri_length;
+        spin_lock(&sdp->sd_rindex_spin);
+        gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+        if (--rgd->rd_bh_count) {
+                spin_unlock(&sdp->sd_rindex_spin);
+                return;
+        }
+        for (x = 0; x < length; x++) {
+                struct gfs2_bitmap *bi = rgd->rd_bits + x;
+                kfree(bi->bi_clone);
+                bi->bi_clone = NULL;
+                brelse(bi->bi_bh);
+                bi->bi_bh = NULL;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        unsigned int length = rgd->rd_ri.ri_length;
+        unsigned int x;
+        for (x = 0; x < length; x++) {
+                struct gfs2_bitmap *bi = rgd->rd_bits + x;
+                if (!bi->bi_clone)
+                        continue;
+                memcpy(bi->bi_clone + bi->bi_offset,
+                       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+        }
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
+ * @ip: the incore GFS2 inode structure
+ *
+ * Returns: the struct gfs2_alloc
+ */
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
+{
+        struct gfs2_alloc *al = &ip->i_alloc;
+        /* FIXME: Should assert that the correct locks are held here... */
+        memset(al, 0, sizeof(*al));
+        return al;
+}
+/**
+ * try_rgrp_fit - See if a given reservation will fit in a given RG
+ * @rgd: the RG data
+ * @al: the struct gfs2_alloc structure describing the reservation
+ *
+ * If there's room for the requested blocks to be allocated from the RG:
+ *   Sets the $al_reserved_data field in @al.
+ *   Sets the $al_reserved_meta field in @al.
+ *   Sets the $al_rgd field in @al.
+ *
+ * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
+ */
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        int ret = 0;
+        spin_lock(&sdp->sd_rindex_spin);
+        if (rgd->rd_free_clone >= al->al_requested) {
+                al->al_rgd = rgd;
+                ret = 1;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        return ret;
+}
+/**
+ * recent_rgrp_first - get first RG from "recent" list
+ * @sdp: The GFS2 superblock
+ * @rglast: address of the rgrp used last
+ *
+ * Returns: The first rgrp in the recent list
+ */
+static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
+                                            u64 rglast)
+{
+        struct gfs2_rgrpd *rgd = NULL;
+        spin_lock(&sdp->sd_rindex_spin);
+        if (list_empty(&sdp->sd_rindex_recent_list))
+                goto out;
+        if (!rglast)
+                goto first;
+        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+                if (rgd->rd_ri.ri_addr == rglast)
+                        goto out;
+        }
+first:
+        rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
+                         rd_recent);
+out:
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
+}
+/**
+ * recent_rgrp_next - get next RG from "recent" list
+ * @cur_rgd: current rgrp
+ * @remove:
+ *
+ * Returns: The next rgrp in the recent list
+ */
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+                                           int remove)
+{
+        struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
+        struct list_head *head;
+        struct gfs2_rgrpd *rgd;
+        spin_lock(&sdp->sd_rindex_spin);
+        head = &sdp->sd_rindex_recent_list;
+        list_for_each_entry(rgd, head, rd_recent) {
+                if (rgd == cur_rgd) {
+                        if (cur_rgd->rd_recent.next != head)
+                                rgd = list_entry(cur_rgd->rd_recent.next,
+                                                 struct gfs2_rgrpd, rd_recent);
+                        else
+                                rgd = NULL;
+                        if (remove)
+                                list_del(&cur_rgd->rd_recent);
+                        goto out;
+                }
+        }
+        rgd = NULL;
+        if (!list_empty(head))
+                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+out:
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
+}
+/**
+ * recent_rgrp_add - add an RG to tail of "recent" list
+ * @new_rgd: The rgrp to add
+ *
+ */
+static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
+{
+        struct gfs2_sbd *sdp = new_rgd->rd_sbd;
+        struct gfs2_rgrpd *rgd;
+        unsigned int count = 0;
+        unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
+        spin_lock(&sdp->sd_rindex_spin);
+        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+                if (rgd == new_rgd)
+                        goto out;
+                if (++count >= max)
+                        goto out;
+        }
+        list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
+out:
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * forward_rgrp_get - get an rgrp to try next from full list
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The rgrp to try next
+ */
+static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
+{
+        struct gfs2_rgrpd *rgd;
+        unsigned int journals = gfs2_jindex_size(sdp);
+        unsigned int rg = 0, x;
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd = sdp->sd_rindex_forward;
+        if (!rgd) {
+                if (sdp->sd_rgrps >= journals)
+                        rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
+                for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
+                     x++, rgd = gfs2_rgrpd_get_next(rgd))
+                        /* Do Nothing */;
+                sdp->sd_rindex_forward = rgd;
+        }
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
+}
+/**
+ * forward_rgrp_set - set the forward rgrp pointer
+ * @sdp: the filesystem
+ * @rgd: The new forward rgrp
+ *
+ */
+static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
+{
+        spin_lock(&sdp->sd_rindex_spin);
+        sdp->sd_rindex_forward = rgd;
+        spin_unlock(&sdp->sd_rindex_spin);
+}
+/**
+ * get_local_rgrp - Choose and lock a rgrp for allocation
+ * @ip: the inode to reserve space for
+ * @rgp: the chosen and locked rgrp
+ *
+ * Try to acquire rgrp in way which avoids contending with others.
+ *
+ * Returns: errno
+ */
+static int get_local_rgrp(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd, *begin = NULL;
+        struct gfs2_alloc *al = &ip->i_alloc;
+        int flags = LM_FLAG_TRY;
+        int skipped = 0;
+        int loops = 0;
+        int error;
+        /* Try recently successful rgrps */
+        rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+        while (rgd) {
+                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+                                           LM_FLAG_TRY, &al->al_rgd_gh);
+                switch (error) {
+                case 0:
+                        if (try_rgrp_fit(rgd, al))
+                                goto out;
+                        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+                        rgd = recent_rgrp_next(rgd, 1);
+                        break;
+                case GLR_TRYFAILED:
+                        rgd = recent_rgrp_next(rgd, 0);
+                        break;
+                default:
+                        return error;
+                }
+        }
+        /* Go through full list of rgrps */
+        begin = rgd = forward_rgrp_get(sdp);
+        for (;;) {
+                error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+                                          &al->al_rgd_gh);
+                switch (error) {
+                case 0:
+                        if (try_rgrp_fit(rgd, al))
+                                goto out;
+                        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+                        break;
+                case GLR_TRYFAILED:
+                        skipped++;
+                        break;
+                default:
+                        return error;
+                }
+                rgd = gfs2_rgrpd_get_next(rgd);
+                if (!rgd)
+                        rgd = gfs2_rgrpd_get_first(sdp);
+                if (rgd == begin) {
+                        if (++loops >= 2 || !skipped)
+                                return -ENOSPC;
+                        flags = 0;
+                }
+        }
+out:
+        ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
+        if (begin) {
+                recent_rgrp_add(rgd);
+                rgd = gfs2_rgrpd_get_next(rgd);
+                if (!rgd)
+                        rgd = gfs2_rgrpd_get_first(sdp);
+                forward_rgrp_set(sdp, rgd);
+        }
+        return 0;
+}
+/**
+ * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * @ip: the inode to reserve space for
+ *
+ * Returns: errno
+ */
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        int error;
+        if (gfs2_assert_warn(sdp, al->al_requested))
+                return -EINVAL;
+        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+        if (error)
+                return error;
+        error = get_local_rgrp(ip);
+        if (error) {
+                gfs2_glock_dq_uninit(&al->al_ri_gh);
+                return error;
+        }
+        al->al_file = file;
+        al->al_line = line;
+        return 0;
+}
+/**
+ * gfs2_inplace_release - release an inplace reservation
+ * @ip: the inode the reservation was taken out on
+ *
+ * Release a reservation made by gfs2_inplace_reserve().
+ */
+void gfs2_inplace_release(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
+                fs_warn(sdp, "al_alloced = %u, al_requested = %u "
+                             "al_file = %s, al_line = %u\n",
+                             al->al_alloced, al->al_requested, al->al_file,
+                             al->al_line);
+        al->al_rgd = NULL;
+        gfs2_glock_dq_uninit(&al->al_rgd_gh);
+        gfs2_glock_dq_uninit(&al->al_ri_gh);
+}
+/**
+ * gfs2_get_block_type - Check a block in a RG is of given type
+ * @rgd: the resource group holding the block
+ * @block: the block number
+ *
+ * Returns: The block type (GFS2_BLKST_*)
+ */
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+{
+        struct gfs2_bitmap *bi = NULL;
+        u32 length, rgrp_block, buf_block;
+        unsigned int buf;
+        unsigned char type;
+        length = rgd->rd_ri.ri_length;
+        rgrp_block = block - rgd->rd_ri.ri_data0;
+        for (buf = 0; buf < length; buf++) {
+                bi = rgd->rd_bits + buf;
+                if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                        break;
+        }
+        gfs2_assert(rgd->rd_sbd, buf < length);
+        buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
+        type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                           bi->bi_len, buf_block);
+        return type;
+}
+/**
+ * rgblk_search - find a block in @old_state, change allocation
+ *           state to @new_state
+ * @rgd: the resource group descriptor
+ * @goal: the goal block within the RG (start here to search for avail block)
+ * @old_state: GFS2_BLKST_XXX the before-allocation state to find
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Walk rgrp's bitmap to find bits that represent a block in @old_state.
+ * Add the found bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ *
+ * This function never fails, because we wouldn't call it unless we
+ * know (from reservation results, etc.) that a block is available.
+ *
+ * Scope of @goal and returned block is just within rgrp, not the whole
+ * filesystem.
+ *
+ * Returns:  the block number allocated
+ */
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
+                             unsigned char old_state, unsigned char new_state)
+{
+        struct gfs2_bitmap *bi = NULL;
+        u32 length = rgd->rd_ri.ri_length;
+        u32 blk = 0;
+        unsigned int buf, x;
+        /* Find bitmap block that contains bits for goal block */
+        for (buf = 0; buf < length; buf++) {
+                bi = rgd->rd_bits + buf;
+                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                        break;
+        }
+        gfs2_assert(rgd->rd_sbd, buf < length);
+        /* Convert scope of "goal" from rgrp-wide to within found bit block */
+        goal -= bi->bi_start * GFS2_NBBY;
+        /* Search (up to entire) bitmap in this rgrp for allocatable block.
+           "x <= length", instead of "x < length", because we typically start
+           the search in the middle of a bit block, but if we can't find an
+           allocatable block anywhere else, we want to be able wrap around and
+           search in the first part of our first-searched bit block.  */
+        for (x = 0; x <= length; x++) {
+                if (bi->bi_clone)
+                        blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+                                          bi->bi_len, goal, old_state);
+                else
+                        blk = gfs2_bitfit(rgd,
+                                          bi->bi_bh->b_data + bi->bi_offset,
+                                          bi->bi_len, goal, old_state);
+                if (blk != BFITNOENT)
+                        break;
+                /* Try next bitmap block (wrap back to rgrp header if at end) */
+                buf = (buf + 1) % length;
+                bi = rgd->rd_bits + buf;
+                goal = 0;
+        }
+        if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
+                blk = 0;
+        gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                    bi->bi_len, blk, new_state);
+        if (bi->bi_clone)
+                gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
+                            bi->bi_len, blk, new_state);
+        return bi->bi_start * GFS2_NBBY + blk;
+}
+/**
+ * rgblk_free - Change alloc state of given block(s)
+ * @sdp: the filesystem
+ * @bstart: the start of a run of blocks to free
+ * @blen: the length of the block run (all must lie within ONE RG!)
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Returns:  Resource group containing the block(s)
+ */
+static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
+                                     u32 blen, unsigned char new_state)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_bitmap *bi = NULL;
+        u32 length, rgrp_blk, buf_blk;
+        unsigned int buf;
+        rgd = gfs2_blk2rgrpd(sdp, bstart);
+        if (!rgd) {
+                if (gfs2_consist(sdp))
+                        fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
+                return NULL;
+        }
+        length = rgd->rd_ri.ri_length;
+        rgrp_blk = bstart - rgd->rd_ri.ri_data0;
+        while (blen--) {
+                for (buf = 0; buf < length; buf++) {
+                        bi = rgd->rd_bits + buf;
+                        if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                                break;
+                }
+                gfs2_assert(rgd->rd_sbd, buf < length);
+                buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
+                rgrp_blk++;
+                if (!bi->bi_clone) {
+                        bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+                                               GFP_NOFS | __GFP_NOFAIL);
+                        memcpy(bi->bi_clone + bi->bi_offset,
+                               bi->bi_bh->b_data + bi->bi_offset,
+                               bi->bi_len);
+                }
+                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+                gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                            bi->bi_len, buf_blk, new_state);
+        }
+        return rgd;
+}
+/**
+ * gfs2_alloc_data - Allocate a data block
+ * @ip: the inode to allocate the data block for
+ *
+ * Returns: the allocated block
+ */
+u64 gfs2_alloc_data(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_rgrpd *rgd = al->al_rgd;
+        u32 goal, blk;
+        u64 block;
+        if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
+                goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
+        else
+                goal = rgd->rd_last_alloc_data;
+        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+        rgd->rd_last_alloc_data = blk;
+        block = rgd->rd_ri.ri_data0 + blk;
+        ip->i_di.di_goal_data = block;
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        rgd->rd_rg.rg_free--;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        al->al_alloced++;
+        gfs2_statfs_change(sdp, 0, -1, 0);
+        gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone--;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return block;
+}
+/**
+ * gfs2_alloc_meta - Allocate a metadata block
+ * @ip: the inode to allocate the metadata block for
+ *
+ * Returns: the allocated block
+ */
+u64 gfs2_alloc_meta(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_alloc *al = &ip->i_alloc;
+        struct gfs2_rgrpd *rgd = al->al_rgd;
+        u32 goal, blk;
+        u64 block;
+        if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
+                goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
+        else
+                goal = rgd->rd_last_alloc_meta;
+        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+        rgd->rd_last_alloc_meta = blk;
+        block = rgd->rd_ri.ri_data0 + blk;
+        ip->i_di.di_goal_meta = block;
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        rgd->rd_rg.rg_free--;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        al->al_alloced++;
+        gfs2_statfs_change(sdp, 0, -1, 0);
+        gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+        gfs2_trans_add_unrevoke(sdp, block);
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone--;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return block;
+}
+/**
+ * gfs2_alloc_di - Allocate a dinode
+ * @dip: the directory that the inode is going in
+ *
+ * Returns: the block allocated
+ */
+u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_alloc *al = &dip->i_alloc;
+        struct gfs2_rgrpd *rgd = al->al_rgd;
+        u32 blk;
+        u64 block;
+        blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+        rgd->rd_last_alloc_meta = blk;
+        block = rgd->rd_ri.ri_data0 + blk;
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        rgd->rd_rg.rg_free--;
+        rgd->rd_rg.rg_dinodes++;
+        *generation = rgd->rd_rg.rg_igeneration++;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        al->al_alloced++;
+        gfs2_statfs_change(sdp, 0, -1, +1);
+        gfs2_trans_add_unrevoke(sdp, block);
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd->rd_free_clone--;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return block;
+}
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd;
+        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+        if (!rgd)
+                return;
+        rgd->rd_rg.rg_free += blen;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_trans_add_rg(rgd);
+        gfs2_statfs_change(sdp, 0, +blen, 0);
+        gfs2_quota_change(ip, -(s64)blen,
+                         ip->i_di.di_uid, ip->i_di.di_gid);
+}
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct gfs2_rgrpd *rgd;
+        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+        if (!rgd)
+                return;
+        rgd->rd_rg.rg_free += blen;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_trans_add_rg(rgd);
+        gfs2_statfs_change(sdp, 0, +blen, 0);
+        gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
+        gfs2_meta_wipe(ip, bstart, blen);
+}
+void gfs2_unlink_di(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_rgrpd *rgd;
+        u64 blkno = ip->i_num.no_addr;
+        rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+        if (!rgd)
+                return;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_trans_add_rg(rgd);
+}
+static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_rgrpd *tmp_rgd;
+        tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
+        if (!tmp_rgd)
+                return;
+        gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+        if (!rgd->rd_rg.rg_dinodes)
+                gfs2_consist_rgrpd(rgd);
+        rgd->rd_rg.rg_dinodes--;
+        rgd->rd_rg.rg_free++;
+        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_statfs_change(sdp, 0, +1, -1);
+        gfs2_trans_add_rg(rgd);
+}
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+        gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
+        gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
+        gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
+}
+/**
+ * gfs2_rlist_add - add a RG to a list of RGs
+ * @sdp: the filesystem
+ * @rlist: the list of resource groups
+ * @block: the block
+ *
+ * Figure out what RG a block belongs to and add that RG to the list
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+                    u64 block)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_rgrpd **tmp;
+        unsigned int new_space;
+        unsigned int x;
+        if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
+                return;
+        rgd = gfs2_blk2rgrpd(sdp, block);
+        if (!rgd) {
+                if (gfs2_consist(sdp))
+                        fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+                return;
+        }
+        for (x = 0; x < rlist->rl_rgrps; x++)
+                if (rlist->rl_rgd[x] == rgd)
+                        return;
+        if (rlist->rl_rgrps == rlist->rl_space) {
+                new_space = rlist->rl_space + 10;
+                tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
+                              GFP_NOFS | __GFP_NOFAIL);
+                if (rlist->rl_rgd) {
+                        memcpy(tmp, rlist->rl_rgd,
+                               rlist->rl_space * sizeof(struct gfs2_rgrpd *));
+                        kfree(rlist->rl_rgd);
+                }
+                rlist->rl_space = new_space;
+                rlist->rl_rgd = tmp;
+        }
+        rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
+}
+/**
+ * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
+ *      and initialize an array of glock holders for them
+ * @rlist: the list of resource groups
+ * @state: the lock state to acquire the RG lock in
+ * @flags: the modifier flags for the holder structures
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+                      int flags)
+{
+        unsigned int x;
+        rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+                                GFP_NOFS | __GFP_NOFAIL);
+        for (x = 0; x < rlist->rl_rgrps; x++)
+                gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
+                                state, flags,
+                                &rlist->rl_ghs[x]);
+}
+/**
+ * gfs2_rlist_free - free a resource group list
+ * @list: the list of resource groups
+ *
+ */
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
+{
+        unsigned int x;
+        kfree(rlist->rl_rgd);
+        if (rlist->rl_ghs) {
+                for (x = 0; x < rlist->rl_rgrps; x++)
+                        gfs2_holder_uninit(&rlist->rl_ghs[x]);
+                kfree(rlist->rl_ghs);
+        }
+}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..b01e0cfc99b5
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __RGRP_DOT_H__
+#define __RGRP_DOT_H__
+struct gfs2_rgrpd;
+struct gfs2_sbd;
+struct gfs2_holder;
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+static inline void gfs2_alloc_put(struct gfs2_inode *ip)
+{
+        return; /* So we can see where ip->i_alloc is used */
+}
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+                         char *file, unsigned int line);
+#define gfs2_inplace_reserve(ip) \
+gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+void gfs2_inplace_release(struct gfs2_inode *ip);
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+u64 gfs2_alloc_data(struct gfs2_inode *ip);
+u64 gfs2_alloc_meta(struct gfs2_inode *ip);
+u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+struct gfs2_rgrp_list {
+        unsigned int rl_rgrps;
+        unsigned int rl_space;
+        struct gfs2_rgrpd **rl_rgd;
+        struct gfs2_holder *rl_ghs;
+};
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+                    u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+                      int flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..6a78b1b32e25
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,976 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+void gfs2_tune_init(struct gfs2_tune *gt)
+{
+        spin_lock_init(&gt->gt_spin);
+        gt->gt_ilimit = 100;
+        gt->gt_ilimit_tries = 3;
+        gt->gt_ilimit_min = 1;
+        gt->gt_demote_secs = 300;
+        gt->gt_incore_log_blocks = 1024;
+        gt->gt_log_flush_secs = 60;
+        gt->gt_jindex_refresh_secs = 60;
+        gt->gt_scand_secs = 15;
+        gt->gt_recoverd_secs = 60;
+        gt->gt_logd_secs = 1;
+        gt->gt_quotad_secs = 5;
+        gt->gt_quota_simul_sync = 64;
+        gt->gt_quota_warn_period = 10;
+        gt->gt_quota_scale_num = 1;
+        gt->gt_quota_scale_den = 1;
+        gt->gt_quota_cache_secs = 300;
+        gt->gt_quota_quantum = 60;
+        gt->gt_atime_quantum = 3600;
+        gt->gt_new_files_jdata = 0;
+        gt->gt_new_files_directio = 0;
+        gt->gt_max_atomic_write = 4 << 20;
+        gt->gt_max_readahead = 1 << 18;
+        gt->gt_lockdump_size = 131072;
+        gt->gt_stall_secs = 600;
+        gt->gt_complain_secs = 10;
+        gt->gt_reclaim_limit = 5000;
+        gt->gt_entries_per_readdir = 32;
+        gt->gt_prefetch_secs = 10;
+        gt->gt_greedy_default = HZ / 10;
+        gt->gt_greedy_quantum = HZ / 40;
+        gt->gt_greedy_max = HZ / 4;
+        gt->gt_statfs_quantum = 30;
+        gt->gt_statfs_slow = 0;
+}
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
+{
+        unsigned int x;
+        if (sb->sb_header.mh_magic != GFS2_MAGIC ||
+            sb->sb_header.mh_type != GFS2_METATYPE_SB) {
+                if (!silent)
+                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+                return -EINVAL;
+        }
+        /*  If format numbers match exactly, we're done.  */
+        if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+                return 0;
+        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+                for (x = 0; gfs2_old_fs_formats[x]; x++)
+                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+                                break;
+                if (!gfs2_old_fs_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+                for (x = 0; gfs2_old_multihost_formats[x]; x++)
+                        if (gfs2_old_multihost_formats[x] ==
+                            sb->sb_multihost_format)
+                                break;
+                if (!gfs2_old_multihost_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (!sdp->sd_args.ar_upgrade) {
+                printk(KERN_WARNING
+                       "GFS2: code version (%u, %u) is incompatible "
+                       "with ondisk format (%u, %u)\n",
+                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                       sb->sb_fs_format, sb->sb_multihost_format);
+                printk(KERN_INFO
+                       "GFS2: Use the \"upgrade\" mount option to upgrade "
+                       "the FS\n");
+                printk(KERN_INFO "GFS2: See the manual for more details\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
+{
+        struct page *page = bio->bi_private;
+        if (bio->bi_size)
+                return 1;
+        if (!error)
+                SetPageUptodate(page);
+        else
+                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+        unlock_page(page);
+        return 0;
+}
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
+{
+        struct page *page;
+        struct bio *bio;
+        page = alloc_page(GFP_KERNEL);
+        if (unlikely(!page))
+                return NULL;
+        ClearPageUptodate(page);
+        ClearPageDirty(page);
+        lock_page(page);
+        bio = bio_alloc(GFP_KERNEL, 1);
+        if (unlikely(!bio)) {
+                __free_page(page);
+                return NULL;
+        }
+        bio->bi_sector = sector;
+        bio->bi_bdev = sb->s_bdev;
+        bio_add_page(bio, page, PAGE_SIZE, 0);
+        bio->bi_end_io = end_bio_io_page;
+        bio->bi_private = page;
+        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+        wait_on_page_locked(page);
+        bio_put(bio);
+        if (!PageUptodate(page)) {
+                __free_page(page);
+                return NULL;
+        }
+        return page;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+        u32 hash_blocks, ind_blocks, leaf_blocks;
+        u32 tmp_blocks;
+        unsigned int x;
+        int error;
+        struct page *page;
+        char *sb;
+        page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+        if (!page) {
+                if (!silent)
+                        fs_err(sdp, "can't read superblock\n");
+                return -EIO;
+        }
+        sb = kmap(page);
+        gfs2_sb_in(&sdp->sd_sb, sb);
+        kunmap(page);
+        __free_page(page);
+        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+        if (error)
+                return error;
+        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_dinode)) / sizeof(u64);
+        sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_meta_header)) / sizeof(u64);
+        sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+        sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+        sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+        sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_meta_header)) /
+                                sizeof(struct gfs2_quota_change);
+        /* Compute maximum reservation required to add a entry to a directory */
+        hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+                             sdp->sd_jbsize);
+        ind_blocks = 0;
+        for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+                tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+                ind_blocks += tmp_blocks;
+        }
+        leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+        sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+        sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_dinode);
+        sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_heightsize[x - 1] || m)
+                        break;
+                sdp->sd_heightsize[x] = space;
+        }
+        sdp->sd_max_height = x;
+        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+                                 sizeof(struct gfs2_dinode);
+        sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_jheightsize[x - 1] || m)
+                        break;
+                sdp->sd_jheightsize[x] = space;
+        }
+        sdp->sd_max_jheight = x;
+        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+        return 0;
+}
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * This is very similar to the gfs2_rindex_hold() function, except that
+ * in general we hold the jindex lock for longer periods of time and
+ * we grab it far less frequently (in general) then the rgrp lock.
+ *
+ * Returns: errno
+ */
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+        struct qstr name;
+        char buf[20];
+        struct gfs2_jdesc *jd;
+        int error;
+        name.name = buf;
+        mutex_lock(&sdp->sd_jindex_mutex);
+        for (;;) {
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
+                                           GL_LOCAL_EXCL, ji_gh);
+                if (error)
+                        break;
+                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+                name.hash = gfs2_disk_hash(name.name, name.len);
+                error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
+                if (error == -ENOENT) {
+                        error = 0;
+                        break;
+                }
+                gfs2_glock_dq_uninit(ji_gh);
+                if (error)
+                        break;
+                error = -ENOMEM;
+                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+                if (!jd)
+                        break;
+                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+                        if (!jd->jd_inode)
+                                error = -ENOENT;
+                        else
+                                error = PTR_ERR(jd->jd_inode);
+                        kfree(jd);
+                        break;
+                }
+                spin_lock(&sdp->sd_jindex_spin);
+                jd->jd_jid = sdp->sd_journals++;
+                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+                spin_unlock(&sdp->sd_jindex_spin);
+        }
+        mutex_unlock(&sdp->sd_jindex_mutex);
+        return error;
+}
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+        struct list_head list;
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_add(&list, &sdp->sd_jindex_list);
+        list_del_init(&sdp->sd_jindex_list);
+        sdp->sd_journals = 0;
+        spin_unlock(&sdp->sd_jindex_spin);
+        while (!list_empty(&list)) {
+                jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+                list_del(&jd->jd_list);
+                iput(jd->jd_inode);
+                kfree(jd);
+        }
+}
+static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        list_for_each_entry(jd, head, jd_list) {
+                if (jd->jd_jid == jid) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                jd = NULL;
+        return jd;
+}
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+        spin_unlock(&sdp->sd_jindex_spin);
+        return jd;
+}
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+        if (jd)
+                jd->jd_dirty = 1;
+        spin_unlock(&sdp->sd_jindex_spin);
+}
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_dirty) {
+                        jd->jd_dirty = 0;
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        if (!found)
+                jd = NULL;
+        return jd;
+}
+int gfs2_jdesc_check(struct gfs2_jdesc *jd)
+{
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+        int ar;
+        int error;
+        if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+            (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+                gfs2_consist_inode(ip);
+                return -EIO;
+        }
+        jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+        if (!error && ar) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+        }
+        return error;
+}
+/**
+ * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+        struct gfs2_glock *j_gl = ip->i_gl;
+        struct gfs2_holder t_gh;
+        struct gfs2_log_header head;
+        int error;
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+                                   GL_LOCAL_EXCL, &t_gh);
+        if (error)
+                return error;
+        gfs2_meta_cache_flush(ip);
+        j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+        error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+        if (error)
+                goto fail;
+        if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+                gfs2_consist(sdp);
+                error = -EIO;
+                goto fail;
+        }
+        /*  Initialize some head of the log stuff  */
+        sdp->sd_log_sequence = head.lh_sequence + 1;
+        gfs2_log_pointers_init(sdp, head.lh_blkno);
+        error = gfs2_quota_init(sdp);
+        if (error)
+                goto fail;
+        set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        gfs2_glock_dq_uninit(&t_gh);
+        return 0;
+fail:
+        t_gh.gh_flags |= GL_NOCACHE;
+        gfs2_glock_dq_uninit(&t_gh);
+        return error;
+}
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+                                GL_LOCAL_EXCL | GL_NOCACHE,
+                                &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+int gfs2_statfs_init(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *m_bh, *l_bh;
+        struct gfs2_holder gh;
+        int error;
+        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                   &gh);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+        if (error)
+                goto out;
+        if (sdp->sd_args.ar_spectator) {
+                spin_lock(&sdp->sd_statfs_spin);
+                gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                                      sizeof(struct gfs2_dinode));
+                spin_unlock(&sdp->sd_statfs_spin);
+        } else {
+                error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+                if (error)
+                        goto out_m_bh;
+                spin_lock(&sdp->sd_statfs_spin);
+                gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                                      sizeof(struct gfs2_dinode));
+                gfs2_statfs_change_in(l_sc, l_bh->b_data +
+                                      sizeof(struct gfs2_dinode));
+                spin_unlock(&sdp->sd_statfs_spin);
+                brelse(l_bh);
+        }
+out_m_bh:
+        brelse(m_bh);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return 0;
+}
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+                        s64 dinodes)
+{
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *l_bh;
+        int error;
+        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+        if (error)
+                return;
+        mutex_lock(&sdp->sd_statfs_mutex);
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        mutex_unlock(&sdp->sd_statfs_mutex);
+        spin_lock(&sdp->sd_statfs_spin);
+        l_sc->sc_total += total;
+        l_sc->sc_free += free;
+        l_sc->sc_dinodes += dinodes;
+        gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+        spin_unlock(&sdp->sd_statfs_spin);
+        brelse(l_bh);
+}
+int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        struct gfs2_holder gh;
+        struct buffer_head *m_bh, *l_bh;
+        int error;
+        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+                                   &gh);
+        if (error)
+                return error;
+        error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+        if (error)
+                goto out;
+        spin_lock(&sdp->sd_statfs_spin);
+        gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                              sizeof(struct gfs2_dinode));
+        if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
+                spin_unlock(&sdp->sd_statfs_spin);
+                goto out_bh;
+        }
+        spin_unlock(&sdp->sd_statfs_spin);
+        error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+        if (error)
+                goto out_bh;
+        error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+        if (error)
+                goto out_bh2;
+        mutex_lock(&sdp->sd_statfs_mutex);
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        mutex_unlock(&sdp->sd_statfs_mutex);
+        spin_lock(&sdp->sd_statfs_spin);
+        m_sc->sc_total += l_sc->sc_total;
+        m_sc->sc_free += l_sc->sc_free;
+        m_sc->sc_dinodes += l_sc->sc_dinodes;
+        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+               0, sizeof(struct gfs2_statfs_change));
+        spin_unlock(&sdp->sd_statfs_spin);
+        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+        gfs2_trans_end(sdp);
+out_bh2:
+        brelse(l_bh);
+out_bh:
+        brelse(m_bh);
+out:
+        gfs2_glock_dq_uninit(&gh);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+        struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_ri.ri_data;
+        sc->sc_free += rgd->rd_rg.rg_free;
+        sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+struct lfcc {
+        struct list_head list;
+        struct gfs2_holder gh;
+};
+/**
+ * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
+ *                            journals are clean
+ * @sdp: the file system
+ * @state: the state to put the transaction lock into
+ * @t_gh: the hold on the transaction lock
+ *
+ * Returns: errno
+ */
+static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
+                                    struct gfs2_holder *t_gh)
+{
+        struct gfs2_inode *ip;
+        struct gfs2_holder ji_gh;
+        struct gfs2_jdesc *jd;
+        struct lfcc *lfcc;
+        LIST_HEAD(list);
+        struct gfs2_log_header lh;
+        int error;
+        error = gfs2_jindex_hold(sdp, &ji_gh);
+        if (error)
+                return error;
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
+                if (!lfcc) {
+                        error = -ENOMEM;
+                        goto out;
+                }
+                ip = GFS2_I(jd->jd_inode);
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
+                if (error) {
+                        kfree(lfcc);
+                        goto out;
+                }
+                list_add(&lfcc->list, &list);
+        }
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
+                               LM_FLAG_PRIORITY | GL_NOCACHE,
+                               t_gh);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                error = gfs2_jdesc_check(jd);
+                if (error)
+                        break;
+                error = gfs2_find_jhead(jd, &lh);
+                if (error)
+                        break;
+                if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+                        error = -EBUSY;
+                        break;
+                }
+        }
+        if (error)
+                gfs2_glock_dq_uninit(t_gh);
+out:
+        while (!list_empty(&list)) {
+                lfcc = list_entry(list.next, struct lfcc, list);
+                list_del(&lfcc->list);
+                gfs2_glock_dq_uninit(&lfcc->gh);
+                kfree(lfcc);
+        }
+        gfs2_glock_dq_uninit(&ji_gh);
+        return error;
+}
+/**
+ * gfs2_freeze_fs - freezes the file system
+ * @sdp: the file system
+ *
+ * This function flushes data and meta data for all machines by
+ * aquiring the transaction log exclusively.  All journals are
+ * ensured to be in a clean state as well.
+ *
+ * Returns: errno
+ */
+int gfs2_freeze_fs(struct gfs2_sbd *sdp)
+{
+        int error = 0;
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (!sdp->sd_freeze_count++) {
+                error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
+                if (error)
+                        sdp->sd_freeze_count--;
+        }
+        mutex_unlock(&sdp->sd_freeze_lock);
+        return error;
+}
+/**
+ * gfs2_unfreeze_fs - unfreezes the file system
+ * @sdp: the file system
+ *
+ * This function allows the file system to proceed by unlocking
+ * the exclusively held transaction lock.  Other GFS2 nodes are
+ * now free to acquire the lock shared and go on with their lives.
+ *
+ */
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
+{
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+}
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..5bb443ae0f59
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __SUPER_DOT_H__
+#define __SUPER_DOT_H__
+#include "incore.h"
+void gfs2_tune_init(struct gfs2_tune *gt);
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
+static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
+{
+        unsigned int x;
+        spin_lock(&sdp->sd_jindex_spin);
+        x = sdp->sd_journals;
+        spin_unlock(&sdp->sd_jindex_spin);
+        return x;
+}
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+                              struct gfs2_inode **ipp);
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp,
+                        s64 total, s64 free, s64 dinodes);
+int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..0e0ec988f731
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "lm.h"
+#include "sys.h"
+#include "super.h"
+#include "glock.h"
+#include "quota.h"
+#include "util.h"
+char *gfs2_sys_margs;
+spinlock_t gfs2_sys_margs_lock;
+static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+}
+static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
+}
+static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
+{
+        unsigned int count;
+        mutex_lock(&sdp->sd_freeze_lock);
+        count = sdp->sd_freeze_count;
+        mutex_unlock(&sdp->sd_freeze_lock);
+        return snprintf(buf, PAGE_SIZE, "%u\n", count);
+}
+static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int error = 0;
+        int n = simple_strtol(buf, NULL, 0);
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        switch (n) {
+        case 0:
+                gfs2_unfreeze_fs(sdp);
+                break;
+        case 1:
+                error = gfs2_freeze_fs(sdp);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        if (error)
+                fs_warn(sdp, "freeze %d error %d", n, error);
+        return ret;
+}
+static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
+{
+        unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+        return snprintf(buf, PAGE_SIZE, "%u\n", b);
+}
+static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
+                sdp->sd_fsname);
+        return len;
+}
+static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
+                                 size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_statfs_sync(sdp);
+        return len;
+}
+static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_gl_hash_clear(sdp, NO_WAIT);
+        return len;
+}
+static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
+                                size_t len)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (simple_strtol(buf, NULL, 0) != 1)
+                return -EINVAL;
+        gfs2_quota_sync(sdp);
+        return len;
+}
+static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
+                                        size_t len)
+{
+        u32 id;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        id = simple_strtoul(buf, NULL, 0);
+        gfs2_quota_refresh(sdp, 1, id);
+        return len;
+}
+static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
+                                         size_t len)
+{
+        u32 id;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        id = simple_strtoul(buf, NULL, 0);
+        gfs2_quota_refresh(sdp, 0, id);
+        return len;
+}
+struct gfs2_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+#define GFS2_ATTR(name, mode, show, store) \
+static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
+GFS2_ATTR(id,                  0444, id_show,       NULL);
+GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
+GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
+GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
+GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
+GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
+GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
+GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+static struct attribute *gfs2_attrs[] = {
+        &gfs2_attr_id.attr,
+        &gfs2_attr_fsname.attr,
+        &gfs2_attr_freeze.attr,
+        &gfs2_attr_shrink.attr,
+        &gfs2_attr_withdraw.attr,
+        &gfs2_attr_statfs_sync.attr,
+        &gfs2_attr_quota_sync.attr,
+        &gfs2_attr_quota_refresh_user.attr,
+        &gfs2_attr_quota_refresh_group.attr,
+        NULL,
+};
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->show ? a->show(sdp, buf) : 0;
+}
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->store ? a->store(sdp, buf, len) : len;
+}
+static struct sysfs_ops gfs2_attr_ops = {
+        .show  = gfs2_attr_show,
+        .store = gfs2_attr_store,
+};
+static struct kobj_type gfs2_ktype = {
+        .default_attrs = gfs2_attrs,
+        .sysfs_ops     = &gfs2_attr_ops,
+};
+static struct kset gfs2_kset = {
+        .subsys = &fs_subsys,
+        .kobj   = {.name = "gfs2"},
+        .ktype  = &gfs2_ktype,
+};
+/*
+ * display struct lm_lockstruct fields
+ */
+struct lockstruct_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+#define LOCKSTRUCT_ATTR(name, fmt)                                          \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
+}                                                                           \
+static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+LOCKSTRUCT_ATTR(jid,      "%u\n");
+LOCKSTRUCT_ATTR(first,    "%u\n");
+LOCKSTRUCT_ATTR(lvb_size, "%u\n");
+LOCKSTRUCT_ATTR(flags,    "%d\n");
+static struct attribute *lockstruct_attrs[] = {
+        &lockstruct_attr_jid.attr,
+        &lockstruct_attr_first.attr,
+        &lockstruct_attr_lvb_size.attr,
+        &lockstruct_attr_flags.attr,
+        NULL,
+};
+/*
+ * display struct gfs2_args fields
+ */
+struct args_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+#define ARGS_ATTR(name, fmt)                                                \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
+}                                                                           \
+static struct args_attr args_attr_##name = __ATTR_RO(name)
+ARGS_ATTR(lockproto,       "%s\n");
+ARGS_ATTR(locktable,       "%s\n");
+ARGS_ATTR(hostdata,        "%s\n");
+ARGS_ATTR(spectator,       "%d\n");
+ARGS_ATTR(ignore_local_fs, "%d\n");
+ARGS_ATTR(localcaching,    "%d\n");
+ARGS_ATTR(localflocks,     "%d\n");
+ARGS_ATTR(debug,           "%d\n");
+ARGS_ATTR(upgrade,         "%d\n");
+ARGS_ATTR(num_glockd,      "%u\n");
+ARGS_ATTR(posix_acl,       "%d\n");
+ARGS_ATTR(quota,           "%u\n");
+ARGS_ATTR(suiddir,         "%d\n");
+ARGS_ATTR(data,            "%d\n");
+/* one oddball doesn't fit the macro mold */
+static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d\n",
+                        !!test_bit(SDF_NOATIME, &sdp->sd_flags));
+}
+static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
+static struct attribute *args_attrs[] = {
+        &args_attr_lockproto.attr,
+        &args_attr_locktable.attr,
+        &args_attr_hostdata.attr,
+        &args_attr_spectator.attr,
+        &args_attr_ignore_local_fs.attr,
+        &args_attr_localcaching.attr,
+        &args_attr_localflocks.attr,
+        &args_attr_debug.attr,
+        &args_attr_upgrade.attr,
+        &args_attr_num_glockd.attr,
+        &args_attr_posix_acl.attr,
+        &args_attr_quota.attr,
+        &args_attr_suiddir.attr,
+        &args_attr_data.attr,
+        &args_attr_noatime.attr,
+        NULL,
+};
+/*
+ * display counters from superblock
+ */
+struct counters_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+#define COUNTERS_ATTR(name, fmt)                                            \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+        return snprintf(buf, PAGE_SIZE, fmt,                                \
+                        (unsigned int)atomic_read(&sdp->sd_##name));        \
+}                                                                           \
+static struct counters_attr counters_attr_##name = __ATTR_RO(name)
+COUNTERS_ATTR(glock_count,      "%u\n");
+COUNTERS_ATTR(glock_held_count, "%u\n");
+COUNTERS_ATTR(inode_count,      "%u\n");
+COUNTERS_ATTR(reclaimed,        "%u\n");
+static struct attribute *counters_attrs[] = {
+        &counters_attr_glock_count.attr,
+        &counters_attr_glock_held_count.attr,
+        &counters_attr_inode_count.attr,
+        &counters_attr_reclaimed.attr,
+        NULL,
+};
+/*
+ * get and set struct gfs2_tune fields
+ */
+static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%u %u\n",
+                        sdp->sd_tune.gt_quota_scale_num,
+                        sdp->sd_tune.gt_quota_scale_den);
+}
+static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
+                                 size_t len)
+{
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        unsigned int x, y;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
+                return -EINVAL;
+        spin_lock(&gt->gt_spin);
+        gt->gt_quota_scale_num = x;
+        gt->gt_quota_scale_den = y;
+        spin_unlock(&gt->gt_spin);
+        return len;
+}
+static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
+                        int check_zero, const char *buf, size_t len)
+{
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        unsigned int x;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EACCES;
+        x = simple_strtoul(buf, NULL, 0);
+        if (check_zero && !x)
+                return -EINVAL;
+        spin_lock(&gt->gt_spin);
+        *field = x;
+        spin_unlock(&gt->gt_spin);
+        return len;
+}
+struct tune_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+#define TUNE_ATTR_3(name, show, store)                                        \
+static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+#define TUNE_ATTR_2(name, store)                                              \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
+{                                                                             \
+        return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name);      \
+}                                                                             \
+TUNE_ATTR_3(name, name##_show, store)
+#define TUNE_ATTR(name, check_zero)                                           \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+        return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len);  \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+#define TUNE_ATTR_DAEMON(name, process)                                       \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+        ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
+        wake_up_process(sdp->sd_##process);                                   \
+        return r;                                                             \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+TUNE_ATTR(ilimit, 0);
+TUNE_ATTR(ilimit_tries, 0);
+TUNE_ATTR(ilimit_min, 0);
+TUNE_ATTR(demote_secs, 0);
+TUNE_ATTR(incore_log_blocks, 0);
+TUNE_ATTR(log_flush_secs, 0);
+TUNE_ATTR(jindex_refresh_secs, 0);
+TUNE_ATTR(quota_warn_period, 0);
+TUNE_ATTR(quota_quantum, 0);
+TUNE_ATTR(atime_quantum, 0);
+TUNE_ATTR(max_readahead, 0);
+TUNE_ATTR(complain_secs, 0);
+TUNE_ATTR(reclaim_limit, 0);
+TUNE_ATTR(prefetch_secs, 0);
+TUNE_ATTR(statfs_slow, 0);
+TUNE_ATTR(new_files_jdata, 0);
+TUNE_ATTR(new_files_directio, 0);
+TUNE_ATTR(quota_simul_sync, 1);
+TUNE_ATTR(quota_cache_secs, 1);
+TUNE_ATTR(max_atomic_write, 1);
+TUNE_ATTR(stall_secs, 1);
+TUNE_ATTR(entries_per_readdir, 1);
+TUNE_ATTR(greedy_default, 1);
+TUNE_ATTR(greedy_quantum, 1);
+TUNE_ATTR(greedy_max, 1);
+TUNE_ATTR(statfs_quantum, 1);
+TUNE_ATTR_DAEMON(scand_secs, scand_process);
+TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
+TUNE_ATTR_DAEMON(logd_secs, logd_process);
+TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
+TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
+static struct attribute *tune_attrs[] = {
+        &tune_attr_ilimit.attr,
+        &tune_attr_ilimit_tries.attr,
+        &tune_attr_ilimit_min.attr,
+        &tune_attr_demote_secs.attr,
+        &tune_attr_incore_log_blocks.attr,
+        &tune_attr_log_flush_secs.attr,
+        &tune_attr_jindex_refresh_secs.attr,
+        &tune_attr_quota_warn_period.attr,
+        &tune_attr_quota_quantum.attr,
+        &tune_attr_atime_quantum.attr,
+        &tune_attr_max_readahead.attr,
+        &tune_attr_complain_secs.attr,
+        &tune_attr_reclaim_limit.attr,
+        &tune_attr_prefetch_secs.attr,
+        &tune_attr_statfs_slow.attr,
+        &tune_attr_quota_simul_sync.attr,
+        &tune_attr_quota_cache_secs.attr,
+        &tune_attr_max_atomic_write.attr,
+        &tune_attr_stall_secs.attr,
+        &tune_attr_entries_per_readdir.attr,
+        &tune_attr_greedy_default.attr,
+        &tune_attr_greedy_quantum.attr,
+        &tune_attr_greedy_max.attr,
+        &tune_attr_statfs_quantum.attr,
+        &tune_attr_scand_secs.attr,
+        &tune_attr_recoverd_secs.attr,
+        &tune_attr_logd_secs.attr,
+        &tune_attr_quotad_secs.attr,
+        &tune_attr_quota_scale.attr,
+        &tune_attr_new_files_jdata.attr,
+        &tune_attr_new_files_directio.attr,
+        NULL,
+};
+static struct attribute_group lockstruct_group = {
+        .name = "lockstruct",
+        .attrs = lockstruct_attrs,
+};
+static struct attribute_group counters_group = {
+        .name = "counters",
+        .attrs = counters_attrs,
+};
+static struct attribute_group args_group = {
+        .name = "args",
+        .attrs = args_attrs,
+};
+static struct attribute_group tune_group = {
+        .name = "tune",
+        .attrs = tune_attrs,
+};
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
+{
+        int error;
+        sdp->sd_kobj.kset = &gfs2_kset;
+        sdp->sd_kobj.ktype = &gfs2_ktype;
+        error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
+        if (error)
+                goto fail;
+        error = kobject_register(&sdp->sd_kobj);
+        if (error)
+                goto fail;
+        error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
+        if (error)
+                goto fail_reg;
+        error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
+        if (error)
+                goto fail_lockstruct;
+        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
+        if (error)
+                goto fail_counters;
+        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
+        if (error)
+                goto fail_args;
+        return 0;
+fail_args:
+        sysfs_remove_group(&sdp->sd_kobj, &args_group);
+fail_counters:
+        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+fail_lockstruct:
+        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+fail_reg:
+        kobject_unregister(&sdp->sd_kobj);
+fail:
+        fs_err(sdp, "error %d adding sysfs files", error);
+        return error;
+}
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
+{
+        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+        sysfs_remove_group(&sdp->sd_kobj, &args_group);
+        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+        kobject_unregister(&sdp->sd_kobj);
+}
+int gfs2_sys_init(void)
+{
+        gfs2_sys_margs = NULL;
+        spin_lock_init(&gfs2_sys_margs_lock);
+        return kset_register(&gfs2_kset);
+}
+void gfs2_sys_uninit(void)
+{
+        kfree(gfs2_sys_margs);
+        kset_unregister(&gfs2_kset);
+}
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..1ca8cdac5304
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __SYS_DOT_H__
+#define __SYS_DOT_H__
+#include <linux/spinlock.h>
+struct gfs2_sbd;
+/* Allow args to be passed to GFS2 when using an initial ram disk */
+extern char *gfs2_sys_margs;
+extern spinlock_t gfs2_sys_margs_lock;
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
+int gfs2_sys_init(void);
+void gfs2_sys_uninit(void);
+#endif /* __SYS_DOT_H__ */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..f8dabf8446bb
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/kallsyms.h>
+#include <linux/lm_interface.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+                     unsigned int revokes)
+{
+        struct gfs2_trans *tr;
+        int error;
+        BUG_ON(current->journal_info);
+        BUG_ON(blocks == 0 && revokes == 0);
+        tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+        if (!tr)
+                return -ENOMEM;
+        tr->tr_ip = (unsigned long)__builtin_return_address(0);
+        tr->tr_blocks = blocks;
+        tr->tr_revokes = revokes;
+        tr->tr_reserved = 1;
+        if (blocks)
+                tr->tr_reserved += 6 + blocks;
+        if (revokes)
+                tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
+                                                   sizeof(u64));
+        INIT_LIST_HEAD(&tr->tr_list_buf);
+        gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
+        error = gfs2_glock_nq(&tr->tr_t_gh);
+        if (error)
+                goto fail_holder_uninit;
+        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                tr->tr_t_gh.gh_flags |= GL_NOCACHE;
+                error = -EROFS;
+                goto fail_gunlock;
+        }
+        error = gfs2_log_reserve(sdp, tr->tr_reserved);
+        if (error)
+                goto fail_gunlock;
+        current->journal_info = tr;
+        return 0;
+fail_gunlock:
+        gfs2_glock_dq(&tr->tr_t_gh);
+fail_holder_uninit:
+        gfs2_holder_uninit(&tr->tr_t_gh);
+        kfree(tr);
+        return error;
+}
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+        struct gfs2_trans *tr = current->journal_info;
+        BUG_ON(!tr);
+        current->journal_info = NULL;
+        if (!tr->tr_touched) {
+                gfs2_log_release(sdp, tr->tr_reserved);
+                gfs2_glock_dq(&tr->tr_t_gh);
+                gfs2_holder_uninit(&tr->tr_t_gh);
+                kfree(tr);
+                return;
+        }
+        if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
+                fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
+                       tr->tr_num_buf, tr->tr_blocks);
+                print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+        }
+        if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
+                fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
+                       tr->tr_num_revoke, tr->tr_revokes);
+                print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+        }
+        gfs2_log_commit(sdp, tr);
+        gfs2_glock_dq(&tr->tr_t_gh);
+        gfs2_holder_uninit(&tr->tr_t_gh);
+        kfree(tr);
+        if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+                gfs2_log_flush(sdp, NULL);
+}
+void gfs2_trans_add_gl(struct gfs2_glock *gl)
+{
+        lops_add(gl->gl_sbd, &gl->gl_le);
+}
+/**
+ * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to add
+ * @meta: True in the case of adding metadata
+ *
+ */
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_bufdata *bd;
+        bd = bh->b_private;
+        if (bd)
+                gfs2_assert(sdp, bd->bd_gl == gl);
+        else {
+                gfs2_attach_bufdata(gl, bh, meta);
+                bd = bh->b_private;
+        }
+        lops_add(sdp, &bd->bd_le);
+}
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+        struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
+                                         GFP_NOFS | __GFP_NOFAIL);
+        lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
+        rv->rv_blkno = blkno;
+        lops_add(sdp, &rv->rv_le);
+}
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+        struct gfs2_revoke *rv;
+        int found = 0;
+        gfs2_log_lock(sdp);
+        list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
+                if (rv->rv_blkno == blkno) {
+                        list_del(&rv->rv_le.le_list);
+                        gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
+                        sdp->sd_log_num_revoke--;
+                        found = 1;
+                        break;
+                }
+        }
+        gfs2_log_unlock(sdp);
+        if (found) {
+                struct gfs2_trans *tr = current->journal_info;
+                kfree(rv);
+                tr->tr_num_revoke_rm++;
+        }
+}
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
+{
+        lops_add(rgd->rd_sbd, &rgd->rd_le);
+}
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..23d4cbe1de5b
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __TRANS_DOT_H__
+#define __TRANS_DOT_H__
+#include <linux/buffer_head.h>
+struct gfs2_sbd;
+struct gfs2_rgrpd;
+struct gfs2_glock;
+#define RES_DINODE      1
+#define RES_INDIRECT    1
+#define RES_JDATA       1
+#define RES_DATA        1
+#define RES_LEAF        1
+#define RES_RG_BIT      2
+#define RES_EATTR       1
+#define RES_STATFS      1
+#define RES_QUOTA       2
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+                     unsigned int revokes);
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+void gfs2_trans_add_gl(struct gfs2_glock *gl);
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..196c604faadc
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "util.h"
+kmem_cache_t *gfs2_glock_cachep __read_mostly;
+kmem_cache_t *gfs2_inode_cachep __read_mostly;
+kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
+void gfs2_assert_i(struct gfs2_sbd *sdp)
+{
+        printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
+               sdp->sd_fsname);
+}
+/**
+ * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+                           const char *function, char *file, unsigned int line)
+{
+        int me;
+        me = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname, assertion,
+                sdp->sd_fsname, function, file, line);
+        dump_stack();
+        return (me) ? -1 : -2;
+}
+/**
+ * gfs2_assert_warn_i - Print a message to the console if @assertion is false
+ * Returns: -1 if we printed something
+ *          -2 if we didn't
+ */
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+                       const char *function, char *file, unsigned int line)
+{
+        if (time_before(jiffies,
+                        sdp->sd_last_warning +
+                        gfs2_tune_get(sdp, gt_complain_secs) * HZ))
+                return -2;
+        printk(KERN_WARNING
+               "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+               "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+               sdp->sd_fsname, assertion,
+               sdp->sd_fsname, function, file, line);
+        if (sdp->sd_args.ar_debug)
+                BUG();
+        else
+                dump_stack();
+        sdp->sd_last_warning = jiffies;
+        return -1;
+}
+/**
+ * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
+                   char *file, unsigned int line)
+{
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+                         const char *function, char *file, unsigned int line)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                "GFS2: fsid=%s:   inode = %llu %llu\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
+                (unsigned long long)ip->i_num.no_addr,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+                         const char *function, char *file, unsigned int line)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                "GFS2: fsid=%s:   RG = %llu\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *type, const char *function, char *file,
+                       unsigned int line)
+{
+        int me;
+        me = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: invalid metadata block\n"
+                "GFS2: fsid=%s:   bh = %llu (%s)\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
+                sdp->sd_fsname, function, file, line);
+        return (me) ? -1 : -2;
+}
+/**
+ * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                           u16 type, u16 t, const char *function,
+                           char *file, unsigned int line)
+{
+        int me;
+        me = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: invalid metadata block\n"
+                "GFS2: fsid=%s:   bh = %llu (type: exp=%u, found=%u)\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
+                sdp->sd_fsname, function, file, line);
+        return (me) ? -1 : -2;
+}
+/**
+ * gfs2_io_error_i - Flag an I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+                    unsigned int line)
+{
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: I/O error\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+/**
+ * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *function, char *file, unsigned int line)
+{
+        int rv;
+        rv = gfs2_lm_withdraw(sdp,
+                "GFS2: fsid=%s: fatal: I/O error\n"
+                "GFS2: fsid=%s:   block = %llu\n"
+                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                sdp->sd_fsname,
+                sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
+                sdp->sd_fsname, function, file, line);
+        return rv;
+}
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+                      unsigned int bit, int new_value)
+{
+        unsigned int c, o, b = bit;
+        int old_value;
+        c = b / (8 * PAGE_SIZE);
+        b %= 8 * PAGE_SIZE;
+        o = b / 8;
+        b %= 8;
+        old_value = (bitmap[c][o] & (1 << b));
+        gfs2_assert_withdraw(sdp, !old_value != !new_value);
+        if (new_value)
+                bitmap[c][o] |= 1 << b;
+        else
+                bitmap[c][o] &= ~(1 << b);
+}
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..76a50899fe9e
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+#include "incore.h"
+#define fs_printk(level, fs, fmt, arg...) \
+        printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
+#define fs_info(fs, fmt, arg...) \
+        fs_printk(KERN_INFO , fs , fmt , ## arg)
+#define fs_warn(fs, fmt, arg...) \
+        fs_printk(KERN_WARNING , fs , fmt , ## arg)
+#define fs_err(fs, fmt, arg...) \
+        fs_printk(KERN_ERR, fs , fmt , ## arg)
+void gfs2_assert_i(struct gfs2_sbd *sdp);
+#define gfs2_assert(sdp, assertion) \
+do { \
+        if (unlikely(!(assertion))) { \
+                gfs2_assert_i(sdp); \
+                BUG(); \
+        } \
+} while (0)
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+                           const char *function, char *file, unsigned int line);
+#define gfs2_assert_withdraw(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
+                                        __FUNCTION__, __FILE__, __LINE__))
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+                       const char *function, char *file, unsigned int line);
+#define gfs2_assert_warn(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
+                                        __FUNCTION__, __FILE__, __LINE__))
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
+                   const char *function, char *file, unsigned int line);
+#define gfs2_consist(sdp) \
+gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+                         const char *function, char *file, unsigned int line);
+#define gfs2_consist_inode(ip) \
+gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+                         const char *function, char *file, unsigned int line);
+#define gfs2_consist_rgrpd(rgd) \
+gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *type, const char *function,
+                       char *file, unsigned int line);
+static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
+                                    struct buffer_head *bh,
+                                    const char *function,
+                                    char *file, unsigned int line)
+{
+        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+        u32 magic = mh->mh_magic;
+        magic = be32_to_cpu(magic);
+        if (unlikely(magic != GFS2_MAGIC))
+                return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+                                          file, line);
+        return 0;
+}
+#define gfs2_meta_check(sdp, bh) \
+gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                           u16 type, u16 t,
+                           const char *function,
+                           char *file, unsigned int line);
+static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
+                                        struct buffer_head *bh,
+                                        u16 type,
+                                        const char *function,
+                                        char *file, unsigned int line)
+{
+        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+        u32 magic = mh->mh_magic;
+        u16 t = be32_to_cpu(mh->mh_type);
+        magic = be32_to_cpu(magic);
+        if (unlikely(magic != GFS2_MAGIC))
+                return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+                                          file, line);
+        if (unlikely(t != type))
+                return gfs2_metatype_check_ii(sdp, bh, type, t, function,
+                                              file, line);
+        return 0;
+}
+#define gfs2_metatype_check(sdp, bh, type) \
+gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
+static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
+                                     u16 format)
+{
+        struct gfs2_meta_header *mh;
+        mh = (struct gfs2_meta_header *)bh->b_data;
+        mh->mh_type = cpu_to_be32(type);
+        mh->mh_format = cpu_to_be32(format);
+}
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+                    char *file, unsigned int line);
+#define gfs2_io_error(sdp) \
+gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+                       const char *function, char *file, unsigned int line);
+#define gfs2_io_error_bh(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
+extern kmem_cache_t *gfs2_glock_cachep;
+extern kmem_cache_t *gfs2_inode_cachep;
+extern kmem_cache_t *gfs2_bufdata_cachep;
+static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
+                                           unsigned int *p)
+{
+        unsigned int x;
+        spin_lock(&gt->gt_spin);
+        x = *p;
+        spin_unlock(&gt->gt_spin);
+        return x;
+}
+#define gfs2_tune_get(sdp, field) \
+gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+                      unsigned int bit, int new_value);
+#endif /* __UTIL_DOT_H__ */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index bcf6ee36e065..7faef8544f32 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -60,14 +60,14 @@ void hpfs_read_inode(struct inode *i)
        if (hpfs_sb(i->i_sb)->sb_eas) {
                if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
                        if (ea_size == 2) {
-                                i->i_uid = le16_to_cpu(*(u16*)ea);
+                                i->i_uid = le16_to_cpu(*(__le16*)ea);
                                hpfs_inode->i_ea_uid = 1;
                        }
                        kfree(ea);
                }
                if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
                        if (ea_size == 2) {
-                                i->i_gid = le16_to_cpu(*(u16*)ea);
+                                i->i_gid = le16_to_cpu(*(__le16*)ea);
                                hpfs_inode->i_ea_gid = 1;
                        }
                        kfree(ea);
@@ -87,7 +87,7 @@ void hpfs_read_inode(struct inode *i)
                        int rdev = 0;
                        umode_t mode = hpfs_sb(sb)->sb_mode;
                        if (ea_size == 2) {
-                                mode = le16_to_cpu(*(u16*)ea);
+                                mode = le16_to_cpu(*(__le16*)ea);
                                hpfs_inode->i_ea_mode = 1;
                        }
                        kfree(ea);
@@ -95,7 +95,7 @@ void hpfs_read_inode(struct inode *i)
                        if (S_ISBLK(mode) || S_ISCHR(mode)) {
                                if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) {
                                        if (ea_size == 4)
-                                                rdev = le32_to_cpu(*(u32*)ea);
+                                                rdev = le32_to_cpu(*(__le32*)ea);
                                        kfree(ea);
                                }
                        }
@@ -148,7 +148,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
                   we'd better not overwrite them
                hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
        } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
-                u32 ea;
+                __le32 ea;
                if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
                        ea = cpu_to_le32(i->i_uid);
                        hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
@@ -165,6 +165,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
                          && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
                          | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
                                ea = cpu_to_le32(i->i_mode);
+                                /* sick, but legal */
                                hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2);
                                hpfs_inode->i_ea_mode = 1;
                        }
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index dcb6d2e988b8..642675fc394a 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -572,7 +572,7 @@ struct hppfs_dirent {
 };
 static int hppfs_filldir(void *d, const char *name, int size,
-                         loff_t offset, ino_t inode, unsigned int type)
+                         loff_t offset, u64 inode, unsigned int type)
 {
        struct hppfs_dirent *dirent = d;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5e03b2f67b93..4ee3f006b861 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
                if (h_vm_pgoff >= h_pgoff)
                        v_offset = 0;
-                unmap_hugepage_range(vma,
+                __unmap_hugepage_range(vma,
                                vma->vm_start + v_offset, vma->vm_end);
        }
 }
diff --git a/fs/inode.c b/fs/inode.c
index bf6bec4e54ff..d9a21d122926 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -162,7 +162,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
                        mapping->backing_dev_info = bdi;
                }
-                inode->i_private = 0;
+                inode->i_private = NULL;
                inode->i_mapping = mapping;
        }
        return inode;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 6dc6721d9e82..89e8da112a75 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -150,11 +150,6 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
        unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
        unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
-        if (!ioprio_valid(aprio))
-                return bprio;
-        if (!ioprio_valid(bprio))
-                return aprio;
        if (aclass == IOPRIO_CLASS_NONE)
                aclass = IOPRIO_CLASS_BE;
        if (bclass == IOPRIO_CLASS_NONE)
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 81a90e170ac3..fb8fe7a9ddc6 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -14,9 +14,9 @@
 * Convert Unicode 16 to UTF-8 or ASCII.
 */
 static int
-uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
+uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
 {
-        wchar_t *ip, ch;
+        __be16 *ip, ch;
        unsigned char *op;
        ip = uni;
@@ -24,8 +24,8 @@ uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
        while ((ch = get_unaligned(ip)) && len) {
                int llen;
-                ch = be16_to_cpu(ch);
+                llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE);
-                if ((llen = nls->uni2char(ch, op, NLS_MAX_CHARSET_SIZE)) > 0)
+                if (llen > 0)
                        op += llen;
                else
                        *op++ = '?';
@@ -82,7 +82,7 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
                len = wcsntombs_be(outname, de->name,
                                   de->name_len[0] >> 1, PAGE_SIZE);
        } else {
-                len = uni16_to_x8(outname, (u16 *) de->name,
+                len = uni16_to_x8(outname, (__be16 *) de->name,
                                  de->name_len[0] >> 1, nls);
        }
        if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) {
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index e7ba0c30e071..c04b3a14a3e9 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
 *  (C) 1991  Linus Torvalds - minix filesystem
 */
-#include <linux/config.h>       /* Joliet? */
 #include <linux/smp_lock.h>
 #include "isofs.h"
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index c518dd8fe60a..b85c686b60db 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -725,6 +725,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
                        __FUNCTION__);
                kfree(journal);
                journal = NULL;
+                goto out;
        }
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
@@ -735,7 +736,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
        J_ASSERT(bh != NULL);
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
+out:
        return journal;
 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e1b3c8af4d17..d5c63047a8b3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1314,13 +1314,14 @@ int journal_stop(handle_t *handle)
        int old_handle_count, err;
        pid_t pid;
-        J_ASSERT(transaction->t_updates > 0);
        J_ASSERT(journal_current_handle() == handle);
        if (is_handle_aborted(handle))
                err = -EIO;
-        else
+        else {
+                J_ASSERT(transaction->t_updates > 0);
                err = 0;
+        }
        if (--handle->h_ref > 0) {
                jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..802a3413872a
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+obj-$(CONFIG_JBD2) += jbd2.o
+jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..68039fa9a566
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
+/*
+ * linux/fs/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+        transaction_t *transaction = jh->b_cp_transaction;
+        jh->b_cpnext->b_cpprev = jh->b_cpprev;
+        jh->b_cpprev->b_cpnext = jh->b_cpnext;
+        if (transaction->t_checkpoint_list == jh) {
+                transaction->t_checkpoint_list = jh->b_cpnext;
+                if (transaction->t_checkpoint_list == jh)
+                        transaction->t_checkpoint_list = NULL;
+        }
+}
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+        transaction_t *transaction = jh->b_cp_transaction;
+        __buffer_unlink_first(jh);
+        if (transaction->t_checkpoint_io_list == jh) {
+                transaction->t_checkpoint_io_list = jh->b_cpnext;
+                if (transaction->t_checkpoint_io_list == jh)
+                        transaction->t_checkpoint_io_list = NULL;
+        }
+}
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+        transaction_t *transaction = jh->b_cp_transaction;
+        __buffer_unlink_first(jh);
+        if (!transaction->t_checkpoint_io_list) {
+                jh->b_cpnext = jh->b_cpprev = jh;
+        } else {
+                jh->b_cpnext = transaction->t_checkpoint_io_list;
+                jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+                jh->b_cpprev->b_cpnext = jh;
+                jh->b_cpnext->b_cpprev = jh;
+        }
+        transaction->t_checkpoint_io_list = jh;
+}
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+        int ret = 0;
+        struct buffer_head *bh = jh2bh(jh);
+        if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+                JBUFFER_TRACE(jh, "remove from checkpoint list");
+                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
+                jbd_unlock_bh_state(bh);
+                jbd2_journal_remove_journal_head(bh);
+                BUFFER_TRACE(bh, "release");
+                __brelse(bh);
+        } else {
+                jbd_unlock_bh_state(bh);
+        }
+        return ret;
+}
+/*
+ * __jbd2_log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*.  It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __jbd2_log_wait_for_space(journal_t *journal)
+{
+        int nblocks;
+        assert_spin_locked(&journal->j_state_lock);
+        nblocks = jbd_space_needed(journal);
+        while (__jbd2_log_space_left(journal) < nblocks) {
+                if (journal->j_flags & JBD2_ABORT)
+                        return;
+                spin_unlock(&journal->j_state_lock);
+                mutex_lock(&journal->j_checkpoint_mutex);
+                /*
+                 * Test again, another process may have checkpointed while we
+                 * were waiting for the checkpoint lock
+                 */
+                spin_lock(&journal->j_state_lock);
+                nblocks = jbd_space_needed(journal);
+                if (__jbd2_log_space_left(journal) < nblocks) {
+                        spin_unlock(&journal->j_state_lock);
+                        jbd2_log_do_checkpoint(journal);
+                        spin_lock(&journal->j_state_lock);
+                }
+                mutex_unlock(&journal->j_checkpoint_mutex);
+        }
+}
+/*
+ * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
+ * The caller must restart a list walk.  Wait for someone else to run
+ * jbd_unlock_bh_state().
+ */
+static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+        __releases(journal->j_list_lock)
+{
+        get_bh(bh);
+        spin_unlock(&journal->j_list_lock);
+        jbd_lock_bh_state(bh);
+        jbd_unlock_bh_state(bh);
+        put_bh(bh);
+}
+/*
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
+ *
+ * Called with j_list_lock held.
+ */
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+{
+        struct journal_head *jh;
+        struct buffer_head *bh;
+        tid_t this_tid;
+        int released = 0;
+        this_tid = transaction->t_tid;
+restart:
+        /* Did somebody clean up the transaction in the meanwhile? */
+        if (journal->j_checkpoint_transactions != transaction ||
+                        transaction->t_tid != this_tid)
+                return;
+        while (!released && transaction->t_checkpoint_io_list) {
+                jh = transaction->t_checkpoint_io_list;
+                bh = jh2bh(jh);
+                if (!jbd_trylock_bh_state(bh)) {
+                        jbd_sync_bh(journal, bh);
+                        spin_lock(&journal->j_list_lock);
+                        goto restart;
+                }
+                if (buffer_locked(bh)) {
+                        atomic_inc(&bh->b_count);
+                        spin_unlock(&journal->j_list_lock);
+                        jbd_unlock_bh_state(bh);
+                        wait_on_buffer(bh);
+                        /* the journal_head may have gone by now */
+                        BUFFER_TRACE(bh, "brelse");
+                        __brelse(bh);
+                        spin_lock(&journal->j_list_lock);
+                        goto restart;
+                }
+                /*
+                 * Now in whatever state the buffer currently is, we know that
+                 * it has been written out and so we can drop it from the list
+                 */
+                released = __jbd2_journal_remove_checkpoint(jh);
+                jbd_unlock_bh_state(bh);
+                jbd2_journal_remove_journal_head(bh);
+                __brelse(bh);
+        }
+}
+#define NR_BATCH        64
+static void
+__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+{
+        int i;
+        ll_rw_block(SWRITE, *batch_count, bhs);
+        for (i = 0; i < *batch_count; i++) {
+                struct buffer_head *bh = bhs[i];
+                clear_buffer_jwrite(bh);
+                BUFFER_TRACE(bh, "brelse");
+                __brelse(bh);
+        }
+        *batch_count = 0;
+}
+/*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.
+ *
+ * Called with j_list_lock held and drops it if 1 is returned
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+                        struct buffer_head **bhs, int *batch_count)
+{
+        struct buffer_head *bh = jh2bh(jh);
+        int ret = 0;
+        if (buffer_locked(bh)) {
+                atomic_inc(&bh->b_count);
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                wait_on_buffer(bh);
+                /* the journal_head may have gone by now */
+                BUFFER_TRACE(bh, "brelse");
+                __brelse(bh);
+                ret = 1;
+        } else if (jh->b_transaction != NULL) {
+                transaction_t *t = jh->b_transaction;
+                tid_t tid = t->t_tid;
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                jbd2_log_start_commit(journal, tid);
+                jbd2_log_wait_commit(journal, tid);
+                ret = 1;
+        } else if (!buffer_dirty(bh)) {
+                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+                BUFFER_TRACE(bh, "remove from checkpoint");
+                __jbd2_journal_remove_checkpoint(jh);
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                jbd2_journal_remove_journal_head(bh);
+                __brelse(bh);
+                ret = 1;
+        } else {
+                /*
+                 * Important: we are about to write the buffer, and
+                 * possibly block, while still holding the journal lock.
+                 * We cannot afford to let the transaction logic start
+                 * messing around with this buffer before we write it to
+                 * disk, as that would break recoverability.
+                 */
+                BUFFER_TRACE(bh, "queue");
+                get_bh(bh);
+                J_ASSERT_BH(bh, !buffer_jwrite(bh));
+                set_buffer_jwrite(bh);
+                bhs[*batch_count] = bh;
+                __buffer_relink_io(jh);
+                jbd_unlock_bh_state(bh);
+                (*batch_count)++;
+                if (*batch_count == NR_BATCH) {
+                        spin_unlock(&journal->j_list_lock);
+                        __flush_batch(journal, bhs, batch_count);
+                        ret = 1;
+                }
+        }
+        return ret;
+}
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ */
+int jbd2_log_do_checkpoint(journal_t *journal)
+{
+        transaction_t *transaction;
+        tid_t this_tid;
+        int result;
+        jbd_debug(1, "Start checkpoint\n");
+        /*
+         * First thing: if there are any transactions in the log which
+         * don't need checkpointing, just eliminate them from the
+         * journal straight away.
+         */
+        result = jbd2_cleanup_journal_tail(journal);
+        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+        if (result <= 0)
+                return result;
+        /*
+         * OK, we need to start writing disk blocks.  Take one transaction
+         * and write it.
+         */
+        spin_lock(&journal->j_list_lock);
+        if (!journal->j_checkpoint_transactions)
+                goto out;
+        transaction = journal->j_checkpoint_transactions;
+        this_tid = transaction->t_tid;
+restart:
+        /*
+         * If someone cleaned up this transaction while we slept, we're
+         * done (maybe it's a new transaction, but it fell at the same
+         * address).
+         */
+        if (journal->j_checkpoint_transactions == transaction &&
+                        transaction->t_tid == this_tid) {
+                int batch_count = 0;
+                struct buffer_head *bhs[NR_BATCH];
+                struct journal_head *jh;
+                int retry = 0;
+                while (!retry && transaction->t_checkpoint_list) {
+                        struct buffer_head *bh;
+                        jh = transaction->t_checkpoint_list;
+                        bh = jh2bh(jh);
+                        if (!jbd_trylock_bh_state(bh)) {
+                                jbd_sync_bh(journal, bh);
+                                retry = 1;
+                                break;
+                        }
+                        retry = __process_buffer(journal, jh, bhs,&batch_count);
+                        if (!retry && lock_need_resched(&journal->j_list_lock)){
+                                spin_unlock(&journal->j_list_lock);
+                                retry = 1;
+                                break;
+                        }
+                }
+                if (batch_count) {
+                        if (!retry) {
+                                spin_unlock(&journal->j_list_lock);
+                                retry = 1;
+                        }
+                        __flush_batch(journal, bhs, &batch_count);
+                }
+                if (retry) {
+                        spin_lock(&journal->j_list_lock);
+                        goto restart;
+                }
+                /*
+                 * Now we have cleaned up the first transaction's checkpoint
+                 * list. Let's clean up the second one
+                 */
+                __wait_cp_io(journal, transaction);
+        }
+out:
+        spin_unlock(&journal->j_list_lock);
+        result = jbd2_cleanup_journal_tail(journal);
+        if (result < 0)
+                return result;
+        return 0;
+}
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock.  If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts.  Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the journal superblock if
+ * we have an abort error outstanding.
+ */
+int jbd2_cleanup_journal_tail(journal_t *journal)
+{
+        transaction_t * transaction;
+        tid_t           first_tid;
+        unsigned long   blocknr, freed;
+        /* OK, work out the oldest transaction remaining in the log, and
+         * the log block it starts at.
+         *
+         * If the log is now empty, we need to work out which is the
+         * next transaction ID we will write, and where it will
+         * start. */
+        spin_lock(&journal->j_state_lock);
+        spin_lock(&journal->j_list_lock);
+        transaction = journal->j_checkpoint_transactions;
+        if (transaction) {
+                first_tid = transaction->t_tid;
+                blocknr = transaction->t_log_start;
+        } else if ((transaction = journal->j_committing_transaction) != NULL) {
+                first_tid = transaction->t_tid;
+                blocknr = transaction->t_log_start;
+        } else if ((transaction = journal->j_running_transaction) != NULL) {
+                first_tid = transaction->t_tid;
+                blocknr = journal->j_head;
+        } else {
+                first_tid = journal->j_transaction_sequence;
+                blocknr = journal->j_head;
+        }
+        spin_unlock(&journal->j_list_lock);
+        J_ASSERT(blocknr != 0);
+        /* If the oldest pinned transaction is at the tail of the log
+           already then there's not much we can do right now. */
+        if (journal->j_tail_sequence == first_tid) {
+                spin_unlock(&journal->j_state_lock);
+                return 1;
+        }
+        /* OK, update the superblock to recover the freed space.
+         * Physical blocks come first: have we wrapped beyond the end of
+         * the log?  */
+        freed = blocknr - journal->j_tail;
+        if (blocknr < journal->j_tail)
+                freed = freed + journal->j_last - journal->j_first;
+        jbd_debug(1,
+                  "Cleaning journal tail from %d to %d (offset %lu), "
+                  "freeing %lu\n",
+                  journal->j_tail_sequence, first_tid, blocknr, freed);
+        journal->j_free += freed;
+        journal->j_tail_sequence = first_tid;
+        journal->j_tail = blocknr;
+        spin_unlock(&journal->j_state_lock);
+        if (!(journal->j_flags & JBD2_ABORT))
+                jbd2_journal_update_superblock(journal, 1);
+        return 0;
+}
+/* Checkpoint list management */
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+        struct journal_head *last_jh;
+        struct journal_head *next_jh = jh;
+        int ret, freed = 0;
+        *released = 0;
+        if (!jh)
+                return 0;
+        last_jh = jh->b_cpprev;
+        do {
+                jh = next_jh;
+                next_jh = jh->b_cpnext;
+                /* Use trylock because of the ranking */
+                if (jbd_trylock_bh_state(jh2bh(jh))) {
+                        ret = __try_to_free_cp_buf(jh);
+                        if (ret) {
+                                freed++;
+                                if (ret == 2) {
+                                        *released = 1;
+                                        return freed;
+                                }
+                        }
+                }
+                /*
+                 * This function only frees up some memory
+                 * if possible so we dont have an obligation
+                 * to finish processing. Bail out if preemption
+                 * requested:
+                 */
+                if (need_resched())
+                        return freed;
+        } while (jh != last_jh);
+        return freed;
+}
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+{
+        transaction_t *transaction, *last_transaction, *next_transaction;
+        int ret = 0;
+        int released;
+        transaction = journal->j_checkpoint_transactions;
+        if (!transaction)
+                goto out;
+        last_transaction = transaction->t_cpprev;
+        next_transaction = transaction;
+        do {
+                transaction = next_transaction;
+                next_transaction = transaction->t_cpnext;
+                ret += journal_clean_one_cp_list(transaction->
+                                t_checkpoint_list, &released);
+                /*
+                 * This function only frees up some memory if possible so we
+                 * dont have an obligation to finish processing. Bail out if
+                 * preemption requested:
+                 */
+                if (need_resched())
+                        goto out;
+                if (released)
+                        continue;
+                /*
+                 * It is essential that we are as careful as in the case of
+                 * t_checkpoint_list with removing the buffer from the list as
+                 * we can possibly see not yet submitted buffers on io_list
+                 */
+                ret += journal_clean_one_cp_list(transaction->
+                                t_checkpoint_io_list, &released);
+                if (need_resched())
+                        goto out;
+        } while (transaction != last_transaction);
+out:
+        return ret;
+}
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk.  To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ *
+ * This function is called with the journal locked.
+ * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
+ */
+int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
+{
+        transaction_t *transaction;
+        journal_t *journal;
+        int ret = 0;
+        JBUFFER_TRACE(jh, "entry");
+        if ((transaction = jh->b_cp_transaction) == NULL) {
+                JBUFFER_TRACE(jh, "not on transaction");
+                goto out;
+        }
+        journal = transaction->t_journal;
+        __buffer_unlink(jh);
+        jh->b_cp_transaction = NULL;
+        if (transaction->t_checkpoint_list != NULL ||
+            transaction->t_checkpoint_io_list != NULL)
+                goto out;
+        JBUFFER_TRACE(jh, "transaction has no more buffers");
+        /*
+         * There is one special case to worry about: if we have just pulled the
+         * buffer off a committing transaction's forget list, then even if the
+         * checkpoint list is empty, the transaction obviously cannot be
+         * dropped!
+         *
+         * The locking here around j_committing_transaction is a bit sleazy.
+         * See the comment at the end of jbd2_journal_commit_transaction().
+         */
+        if (transaction == journal->j_committing_transaction) {
+                JBUFFER_TRACE(jh, "belongs to committing transaction");
+                goto out;
+        }
+        /* OK, that was the last buffer for the transaction: we can now
+           safely remove this transaction from the log */
+        __jbd2_journal_drop_transaction(journal, transaction);
+        /* Just in case anybody was waiting for more transactions to be
+           checkpointed... */
+        wake_up(&journal->j_wait_logspace);
+        ret = 1;
+out:
+        JBUFFER_TRACE(jh, "exit");
+        return ret;
+}
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
+                               transaction_t *transaction)
+{
+        JBUFFER_TRACE(jh, "entry");
+        J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+        jh->b_cp_transaction = transaction;
+        if (!transaction->t_checkpoint_list) {
+                jh->b_cpnext = jh->b_cpprev = jh;
+        } else {
+                jh->b_cpnext = transaction->t_checkpoint_list;
+                jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+                jh->b_cpprev->b_cpnext = jh;
+                jh->b_cpnext->b_cpprev = jh;
+        }
+        transaction->t_checkpoint_list = jh;
+}
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+        assert_spin_locked(&journal->j_list_lock);
+        if (transaction->t_cpnext) {
+                transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+                transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+                if (journal->j_checkpoint_transactions == transaction)
+                        journal->j_checkpoint_transactions =
+                                transaction->t_cpnext;
+                if (journal->j_checkpoint_transactions == transaction)
+                        journal->j_checkpoint_transactions = NULL;
+        }
+        J_ASSERT(transaction->t_state == T_FINISHED);
+        J_ASSERT(transaction->t_buffers == NULL);
+        J_ASSERT(transaction->t_sync_datalist == NULL);
+        J_ASSERT(transaction->t_forget == NULL);
+        J_ASSERT(transaction->t_iobuf_list == NULL);
+        J_ASSERT(transaction->t_shadow_list == NULL);
+        J_ASSERT(transaction->t_log_list == NULL);
+        J_ASSERT(transaction->t_checkpoint_list == NULL);
+        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+        J_ASSERT(transaction->t_updates == 0);
+        J_ASSERT(journal->j_committing_transaction != transaction);
+        J_ASSERT(journal->j_running_transaction != transaction);
+        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+        kfree(transaction);
+}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..70b2ae1ef281
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,920 @@
+/*
+ * linux/fs/jbd2/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+/*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+        BUFFER_TRACE(bh, "");
+        if (uptodate)
+                set_buffer_uptodate(bh);
+        else
+                clear_buffer_uptodate(bh);
+        unlock_buffer(bh);
+}
+/*
+ * When an ext3-ordered file is truncated, it is possible that many pages are
+ * not sucessfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers.  These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list.  Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under lock_journal(), and possibly under journal_datalist_lock.  The
+ * caller provided us with a ref against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+        struct page *page;
+        if (buffer_dirty(bh))
+                goto nope;
+        if (atomic_read(&bh->b_count) != 1)
+                goto nope;
+        page = bh->b_page;
+        if (!page)
+                goto nope;
+        if (page->mapping)
+                goto nope;
+        /* OK, it's a truncated page */
+        if (TestSetPageLocked(page))
+                goto nope;
+        page_cache_get(page);
+        __brelse(bh);
+        try_to_free_buffers(page);
+        unlock_page(page);
+        page_cache_release(page);
+        return;
+nope:
+        __brelse(bh);
+}
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held.  For ranking reasons we must trylock.  If we lose, schedule away and
+ * return 0.  j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+        if (!jbd_trylock_bh_state(bh)) {
+                spin_unlock(&journal->j_list_lock);
+                schedule();
+                return 0;
+        }
+        return 1;
+}
+/* Done it all: now write the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_write_commit_record(journal_t *journal,
+                                        transaction_t *commit_transaction)
+{
+        struct journal_head *descriptor;
+        struct buffer_head *bh;
+        int i, ret;
+        int barrier_done = 0;
+        if (is_journal_aborted(journal))
+                return 0;
+        descriptor = jbd2_journal_get_descriptor_buffer(journal);
+        if (!descriptor)
+                return 1;
+        bh = jh2bh(descriptor);
+        /* AKPM: buglet - add `i' to tmp! */
+        for (i = 0; i < bh->b_size; i += 512) {
+                journal_header_t *tmp = (journal_header_t*)bh->b_data;
+                tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+                tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
+                tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+        }
+        JBUFFER_TRACE(descriptor, "write commit block");
+        set_buffer_dirty(bh);
+        if (journal->j_flags & JBD2_BARRIER) {
+                set_buffer_ordered(bh);
+                barrier_done = 1;
+        }
+        ret = sync_dirty_buffer(bh);
+        /* is it possible for another commit to fail at roughly
+         * the same time as this one?  If so, we don't want to
+         * trust the barrier flag in the super, but instead want
+         * to remember if we sent a barrier request
+         */
+        if (ret == -EOPNOTSUPP && barrier_done) {
+                char b[BDEVNAME_SIZE];
+                printk(KERN_WARNING
+                        "JBD: barrier-based sync failed on %s - "
+                        "disabling barriers\n",
+                        bdevname(journal->j_dev, b));
+                spin_lock(&journal->j_state_lock);
+                journal->j_flags &= ~JBD2_BARRIER;
+                spin_unlock(&journal->j_state_lock);
+                /* And try again, without the barrier */
+                clear_buffer_ordered(bh);
+                set_buffer_uptodate(bh);
+                set_buffer_dirty(bh);
+                ret = sync_dirty_buffer(bh);
+        }
+        put_bh(bh);             /* One for getblk() */
+        jbd2_journal_put_journal_head(descriptor);
+        return (ret == -EIO);
+}
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+        int i;
+        for (i = 0; i < bufs; i++) {
+                wbuf[i]->b_end_io = end_buffer_write_sync;
+                /* We use-up our safety reference in submit_bh() */
+                submit_bh(WRITE, wbuf[i]);
+        }
+}
+/*
+ *  Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+                                transaction_t *commit_transaction)
+{
+        struct journal_head *jh;
+        struct buffer_head *bh;
+        int locked;
+        int bufs = 0;
+        struct buffer_head **wbuf = journal->j_wbuf;
+        /*
+         * Whenever we unlock the journal and sleep, things can get added
+         * onto ->t_sync_datalist, so we have to keep looping back to
+         * write_out_data until we *know* that the list is empty.
+         *
+         * Cleanup any flushed data buffers from the data list.  Even in
+         * abort mode, we want to flush this out as soon as possible.
+         */
+write_out_data:
+        cond_resched();
+        spin_lock(&journal->j_list_lock);
+        while (commit_transaction->t_sync_datalist) {
+                jh = commit_transaction->t_sync_datalist;
+                bh = jh2bh(jh);
+                locked = 0;
+                /* Get reference just to make sure buffer does not disappear
+                 * when we are forced to drop various locks */
+                get_bh(bh);
+                /* If the buffer is dirty, we need to submit IO and hence
+                 * we need the buffer lock. We try to lock the buffer without
+                 * blocking. If we fail, we need to drop j_list_lock and do
+                 * blocking lock_buffer().
+                 */
+                if (buffer_dirty(bh)) {
+                        if (test_set_buffer_locked(bh)) {
+                                BUFFER_TRACE(bh, "needs blocking lock");
+                                spin_unlock(&journal->j_list_lock);
+                                /* Write out all data to prevent deadlocks */
+                                journal_do_submit_data(wbuf, bufs);
+                                bufs = 0;
+                                lock_buffer(bh);
+                                spin_lock(&journal->j_list_lock);
+                        }
+                        locked = 1;
+                }
+                /* We have to get bh_state lock. Again out of order, sigh. */
+                if (!inverted_lock(journal, bh)) {
+                        jbd_lock_bh_state(bh);
+                        spin_lock(&journal->j_list_lock);
+                }
+                /* Someone already cleaned up the buffer? */
+                if (!buffer_jbd(bh)
+                        || jh->b_transaction != commit_transaction
+                        || jh->b_jlist != BJ_SyncData) {
+                        jbd_unlock_bh_state(bh);
+                        if (locked)
+                                unlock_buffer(bh);
+                        BUFFER_TRACE(bh, "already cleaned up");
+                        put_bh(bh);
+                        continue;
+                }
+                if (locked && test_clear_buffer_dirty(bh)) {
+                        BUFFER_TRACE(bh, "needs writeout, adding to array");
+                        wbuf[bufs++] = bh;
+                        __jbd2_journal_file_buffer(jh, commit_transaction,
+                                                BJ_Locked);
+                        jbd_unlock_bh_state(bh);
+                        if (bufs == journal->j_wbufsize) {
+                                spin_unlock(&journal->j_list_lock);
+                                journal_do_submit_data(wbuf, bufs);
+                                bufs = 0;
+                                goto write_out_data;
+                        }
+                }
+                else {
+                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        __jbd2_journal_unfile_buffer(jh);
+                        jbd_unlock_bh_state(bh);
+                        if (locked)
+                                unlock_buffer(bh);
+                        jbd2_journal_remove_journal_head(bh);
+                        /* Once for our safety reference, once for
+                         * jbd2_journal_remove_journal_head() */
+                        put_bh(bh);
+                        put_bh(bh);
+                }
+                if (lock_need_resched(&journal->j_list_lock)) {
+                        spin_unlock(&journal->j_list_lock);
+                        goto write_out_data;
+                }
+        }
+        spin_unlock(&journal->j_list_lock);
+        journal_do_submit_data(wbuf, bufs);
+}
+static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+                                   unsigned long long block)
+{
+        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
+        if (tag_bytes > JBD_TAG_SIZE32)
+                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
+}
+/*
+ * jbd2_journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log.  This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void jbd2_journal_commit_transaction(journal_t *journal)
+{
+        transaction_t *commit_transaction;
+        struct journal_head *jh, *new_jh, *descriptor;
+        struct buffer_head **wbuf = journal->j_wbuf;
+        int bufs;
+        int flags;
+        int err;
+        unsigned long long blocknr;
+        char *tagp = NULL;
+        journal_header_t *header;
+        journal_block_tag_t *tag = NULL;
+        int space_left = 0;
+        int first_tag = 0;
+        int tag_flag;
+        int i;
+        int tag_bytes = journal_tag_bytes(journal);
+        /*
+         * First job: lock down the current transaction and wait for
+         * all outstanding updates to complete.
+         */
+#ifdef COMMIT_STATS
+        spin_lock(&journal->j_list_lock);
+        summarise_journal_usage(journal);
+        spin_unlock(&journal->j_list_lock);
+#endif
+        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
+        if (journal->j_flags & JBD2_FLUSHED) {
+                jbd_debug(3, "super block updated\n");
+                jbd2_journal_update_superblock(journal, 1);
+        } else {
+                jbd_debug(3, "superblock not updated\n");
+        }
+        J_ASSERT(journal->j_running_transaction != NULL);
+        J_ASSERT(journal->j_committing_transaction == NULL);
+        commit_transaction = journal->j_running_transaction;
+        J_ASSERT(commit_transaction->t_state == T_RUNNING);
+        jbd_debug(1, "JBD: starting commit of transaction %d\n",
+                        commit_transaction->t_tid);
+        spin_lock(&journal->j_state_lock);
+        commit_transaction->t_state = T_LOCKED;
+        spin_lock(&commit_transaction->t_handle_lock);
+        while (commit_transaction->t_updates) {
+                DEFINE_WAIT(wait);
+                prepare_to_wait(&journal->j_wait_updates, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                if (commit_transaction->t_updates) {
+                        spin_unlock(&commit_transaction->t_handle_lock);
+                        spin_unlock(&journal->j_state_lock);
+                        schedule();
+                        spin_lock(&journal->j_state_lock);
+                        spin_lock(&commit_transaction->t_handle_lock);
+                }
+                finish_wait(&journal->j_wait_updates, &wait);
+        }
+        spin_unlock(&commit_transaction->t_handle_lock);
+        J_ASSERT (commit_transaction->t_outstanding_credits <=
+                        journal->j_max_transaction_buffers);
+        /*
+         * First thing we are allowed to do is to discard any remaining
+         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+         * that there are no such buffers: if a large filesystem
+         * operation like a truncate needs to split itself over multiple
+         * transactions, then it may try to do a jbd2_journal_restart() while
+         * there are still BJ_Reserved buffers outstanding.  These must
+         * be released cleanly from the current transaction.
+         *
+         * In this case, the filesystem must still reserve write access
+         * again before modifying the buffer in the new transaction, but
+         * we do not require it to remember exactly which old buffers it
+         * has reserved.  This is consistent with the existing behaviour
+         * that multiple jbd2_journal_get_write_access() calls to the same
+         * buffer are perfectly permissable.
+         */
+        while (commit_transaction->t_reserved_list) {
+                jh = commit_transaction->t_reserved_list;
+                JBUFFER_TRACE(jh, "reserved, unused: refile");
+                /*
+                 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
+                 * leave undo-committed data.
+                 */
+                if (jh->b_committed_data) {
+                        struct buffer_head *bh = jh2bh(jh);
+                        jbd_lock_bh_state(bh);
+                        jbd2_slab_free(jh->b_committed_data, bh->b_size);
+                        jh->b_committed_data = NULL;
+                        jbd_unlock_bh_state(bh);
+                }
+                jbd2_journal_refile_buffer(journal, jh);
+        }
+        /*
+         * Now try to drop any written-back buffers from the journal's
+         * checkpoint lists.  We do this *before* commit because it potentially
+         * frees some memory
+         */
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_clean_checkpoint_list(journal);
+        spin_unlock(&journal->j_list_lock);
+        jbd_debug (3, "JBD: commit phase 1\n");
+        /*
+         * Switch to a new revoke table.
+         */
+        jbd2_journal_switch_revoke_table(journal);
+        commit_transaction->t_state = T_FLUSH;
+        journal->j_committing_transaction = commit_transaction;
+        journal->j_running_transaction = NULL;
+        commit_transaction->t_log_start = journal->j_head;
+        wake_up(&journal->j_wait_transaction_locked);
+        spin_unlock(&journal->j_state_lock);
+        jbd_debug (3, "JBD: commit phase 2\n");
+        /*
+         * First, drop modified flag: all accesses to the buffers
+         * will be tracked for a new trasaction only -bzzz
+         */
+        spin_lock(&journal->j_list_lock);
+        if (commit_transaction->t_buffers) {
+                new_jh = jh = commit_transaction->t_buffers->b_tnext;
+                do {
+                        J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
+                                        new_jh->b_modified == 0);
+                        new_jh->b_modified = 0;
+                        new_jh = new_jh->b_tnext;
+                } while (new_jh != jh);
+        }
+        spin_unlock(&journal->j_list_lock);
+        /*
+         * Now start flushing things to disk, in the order they appear
+         * on the transaction lists.  Data blocks go first.
+         */
+        err = 0;
+        journal_submit_data_buffers(journal, commit_transaction);
+        /*
+         * Wait for all previously submitted IO to complete.
+         */
+        spin_lock(&journal->j_list_lock);
+        while (commit_transaction->t_locked_list) {
+                struct buffer_head *bh;
+                jh = commit_transaction->t_locked_list->b_tprev;
+                bh = jh2bh(jh);
+                get_bh(bh);
+                if (buffer_locked(bh)) {
+                        spin_unlock(&journal->j_list_lock);
+                        wait_on_buffer(bh);
+                        if (unlikely(!buffer_uptodate(bh)))
+                                err = -EIO;
+                        spin_lock(&journal->j_list_lock);
+                }
+                if (!inverted_lock(journal, bh)) {
+                        put_bh(bh);
+                        spin_lock(&journal->j_list_lock);
+                        continue;
+                }
+                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+                        __jbd2_journal_unfile_buffer(jh);
+                        jbd_unlock_bh_state(bh);
+                        jbd2_journal_remove_journal_head(bh);
+                        put_bh(bh);
+                } else {
+                        jbd_unlock_bh_state(bh);
+                }
+                put_bh(bh);
+                cond_resched_lock(&journal->j_list_lock);
+        }
+        spin_unlock(&journal->j_list_lock);
+        if (err)
+                __jbd2_journal_abort_hard(journal);
+        jbd2_journal_write_revoke_records(journal, commit_transaction);
+        jbd_debug(3, "JBD: commit phase 2\n");
+        /*
+         * If we found any dirty or locked buffers, then we should have
+         * looped back up to the write_out_data label.  If there weren't
+         * any then journal_clean_data_list should have wiped the list
+         * clean by now, so check that it is in fact empty.
+         */
+        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+        jbd_debug (3, "JBD: commit phase 3\n");
+        /*
+         * Way to go: we have now written out all of the data for a
+         * transaction!  Now comes the tricky part: we need to write out
+         * metadata.  Loop over the transaction's entire buffer list:
+         */
+        commit_transaction->t_state = T_COMMIT;
+        descriptor = NULL;
+        bufs = 0;
+        while (commit_transaction->t_buffers) {
+                /* Find the next buffer to be journaled... */
+                jh = commit_transaction->t_buffers;
+                /* If we're in abort mode, we just un-journal the buffer and
+                   release it for background writing. */
+                if (is_journal_aborted(journal)) {
+                        JBUFFER_TRACE(jh, "journal is aborting: refile");
+                        jbd2_journal_refile_buffer(journal, jh);
+                        /* If that was the last one, we need to clean up
+                         * any descriptor buffers which may have been
+                         * already allocated, even if we are now
+                         * aborting. */
+                        if (!commit_transaction->t_buffers)
+                                goto start_journal_io;
+                        continue;
+                }
+                /* Make sure we have a descriptor block in which to
+                   record the metadata buffer. */
+                if (!descriptor) {
+                        struct buffer_head *bh;
+                        J_ASSERT (bufs == 0);
+                        jbd_debug(4, "JBD: get descriptor\n");
+                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
+                        if (!descriptor) {
+                                __jbd2_journal_abort_hard(journal);
+                                continue;
+                        }
+                        bh = jh2bh(descriptor);
+                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+                                (unsigned long long)bh->b_blocknr, bh->b_data);
+                        header = (journal_header_t *)&bh->b_data[0];
+                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
+                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
+                        tagp = &bh->b_data[sizeof(journal_header_t)];
+                        space_left = bh->b_size - sizeof(journal_header_t);
+                        first_tag = 1;
+                        set_buffer_jwrite(bh);
+                        set_buffer_dirty(bh);
+                        wbuf[bufs++] = bh;
+                        /* Record it so that we can wait for IO
+                           completion later */
+                        BUFFER_TRACE(bh, "ph3: file as descriptor");
+                        jbd2_journal_file_buffer(descriptor, commit_transaction,
+                                        BJ_LogCtl);
+                }
+                /* Where is the buffer to be written? */
+                err = jbd2_journal_next_log_block(journal, &blocknr);
+                /* If the block mapping failed, just abandon the buffer
+                   and repeat this loop: we'll fall into the
+                   refile-on-abort condition above. */
+                if (err) {
+                        __jbd2_journal_abort_hard(journal);
+                        continue;
+                }
+                /*
+                 * start_this_handle() uses t_outstanding_credits to determine
+                 * the free space in the log, but this counter is changed
+                 * by jbd2_journal_next_log_block() also.
+                 */
+                commit_transaction->t_outstanding_credits--;
+                /* Bump b_count to prevent truncate from stumbling over
+                   the shadowed buffer!  @@@ This can go if we ever get
+                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+                atomic_inc(&jh2bh(jh)->b_count);
+                /* Make a temporary IO buffer with which to write it out
+                   (this will requeue both the metadata buffer and the
+                   temporary IO buffer). new_bh goes on BJ_IO*/
+                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+                /*
+                 * akpm: jbd2_journal_write_metadata_buffer() sets
+                 * new_bh->b_transaction to commit_transaction.
+                 * We need to clean this up before we release new_bh
+                 * (which is of type BJ_IO)
+                 */
+                JBUFFER_TRACE(jh, "ph3: write metadata");
+                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+                                                      jh, &new_jh, blocknr);
+                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+                wbuf[bufs++] = jh2bh(new_jh);
+                /* Record the new block's tag in the current descriptor
+                   buffer */
+                tag_flag = 0;
+                if (flags & 1)
+                        tag_flag |= JBD2_FLAG_ESCAPE;
+                if (!first_tag)
+                        tag_flag |= JBD2_FLAG_SAME_UUID;
+                tag = (journal_block_tag_t *) tagp;
+                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+                tag->t_flags = cpu_to_be32(tag_flag);
+                tagp += tag_bytes;
+                space_left -= tag_bytes;
+                if (first_tag) {
+                        memcpy (tagp, journal->j_uuid, 16);
+                        tagp += 16;
+                        space_left -= 16;
+                        first_tag = 0;
+                }
+                /* If there's no more to do, or if the descriptor is full,
+                   let the IO rip! */
+                if (bufs == journal->j_wbufsize ||
+                    commit_transaction->t_buffers == NULL ||
+                    space_left < tag_bytes + 16) {
+                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+                        /* Write an end-of-descriptor marker before
+                           submitting the IOs.  "tag" still points to
+                           the last tag we set up. */
+                        tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+start_journal_io:
+                        for (i = 0; i < bufs; i++) {
+                                struct buffer_head *bh = wbuf[i];
+                                lock_buffer(bh);
+                                clear_buffer_dirty(bh);
+                                set_buffer_uptodate(bh);
+                                bh->b_end_io = journal_end_buffer_io_sync;
+                                submit_bh(WRITE, bh);
+                        }
+                        cond_resched();
+                        /* Force a new descriptor to be generated next
+                           time round the loop. */
+                        descriptor = NULL;
+                        bufs = 0;
+                }
+        }
+        /* Lo and behold: we have just managed to send a transaction to
+           the log.  Before we can commit it, wait for the IO so far to
+           complete.  Control buffers being written are on the
+           transaction's t_log_list queue, and metadata buffers are on
+           the t_iobuf_list queue.
+           Wait for the buffers in reverse order.  That way we are
+           less likely to be woken up until all IOs have completed, and
+           so we incur less scheduling load.
+        */
+        jbd_debug(3, "JBD: commit phase 4\n");
+        /*
+         * akpm: these are BJ_IO, and j_list_lock is not needed.
+         * See __journal_try_to_free_buffer.
+         */
+wait_for_iobuf:
+        while (commit_transaction->t_iobuf_list != NULL) {
+                struct buffer_head *bh;
+                jh = commit_transaction->t_iobuf_list->b_tprev;
+                bh = jh2bh(jh);
+                if (buffer_locked(bh)) {
+                        wait_on_buffer(bh);
+                        goto wait_for_iobuf;
+                }
+                if (cond_resched())
+                        goto wait_for_iobuf;
+                if (unlikely(!buffer_uptodate(bh)))
+                        err = -EIO;
+                clear_buffer_jwrite(bh);
+                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+                jbd2_journal_unfile_buffer(journal, jh);
+                /*
+                 * ->t_iobuf_list should contain only dummy buffer_heads
+                 * which were created by jbd2_journal_write_metadata_buffer().
+                 */
+                BUFFER_TRACE(bh, "dumping temporary bh");
+                jbd2_journal_put_journal_head(jh);
+                __brelse(bh);
+                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+                free_buffer_head(bh);
+                /* We also have to unlock and free the corresponding
+                   shadowed buffer */
+                jh = commit_transaction->t_shadow_list->b_tprev;
+                bh = jh2bh(jh);
+                clear_bit(BH_JWrite, &bh->b_state);
+                J_ASSERT_BH(bh, buffer_jbddirty(bh));
+                /* The metadata is now released for reuse, but we need
+                   to remember it against this transaction so that when
+                   we finally commit, we can do any checkpointing
+                   required. */
+                JBUFFER_TRACE(jh, "file as BJ_Forget");
+                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
+                /* Wake up any transactions which were waiting for this
+                   IO to complete */
+                wake_up_bit(&bh->b_state, BH_Unshadow);
+                JBUFFER_TRACE(jh, "brelse shadowed buffer");
+                __brelse(bh);
+        }
+        J_ASSERT (commit_transaction->t_shadow_list == NULL);
+        jbd_debug(3, "JBD: commit phase 5\n");
+        /* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+        while (commit_transaction->t_log_list != NULL) {
+                struct buffer_head *bh;
+                jh = commit_transaction->t_log_list->b_tprev;
+                bh = jh2bh(jh);
+                if (buffer_locked(bh)) {
+                        wait_on_buffer(bh);
+                        goto wait_for_ctlbuf;
+                }
+                if (cond_resched())
+                        goto wait_for_ctlbuf;
+                if (unlikely(!buffer_uptodate(bh)))
+                        err = -EIO;
+                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+                clear_buffer_jwrite(bh);
+                jbd2_journal_unfile_buffer(journal, jh);
+                jbd2_journal_put_journal_head(jh);
+                __brelse(bh);           /* One for getblk */
+                /* AKPM: bforget here */
+        }
+        jbd_debug(3, "JBD: commit phase 6\n");
+        if (journal_write_commit_record(journal, commit_transaction))
+                err = -EIO;
+        if (err)
+                __jbd2_journal_abort_hard(journal);
+        /* End of a transaction!  Finally, we can do checkpoint
+           processing: any buffers committed as a result of this
+           transaction can be removed from any checkpoint list it was on
+           before. */
+        jbd_debug(3, "JBD: commit phase 7\n");
+        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+        J_ASSERT(commit_transaction->t_buffers == NULL);
+        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+        J_ASSERT(commit_transaction->t_shadow_list == NULL);
+        J_ASSERT(commit_transaction->t_log_list == NULL);
+restart_loop:
+        /*
+         * As there are other places (journal_unmap_buffer()) adding buffers
+         * to this list we have to be careful and hold the j_list_lock.
+         */
+        spin_lock(&journal->j_list_lock);
+        while (commit_transaction->t_forget) {
+                transaction_t *cp_transaction;
+                struct buffer_head *bh;
+                jh = commit_transaction->t_forget;
+                spin_unlock(&journal->j_list_lock);
+                bh = jh2bh(jh);
+                jbd_lock_bh_state(bh);
+                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
+                        jh->b_transaction == journal->j_running_transaction);
+                /*
+                 * If there is undo-protected committed data against
+                 * this buffer, then we can remove it now.  If it is a
+                 * buffer needing such protection, the old frozen_data
+                 * field now points to a committed version of the
+                 * buffer, so rotate that field to the new committed
+                 * data.
+                 *
+                 * Otherwise, we can just throw away the frozen data now.
+                 */
+                if (jh->b_committed_data) {
+                        jbd2_slab_free(jh->b_committed_data, bh->b_size);
+                        jh->b_committed_data = NULL;
+                        if (jh->b_frozen_data) {
+                                jh->b_committed_data = jh->b_frozen_data;
+                                jh->b_frozen_data = NULL;
+                        }
+                } else if (jh->b_frozen_data) {
+                        jbd2_slab_free(jh->b_frozen_data, bh->b_size);
+                        jh->b_frozen_data = NULL;
+                }
+                spin_lock(&journal->j_list_lock);
+                cp_transaction = jh->b_cp_transaction;
+                if (cp_transaction) {
+                        JBUFFER_TRACE(jh, "remove from old cp transaction");
+                        __jbd2_journal_remove_checkpoint(jh);
+                }
+                /* Only re-checkpoint the buffer_head if it is marked
+                 * dirty.  If the buffer was added to the BJ_Forget list
+                 * by jbd2_journal_forget, it may no longer be dirty and
+                 * there's no point in keeping a checkpoint record for
+                 * it. */
+                /* A buffer which has been freed while still being
+                 * journaled by a previous transaction may end up still
+                 * being dirty here, but we want to avoid writing back
+                 * that buffer in the future now that the last use has
+                 * been committed.  That's not only a performance gain,
+                 * it also stops aliasing problems if the buffer is left
+                 * behind for writeback and gets reallocated for another
+                 * use in a different page. */
+                if (buffer_freed(bh)) {
+                        clear_buffer_freed(bh);
+                        clear_buffer_jbddirty(bh);
+                }
+                if (buffer_jbddirty(bh)) {
+                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
+                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
+                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+                        __jbd2_journal_refile_buffer(jh);
+                        jbd_unlock_bh_state(bh);
+                } else {
+                        J_ASSERT_BH(bh, !buffer_dirty(bh));
+                        /* The buffer on BJ_Forget list and not jbddirty means
+                         * it has been freed by this transaction and hence it
+                         * could not have been reallocated until this
+                         * transaction has committed. *BUT* it could be
+                         * reallocated once we have written all the data to
+                         * disk and before we process the buffer on BJ_Forget
+                         * list. */
+                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+                        __jbd2_journal_refile_buffer(jh);
+                        if (!jh->b_transaction) {
+                                jbd_unlock_bh_state(bh);
+                                 /* needs a brelse */
+                                jbd2_journal_remove_journal_head(bh);
+                                release_buffer_page(bh);
+                        } else
+                                jbd_unlock_bh_state(bh);
+                }
+                cond_resched_lock(&journal->j_list_lock);
+        }
+        spin_unlock(&journal->j_list_lock);
+        /*
+         * This is a bit sleazy.  We borrow j_list_lock to protect
+         * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
+         * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
+         * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
+         */
+        spin_lock(&journal->j_state_lock);
+        spin_lock(&journal->j_list_lock);
+        /*
+         * Now recheck if some buffers did not get attached to the transaction
+         * while the lock was dropped...
+         */
+        if (commit_transaction->t_forget) {
+                spin_unlock(&journal->j_list_lock);
+                spin_unlock(&journal->j_state_lock);
+                goto restart_loop;
+        }
+        /* Done with this transaction! */
+        jbd_debug(3, "JBD: commit phase 8\n");
+        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_state = T_FINISHED;
+        J_ASSERT(commit_transaction == journal->j_committing_transaction);
+        journal->j_commit_sequence = commit_transaction->t_tid;
+        journal->j_committing_transaction = NULL;
+        spin_unlock(&journal->j_state_lock);
+        if (commit_transaction->t_checkpoint_list == NULL) {
+                __jbd2_journal_drop_transaction(journal, commit_transaction);
+        } else {
+                if (journal->j_checkpoint_transactions == NULL) {
+                        journal->j_checkpoint_transactions = commit_transaction;
+                        commit_transaction->t_cpnext = commit_transaction;
+                        commit_transaction->t_cpprev = commit_transaction;
+                } else {
+                        commit_transaction->t_cpnext =
+                                journal->j_checkpoint_transactions;
+                        commit_transaction->t_cpprev =
+                                commit_transaction->t_cpnext->t_cpprev;
+                        commit_transaction->t_cpnext->t_cpprev =
+                                commit_transaction;
+                        commit_transaction->t_cpprev->t_cpnext =
+                                commit_transaction;
+                }
+        }
+        spin_unlock(&journal->j_list_lock);
+        jbd_debug(1, "JBD: commit %d complete, head %d\n",
+                  journal->j_commit_sequence, journal->j_tail_sequence);
+        wake_up(&journal->j_wait_done_commit);
+}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..c60f378b0f76
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2084 @@
+/*
+ * linux/fs/jbd2/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates.  This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <asm/page.h>
+EXPORT_SYMBOL(jbd2_journal_start);
+EXPORT_SYMBOL(jbd2_journal_restart);
+EXPORT_SYMBOL(jbd2_journal_extend);
+EXPORT_SYMBOL(jbd2_journal_stop);
+EXPORT_SYMBOL(jbd2_journal_lock_updates);
+EXPORT_SYMBOL(jbd2_journal_unlock_updates);
+EXPORT_SYMBOL(jbd2_journal_get_write_access);
+EXPORT_SYMBOL(jbd2_journal_get_create_access);
+EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_dirty_data);
+EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
+EXPORT_SYMBOL(jbd2_journal_release_buffer);
+EXPORT_SYMBOL(jbd2_journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(jbd2_journal_flush);
+EXPORT_SYMBOL(jbd2_journal_revoke);
+EXPORT_SYMBOL(jbd2_journal_init_dev);
+EXPORT_SYMBOL(jbd2_journal_init_inode);
+EXPORT_SYMBOL(jbd2_journal_update_format);
+EXPORT_SYMBOL(jbd2_journal_check_used_features);
+EXPORT_SYMBOL(jbd2_journal_check_available_features);
+EXPORT_SYMBOL(jbd2_journal_set_features);
+EXPORT_SYMBOL(jbd2_journal_create);
+EXPORT_SYMBOL(jbd2_journal_load);
+EXPORT_SYMBOL(jbd2_journal_destroy);
+EXPORT_SYMBOL(jbd2_journal_update_superblock);
+EXPORT_SYMBOL(jbd2_journal_abort);
+EXPORT_SYMBOL(jbd2_journal_errno);
+EXPORT_SYMBOL(jbd2_journal_ack_err);
+EXPORT_SYMBOL(jbd2_journal_clear_err);
+EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_journal_start_commit);
+EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
+EXPORT_SYMBOL(jbd2_journal_wipe);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
+EXPORT_SYMBOL(jbd2_journal_force_commit);
+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_jbd_slab(size_t slab_size);
+/*
+ * Helper function used to manage commit timeouts
+ */
+static void commit_timeout(unsigned long __data)
+{
+        struct task_struct * p = (struct task_struct *) __data;
+        wake_up_process(p);
+}
+/*
+ * kjournald2: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT:  Every so often we need to commit the current state of the
+ *    filesystem to disk.  The journal thread is responsible for writing
+ *    all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ *    of the data in that part of the log has been rewritten elsewhere on
+ *    the disk.  Flushing these old buffers to reclaim space in the log is
+ *    known as checkpointing, and this thread is responsible for that job.
+ */
+static int kjournald2(void *arg)
+{
+        journal_t *journal = arg;
+        transaction_t *transaction;
+        /*
+         * Set up an interval timer which can be used to trigger a commit wakeup
+         * after the commit interval expires
+         */
+        setup_timer(&journal->j_commit_timer, commit_timeout,
+                        (unsigned long)current);
+        /* Record that the journal thread is running */
+        journal->j_task = current;
+        wake_up(&journal->j_wait_done_commit);
+        printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
+                        journal->j_commit_interval / HZ);
+        /*
+         * And now, wait forever for commit wakeup events.
+         */
+        spin_lock(&journal->j_state_lock);
+loop:
+        if (journal->j_flags & JBD2_UNMOUNT)
+                goto end_loop;
+        jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+                journal->j_commit_sequence, journal->j_commit_request);
+        if (journal->j_commit_sequence != journal->j_commit_request) {
+                jbd_debug(1, "OK, requests differ\n");
+                spin_unlock(&journal->j_state_lock);
+                del_timer_sync(&journal->j_commit_timer);
+                jbd2_journal_commit_transaction(journal);
+                spin_lock(&journal->j_state_lock);
+                goto loop;
+        }
+        wake_up(&journal->j_wait_done_commit);
+        if (freezing(current)) {
+                /*
+                 * The simpler the better. Flushing journal isn't a
+                 * good idea, because that depends on threads that may
+                 * be already stopped.
+                 */
+                jbd_debug(1, "Now suspending kjournald2\n");
+                spin_unlock(&journal->j_state_lock);
+                refrigerator();
+                spin_lock(&journal->j_state_lock);
+        } else {
+                /*
+                 * We assume on resume that commits are already there,
+                 * so we don't sleep
+                 */
+                DEFINE_WAIT(wait);
+                int should_sleep = 1;
+                prepare_to_wait(&journal->j_wait_commit, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (journal->j_commit_sequence != journal->j_commit_request)
+                        should_sleep = 0;
+                transaction = journal->j_running_transaction;
+                if (transaction && time_after_eq(jiffies,
+                                                transaction->t_expires))
+                        should_sleep = 0;
+                if (journal->j_flags & JBD2_UNMOUNT)
+                        should_sleep = 0;
+                if (should_sleep) {
+                        spin_unlock(&journal->j_state_lock);
+                        schedule();
+                        spin_lock(&journal->j_state_lock);
+                }
+                finish_wait(&journal->j_wait_commit, &wait);
+        }
+        jbd_debug(1, "kjournald2 wakes\n");
+        /*
+         * Were we woken up by a commit wakeup event?
+         */
+        transaction = journal->j_running_transaction;
+        if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+                journal->j_commit_request = transaction->t_tid;
+                jbd_debug(1, "woke because of timeout\n");
+        }
+        goto loop;
+end_loop:
+        spin_unlock(&journal->j_state_lock);
+        del_timer_sync(&journal->j_commit_timer);
+        journal->j_task = NULL;
+        wake_up(&journal->j_wait_done_commit);
+        jbd_debug(1, "Journal thread exiting.\n");
+        return 0;
+}
+static void jbd2_journal_start_thread(journal_t *journal)
+{
+        kthread_run(kjournald2, journal, "kjournald2");
+        wait_event(journal->j_wait_done_commit, journal->j_task != 0);
+}
+static void journal_kill_thread(journal_t *journal)
+{
+        spin_lock(&journal->j_state_lock);
+        journal->j_flags |= JBD2_UNMOUNT;
+        while (journal->j_task) {
+                wake_up(&journal->j_wait_commit);
+                spin_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_done_commit, journal->j_task == 0);
+                spin_lock(&journal->j_state_lock);
+        }
+        spin_unlock(&journal->j_state_lock);
+}
+/*
+ * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block.  The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here.  If the buffer happens to start with the
+ * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks.  In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block.  The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO.  If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ *  <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
+                                  struct journal_head  *jh_in,
+                                  struct journal_head **jh_out,
+                                  unsigned long long blocknr)
+{
+        int need_copy_out = 0;
+        int done_copy_out = 0;
+        int do_escape = 0;
+        char *mapped_data;
+        struct buffer_head *new_bh;
+        struct journal_head *new_jh;
+        struct page *new_page;
+        unsigned int new_offset;
+        struct buffer_head *bh_in = jh2bh(jh_in);
+        /*
+         * The buffer really shouldn't be locked: only the current committing
+         * transaction is allowed to write it, so nobody else is allowed
+         * to do any IO.
+         *
+         * akpm: except if we're journalling data, and write() output is
+         * also part of a shared mapping, and another thread has
+         * decided to launch a writepage() against this buffer.
+         */
+        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /*
+         * If a new transaction has already done a buffer copy-out, then
+         * we use that version of the data for the commit.
+         */
+        jbd_lock_bh_state(bh_in);
+repeat:
+        if (jh_in->b_frozen_data) {
+                done_copy_out = 1;
+                new_page = virt_to_page(jh_in->b_frozen_data);
+                new_offset = offset_in_page(jh_in->b_frozen_data);
+        } else {
+                new_page = jh2bh(jh_in)->b_page;
+                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+        }
+        mapped_data = kmap_atomic(new_page, KM_USER0);
+        /*
+         * Check for escaping
+         */
+        if (*((__be32 *)(mapped_data + new_offset)) ==
+                                cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+                need_copy_out = 1;
+                do_escape = 1;
+        }
+        kunmap_atomic(mapped_data, KM_USER0);
+        /*
+         * Do we need to do a data copy?
+         */
+        if (need_copy_out && !done_copy_out) {
+                char *tmp;
+                jbd_unlock_bh_state(bh_in);
+                tmp = jbd2_slab_alloc(bh_in->b_size, GFP_NOFS);
+                jbd_lock_bh_state(bh_in);
+                if (jh_in->b_frozen_data) {
+                        jbd2_slab_free(tmp, bh_in->b_size);
+                        goto repeat;
+                }
+                jh_in->b_frozen_data = tmp;
+                mapped_data = kmap_atomic(new_page, KM_USER0);
+                memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+                kunmap_atomic(mapped_data, KM_USER0);
+                new_page = virt_to_page(tmp);
+                new_offset = offset_in_page(tmp);
+                done_copy_out = 1;
+        }
+        /*
+         * Did we need to do an escaping?  Now we've done all the
+         * copying, we can finally do so.
+         */
+        if (do_escape) {
+                mapped_data = kmap_atomic(new_page, KM_USER0);
+                *((unsigned int *)(mapped_data + new_offset)) = 0;
+                kunmap_atomic(mapped_data, KM_USER0);
+        }
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        jbd_unlock_bh_state(bh_in);
+        new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+        set_bh_page(new_bh, new_page, new_offset);
+        new_jh->b_transaction = NULL;
+        new_bh->b_size = jh2bh(jh_in)->b_size;
+        new_bh->b_bdev = transaction->t_journal->j_dev;
+        new_bh->b_blocknr = blocknr;
+        set_buffer_mapped(new_bh);
+        set_buffer_dirty(new_bh);
+        *jh_out = new_jh;
+        /*
+         * The to-be-written buffer needs to get moved to the io queue,
+         * and the original buffer whose contents we are shadowing or
+         * copying is moved to the transaction's shadow queue.
+         */
+        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+        jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        JBUFFER_TRACE(new_jh, "file as BJ_IO");
+        jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
+        return do_escape | (done_copy_out << 1);
+}
+/*
+ * Allocation code for the journal file.  Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+/*
+ * __jbd2_log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ *
+ * Called under j_state_lock
+ */
+int __jbd2_log_space_left(journal_t *journal)
+{
+        int left = journal->j_free;
+        assert_spin_locked(&journal->j_state_lock);
+        /*
+         * Be pessimistic here about the number of those free blocks which
+         * might be required for log descriptor control blocks.
+         */
+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+        left -= MIN_LOG_RESERVED_BLOCKS;
+        if (left <= 0)
+                return 0;
+        left -= (left >> 3);
+        return left;
+}
+/*
+ * Called under j_state_lock.  Returns true if a transaction was started.
+ */
+int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+{
+        /*
+         * Are we already doing a recent enough commit?
+         */
+        if (!tid_geq(journal->j_commit_request, target)) {
+                /*
+                 * We want a new commit: OK, mark the request and wakup the
+                 * commit thread.  We do _not_ do the commit ourselves.
+                 */
+                journal->j_commit_request = target;
+                jbd_debug(1, "JBD: requesting commit %d/%d\n",
+                          journal->j_commit_request,
+                          journal->j_commit_sequence);
+                wake_up(&journal->j_wait_commit);
+                return 1;
+        }
+        return 0;
+}
+int jbd2_log_start_commit(journal_t *journal, tid_t tid)
+{
+        int ret;
+        spin_lock(&journal->j_state_lock);
+        ret = __jbd2_log_start_commit(journal, tid);
+        spin_unlock(&journal->j_state_lock);
+        return ret;
+}
+/*
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * We can only force the running transaction if we don't have an active handle;
+ * otherwise, we will deadlock.
+ *
+ * Returns true if a transaction was started.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+        transaction_t *transaction = NULL;
+        tid_t tid;
+        spin_lock(&journal->j_state_lock);
+        if (journal->j_running_transaction && !current->journal_info) {
+                transaction = journal->j_running_transaction;
+                __jbd2_log_start_commit(journal, transaction->t_tid);
+        } else if (journal->j_committing_transaction)
+                transaction = journal->j_committing_transaction;
+        if (!transaction) {
+                spin_unlock(&journal->j_state_lock);
+                return 0;       /* Nothing to retry */
+        }
+        tid = transaction->t_tid;
+        spin_unlock(&journal->j_state_lock);
+        jbd2_log_wait_commit(journal, tid);
+        return 1;
+}
+/*
+ * Start a commit of the current running transaction (if any).  Returns true
+ * if a transaction was started, and fills its tid in at *ptid
+ */
+int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+        int ret = 0;
+        spin_lock(&journal->j_state_lock);
+        if (journal->j_running_transaction) {
+                tid_t tid = journal->j_running_transaction->t_tid;
+                ret = __jbd2_log_start_commit(journal, tid);
+                if (ret && ptid)
+                        *ptid = tid;
+        } else if (journal->j_committing_transaction && ptid) {
+                /*
+                 * If ext3_write_super() recently started a commit, then we
+                 * have to wait for completion of that transaction
+                 */
+                *ptid = journal->j_committing_transaction->t_tid;
+                ret = 1;
+        }
+        spin_unlock(&journal->j_state_lock);
+        return ret;
+}
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+{
+        int err = 0;
+#ifdef CONFIG_JBD_DEBUG
+        spin_lock(&journal->j_state_lock);
+        if (!tid_geq(journal->j_commit_request, tid)) {
+                printk(KERN_EMERG
+                       "%s: error: j_commit_request=%d, tid=%d\n",
+                       __FUNCTION__, journal->j_commit_request, tid);
+        }
+        spin_unlock(&journal->j_state_lock);
+#endif
+        spin_lock(&journal->j_state_lock);
+        while (tid_gt(tid, journal->j_commit_sequence)) {
+                jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+                                  tid, journal->j_commit_sequence);
+                wake_up(&journal->j_wait_commit);
+                spin_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_done_commit,
+                                !tid_gt(tid, journal->j_commit_sequence));
+                spin_lock(&journal->j_state_lock);
+        }
+        spin_unlock(&journal->j_state_lock);
+        if (unlikely(is_journal_aborted(journal))) {
+                printk(KERN_EMERG "journal commit I/O error\n");
+                err = -EIO;
+        }
+        return err;
+}
+/*
+ * Log buffer allocation routines:
+ */
+int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
+{
+        unsigned long blocknr;
+        spin_lock(&journal->j_state_lock);
+        J_ASSERT(journal->j_free > 1);
+        blocknr = journal->j_head;
+        journal->j_head++;
+        journal->j_free--;
+        if (journal->j_head == journal->j_last)
+                journal->j_head = journal->j_first;
+        spin_unlock(&journal->j_state_lock);
+        return jbd2_journal_bmap(journal, blocknr, retp);
+}
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op.  If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
+                 unsigned long long *retp)
+{
+        int err = 0;
+        unsigned long long ret;
+        if (journal->j_inode) {
+                ret = bmap(journal->j_inode, blocknr);
+                if (ret)
+                        *retp = ret;
+                else {
+                        char b[BDEVNAME_SIZE];
+                        printk(KERN_ALERT "%s: journal block not found "
+                                        "at offset %lu on %s\n",
+                                __FUNCTION__,
+                                blocknr,
+                                bdevname(journal->j_dev, b));
+                        err = -EIO;
+                        __journal_abort_soft(journal, err);
+                }
+        } else {
+                *retp = blocknr; /* +journal->j_blk_offset */
+        }
+        return err;
+}
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+{
+        struct buffer_head *bh;
+        unsigned long long blocknr;
+        int err;
+        err = jbd2_journal_next_log_block(journal, &blocknr);
+        if (err)
+                return NULL;
+        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        lock_buffer(bh);
+        memset(bh->b_data, 0, journal->j_blocksize);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        BUFFER_TRACE(bh, "return this buffer");
+        return jbd2_journal_add_journal_head(bh);
+}
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk.  */
+/* First: create and setup a journal_t object in memory.  We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+static journal_t * journal_init_common (void)
+{
+        journal_t *journal;
+        int err;
+        journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
+        if (!journal)
+                goto fail;
+        memset(journal, 0, sizeof(*journal));
+        init_waitqueue_head(&journal->j_wait_transaction_locked);
+        init_waitqueue_head(&journal->j_wait_logspace);
+        init_waitqueue_head(&journal->j_wait_done_commit);
+        init_waitqueue_head(&journal->j_wait_checkpoint);
+        init_waitqueue_head(&journal->j_wait_commit);
+        init_waitqueue_head(&journal->j_wait_updates);
+        mutex_init(&journal->j_barrier);
+        mutex_init(&journal->j_checkpoint_mutex);
+        spin_lock_init(&journal->j_revoke_lock);
+        spin_lock_init(&journal->j_list_lock);
+        spin_lock_init(&journal->j_state_lock);
+        journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
+        /* The journal is marked for error until we succeed with recovery! */
+        journal->j_flags = JBD2_ABORT;
+        /* Set up a default-sized revoke table for the new mount. */
+        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+        if (err) {
+                kfree(journal);
+                goto fail;
+        }
+        return journal;
+fail:
+        return NULL;
+}
+/* jbd2_journal_init_dev and jbd2_journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal.  We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+/**
+ *  journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Length of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *  @returns: a newly created journal_t *
+ *
+ *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
+ *  range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+                        struct block_device *fs_dev,
+                        unsigned long long start, int len, int blocksize)
+{
+        journal_t *journal = journal_init_common();
+        struct buffer_head *bh;
+        int n;
+        if (!journal)
+                return NULL;
+        /* journal descriptor can store up to n blocks -bzzz */
+        journal->j_blocksize = blocksize;
+        n = journal->j_blocksize / sizeof(journal_block_tag_t);
+        journal->j_wbufsize = n;
+        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+        if (!journal->j_wbuf) {
+                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+                        __FUNCTION__);
+                kfree(journal);
+                journal = NULL;
+                goto out;
+        }
+        journal->j_dev = bdev;
+        journal->j_fs_dev = fs_dev;
+        journal->j_blk_offset = start;
+        journal->j_maxlen = len;
+        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+        J_ASSERT(bh != NULL);
+        journal->j_sb_buffer = bh;
+        journal->j_superblock = (journal_superblock_t *)bh->b_data;
+out:
+        return journal;
+}
+/**
+ *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *
+ * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * jbd2_journal_init_inode (struct inode *inode)
+{
+        struct buffer_head *bh;
+        journal_t *journal = journal_init_common();
+        int err;
+        int n;
+        unsigned long long blocknr;
+        if (!journal)
+                return NULL;
+        journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+        journal->j_inode = inode;
+        jbd_debug(1,
+                  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+                  journal, inode->i_sb->s_id, inode->i_ino,
+                  (long long) inode->i_size,
+                  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+        journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+        journal->j_blocksize = inode->i_sb->s_blocksize;
+        /* journal descriptor can store up to n blocks -bzzz */
+        n = journal->j_blocksize / sizeof(journal_block_tag_t);
+        journal->j_wbufsize = n;
+        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+        if (!journal->j_wbuf) {
+                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+                        __FUNCTION__);
+                kfree(journal);
+                return NULL;
+        }
+        err = jbd2_journal_bmap(journal, 0, &blocknr);
+        /* If that failed, give up */
+        if (err) {
+                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                       __FUNCTION__);
+                kfree(journal);
+                return NULL;
+        }
+        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        J_ASSERT(bh != NULL);
+        journal->j_sb_buffer = bh;
+        journal->j_superblock = (journal_superblock_t *)bh->b_data;
+        return journal;
+}
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+        struct buffer_head *bh = journal->j_sb_buffer;
+        brelse(bh);
+        journal->j_sb_buffer = NULL;
+}
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session.  We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+static int journal_reset(journal_t *journal)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        unsigned long long first, last;
+        first = be32_to_cpu(sb->s_first);
+        last = be32_to_cpu(sb->s_maxlen);
+        journal->j_first = first;
+        journal->j_last = last;
+        journal->j_head = first;
+        journal->j_tail = first;
+        journal->j_free = last - first;
+        journal->j_tail_sequence = journal->j_transaction_sequence;
+        journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+        journal->j_commit_request = journal->j_commit_sequence;
+        journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+        /* Add the dynamic fields and write it to disk. */
+        jbd2_journal_update_superblock(journal, 1);
+        jbd2_journal_start_thread(journal);
+        return 0;
+}
+/**
+ * int jbd2_journal_create() - Initialise the new journal file
+ * @journal: Journal to create. This structure must have been initialised
+ *
+ * Given a journal_t structure which tells us which disk blocks we can
+ * use, create a new journal superblock and initialise all of the
+ * journal fields from scratch.
+ **/
+int jbd2_journal_create(journal_t *journal)
+{
+        unsigned long long blocknr;
+        struct buffer_head *bh;
+        journal_superblock_t *sb;
+        int i, err;
+        if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
+                printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+                        journal->j_maxlen);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
+        if (journal->j_inode == NULL) {
+                /*
+                 * We don't know what block to start at!
+                 */
+                printk(KERN_EMERG
+                       "%s: creation of journal on external device!\n",
+                       __FUNCTION__);
+                BUG();
+        }
+        /* Zero out the entire journal on disk.  We cannot afford to
+           have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
+        jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+        for (i = 0; i < journal->j_maxlen; i++) {
+                err = jbd2_journal_bmap(journal, i, &blocknr);
+                if (err)
+                        return err;
+                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+                lock_buffer(bh);
+                memset (bh->b_data, 0, journal->j_blocksize);
+                BUFFER_TRACE(bh, "marking dirty");
+                mark_buffer_dirty(bh);
+                BUFFER_TRACE(bh, "marking uptodate");
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+                __brelse(bh);
+        }
+        sync_blockdev(journal->j_dev);
+        jbd_debug(1, "JBD: journal cleared.\n");
+        /* OK, fill in the initial static fields in the new superblock */
+        sb = journal->j_superblock;
+        sb->s_header.h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+        sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+        sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
+        sb->s_maxlen    = cpu_to_be32(journal->j_maxlen);
+        sb->s_first     = cpu_to_be32(1);
+        journal->j_transaction_sequence = 1;
+        journal->j_flags &= ~JBD2_ABORT;
+        journal->j_format_version = 2;
+        return journal_reset(journal);
+}
+/**
+ * void jbd2_journal_update_superblock() - Update journal sb on disk.
+ * @journal: The journal to update.
+ * @wait: Set to '0' if you don't want to wait for IO completion.
+ *
+ * Update a journal's dynamic superblock fields and write it to disk,
+ * optionally waiting for the IO to complete.
+ */
+void jbd2_journal_update_superblock(journal_t *journal, int wait)
+{
+        journal_superblock_t *sb = journal->j_superblock;
+        struct buffer_head *bh = journal->j_sb_buffer;
+        /*
+         * As a special case, if the on-disk copy is already marked as needing
+         * no recovery (s_start == 0) and there are no outstanding transactions
+         * in the filesystem, then we can safely defer the superblock update
+         * until the next commit by setting JBD2_FLUSHED.  This avoids
+         * attempting a write to a potential-readonly device.
+         */
+        if (sb->s_start == 0 && journal->j_tail_sequence ==
+                                journal->j_transaction_sequence) {
+                jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+                        "(start %ld, seq %d, errno %d)\n",
+                        journal->j_tail, journal->j_tail_sequence,
+                        journal->j_errno);
+                goto out;
+        }
+        spin_lock(&journal->j_state_lock);
+        jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+        sb->s_start    = cpu_to_be32(journal->j_tail);
+        sb->s_errno    = cpu_to_be32(journal->j_errno);
+        spin_unlock(&journal->j_state_lock);
+        BUFFER_TRACE(bh, "marking dirty");
+        mark_buffer_dirty(bh);
+        if (wait)
+                sync_dirty_buffer(bh);
+        else
+                ll_rw_block(SWRITE, 1, &bh);
+out:
+        /* If we have just flushed the log (by marking s_start==0), then
+         * any future commit will have to be careful to update the
+         * superblock again to re-record the true start of the log. */
+        spin_lock(&journal->j_state_lock);
+        if (sb->s_start)
+                journal->j_flags &= ~JBD2_FLUSHED;
+        else
+                journal->j_flags |= JBD2_FLUSHED;
+        spin_unlock(&journal->j_state_lock);
+}
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+static int journal_get_superblock(journal_t *journal)
+{
+        struct buffer_head *bh;
+        journal_superblock_t *sb;
+        int err = -EIO;
+        bh = journal->j_sb_buffer;
+        J_ASSERT(bh != NULL);
+        if (!buffer_uptodate(bh)) {
+                ll_rw_block(READ, 1, &bh);
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh)) {
+                        printk (KERN_ERR
+                                "JBD: IO error reading journal superblock\n");
+                        goto out;
+                }
+        }
+        sb = journal->j_superblock;
+        err = -EINVAL;
+        if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+            sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+                printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+                goto out;
+        }
+        switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+        case JBD2_SUPERBLOCK_V1:
+                journal->j_format_version = 1;
+                break;
+        case JBD2_SUPERBLOCK_V2:
+                journal->j_format_version = 2;
+                break;
+        default:
+                printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+                goto out;
+        }
+        if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+                journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+        else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+                printk (KERN_WARNING "JBD: journal file too short\n");
+                goto out;
+        }
+        return 0;
+out:
+        journal_fail_superblock(journal);
+        return err;
+}
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+static int load_superblock(journal_t *journal)
+{
+        int err;
+        journal_superblock_t *sb;
+        err = journal_get_superblock(journal);
+        if (err)
+                return err;
+        sb = journal->j_superblock;
+        journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+        journal->j_tail = be32_to_cpu(sb->s_start);
+        journal->j_first = be32_to_cpu(sb->s_first);
+        journal->j_last = be32_to_cpu(sb->s_maxlen);
+        journal->j_errno = be32_to_cpu(sb->s_errno);
+        return 0;
+}
+/**
+ * int jbd2_journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int jbd2_journal_load(journal_t *journal)
+{
+        int err;
+        journal_superblock_t *sb;
+        err = load_superblock(journal);
+        if (err)
+                return err;
+        sb = journal->j_superblock;
+        /* If this is a V2 superblock, then we have to check the
+         * features flags on it. */
+        if (journal->j_format_version >= 2) {
+                if ((sb->s_feature_ro_compat &
+                     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+                    (sb->s_feature_incompat &
+                     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+                        printk (KERN_WARNING
+                                "JBD: Unrecognised features on journal\n");
+                        return -EINVAL;
+                }
+        }
+        /*
+         * Create a slab for this blocksize
+         */
+        err = jbd2_journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
+        if (err)
+                return err;
+        /* Let the recovery code check whether it needs to recover any
+         * data from the journal. */
+        if (jbd2_journal_recover(journal))
+                goto recovery_error;
+        /* OK, we've finished with the dynamic journal bits:
+         * reinitialise the dynamic contents of the superblock in memory
+         * and reset them on disk. */
+        if (journal_reset(journal))
+                goto recovery_error;
+        journal->j_flags &= ~JBD2_ABORT;
+        journal->j_flags |= JBD2_LOADED;
+        return 0;
+recovery_error:
+        printk (KERN_WARNING "JBD: recovery failed\n");
+        return -EIO;
+}
+/**
+ * void jbd2_journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ */
+void jbd2_journal_destroy(journal_t *journal)
+{
+        /* Wait for the commit thread to wake up and die. */
+        journal_kill_thread(journal);
+        /* Force a final log commit */
+        if (journal->j_running_transaction)
+                jbd2_journal_commit_transaction(journal);
+        /* Force any old transactions to disk */
+        /* Totally anal locking here... */
+        spin_lock(&journal->j_list_lock);
+        while (journal->j_checkpoint_transactions != NULL) {
+                spin_unlock(&journal->j_list_lock);
+                jbd2_log_do_checkpoint(journal);
+                spin_lock(&journal->j_list_lock);
+        }
+        J_ASSERT(journal->j_running_transaction == NULL);
+        J_ASSERT(journal->j_committing_transaction == NULL);
+        J_ASSERT(journal->j_checkpoint_transactions == NULL);
+        spin_unlock(&journal->j_list_lock);
+        /* We can now mark the journal as empty. */
+        journal->j_tail = 0;
+        journal->j_tail_sequence = ++journal->j_transaction_sequence;
+        if (journal->j_sb_buffer) {
+                jbd2_journal_update_superblock(journal, 1);
+                brelse(journal->j_sb_buffer);
+        }
+        if (journal->j_inode)
+                iput(journal->j_inode);
+        if (journal->j_revoke)
+                jbd2_journal_destroy_revoke(journal);
+        kfree(journal->j_wbuf);
+        kfree(journal);
+}
+/**
+ *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does.
+ **/
+int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+                                 unsigned long ro, unsigned long incompat)
+{
+        journal_superblock_t *sb;
+        if (!compat && !ro && !incompat)
+                return 1;
+        if (journal->j_format_version == 1)
+                return 0;
+        sb = journal->j_superblock;
+        if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+            ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+            ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+                return 1;
+        return 0;
+}
+/**
+ * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal.  Return true
+ * (non-zero) if it can. */
+int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+                                      unsigned long ro, unsigned long incompat)
+{
+        journal_superblock_t *sb;
+        if (!compat && !ro && !incompat)
+                return 1;
+        sb = journal->j_superblock;
+        /* We can support any known requested features iff the
+         * superblock is in version 2.  Otherwise we fail to support any
+         * extended sb features. */
+        if (journal->j_format_version != 2)
+                return 0;
+        if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
+            (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
+            (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
+                return 1;
+        return 0;
+}
+/**
+ * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set.
+ *
+ */
+int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+                          unsigned long ro, unsigned long incompat)
+{
+        journal_superblock_t *sb;
+        if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
+                return 1;
+        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
+                return 0;
+        jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+                  compat, ro, incompat);
+        sb = journal->j_superblock;
+        sb->s_feature_compat    |= cpu_to_be32(compat);
+        sb->s_feature_ro_compat |= cpu_to_be32(ro);
+        sb->s_feature_incompat  |= cpu_to_be32(incompat);
+        return 1;
+}
+/**
+ * int jbd2_journal_update_format () - Update on-disk journal structure.
+ * @journal: Journal to act on.
+ *
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+int jbd2_journal_update_format (journal_t *journal)
+{
+        journal_superblock_t *sb;
+        int err;
+        err = journal_get_superblock(journal);
+        if (err)
+                return err;
+        sb = journal->j_superblock;
+        switch (be32_to_cpu(sb->s_header.h_blocktype)) {
+        case JBD2_SUPERBLOCK_V2:
+                return 0;
+        case JBD2_SUPERBLOCK_V1:
+                return journal_convert_superblock_v1(journal, sb);
+        default:
+                break;
+        }
+        return -EINVAL;
+}
+static int journal_convert_superblock_v1(journal_t *journal,
+                                         journal_superblock_t *sb)
+{
+        int offset, blocksize;
+        struct buffer_head *bh;
+        printk(KERN_WARNING
+                "JBD: Converting superblock from version 1 to 2.\n");
+        /* Pre-initialise new fields to zero */
+        offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+        blocksize = be32_to_cpu(sb->s_blocksize);
+        memset(&sb->s_feature_compat, 0, blocksize-offset);
+        sb->s_nr_users = cpu_to_be32(1);
+        sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+        journal->j_format_version = 2;
+        bh = journal->j_sb_buffer;
+        BUFFER_TRACE(bh, "marking dirty");
+        mark_buffer_dirty(bh);
+        sync_dirty_buffer(bh);
+        return 0;
+}
+/**
+ * int jbd2_journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+int jbd2_journal_flush(journal_t *journal)
+{
+        int err = 0;
+        transaction_t *transaction = NULL;
+        unsigned long old_tail;
+        spin_lock(&journal->j_state_lock);
+        /* Force everything buffered to the log... */
+        if (journal->j_running_transaction) {
+                transaction = journal->j_running_transaction;
+                __jbd2_log_start_commit(journal, transaction->t_tid);
+        } else if (journal->j_committing_transaction)
+                transaction = journal->j_committing_transaction;
+        /* Wait for the log commit to complete... */
+        if (transaction) {
+                tid_t tid = transaction->t_tid;
+                spin_unlock(&journal->j_state_lock);
+                jbd2_log_wait_commit(journal, tid);
+        } else {
+                spin_unlock(&journal->j_state_lock);
+        }
+        /* ...and flush everything in the log out to disk. */
+        spin_lock(&journal->j_list_lock);
+        while (!err && journal->j_checkpoint_transactions != NULL) {
+                spin_unlock(&journal->j_list_lock);
+                err = jbd2_log_do_checkpoint(journal);
+                spin_lock(&journal->j_list_lock);
+        }
+        spin_unlock(&journal->j_list_lock);
+        jbd2_cleanup_journal_tail(journal);
+        /* Finally, mark the journal as really needing no recovery.
+         * This sets s_start==0 in the underlying superblock, which is
+         * the magic code for a fully-recovered superblock.  Any future
+         * commits of data to the journal will restore the current
+         * s_start value. */
+        spin_lock(&journal->j_state_lock);
+        old_tail = journal->j_tail;
+        journal->j_tail = 0;
+        spin_unlock(&journal->j_state_lock);
+        jbd2_journal_update_superblock(journal, 1);
+        spin_lock(&journal->j_state_lock);
+        journal->j_tail = old_tail;
+        J_ASSERT(!journal->j_running_transaction);
+        J_ASSERT(!journal->j_committing_transaction);
+        J_ASSERT(!journal->j_checkpoint_transactions);
+        J_ASSERT(journal->j_head == journal->j_tail);
+        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+        spin_unlock(&journal->j_state_lock);
+        return err;
+}
+/**
+ * int jbd2_journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely.  This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and jbd2_journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+int jbd2_journal_wipe(journal_t *journal, int write)
+{
+        journal_superblock_t *sb;
+        int err = 0;
+        J_ASSERT (!(journal->j_flags & JBD2_LOADED));
+        err = load_superblock(journal);
+        if (err)
+                return err;
+        sb = journal->j_superblock;
+        if (!journal->j_tail)
+                goto no_recovery;
+        printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+                write ? "Clearing" : "Ignoring");
+        err = jbd2_journal_skip_recovery(journal);
+        if (write)
+                jbd2_journal_update_superblock(journal, 1);
+ no_recovery:
+        return err;
+}
+/*
+ * journal_dev_name: format a character string to describe on what
+ * device this journal is present.
+ */
+static const char *journal_dev_name(journal_t *journal, char *buffer)
+{
+        struct block_device *bdev;
+        if (journal->j_inode)
+                bdev = journal->j_inode->i_sb->s_bdev;
+        else
+                bdev = journal->j_dev;
+        return bdevname(bdev, buffer);
+}
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal function, which provide abort to te jbd layer
+ * itself are here.
+ */
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+void __jbd2_journal_abort_hard(journal_t *journal)
+{
+        transaction_t *transaction;
+        char b[BDEVNAME_SIZE];
+        if (journal->j_flags & JBD2_ABORT)
+                return;
+        printk(KERN_ERR "Aborting journal on device %s.\n",
+                journal_dev_name(journal, b));
+        spin_lock(&journal->j_state_lock);
+        journal->j_flags |= JBD2_ABORT;
+        transaction = journal->j_running_transaction;
+        if (transaction)
+                __jbd2_log_start_commit(journal, transaction->t_tid);
+        spin_unlock(&journal->j_state_lock);
+}
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+        if (journal->j_flags & JBD2_ABORT)
+                return;
+        if (!journal->j_errno)
+                journal->j_errno = errno;
+        __jbd2_journal_abort_hard(journal);
+        if (errno)
+                jbd2_journal_update_superblock(journal, 1);
+}
+/**
+ * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The jbd2_journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * jbd2_journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final jbd2_journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the jbd2_journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+void jbd2_journal_abort(journal_t *journal, int errno)
+{
+        __journal_abort_soft(journal, errno);
+}
+/**
+ * int jbd2_journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno numbet set with jbd2_journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int jbd2_journal_errno(journal_t *journal)
+{
+        int err;
+        spin_lock(&journal->j_state_lock);
+        if (journal->j_flags & JBD2_ABORT)
+                err = -EROFS;
+        else
+                err = journal->j_errno;
+        spin_unlock(&journal->j_state_lock);
+        return err;
+}
+/**
+ * int jbd2_journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+int jbd2_journal_clear_err(journal_t *journal)
+{
+        int err = 0;
+        spin_lock(&journal->j_state_lock);
+        if (journal->j_flags & JBD2_ABORT)
+                err = -EROFS;
+        else
+                journal->j_errno = 0;
+        spin_unlock(&journal->j_state_lock);
+        return err;
+}
+/**
+ * void jbd2_journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+void jbd2_journal_ack_err(journal_t *journal)
+{
+        spin_lock(&journal->j_state_lock);
+        if (journal->j_errno)
+                journal->j_flags |= JBD2_ACK_ERR;
+        spin_unlock(&journal->j_state_lock);
+}
+int jbd2_journal_blocks_per_page(struct inode *inode)
+{
+        return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+/*
+ * helper functions to deal with 32 or 64bit block numbers.
+ */
+size_t journal_tag_bytes(journal_t *journal)
+{
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+                return JBD_TAG_SIZE64;
+        else
+                return JBD_TAG_SIZE32;
+}
+/*
+ * Simple support for retrying memory allocations.  Introduced to help to
+ * debug different VM deadlock avoidance strategies.
+ */
+void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
+{
+        return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
+}
+/*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
+ * and allocate frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+#define JBD_MAX_SLABS 5
+#define JBD_SLAB_INDEX(size)  (size >> 11)
+static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+        "jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
+};
+static void jbd2_journal_destroy_jbd_slabs(void)
+{
+        int i;
+        for (i = 0; i < JBD_MAX_SLABS; i++) {
+                if (jbd_slab[i])
+                        kmem_cache_destroy(jbd_slab[i]);
+                jbd_slab[i] = NULL;
+        }
+}
+static int jbd2_journal_create_jbd_slab(size_t slab_size)
+{
+        int i = JBD_SLAB_INDEX(slab_size);
+        BUG_ON(i >= JBD_MAX_SLABS);
+        /*
+         * Check if we already have a slab created for this size
+         */
+        if (jbd_slab[i])
+                return 0;
+        /*
+         * Create a slab and force alignment to be same as slabsize -
+         * this will make sure that allocations won't cross the page
+         * boundary.
+         */
+        jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+                                slab_size, slab_size, 0, NULL, NULL);
+        if (!jbd_slab[i]) {
+                printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+void * jbd2_slab_alloc(size_t size, gfp_t flags)
+{
+        int idx;
+        idx = JBD_SLAB_INDEX(size);
+        BUG_ON(jbd_slab[idx] == NULL);
+        return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+void jbd2_slab_free(void *ptr,  size_t size)
+{
+        int idx;
+        idx = JBD_SLAB_INDEX(size);
+        BUG_ON(jbd_slab[idx] == NULL);
+        kmem_cache_free(jbd_slab[idx], ptr);
+}
+/*
+ * Journal_head storage management
+ */
+static kmem_cache_t *jbd2_journal_head_cache;
+#ifdef CONFIG_JBD_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+static int journal_init_jbd2_journal_head_cache(void)
+{
+        int retval;
+        J_ASSERT(jbd2_journal_head_cache == 0);
+        jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
+                                sizeof(struct journal_head),
+                                0,              /* offset */
+                                0,              /* flags */
+                                NULL,           /* ctor */
+                                NULL);          /* dtor */
+        retval = 0;
+        if (jbd2_journal_head_cache == 0) {
+                retval = -ENOMEM;
+                printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+        }
+        return retval;
+}
+static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+{
+        J_ASSERT(jbd2_journal_head_cache != NULL);
+        kmem_cache_destroy(jbd2_journal_head_cache);
+        jbd2_journal_head_cache = NULL;
+}
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+        struct journal_head *ret;
+        static unsigned long last_warning;
+#ifdef CONFIG_JBD_DEBUG
+        atomic_inc(&nr_journal_heads);
+#endif
+        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+        if (ret == 0) {
+                jbd_debug(1, "out of memory for journal_head\n");
+                if (time_after(jiffies, last_warning + 5*HZ)) {
+                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+                               __FUNCTION__);
+                        last_warning = jiffies;
+                }
+                while (ret == 0) {
+                        yield();
+                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+                }
+        }
+        return ret;
+}
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD_DEBUG
+        atomic_dec(&nr_journal_heads);
+        memset(jh, JBD_POISON_FREE, sizeof(*jh));
+#endif
+        kmem_cache_free(jbd2_journal_head_cache, jh);
+}
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set.  This bit is tested in core kernel code where we need to take
+ * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head may be detached from its buffer_head when the journal_head's
+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
+ * journal_head can be dropped if needed.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction.  To protect the
+ * journal_head in this situation, jbd2_journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one.  The caller must call
+ * jbd2_journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ *      (Attach a journal_head if needed.  Increments b_jcount)
+ *      struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ *      ...
+ *      jh->b_transaction = xxx;
+ *      jbd2_journal_put_journal_head(jh);
+ *
+ * Now, the journal_head's b_jcount is zero, but it is safe from being released
+ * because it has a non-zero b_transaction.
+ */
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * Doesn't need the journal lock.
+ * May sleep.
+ */
+struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
+{
+        struct journal_head *jh;
+        struct journal_head *new_jh = NULL;
+repeat:
+        if (!buffer_jbd(bh)) {
+                new_jh = journal_alloc_journal_head();
+                memset(new_jh, 0, sizeof(*new_jh));
+        }
+        jbd_lock_bh_journal_head(bh);
+        if (buffer_jbd(bh)) {
+                jh = bh2jh(bh);
+        } else {
+                J_ASSERT_BH(bh,
+                        (atomic_read(&bh->b_count) > 0) ||
+                        (bh->b_page && bh->b_page->mapping));
+                if (!new_jh) {
+                        jbd_unlock_bh_journal_head(bh);
+                        goto repeat;
+                }
+                jh = new_jh;
+                new_jh = NULL;          /* We consumed it */
+                set_buffer_jbd(bh);
+                bh->b_private = jh;
+                jh->b_bh = bh;
+                get_bh(bh);
+                BUFFER_TRACE(bh, "added journal_head");
+        }
+        jh->b_jcount++;
+        jbd_unlock_bh_journal_head(bh);
+        if (new_jh)
+                journal_free_journal_head(new_jh);
+        return bh->b_private;
+}
+/*
+ * Grab a ref against this buffer_head's journal_head.  If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
+{
+        struct journal_head *jh = NULL;
+        jbd_lock_bh_journal_head(bh);
+        if (buffer_jbd(bh)) {
+                jh = bh2jh(bh);
+                jh->b_jcount++;
+        }
+        jbd_unlock_bh_journal_head(bh);
+        return jh;
+}
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+        struct journal_head *jh = bh2jh(bh);
+        J_ASSERT_JH(jh, jh->b_jcount >= 0);
+        get_bh(bh);
+        if (jh->b_jcount == 0) {
+                if (jh->b_transaction == NULL &&
+                                jh->b_next_transaction == NULL &&
+                                jh->b_cp_transaction == NULL) {
+                        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+                        J_ASSERT_BH(bh, buffer_jbd(bh));
+                        J_ASSERT_BH(bh, jh2bh(jh) == bh);
+                        BUFFER_TRACE(bh, "remove journal_head");
+                        if (jh->b_frozen_data) {
+                                printk(KERN_WARNING "%s: freeing "
+                                                "b_frozen_data\n",
+                                                __FUNCTION__);
+                                jbd2_slab_free(jh->b_frozen_data, bh->b_size);
+                        }
+                        if (jh->b_committed_data) {
+                                printk(KERN_WARNING "%s: freeing "
+                                                "b_committed_data\n",
+                                                __FUNCTION__);
+                                jbd2_slab_free(jh->b_committed_data, bh->b_size);
+                        }
+                        bh->b_private = NULL;
+                        jh->b_bh = NULL;        /* debug, really */
+                        clear_buffer_jbd(bh);
+                        __brelse(bh);
+                        journal_free_journal_head(jh);
+                } else {
+                        BUFFER_TRACE(bh, "journal_head was locked");
+                }
+        }
+}
+/*
+ * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * and has a zero b_jcount then remove and release its journal_head.   If we did
+ * see that the buffer is not used by any transaction we also "logically"
+ * decrement ->b_count.
+ *
+ * We in fact take an additional increment on ->b_count as a convenience,
+ * because the caller usually wants to do additional things with the bh
+ * after calling here.
+ * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
+ * time.  Once the caller has run __brelse(), the buffer is eligible for
+ * reaping by try_to_free_buffers().
+ */
+void jbd2_journal_remove_journal_head(struct buffer_head *bh)
+{
+        jbd_lock_bh_journal_head(bh);
+        __journal_remove_journal_head(bh);
+        jbd_unlock_bh_journal_head(bh);
+}
+/*
+ * Drop a reference on the passed journal_head.  If it fell to zero then try to
+ * release the journal_head from the buffer_head.
+ */
+void jbd2_journal_put_journal_head(struct journal_head *jh)
+{
+        struct buffer_head *bh = jh2bh(jh);
+        jbd_lock_bh_journal_head(bh);
+        J_ASSERT_JH(jh, jh->b_jcount > 0);
+        --jh->b_jcount;
+        if (!jh->b_jcount && !jh->b_transaction) {
+                __journal_remove_journal_head(bh);
+                __brelse(bh);
+        }
+        jbd_unlock_bh_journal_head(bh);
+}
+/*
+ * /proc tunables
+ */
+#if defined(CONFIG_JBD_DEBUG)
+int jbd2_journal_enable_debug;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+#endif
+#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
+static struct proc_dir_entry *proc_jbd_debug;
+static int read_jbd_debug(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{
+        int ret;
+        ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
+        *eof = 1;
+        return ret;
+}
+static int write_jbd_debug(struct file *file, const char __user *buffer,
+                           unsigned long count, void *data)
+{
+        char buf[32];
+        if (count > ARRAY_SIZE(buf) - 1)
+                count = ARRAY_SIZE(buf) - 1;
+        if (copy_from_user(buf, buffer, count))
+                return -EFAULT;
+        buf[ARRAY_SIZE(buf) - 1] = '\0';
+        jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
+        return count;
+}
+#define JBD_PROC_NAME "sys/fs/jbd2-debug"
+static void __init create_jbd_proc_entry(void)
+{
+        proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
+        if (proc_jbd_debug) {
+                /* Why is this so hard? */
+                proc_jbd_debug->read_proc = read_jbd_debug;
+                proc_jbd_debug->write_proc = write_jbd_debug;
+        }
+}
+static void __exit jbd2_remove_jbd_proc_entry(void)
+{
+        if (proc_jbd_debug)
+                remove_proc_entry(JBD_PROC_NAME, NULL);
+}
+#else
+#define create_jbd_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_proc_entry() do {} while (0)
+#endif
+kmem_cache_t *jbd2_handle_cache;
+static int __init journal_init_handle_cache(void)
+{
+        jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+                                sizeof(handle_t),
+                                0,              /* offset */
+                                0,              /* flags */
+                                NULL,           /* ctor */
+                                NULL);          /* dtor */
+        if (jbd2_handle_cache == NULL) {
+                printk(KERN_EMERG "JBD: failed to create handle cache\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void jbd2_journal_destroy_handle_cache(void)
+{
+        if (jbd2_handle_cache)
+                kmem_cache_destroy(jbd2_handle_cache);
+}
+/*
+ * Module startup and shutdown
+ */
+static int __init journal_init_caches(void)
+{
+        int ret;
+        ret = jbd2_journal_init_revoke_caches();
+        if (ret == 0)
+                ret = journal_init_jbd2_journal_head_cache();
+        if (ret == 0)
+                ret = journal_init_handle_cache();
+        return ret;
+}
+static void jbd2_journal_destroy_caches(void)
+{
+        jbd2_journal_destroy_revoke_caches();
+        jbd2_journal_destroy_jbd2_journal_head_cache();
+        jbd2_journal_destroy_handle_cache();
+        jbd2_journal_destroy_jbd_slabs();
+}
+static int __init journal_init(void)
+{
+        int ret;
+        BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+        ret = journal_init_caches();
+        if (ret != 0)
+                jbd2_journal_destroy_caches();
+        create_jbd_proc_entry();
+        return ret;
+}
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD_DEBUG
+        int n = atomic_read(&nr_journal_heads);
+        if (n)
+                printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+#endif
+        jbd2_remove_jbd_proc_entry();
+        jbd2_journal_destroy_caches();
+}
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..9f10acafaf70
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,609 @@
+/*
+ * linux/fs/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#endif
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+        tid_t           start_transaction;
+        tid_t           end_transaction;
+        int             nr_replays;
+        int             nr_revokes;
+        int             nr_revoke_hits;
+};
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+                                struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+                                tid_t, struct recovery_info *);
+#ifdef __KERNEL__
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+        while (--n >= 0)
+                brelse (b[n]);
+}
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+        int err;
+        unsigned int max, nbufs, next;
+        unsigned long long blocknr;
+        struct buffer_head *bh;
+        struct buffer_head * bufs[MAXBUF];
+        /* Do up to 128K of readahead */
+        max = start + (128 * 1024 / journal->j_blocksize);
+        if (max > journal->j_maxlen)
+                max = journal->j_maxlen;
+        /* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+         * a time to the block device IO layer. */
+        nbufs = 0;
+        for (next = start; next < max; next++) {
+                err = jbd2_journal_bmap(journal, next, &blocknr);
+                if (err) {
+                        printk (KERN_ERR "JBD: bad block at offset %u\n",
+                                next);
+                        goto failed;
+                }
+                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+                if (!bh) {
+                        err = -ENOMEM;
+                        goto failed;
+                }
+                if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+                        bufs[nbufs++] = bh;
+                        if (nbufs == MAXBUF) {
+                                ll_rw_block(READ, nbufs, bufs);
+                                journal_brelse_array(bufs, nbufs);
+                                nbufs = 0;
+                        }
+                } else
+                        brelse(bh);
+        }
+        if (nbufs)
+                ll_rw_block(READ, nbufs, bufs);
+        err = 0;
+failed:
+        if (nbufs)
+                journal_brelse_array(bufs, nbufs);
+        return err;
+}
+#endif /* __KERNEL__ */
+/*
+ * Read a block from the journal
+ */
+static int jread(struct buffer_head **bhp, journal_t *journal,
+                 unsigned int offset)
+{
+        int err;
+        unsigned long long blocknr;
+        struct buffer_head *bh;
+        *bhp = NULL;
+        if (offset >= journal->j_maxlen) {
+                printk(KERN_ERR "JBD: corrupted journal superblock\n");
+                return -EIO;
+        }
+        err = jbd2_journal_bmap(journal, offset, &blocknr);
+        if (err) {
+                printk (KERN_ERR "JBD: bad block at offset %u\n",
+                        offset);
+                return err;
+        }
+        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        if (!bh)
+                return -ENOMEM;
+        if (!buffer_uptodate(bh)) {
+                /* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+                if (!buffer_req(bh))
+                        do_readahead(journal, offset);
+                wait_on_buffer(bh);
+        }
+        if (!buffer_uptodate(bh)) {
+                printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+                        offset);
+                brelse(bh);
+                return -EIO;
+        }
+        *bhp = bh;
+        return 0;
+}
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+static int count_tags(journal_t *journal, struct buffer_head *bh)
+{
+        char *                  tagp;
+        journal_block_tag_t *   tag;
+        int                     nr = 0, size = journal->j_blocksize;
+        int                     tag_bytes = journal_tag_bytes(journal);
+        tagp = &bh->b_data[sizeof(journal_header_t)];
+        while ((tagp - bh->b_data + tag_bytes) <= size) {
+                tag = (journal_block_tag_t *) tagp;
+                nr++;
+                tagp += tag_bytes;
+                if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+                        tagp += 16;
+                if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+                        break;
+        }
+        return nr;
+}
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)                                              \
+do {                                                                    \
+        if (var >= (journal)->j_last)                                   \
+                var -= ((journal)->j_last - (journal)->j_first);        \
+} while (0)
+/**
+ * jbd2_journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int jbd2_journal_recover(journal_t *journal)
+{
+        int                     err;
+        journal_superblock_t *  sb;
+        struct recovery_info    info;
+        memset(&info, 0, sizeof(info));
+        sb = journal->j_superblock;
+        /*
+         * The journal superblock's s_start field (the current log head)
+         * is always zero if, and only if, the journal was cleanly
+         * unmounted.
+         */
+        if (!sb->s_start) {
+                jbd_debug(1, "No recovery required, last transaction %d\n",
+                          be32_to_cpu(sb->s_sequence));
+                journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+                return 0;
+        }
+        err = do_one_pass(journal, &info, PASS_SCAN);
+        if (!err)
+                err = do_one_pass(journal, &info, PASS_REVOKE);
+        if (!err)
+                err = do_one_pass(journal, &info, PASS_REPLAY);
+        jbd_debug(0, "JBD: recovery, exit status %d, "
+                  "recovered transactions %u to %u\n",
+                  err, info.start_transaction, info.end_transaction);
+        jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
+                  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+        /* Restart the log at the next transaction ID, thus invalidating
+         * any existing commit records in the log. */
+        journal->j_transaction_sequence = ++info.end_transaction;
+        jbd2_journal_clear_revoke(journal);
+        sync_blockdev(journal->j_fs_dev);
+        return err;
+}
+/**
+ * jbd2_journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int jbd2_journal_skip_recovery(journal_t *journal)
+{
+        int                     err;
+        journal_superblock_t *  sb;
+        struct recovery_info    info;
+        memset (&info, 0, sizeof(info));
+        sb = journal->j_superblock;
+        err = do_one_pass(journal, &info, PASS_SCAN);
+        if (err) {
+                printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+                ++journal->j_transaction_sequence;
+        } else {
+#ifdef CONFIG_JBD_DEBUG
+                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+#endif
+                jbd_debug(0,
+                          "JBD: ignoring %d transaction%s from the journal.\n",
+                          dropped, (dropped == 1) ? "" : "s");
+                journal->j_transaction_sequence = ++info.end_transaction;
+        }
+        journal->j_tail = 0;
+        return err;
+}
+static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+{
+        unsigned long long block = be32_to_cpu(tag->t_blocknr);
+        if (tag_bytes > JBD_TAG_SIZE32)
+                block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
+        return block;
+}
+static int do_one_pass(journal_t *journal,
+                        struct recovery_info *info, enum passtype pass)
+{
+        unsigned int            first_commit_ID, next_commit_ID;
+        unsigned long           next_log_block;
+        int                     err, success = 0;
+        journal_superblock_t *  sb;
+        journal_header_t *      tmp;
+        struct buffer_head *    bh;
+        unsigned int            sequence;
+        int                     blocktype;
+        int                     tag_bytes = journal_tag_bytes(journal);
+        /* Precompute the maximum metadata descriptors in a descriptor block */
+        int                     MAX_BLOCKS_PER_DESC;
+        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+                               / tag_bytes);
+        /*
+         * First thing is to establish what we expect to find in the log
+         * (in terms of transaction IDs), and where (in terms of log
+         * block offsets): query the superblock.
+         */
+        sb = journal->j_superblock;
+        next_commit_ID = be32_to_cpu(sb->s_sequence);
+        next_log_block = be32_to_cpu(sb->s_start);
+        first_commit_ID = next_commit_ID;
+        if (pass == PASS_SCAN)
+                info->start_transaction = first_commit_ID;
+        jbd_debug(1, "Starting recovery pass %d\n", pass);
+        /*
+         * Now we walk through the log, transaction by transaction,
+         * making sure that each transaction has a commit block in the
+         * expected place.  Each complete transaction gets replayed back
+         * into the main filesystem.
+         */
+        while (1) {
+                int                     flags;
+                char *                  tagp;
+                journal_block_tag_t *   tag;
+                struct buffer_head *    obh;
+                struct buffer_head *    nbh;
+                cond_resched();         /* We're under lock_kernel() */
+                /* If we already know where to stop the log traversal,
+                 * check right now that we haven't gone past the end of
+                 * the log. */
+                if (pass != PASS_SCAN)
+                        if (tid_geq(next_commit_ID, info->end_transaction))
+                                break;
+                jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+                          next_commit_ID, next_log_block, journal->j_last);
+                /* Skip over each chunk of the transaction looking
+                 * either the next descriptor block or the final commit
+                 * record. */
+                jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+                err = jread(&bh, journal, next_log_block);
+                if (err)
+                        goto failed;
+                next_log_block++;
+                wrap(journal, next_log_block);
+                /* What kind of buffer is it?
+                 *
+                 * If it is a descriptor block, check that it has the
+                 * expected sequence number.  Otherwise, we're all done
+                 * here. */
+                tmp = (journal_header_t *)bh->b_data;
+                if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+                        brelse(bh);
+                        break;
+                }
+                blocktype = be32_to_cpu(tmp->h_blocktype);
+                sequence = be32_to_cpu(tmp->h_sequence);
+                jbd_debug(3, "Found magic %d, sequence %d\n",
+                          blocktype, sequence);
+                if (sequence != next_commit_ID) {
+                        brelse(bh);
+                        break;
+                }
+                /* OK, we have a valid descriptor block which matches
+                 * all of the sequence number checks.  What are we going
+                 * to do with it?  That depends on the pass... */
+                switch(blocktype) {
+                case JBD2_DESCRIPTOR_BLOCK:
+                        /* If it is a valid descriptor block, replay it
+                         * in pass REPLAY; otherwise, just skip over the
+                         * blocks it describes. */
+                        if (pass != PASS_REPLAY) {
+                                next_log_block += count_tags(journal, bh);
+                                wrap(journal, next_log_block);
+                                brelse(bh);
+                                continue;
+                        }
+                        /* A descriptor block: we can now write all of
+                         * the data blocks.  Yay, useful work is finally
+                         * getting done here! */
+                        tagp = &bh->b_data[sizeof(journal_header_t)];
+                        while ((tagp - bh->b_data + tag_bytes)
+                               <= journal->j_blocksize) {
+                                unsigned long io_block;
+                                tag = (journal_block_tag_t *) tagp;
+                                flags = be32_to_cpu(tag->t_flags);
+                                io_block = next_log_block++;
+                                wrap(journal, next_log_block);
+                                err = jread(&obh, journal, io_block);
+                                if (err) {
+                                        /* Recover what we can, but
+                                         * report failure at the end. */
+                                        success = err;
+                                        printk (KERN_ERR
+                                                "JBD: IO error %d recovering "
+                                                "block %ld in log\n",
+                                                err, io_block);
+                                } else {
+                                        unsigned long long blocknr;
+                                        J_ASSERT(obh != NULL);
+                                        blocknr = read_tag_block(tag_bytes,
+                                                                 tag);
+                                        /* If the block has been
+                                         * revoked, then we're all done
+                                         * here. */
+                                        if (jbd2_journal_test_revoke
+                                            (journal, blocknr,
+                                             next_commit_ID)) {
+                                                brelse(obh);
+                                                ++info->nr_revoke_hits;
+                                                goto skip_write;
+                                        }
+                                        /* Find a buffer for the new
+                                         * data being restored */
+                                        nbh = __getblk(journal->j_fs_dev,
+                                                        blocknr,
+                                                        journal->j_blocksize);
+                                        if (nbh == NULL) {
+                                                printk(KERN_ERR
+                                                       "JBD: Out of memory "
+                                                       "during recovery.\n");
+                                                err = -ENOMEM;
+                                                brelse(bh);
+                                                brelse(obh);
+                                                goto failed;
+                                        }
+                                        lock_buffer(nbh);
+                                        memcpy(nbh->b_data, obh->b_data,
+                                                        journal->j_blocksize);
+                                        if (flags & JBD2_FLAG_ESCAPE) {
+                                                *((__be32 *)bh->b_data) =
+                                                cpu_to_be32(JBD2_MAGIC_NUMBER);
+                                        }
+                                        BUFFER_TRACE(nbh, "marking dirty");
+                                        set_buffer_uptodate(nbh);
+                                        mark_buffer_dirty(nbh);
+                                        BUFFER_TRACE(nbh, "marking uptodate");
+                                        ++info->nr_replays;
+                                        /* ll_rw_block(WRITE, 1, &nbh); */
+                                        unlock_buffer(nbh);
+                                        brelse(obh);
+                                        brelse(nbh);
+                                }
+                        skip_write:
+                                tagp += tag_bytes;
+                                if (!(flags & JBD2_FLAG_SAME_UUID))
+                                        tagp += 16;
+                                if (flags & JBD2_FLAG_LAST_TAG)
+                                        break;
+                        }
+                        brelse(bh);
+                        continue;
+                case JBD2_COMMIT_BLOCK:
+                        /* Found an expected commit block: not much to
+                         * do other than move on to the next sequence
+                         * number. */
+                        brelse(bh);
+                        next_commit_ID++;
+                        continue;
+                case JBD2_REVOKE_BLOCK:
+                        /* If we aren't in the REVOKE pass, then we can
+                         * just skip over this block. */
+                        if (pass != PASS_REVOKE) {
+                                brelse(bh);
+                                continue;
+                        }
+                        err = scan_revoke_records(journal, bh,
+                                                  next_commit_ID, info);
+                        brelse(bh);
+                        if (err)
+                                goto failed;
+                        continue;
+                default:
+                        jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+                                  blocktype);
+                        brelse(bh);
+                        goto done;
+                }
+        }
+ done:
+        /*
+         * We broke out of the log scan loop: either we came to the
+         * known end of the log or we found an unexpected block in the
+         * log.  If the latter happened, then we know that the "current"
+         * transaction marks the end of the valid log.
+         */
+        if (pass == PASS_SCAN)
+                info->end_transaction = next_commit_ID;
+        else {
+                /* It's really bad news if different passes end up at
+                 * different places (but possible due to IO errors). */
+                if (info->end_transaction != next_commit_ID) {
+                        printk (KERN_ERR "JBD: recovery pass %d ended at "
+                                "transaction %u, expected %u\n",
+                                pass, next_commit_ID, info->end_transaction);
+                        if (!success)
+                                success = -EIO;
+                }
+        }
+        return success;
+ failed:
+        return err;
+}
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+                               tid_t sequence, struct recovery_info *info)
+{
+        jbd2_journal_revoke_header_t *header;
+        int offset, max;
+        int record_len = 4;
+        header = (jbd2_journal_revoke_header_t *) bh->b_data;
+        offset = sizeof(jbd2_journal_revoke_header_t);
+        max = be32_to_cpu(header->r_count);
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+                record_len = 8;
+        while (offset + record_len <= max) {
+                unsigned long long blocknr;
+                int err;
+                if (record_len == 4)
+                        blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+                else
+                        blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
+                offset += record_len;
+                err = jbd2_journal_set_revoke(journal, blocknr, sequence);
+                if (err)
+                        return err;
+                ++info->nr_revokes;
+        }
+        return 0;
+}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..380d19917f37
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,712 @@
+/*
+ * linux/fs/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks.  The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ *   transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ *   revoked blocks.  If there are multiple revoke records in the log
+ *   for a single block, only the last one counts, and if there is a log
+ *   entry for a block beyond the last revoke, then that log entry still
+ *   gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ *   The desired end result is the journaling of the new block, so we
+ *   cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ *   The revoke must take precedence over the write of the block, so we
+ *   need either to cancel the journal entry or to write the revoke
+ *   later in the log than the log block.  In this case, we choose the
+ *   latter: journaling a block cancels any revoke record for that block
+ *   in the current transaction, so any revoke for that block in the
+ *   transaction must have happened after the block was journaled and so
+ *   the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ *   The data write is allowed to succeed, but the revoke is _not_
+ *   cancelled.  We still need to prevent old log records from
+ *   overwriting the new data.  We don't even need to clear the revoke
+ *   bit here.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear:   no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ *                      buffer has not been revoked, and cancel_revoke
+ *                      need do nothing.
+ * RevokeValid set, Revoked set:
+ *                      buffer has been revoked.
+ */
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#endif
+static kmem_cache_t *jbd2_revoke_record_cache;
+static kmem_cache_t *jbd2_revoke_table_cache;
+/* Each revoke record represents one single revoked block.  During
+   journal replay, this involves recording the transaction ID of the
+   last transaction to revoke this block. */
+struct jbd2_revoke_record_s
+{
+        struct list_head  hash;
+        tid_t             sequence;     /* Used for recovery only */
+        unsigned long long        blocknr;
+};
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd2_revoke_table_s
+{
+        /* It is conceivable that we might want a larger hash table
+         * for recovery.  Must be a power of two. */
+        int               hash_size;
+        int               hash_shift;
+        struct list_head *hash_table;
+};
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+                                    struct journal_head **, int *,
+                                    struct jbd2_revoke_record_s *);
+static void flush_descriptor(journal_t *, struct journal_head *, int);
+#endif
+/* Utility functions to maintain the revoke table */
+/* Borrowed from buffer.c: this is a tried and tested block hash function */
+static inline int hash(journal_t *journal, unsigned long long block)
+{
+        struct jbd2_revoke_table_s *table = journal->j_revoke;
+        int hash_shift = table->hash_shift;
+        int hash = (int)block ^ (int)((block >> 31) >> 1);
+        return ((hash << (hash_shift - 6)) ^
+                (hash >> 13) ^
+                (hash << (hash_shift - 12))) & (table->hash_size - 1);
+}
+static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
+                              tid_t seq)
+{
+        struct list_head *hash_list;
+        struct jbd2_revoke_record_s *record;
+repeat:
+        record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+        if (!record)
+                goto oom;
+        record->sequence = seq;
+        record->blocknr = blocknr;
+        hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+        spin_lock(&journal->j_revoke_lock);
+        list_add(&record->hash, hash_list);
+        spin_unlock(&journal->j_revoke_lock);
+        return 0;
+oom:
+        if (!journal_oom_retry)
+                return -ENOMEM;
+        jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
+        yield();
+        goto repeat;
+}
+/* Find a revoke record in the journal's hash table. */
+static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
+                                                      unsigned long long blocknr)
+{
+        struct list_head *hash_list;
+        struct jbd2_revoke_record_s *record;
+        hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+        spin_lock(&journal->j_revoke_lock);
+        record = (struct jbd2_revoke_record_s *) hash_list->next;
+        while (&(record->hash) != hash_list) {
+                if (record->blocknr == blocknr) {
+                        spin_unlock(&journal->j_revoke_lock);
+                        return record;
+                }
+                record = (struct jbd2_revoke_record_s *) record->hash.next;
+        }
+        spin_unlock(&journal->j_revoke_lock);
+        return NULL;
+}
+int __init jbd2_journal_init_revoke_caches(void)
+{
+        jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
+                                           sizeof(struct jbd2_revoke_record_s),
+                                           0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+        if (jbd2_revoke_record_cache == 0)
+                return -ENOMEM;
+        jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
+                                           sizeof(struct jbd2_revoke_table_s),
+                                           0, 0, NULL, NULL);
+        if (jbd2_revoke_table_cache == 0) {
+                kmem_cache_destroy(jbd2_revoke_record_cache);
+                jbd2_revoke_record_cache = NULL;
+                return -ENOMEM;
+        }
+        return 0;
+}
+void jbd2_journal_destroy_revoke_caches(void)
+{
+        kmem_cache_destroy(jbd2_revoke_record_cache);
+        jbd2_revoke_record_cache = NULL;
+        kmem_cache_destroy(jbd2_revoke_table_cache);
+        jbd2_revoke_table_cache = NULL;
+}
+/* Initialise the revoke table for a given journal to a given size. */
+int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+{
+        int shift, tmp;
+        J_ASSERT (journal->j_revoke_table[0] == NULL);
+        shift = 0;
+        tmp = hash_size;
+        while((tmp >>= 1UL) != 0UL)
+                shift++;
+        journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+        if (!journal->j_revoke_table[0])
+                return -ENOMEM;
+        journal->j_revoke = journal->j_revoke_table[0];
+        /* Check that the hash_size is a power of two */
+        J_ASSERT ((hash_size & (hash_size-1)) == 0);
+        journal->j_revoke->hash_size = hash_size;
+        journal->j_revoke->hash_shift = shift;
+        journal->j_revoke->hash_table =
+                kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+        if (!journal->j_revoke->hash_table) {
+                kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+                journal->j_revoke = NULL;
+                return -ENOMEM;
+        }
+        for (tmp = 0; tmp < hash_size; tmp++)
+                INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+        journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+        if (!journal->j_revoke_table[1]) {
+                kfree(journal->j_revoke_table[0]->hash_table);
+                kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+                return -ENOMEM;
+        }
+        journal->j_revoke = journal->j_revoke_table[1];
+        /* Check that the hash_size is a power of two */
+        J_ASSERT ((hash_size & (hash_size-1)) == 0);
+        journal->j_revoke->hash_size = hash_size;
+        journal->j_revoke->hash_shift = shift;
+        journal->j_revoke->hash_table =
+                kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+        if (!journal->j_revoke->hash_table) {
+                kfree(journal->j_revoke_table[0]->hash_table);
+                kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+                kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
+                journal->j_revoke = NULL;
+                return -ENOMEM;
+        }
+        for (tmp = 0; tmp < hash_size; tmp++)
+                INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+        spin_lock_init(&journal->j_revoke_lock);
+        return 0;
+}
+/* Destoy a journal's revoke table.  The table must already be empty! */
+void jbd2_journal_destroy_revoke(journal_t *journal)
+{
+        struct jbd2_revoke_table_s *table;
+        struct list_head *hash_list;
+        int i;
+        table = journal->j_revoke_table[0];
+        if (!table)
+                return;
+        for (i=0; i<table->hash_size; i++) {
+                hash_list = &table->hash_table[i];
+                J_ASSERT (list_empty(hash_list));
+        }
+        kfree(table->hash_table);
+        kmem_cache_free(jbd2_revoke_table_cache, table);
+        journal->j_revoke = NULL;
+        table = journal->j_revoke_table[1];
+        if (!table)
+                return;
+        for (i=0; i<table->hash_size; i++) {
+                hash_list = &table->hash_table[i];
+                J_ASSERT (list_empty(hash_list));
+        }
+        kfree(table->hash_table);
+        kmem_cache_free(jbd2_revoke_table_cache, table);
+        journal->j_revoke = NULL;
+}
+#ifdef __KERNEL__
+/*
+ * jbd2_journal_revoke: revoke a given buffer_head from the journal.  This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits.  Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete.  In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
+ * by one.
+ */
+int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
+                   struct buffer_head *bh_in)
+{
+        struct buffer_head *bh = NULL;
+        journal_t *journal;
+        struct block_device *bdev;
+        int err;
+        might_sleep();
+        if (bh_in)
+                BUFFER_TRACE(bh_in, "enter");
+        journal = handle->h_transaction->t_journal;
+        if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
+                J_ASSERT (!"Cannot set revoke feature!");
+                return -EINVAL;
+        }
+        bdev = journal->j_fs_dev;
+        bh = bh_in;
+        if (!bh) {
+                bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+                if (bh)
+                        BUFFER_TRACE(bh, "found on hash");
+        }
+#ifdef JBD_EXPENSIVE_CHECKING
+        else {
+                struct buffer_head *bh2;
+                /* If there is a different buffer_head lying around in
+                 * memory anywhere... */
+                bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+                if (bh2) {
+                        /* ... and it has RevokeValid status... */
+                        if (bh2 != bh && buffer_revokevalid(bh2))
+                                /* ...then it better be revoked too,
+                                 * since it's illegal to create a revoke
+                                 * record against a buffer_head which is
+                                 * not marked revoked --- that would
+                                 * risk missing a subsequent revoke
+                                 * cancel. */
+                                J_ASSERT_BH(bh2, buffer_revoked(bh2));
+                        put_bh(bh2);
+                }
+        }
+#endif
+        /* We really ought not ever to revoke twice in a row without
+           first having the revoke cancelled: it's illegal to free a
+           block twice without allocating it in between! */
+        if (bh) {
+                if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+                                 "inconsistent data on disk")) {
+                        if (!bh_in)
+                                brelse(bh);
+                        return -EIO;
+                }
+                set_buffer_revoked(bh);
+                set_buffer_revokevalid(bh);
+                if (bh_in) {
+                        BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
+                        jbd2_journal_forget(handle, bh_in);
+                } else {
+                        BUFFER_TRACE(bh, "call brelse");
+                        __brelse(bh);
+                }
+        }
+        jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+        err = insert_revoke_hash(journal, blocknr,
+                                handle->h_transaction->t_tid);
+        BUFFER_TRACE(bh_in, "exit");
+        return err;
+}
+/*
+ * Cancel an outstanding revoke.  For use only internally by the
+ * journaling code (called from jbd2_journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction.  In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel.  So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ *
+ * The caller must have the journal locked.
+ */
+int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+        struct jbd2_revoke_record_s *record;
+        journal_t *journal = handle->h_transaction->t_journal;
+        int need_cancel;
+        int did_revoke = 0;     /* akpm: debug */
+        struct buffer_head *bh = jh2bh(jh);
+        jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+        /* Is the existing Revoke bit valid?  If so, we trust it, and
+         * only perform the full cancel if the revoke bit is set.  If
+         * not, we can't trust the revoke bit, and we need to do the
+         * full search for a revoke record. */
+        if (test_set_buffer_revokevalid(bh)) {
+                need_cancel = test_clear_buffer_revoked(bh);
+        } else {
+                need_cancel = 1;
+                clear_buffer_revoked(bh);
+        }
+        if (need_cancel) {
+                record = find_revoke_record(journal, bh->b_blocknr);
+                if (record) {
+                        jbd_debug(4, "cancelled existing revoke on "
+                                  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+                        spin_lock(&journal->j_revoke_lock);
+                        list_del(&record->hash);
+                        spin_unlock(&journal->j_revoke_lock);
+                        kmem_cache_free(jbd2_revoke_record_cache, record);
+                        did_revoke = 1;
+                }
+        }
+#ifdef JBD_EXPENSIVE_CHECKING
+        /* There better not be one left behind by now! */
+        record = find_revoke_record(journal, bh->b_blocknr);
+        J_ASSERT_JH(jh, record == NULL);
+#endif
+        /* Finally, have we just cleared revoke on an unhashed
+         * buffer_head?  If so, we'd better make sure we clear the
+         * revoked status on any hashed alias too, otherwise the revoke
+         * state machine will get very upset later on. */
+        if (need_cancel) {
+                struct buffer_head *bh2;
+                bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+                if (bh2) {
+                        if (bh2 != bh)
+                                clear_buffer_revoked(bh2);
+                        __brelse(bh2);
+                }
+        }
+        return did_revoke;
+}
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void jbd2_journal_switch_revoke_table(journal_t *journal)
+{
+        int i;
+        if (journal->j_revoke == journal->j_revoke_table[0])
+                journal->j_revoke = journal->j_revoke_table[1];
+        else
+                journal->j_revoke = journal->j_revoke_table[0];
+        for (i = 0; i < journal->j_revoke->hash_size; i++)
+                INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ *
+ * Called with the journal lock held.
+ */
+void jbd2_journal_write_revoke_records(journal_t *journal,
+                                  transaction_t *transaction)
+{
+        struct journal_head *descriptor;
+        struct jbd2_revoke_record_s *record;
+        struct jbd2_revoke_table_s *revoke;
+        struct list_head *hash_list;
+        int i, offset, count;
+        descriptor = NULL;
+        offset = 0;
+        count = 0;
+        /* select revoke table for committing transaction */
+        revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+                journal->j_revoke_table[1] : journal->j_revoke_table[0];
+        for (i = 0; i < revoke->hash_size; i++) {
+                hash_list = &revoke->hash_table[i];
+                while (!list_empty(hash_list)) {
+                        record = (struct jbd2_revoke_record_s *)
+                                hash_list->next;
+                        write_one_revoke_record(journal, transaction,
+                                                &descriptor, &offset,
+                                                record);
+                        count++;
+                        list_del(&record->hash);
+                        kmem_cache_free(jbd2_revoke_record_cache, record);
+                }
+        }
+        if (descriptor)
+                flush_descriptor(journal, descriptor, offset);
+        jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+/*
+ * Write out one revoke record.  We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+static void write_one_revoke_record(journal_t *journal,
+                                    transaction_t *transaction,
+                                    struct journal_head **descriptorp,
+                                    int *offsetp,
+                                    struct jbd2_revoke_record_s *record)
+{
+        struct journal_head *descriptor;
+        int offset;
+        journal_header_t *header;
+        /* If we are already aborting, this all becomes a noop.  We
+           still need to go round the loop in
+           jbd2_journal_write_revoke_records in order to free all of the
+           revoke records: only the IO to the journal is omitted. */
+        if (is_journal_aborted(journal))
+                return;
+        descriptor = *descriptorp;
+        offset = *offsetp;
+        /* Make sure we have a descriptor with space left for the record */
+        if (descriptor) {
+                if (offset == journal->j_blocksize) {
+                        flush_descriptor(journal, descriptor, offset);
+                        descriptor = NULL;
+                }
+        }
+        if (!descriptor) {
+                descriptor = jbd2_journal_get_descriptor_buffer(journal);
+                if (!descriptor)
+                        return;
+                header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+                header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+                header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
+                header->h_sequence  = cpu_to_be32(transaction->t_tid);
+                /* Record it so that we can wait for IO completion later */
+                JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+                jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+                offset = sizeof(jbd2_journal_revoke_header_t);
+                *descriptorp = descriptor;
+        }
+        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
+                * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+                        cpu_to_be64(record->blocknr);
+                offset += 8;
+        } else {
+                * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+                        cpu_to_be32(record->blocknr);
+                offset += 4;
+        }
+        *offsetp = offset;
+}
+/*
+ * Flush a revoke descriptor out to the journal.  If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+static void flush_descriptor(journal_t *journal,
+                             struct journal_head *descriptor,
+                             int offset)
+{
+        jbd2_journal_revoke_header_t *header;
+        struct buffer_head *bh = jh2bh(descriptor);
+        if (is_journal_aborted(journal)) {
+                put_bh(bh);
+                return;
+        }
+        header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+        header->r_count = cpu_to_be32(offset);
+        set_buffer_jwrite(bh);
+        BUFFER_TRACE(bh, "write");
+        set_buffer_dirty(bh);
+        ll_rw_block(SWRITE, 1, &bh);
+}
+#endif
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ *  record all revoke records, including the tid of the latest instance
+ *  of each revoke in the journal
+ *
+ *  check whether a given block in a given transaction should be replayed
+ *  (ie. has not been revoked by a revoke record in that or a subsequent
+ *  transaction)
+ *
+ *  empty the revoke table after recovery.
+ */
+/*
+ * First, setting revoke records.  We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+int jbd2_journal_set_revoke(journal_t *journal,
+                       unsigned long long blocknr,
+                       tid_t sequence)
+{
+        struct jbd2_revoke_record_s *record;
+        record = find_revoke_record(journal, blocknr);
+        if (record) {
+                /* If we have multiple occurrences, only record the
+                 * latest sequence number in the hashed record */
+                if (tid_gt(sequence, record->sequence))
+                        record->sequence = sequence;
+                return 0;
+        }
+        return insert_revoke_hash(journal, blocknr, sequence);
+}
+/*
+ * Test revoke records.  For a given block referenced in the log, has
+ * that block been revoked?  A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+int jbd2_journal_test_revoke(journal_t *journal,
+                        unsigned long long blocknr,
+                        tid_t sequence)
+{
+        struct jbd2_revoke_record_s *record;
+        record = find_revoke_record(journal, blocknr);
+        if (!record)
+                return 0;
+        if (tid_gt(sequence, record->sequence))
+                return 0;
+        return 1;
+}
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+void jbd2_journal_clear_revoke(journal_t *journal)
+{
+        int i;
+        struct list_head *hash_list;
+        struct jbd2_revoke_record_s *record;
+        struct jbd2_revoke_table_s *revoke;
+        revoke = journal->j_revoke;
+        for (i = 0; i < revoke->hash_size; i++) {
+                hash_list = &revoke->hash_table[i];
+                while (!list_empty(hash_list)) {
+                        record = (struct jbd2_revoke_record_s*) hash_list->next;
+                        list_del(&record->hash);
+                        kmem_cache_free(jbd2_revoke_record_cache, record);
+                }
+        }
+}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..b6cf2be845a1
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2081 @@
+/*
+ * linux/fs/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/smp_lock.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+/*
+ * jbd2_get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction.  Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ *      The journal MUST be locked.  We don't perform atomic mallocs on the
+ *      new transaction and we can't block without protecting against other
+ *      processes trying to touch the journal while it is in transition.
+ *
+ * Called under j_state_lock
+ */
+static transaction_t *
+jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+{
+        transaction->t_journal = journal;
+        transaction->t_state = T_RUNNING;
+        transaction->t_tid = journal->j_transaction_sequence++;
+        transaction->t_expires = jiffies + journal->j_commit_interval;
+        spin_lock_init(&transaction->t_handle_lock);
+        /* Set up the commit timer for the new transaction. */
+        journal->j_commit_timer.expires = transaction->t_expires;
+        add_timer(&journal->j_commit_timer);
+        J_ASSERT(journal->j_running_transaction == NULL);
+        journal->j_running_transaction = transaction;
+        return transaction;
+}
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin.  Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+static int start_this_handle(journal_t *journal, handle_t *handle)
+{
+        transaction_t *transaction;
+        int needed;
+        int nblocks = handle->h_buffer_credits;
+        transaction_t *new_transaction = NULL;
+        int ret = 0;
+        if (nblocks > journal->j_max_transaction_buffers) {
+                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+                       current->comm, nblocks,
+                       journal->j_max_transaction_buffers);
+                ret = -ENOSPC;
+                goto out;
+        }
+alloc_transaction:
+        if (!journal->j_running_transaction) {
+                new_transaction = jbd_kmalloc(sizeof(*new_transaction),
+                                                GFP_NOFS);
+                if (!new_transaction) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                memset(new_transaction, 0, sizeof(*new_transaction));
+        }
+        jbd_debug(3, "New handle %p going live.\n", handle);
+repeat:
+        /*
+         * We need to hold j_state_lock until t_updates has been incremented,
+         * for proper journal barrier handling
+         */
+        spin_lock(&journal->j_state_lock);
+repeat_locked:
+        if (is_journal_aborted(journal) ||
+            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
+                spin_unlock(&journal->j_state_lock);
+                ret = -EROFS;
+                goto out;
+        }
+        /* Wait on the journal's transaction barrier if necessary */
+        if (journal->j_barrier_count) {
+                spin_unlock(&journal->j_state_lock);
+                wait_event(journal->j_wait_transaction_locked,
+                                journal->j_barrier_count == 0);
+                goto repeat;
+        }
+        if (!journal->j_running_transaction) {
+                if (!new_transaction) {
+                        spin_unlock(&journal->j_state_lock);
+                        goto alloc_transaction;
+                }
+                jbd2_get_transaction(journal, new_transaction);
+                new_transaction = NULL;
+        }
+        transaction = journal->j_running_transaction;
+        /*
+         * If the current transaction is locked down for commit, wait for the
+         * lock to be released.
+         */
+        if (transaction->t_state == T_LOCKED) {
+                DEFINE_WAIT(wait);
+                prepare_to_wait(&journal->j_wait_transaction_locked,
+                                        &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&journal->j_state_lock);
+                schedule();
+                finish_wait(&journal->j_wait_transaction_locked, &wait);
+                goto repeat;
+        }
+        /*
+         * If there is not enough space left in the log to write all potential
+         * buffers requested by this operation, we need to stall pending a log
+         * checkpoint to free some more log space.
+         */
+        spin_lock(&transaction->t_handle_lock);
+        needed = transaction->t_outstanding_credits + nblocks;
+        if (needed > journal->j_max_transaction_buffers) {
+                /*
+                 * If the current transaction is already too large, then start
+                 * to commit it: we can then go back and attach this handle to
+                 * a new transaction.
+                 */
+                DEFINE_WAIT(wait);
+                jbd_debug(2, "Handle %p starting new commit...\n", handle);
+                spin_unlock(&transaction->t_handle_lock);
+                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                __jbd2_log_start_commit(journal, transaction->t_tid);
+                spin_unlock(&journal->j_state_lock);
+                schedule();
+                finish_wait(&journal->j_wait_transaction_locked, &wait);
+                goto repeat;
+        }
+        /*
+         * The commit code assumes that it can get enough log space
+         * without forcing a checkpoint.  This is *critical* for
+         * correctness: a checkpoint of a buffer which is also
+         * associated with a committing transaction creates a deadlock,
+         * so commit simply cannot force through checkpoints.
+         *
+         * We must therefore ensure the necessary space in the journal
+         * *before* starting to dirty potentially checkpointed buffers
+         * in the new transaction.
+         *
+         * The worst part is, any transaction currently committing can
+         * reduce the free space arbitrarily.  Be careful to account for
+         * those buffers when checkpointing.
+         */
+        /*
+         * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+         * a _lot_ of headroom: 1/4 of the journal plus the size of
+         * the committing transaction.  Really, we only need to give it
+         * committing_transaction->t_outstanding_credits plus "enough" for
+         * the log control blocks.
+         * Also, this test is inconsitent with the matching one in
+         * jbd2_journal_extend().
+         */
+        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
+                jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+                spin_unlock(&transaction->t_handle_lock);
+                __jbd2_log_wait_for_space(journal);
+                goto repeat_locked;
+        }
+        /* OK, account for the buffers that this operation expects to
+         * use and add the handle to the running transaction. */
+        handle->h_transaction = transaction;
+        transaction->t_outstanding_credits += nblocks;
+        transaction->t_updates++;
+        transaction->t_handle_count++;
+        jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+                  handle, nblocks, transaction->t_outstanding_credits,
+                  __jbd2_log_space_left(journal));
+        spin_unlock(&transaction->t_handle_lock);
+        spin_unlock(&journal->j_state_lock);
+out:
+        if (unlikely(new_transaction))          /* It's usually NULL */
+                kfree(new_transaction);
+        return ret;
+}
+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+        handle_t *handle = jbd_alloc_handle(GFP_NOFS);
+        if (!handle)
+                return NULL;
+        memset(handle, 0, sizeof(*handle));
+        handle->h_buffer_credits = nblocks;
+        handle->h_ref = 1;
+        return handle;
+}
+/**
+ * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log.  We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext3fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or NULL on failure
+ */
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+        handle_t *handle = journal_current_handle();
+        int err;
+        if (!journal)
+                return ERR_PTR(-EROFS);
+        if (handle) {
+                J_ASSERT(handle->h_transaction->t_journal == journal);
+                handle->h_ref++;
+                return handle;
+        }
+        handle = new_handle(nblocks);
+        if (!handle)
+                return ERR_PTR(-ENOMEM);
+        current->journal_info = handle;
+        err = start_this_handle(journal, handle);
+        if (err < 0) {
+                jbd_free_handle(handle);
+                current->journal_info = NULL;
+                handle = ERR_PTR(err);
+        }
+        return handle;
+}
+/**
+ * int jbd2_journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages.  The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * jbd2_journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int jbd2_journal_extend(handle_t *handle, int nblocks)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        int result;
+        int wanted;
+        result = -EIO;
+        if (is_handle_aborted(handle))
+                goto out;
+        result = 1;
+        spin_lock(&journal->j_state_lock);
+        /* Don't extend a locked-down transaction! */
+        if (handle->h_transaction->t_state != T_RUNNING) {
+                jbd_debug(3, "denied handle %p %d blocks: "
+                          "transaction not running\n", handle, nblocks);
+                goto error_out;
+        }
+        spin_lock(&transaction->t_handle_lock);
+        wanted = transaction->t_outstanding_credits + nblocks;
+        if (wanted > journal->j_max_transaction_buffers) {
+                jbd_debug(3, "denied handle %p %d blocks: "
+                          "transaction too large\n", handle, nblocks);
+                goto unlock;
+        }
+        if (wanted > __jbd2_log_space_left(journal)) {
+                jbd_debug(3, "denied handle %p %d blocks: "
+                          "insufficient log space\n", handle, nblocks);
+                goto unlock;
+        }
+        handle->h_buffer_credits += nblocks;
+        transaction->t_outstanding_credits += nblocks;
+        result = 0;
+        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+        spin_unlock(&transaction->t_handle_lock);
+error_out:
+        spin_unlock(&journal->j_state_lock);
+out:
+        return result;
+}
+/**
+ * int jbd2_journal_restart() - restart a handle .
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the jbd2_journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to jbd2_journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        int ret;
+        /* If we've had an abort of any type, don't even think about
+         * actually doing the restart! */
+        if (is_handle_aborted(handle))
+                return 0;
+        /*
+         * First unlink the handle from its current transaction, and start the
+         * commit on that.
+         */
+        J_ASSERT(transaction->t_updates > 0);
+        J_ASSERT(journal_current_handle() == handle);
+        spin_lock(&journal->j_state_lock);
+        spin_lock(&transaction->t_handle_lock);
+        transaction->t_outstanding_credits -= handle->h_buffer_credits;
+        transaction->t_updates--;
+        if (!transaction->t_updates)
+                wake_up(&journal->j_wait_updates);
+        spin_unlock(&transaction->t_handle_lock);
+        jbd_debug(2, "restarting handle %p\n", handle);
+        __jbd2_log_start_commit(journal, transaction->t_tid);
+        spin_unlock(&journal->j_state_lock);
+        handle->h_buffer_credits = nblocks;
+        ret = start_this_handle(journal, handle);
+        return ret;
+}
+/**
+ * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+void jbd2_journal_lock_updates(journal_t *journal)
+{
+        DEFINE_WAIT(wait);
+        spin_lock(&journal->j_state_lock);
+        ++journal->j_barrier_count;
+        /* Wait until there are no running updates */
+        while (1) {
+                transaction_t *transaction = journal->j_running_transaction;
+                if (!transaction)
+                        break;
+                spin_lock(&transaction->t_handle_lock);
+                if (!transaction->t_updates) {
+                        spin_unlock(&transaction->t_handle_lock);
+                        break;
+                }
+                prepare_to_wait(&journal->j_wait_updates, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                spin_unlock(&transaction->t_handle_lock);
+                spin_unlock(&journal->j_state_lock);
+                schedule();
+                finish_wait(&journal->j_wait_updates, &wait);
+                spin_lock(&journal->j_state_lock);
+        }
+        spin_unlock(&journal->j_state_lock);
+        /*
+         * We have now established a barrier against other normal updates, but
+         * we also need to barrier against other jbd2_journal_lock_updates() calls
+         * to make sure that we serialise special journal-locked operations
+         * too.
+         */
+        mutex_lock(&journal->j_barrier);
+}
+/**
+ * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with jbd2_journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+void jbd2_journal_unlock_updates (journal_t *journal)
+{
+        J_ASSERT(journal->j_barrier_count != 0);
+        mutex_unlock(&journal->j_barrier);
+        spin_lock(&journal->j_state_lock);
+        --journal->j_barrier_count;
+        spin_unlock(&journal->j_state_lock);
+        wake_up(&journal->j_wait_transaction_locked);
+}
+/*
+ * Report any unexpected dirty buffers which turn up.  Normally those
+ * indicate an error, but they can occur if the user is running (say)
+ * tune2fs to modify the live filesystem, so we need the option of
+ * continuing as gracefully as possible.  #
+ *
+ * The caller should already hold the journal lock and
+ * j_list_lock spinlock: most callers will need those anyway
+ * in order to probe the buffer's journaling state safely.
+ */
+static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+{
+        int jlist;
+        /* If this buffer is one which might reasonably be dirty
+         * --- ie. data, or not part of this journal --- then
+         * we're OK to leave it alone, but otherwise we need to
+         * move the dirty bit to the journal's own internal
+         * JBDDirty bit. */
+        jlist = jh->b_jlist;
+        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                struct buffer_head *bh = jh2bh(jh);
+                if (test_clear_buffer_dirty(bh))
+                        set_buffer_jbddirty(bh);
+        }
+}
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do.  If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk.  We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+                        int force_copy)
+{
+        struct buffer_head *bh;
+        transaction_t *transaction;
+        journal_t *journal;
+        int error;
+        char *frozen_buffer = NULL;
+        int need_copy = 0;
+        if (is_handle_aborted(handle))
+                return -EROFS;
+        transaction = handle->h_transaction;
+        journal = transaction->t_journal;
+        jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+        JBUFFER_TRACE(jh, "entry");
+repeat:
+        bh = jh2bh(jh);
+        /* @@@ Need to check for errors here at some point. */
+        lock_buffer(bh);
+        jbd_lock_bh_state(bh);
+        /* We now hold the buffer lock so it is safe to query the buffer
+         * state.  Is the buffer dirty?
+         *
+         * If so, there are two possibilities.  The buffer may be
+         * non-journaled, and undergoing a quite legitimate writeback.
+         * Otherwise, it is journaled, and we don't expect dirty buffers
+         * in that state (the buffers should be marked JBD_Dirty
+         * instead.)  So either the IO is being done under our own
+         * control and this is a bug, or it's a third party IO such as
+         * dump(8) (which may leave the buffer scheduled for read ---
+         * ie. locked but not dirty) or tune2fs (which may actually have
+         * the buffer dirtied, ugh.)  */
+        if (buffer_dirty(bh)) {
+                /*
+                 * First question: is this buffer already part of the current
+                 * transaction or the existing committing transaction?
+                 */
+                if (jh->b_transaction) {
+                        J_ASSERT_JH(jh,
+                                jh->b_transaction == transaction ||
+                                jh->b_transaction ==
+                                        journal->j_committing_transaction);
+                        if (jh->b_next_transaction)
+                                J_ASSERT_JH(jh, jh->b_next_transaction ==
+                                                        transaction);
+                }
+                /*
+                 * In any case we need to clean the dirty flag and we must
+                 * do it under the buffer lock to be sure we don't race
+                 * with running write-out.
+                 */
+                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                jbd_unexpected_dirty_buffer(jh);
+        }
+        unlock_buffer(bh);
+        error = -EROFS;
+        if (is_handle_aborted(handle)) {
+                jbd_unlock_bh_state(bh);
+                goto out;
+        }
+        error = 0;
+        /*
+         * The buffer is already part of this transaction if b_transaction or
+         * b_next_transaction points to it
+         */
+        if (jh->b_transaction == transaction ||
+            jh->b_next_transaction == transaction)
+                goto done;
+        /*
+         * If there is already a copy-out version of this buffer, then we don't
+         * need to make another one
+         */
+        if (jh->b_frozen_data) {
+                JBUFFER_TRACE(jh, "has frozen data");
+                J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+                jh->b_next_transaction = transaction;
+                goto done;
+        }
+        /* Is there data here we need to preserve? */
+        if (jh->b_transaction && jh->b_transaction != transaction) {
+                JBUFFER_TRACE(jh, "owned by older transaction");
+                J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+                J_ASSERT_JH(jh, jh->b_transaction ==
+                                        journal->j_committing_transaction);
+                /* There is one case we have to be very careful about.
+                 * If the committing transaction is currently writing
+                 * this buffer out to disk and has NOT made a copy-out,
+                 * then we cannot modify the buffer contents at all
+                 * right now.  The essence of copy-out is that it is the
+                 * extra copy, not the primary copy, which gets
+                 * journaled.  If the primary copy is already going to
+                 * disk then we cannot do copy-out here. */
+                if (jh->b_jlist == BJ_Shadow) {
+                        DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
+                        wait_queue_head_t *wqh;
+                        wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
+                        JBUFFER_TRACE(jh, "on shadow: sleep");
+                        jbd_unlock_bh_state(bh);
+                        /* commit wakes up all shadow buffers after IO */
+                        for ( ; ; ) {
+                                prepare_to_wait(wqh, &wait.wait,
+                                                TASK_UNINTERRUPTIBLE);
+                                if (jh->b_jlist != BJ_Shadow)
+                                        break;
+                                schedule();
+                        }
+                        finish_wait(wqh, &wait.wait);
+                        goto repeat;
+                }
+                /* Only do the copy if the currently-owning transaction
+                 * still needs it.  If it is on the Forget list, the
+                 * committing transaction is past that stage.  The
+                 * buffer had better remain locked during the kmalloc,
+                 * but that should be true --- we hold the journal lock
+                 * still and the buffer is already on the BUF_JOURNAL
+                 * list so won't be flushed.
+                 *
+                 * Subtle point, though: if this is a get_undo_access,
+                 * then we will be relying on the frozen_data to contain
+                 * the new value of the committed_data record after the
+                 * transaction, so we HAVE to force the frozen_data copy
+                 * in that case. */
+                if (jh->b_jlist != BJ_Forget || force_copy) {
+                        JBUFFER_TRACE(jh, "generate frozen data");
+                        if (!frozen_buffer) {
+                                JBUFFER_TRACE(jh, "allocate memory for buffer");
+                                jbd_unlock_bh_state(bh);
+                                frozen_buffer =
+                                        jbd2_slab_alloc(jh2bh(jh)->b_size,
+                                                         GFP_NOFS);
+                                if (!frozen_buffer) {
+                                        printk(KERN_EMERG
+                                               "%s: OOM for frozen_buffer\n",
+                                               __FUNCTION__);
+                                        JBUFFER_TRACE(jh, "oom!");
+                                        error = -ENOMEM;
+                                        jbd_lock_bh_state(bh);
+                                        goto done;
+                                }
+                                goto repeat;
+                        }
+                        jh->b_frozen_data = frozen_buffer;
+                        frozen_buffer = NULL;
+                        need_copy = 1;
+                }
+                jh->b_next_transaction = transaction;
+        }
+        /*
+         * Finally, if the buffer is not journaled right now, we need to make
+         * sure it doesn't get written to disk before the caller actually
+         * commits the new data
+         */
+        if (!jh->b_transaction) {
+                JBUFFER_TRACE(jh, "no transaction");
+                J_ASSERT_JH(jh, !jh->b_next_transaction);
+                jh->b_transaction = transaction;
+                JBUFFER_TRACE(jh, "file as BJ_Reserved");
+                spin_lock(&journal->j_list_lock);
+                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+                spin_unlock(&journal->j_list_lock);
+        }
+done:
+        if (need_copy) {
+                struct page *page;
+                int offset;
+                char *source;
+                J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+                            "Possible IO failure.\n");
+                page = jh2bh(jh)->b_page;
+                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+                source = kmap_atomic(page, KM_USER0);
+                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+                kunmap_atomic(source, KM_USER0);
+        }
+        jbd_unlock_bh_state(bh);
+        /*
+         * If we are about to journal a buffer, then any revoke pending on it is
+         * no longer valid
+         */
+        jbd2_journal_cancel_revoke(handle, jh);
+out:
+        if (unlikely(frozen_buffer))    /* It's usually NULL */
+                jbd2_slab_free(frozen_buffer, bh->b_size);
+        JBUFFER_TRACE(jh, "exit");
+        return error;
+}
+/**
+ * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ * @credits: variable that will receive credits for the buffer
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+        int rc;
+        /* We do not want to get caught playing with fields which the
+         * log thread also manipulates.  Make sure that the buffer
+         * completes any outstanding IO before proceeding. */
+        rc = do_get_write_access(handle, jh, 0);
+        jbd2_journal_put_journal_head(jh);
+        return rc;
+}
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data.  In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+/**
+ * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+        int err;
+        jbd_debug(5, "journal_head %p\n", jh);
+        err = -EROFS;
+        if (is_handle_aborted(handle))
+                goto out;
+        err = 0;
+        JBUFFER_TRACE(jh, "entry");
+        /*
+         * The buffer may already belong to this transaction due to pre-zeroing
+         * in the filesystem's new_block code.  It may also be on the previous,
+         * committing transaction's lists, but it HAS to be in Forget state in
+         * that case: the transaction must have deleted the buffer for it to be
+         * reused here.
+         */
+        jbd_lock_bh_state(bh);
+        spin_lock(&journal->j_list_lock);
+        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+                jh->b_transaction == NULL ||
+                (jh->b_transaction == journal->j_committing_transaction &&
+                          jh->b_jlist == BJ_Forget)));
+        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+        if (jh->b_transaction == NULL) {
+                jh->b_transaction = transaction;
+                JBUFFER_TRACE(jh, "file as BJ_Reserved");
+                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+        } else if (jh->b_transaction == journal->j_committing_transaction) {
+                JBUFFER_TRACE(jh, "set next transaction");
+                jh->b_next_transaction = transaction;
+        }
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh);
+        /*
+         * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+         * blocks which contain freed but then revoked metadata.  We need
+         * to cancel the revoke in case we end up freeing it yet again
+         * and the reallocating as data - this would cause a second revoke,
+         * which hits an assertion error.
+         */
+        JBUFFER_TRACE(jh, "cancelling revoke");
+        jbd2_journal_cancel_revoke(handle, jh);
+        jbd2_journal_put_journal_head(jh);
+out:
+        return err;
+}
+/**
+ * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
+ *     non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ * @credits: store the number of taken credits here (if not NULL)
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not.  The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, jbd2_journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps.  The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+        int err;
+        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+        char *committed_data = NULL;
+        JBUFFER_TRACE(jh, "entry");
+        /*
+         * Do this first --- it can drop the journal lock, so we want to
+         * make sure that obtaining the committed_data is done
+         * atomically wrt. completion of any outstanding commits.
+         */
+        err = do_get_write_access(handle, jh, 1);
+        if (err)
+                goto out;
+repeat:
+        if (!jh->b_committed_data) {
+                committed_data = jbd2_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+                if (!committed_data) {
+                        printk(KERN_EMERG "%s: No memory for committed data\n",
+                                __FUNCTION__);
+                        err = -ENOMEM;
+                        goto out;
+                }
+        }
+        jbd_lock_bh_state(bh);
+        if (!jh->b_committed_data) {
+                /* Copy out the current buffer contents into the
+                 * preserved, committed copy. */
+                JBUFFER_TRACE(jh, "generate b_committed data");
+                if (!committed_data) {
+                        jbd_unlock_bh_state(bh);
+                        goto repeat;
+                }
+                jh->b_committed_data = committed_data;
+                committed_data = NULL;
+                memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+        }
+        jbd_unlock_bh_state(bh);
+out:
+        jbd2_journal_put_journal_head(jh);
+        if (unlikely(committed_data))
+                jbd2_slab_free(committed_data, bh->b_size);
+        return err;
+}
+/**
+ * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
+ *                             needs to be flushed before we can commit the
+ *                             current transaction.
+ * @handle: transaction
+ * @bh: bufferhead to mark
+ *
+ * The buffer is placed on the transaction's data list and is marked as
+ * belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
+ * by kswapd.
+ */
+int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+        journal_t *journal = handle->h_transaction->t_journal;
+        int need_brelse = 0;
+        struct journal_head *jh;
+        if (is_handle_aborted(handle))
+                return 0;
+        jh = jbd2_journal_add_journal_head(bh);
+        JBUFFER_TRACE(jh, "entry");
+        /*
+         * The buffer could *already* be dirty.  Writeout can start
+         * at any time.
+         */
+        jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+        /*
+         * What if the buffer is already part of a running transaction?
+         *
+         * There are two cases:
+         * 1) It is part of the current running transaction.  Refile it,
+         *    just in case we have allocated it as metadata, deallocated
+         *    it, then reallocated it as data.
+         * 2) It is part of the previous, still-committing transaction.
+         *    If all we want to do is to guarantee that the buffer will be
+         *    written to disk before this new transaction commits, then
+         *    being sure that the *previous* transaction has this same
+         *    property is sufficient for us!  Just leave it on its old
+         *    transaction.
+         *
+         * In case (2), the buffer must not already exist as metadata
+         * --- that would violate write ordering (a transaction is free
+         * to write its data at any point, even before the previous
+         * committing transaction has committed).  The caller must
+         * never, ever allow this to happen: there's nothing we can do
+         * about it in this layer.
+         */
+        jbd_lock_bh_state(bh);
+        spin_lock(&journal->j_list_lock);
+        if (jh->b_transaction) {
+                JBUFFER_TRACE(jh, "has transaction");
+                if (jh->b_transaction != handle->h_transaction) {
+                        JBUFFER_TRACE(jh, "belongs to older transaction");
+                        J_ASSERT_JH(jh, jh->b_transaction ==
+                                        journal->j_committing_transaction);
+                        /* @@@ IS THIS TRUE  ? */
+                        /*
+                         * Not any more.  Scenario: someone does a write()
+                         * in data=journal mode.  The buffer's transaction has
+                         * moved into commit.  Then someone does another
+                         * write() to the file.  We do the frozen data copyout
+                         * and set b_next_transaction to point to j_running_t.
+                         * And while we're in that state, someone does a
+                         * writepage() in an attempt to pageout the same area
+                         * of the file via a shared mapping.  At present that
+                         * calls jbd2_journal_dirty_data(), and we get right here.
+                         * It may be too late to journal the data.  Simply
+                         * falling through to the next test will suffice: the
+                         * data will be dirty and wil be checkpointed.  The
+                         * ordering comments in the next comment block still
+                         * apply.
+                         */
+                        //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+                        /*
+                         * If we're journalling data, and this buffer was
+                         * subject to a write(), it could be metadata, forget
+                         * or shadow against the committing transaction.  Now,
+                         * someone has dirtied the same darn page via a mapping
+                         * and it is being writepage()'d.
+                         * We *could* just steal the page from commit, with some
+                         * fancy locking there.  Instead, we just skip it -
+                         * don't tie the page's buffers to the new transaction
+                         * at all.
+                         * Implication: if we crash before the writepage() data
+                         * is written into the filesystem, recovery will replay
+                         * the write() data.
+                         */
+                        if (jh->b_jlist != BJ_None &&
+                                        jh->b_jlist != BJ_SyncData &&
+                                        jh->b_jlist != BJ_Locked) {
+                                JBUFFER_TRACE(jh, "Not stealing");
+                                goto no_journal;
+                        }
+                        /*
+                         * This buffer may be undergoing writeout in commit.  We
+                         * can't return from here and let the caller dirty it
+                         * again because that can cause the write-out loop in
+                         * commit to never terminate.
+                         */
+                        if (buffer_dirty(bh)) {
+                                get_bh(bh);
+                                spin_unlock(&journal->j_list_lock);
+                                jbd_unlock_bh_state(bh);
+                                need_brelse = 1;
+                                sync_dirty_buffer(bh);
+                                jbd_lock_bh_state(bh);
+                                spin_lock(&journal->j_list_lock);
+                                /* The buffer may become locked again at any
+                                   time if it is redirtied */
+                        }
+                        /* journal_clean_data_list() may have got there first */
+                        if (jh->b_transaction != NULL) {
+                                JBUFFER_TRACE(jh, "unfile from commit");
+                                __jbd2_journal_temp_unlink_buffer(jh);
+                                /* It still points to the committing
+                                 * transaction; move it to this one so
+                                 * that the refile assert checks are
+                                 * happy. */
+                                jh->b_transaction = handle->h_transaction;
+                        }
+                        /* The buffer will be refiled below */
+                }
+                /*
+                 * Special case --- the buffer might actually have been
+                 * allocated and then immediately deallocated in the previous,
+                 * committing transaction, so might still be left on that
+                 * transaction's metadata lists.
+                 */
+                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
+                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+                        __jbd2_journal_temp_unlink_buffer(jh);
+                        jh->b_transaction = handle->h_transaction;
+                        JBUFFER_TRACE(jh, "file as data");
+                        __jbd2_journal_file_buffer(jh, handle->h_transaction,
+                                                BJ_SyncData);
+                }
+        } else {
+                JBUFFER_TRACE(jh, "not on a transaction");
+                __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+        }
+no_journal:
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh);
+        if (need_brelse) {
+                BUFFER_TRACE(bh, "brelse");
+                __brelse(bh);
+        }
+        JBUFFER_TRACE(jh, "exit");
+        jbd2_journal_put_journal_head(jh);
+        return 0;
+}
+/**
+ * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit).  In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        struct journal_head *jh = bh2jh(bh);
+        jbd_debug(5, "journal_head %p\n", jh);
+        JBUFFER_TRACE(jh, "entry");
+        if (is_handle_aborted(handle))
+                goto out;
+        jbd_lock_bh_state(bh);
+        if (jh->b_modified == 0) {
+                /*
+                 * This buffer's got modified and becoming part
+                 * of the transaction. This needs to be done
+                 * once a transaction -bzzz
+                 */
+                jh->b_modified = 1;
+                J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+                handle->h_buffer_credits--;
+        }
+        /*
+         * fastpath, to avoid expensive locking.  If this buffer is already
+         * on the running transaction's metadata list there is nothing to do.
+         * Nobody can take it off again because there is a handle open.
+         * I _think_ we're OK here with SMP barriers - a mistaken decision will
+         * result in this test being false, so we go in and take the locks.
+         */
+        if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+                JBUFFER_TRACE(jh, "fastpath");
+                J_ASSERT_JH(jh, jh->b_transaction ==
+                                        journal->j_running_transaction);
+                goto out_unlock_bh;
+        }
+        set_buffer_jbddirty(bh);
+        /*
+         * Metadata already on the current transaction list doesn't
+         * need to be filed.  Metadata on another transaction's list must
+         * be committing, and will be refiled once the commit completes:
+         * leave it alone for now.
+         */
+        if (jh->b_transaction != transaction) {
+                JBUFFER_TRACE(jh, "already on other transaction");
+                J_ASSERT_JH(jh, jh->b_transaction ==
+                                        journal->j_committing_transaction);
+                J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+                /* And this case is illegal: we can't reuse another
+                 * transaction's data buffer, ever. */
+                goto out_unlock_bh;
+        }
+        /* That test should have eliminated the following case: */
+        J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+        JBUFFER_TRACE(jh, "file as BJ_Metadata");
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+        spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+        jbd_unlock_bh_state(bh);
+out:
+        JBUFFER_TRACE(jh, "exit");
+        return 0;
+}
+/*
+ * jbd2_journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+        BUFFER_TRACE(bh, "entry");
+}
+/**
+ * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer.  If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable.  Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        struct journal_head *jh;
+        int drop_reserve = 0;
+        int err = 0;
+        BUFFER_TRACE(bh, "entry");
+        jbd_lock_bh_state(bh);
+        spin_lock(&journal->j_list_lock);
+        if (!buffer_jbd(bh))
+                goto not_jbd;
+        jh = bh2jh(bh);
+        /* Critical error: attempting to delete a bitmap buffer, maybe?
+         * Don't do any jbd operations, and return an error. */
+        if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+                         "inconsistent data on disk")) {
+                err = -EIO;
+                goto not_jbd;
+        }
+        /*
+         * The buffer's going from the transaction, we must drop
+         * all references -bzzz
+         */
+        jh->b_modified = 0;
+        if (jh->b_transaction == handle->h_transaction) {
+                J_ASSERT_JH(jh, !jh->b_frozen_data);
+                /* If we are forgetting a buffer which is already part
+                 * of this transaction, then we can just drop it from
+                 * the transaction immediately. */
+                clear_buffer_dirty(bh);
+                clear_buffer_jbddirty(bh);
+                JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+                drop_reserve = 1;
+                /*
+                 * We are no longer going to journal this buffer.
+                 * However, the commit of this transaction is still
+                 * important to the buffer: the delete that we are now
+                 * processing might obsolete an old log entry, so by
+                 * committing, we can satisfy the buffer's checkpoint.
+                 *
+                 * So, if we have a checkpoint on the buffer, we should
+                 * now refile the buffer on our BJ_Forget list so that
+                 * we know to remove the checkpoint after we commit.
+                 */
+                if (jh->b_cp_transaction) {
+                        __jbd2_journal_temp_unlink_buffer(jh);
+                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+                } else {
+                        __jbd2_journal_unfile_buffer(jh);
+                        jbd2_journal_remove_journal_head(bh);
+                        __brelse(bh);
+                        if (!buffer_jbd(bh)) {
+                                spin_unlock(&journal->j_list_lock);
+                                jbd_unlock_bh_state(bh);
+                                __bforget(bh);
+                                goto drop;
+                        }
+                }
+        } else if (jh->b_transaction) {
+                J_ASSERT_JH(jh, (jh->b_transaction ==
+                                 journal->j_committing_transaction));
+                /* However, if the buffer is still owned by a prior
+                 * (committing) transaction, we can't drop it yet... */
+                JBUFFER_TRACE(jh, "belongs to older transaction");
+                /* ... but we CAN drop it from the new transaction if we
+                 * have also modified it since the original commit. */
+                if (jh->b_next_transaction) {
+                        J_ASSERT(jh->b_next_transaction == transaction);
+                        jh->b_next_transaction = NULL;
+                        drop_reserve = 1;
+                }
+        }
+not_jbd:
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh);
+        __brelse(bh);
+drop:
+        if (drop_reserve) {
+                /* no need to reserve log space for this block -bzzz */
+                handle->h_buffer_credits++;
+        }
+        return err;
+}
+/**
+ * int jbd2_journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here.  We just return any remaining
+ * buffer credits to the transaction and remove the handle.  The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * jbd2_journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances.  In particular, expect it to
+ * return -EIO if a jbd2_journal_abort has been executed since the
+ * transaction began.
+ */
+int jbd2_journal_stop(handle_t *handle)
+{
+        transaction_t *transaction = handle->h_transaction;
+        journal_t *journal = transaction->t_journal;
+        int old_handle_count, err;
+        pid_t pid;
+        J_ASSERT(journal_current_handle() == handle);
+        if (is_handle_aborted(handle))
+                err = -EIO;
+        else {
+                J_ASSERT(transaction->t_updates > 0);
+                err = 0;
+        }
+        if (--handle->h_ref > 0) {
+                jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+                          handle->h_ref);
+                return err;
+        }
+        jbd_debug(4, "Handle %p going down\n", handle);
+        /*
+         * Implement synchronous transaction batching.  If the handle
+         * was synchronous, don't force a commit immediately.  Let's
+         * yield and let another thread piggyback onto this transaction.
+         * Keep doing that while new threads continue to arrive.
+         * It doesn't cost much - we're about to run a commit and sleep
+         * on IO anyway.  Speeds up many-threaded, many-dir operations
+         * by 30x or more...
+         *
+         * But don't do this if this process was the most recent one to
+         * perform a synchronous write.  We do this to detect the case where a
+         * single process is doing a stream of sync writes.  No point in waiting
+         * for joiners in that case.
+         */
+        pid = current->pid;
+        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                journal->j_last_sync_writer = pid;
+                do {
+                        old_handle_count = transaction->t_handle_count;
+                        schedule_timeout_uninterruptible(1);
+                } while (old_handle_count != transaction->t_handle_count);
+        }
+        current->journal_info = NULL;
+        spin_lock(&journal->j_state_lock);
+        spin_lock(&transaction->t_handle_lock);
+        transaction->t_outstanding_credits -= handle->h_buffer_credits;
+        transaction->t_updates--;
+        if (!transaction->t_updates) {
+                wake_up(&journal->j_wait_updates);
+                if (journal->j_barrier_count)
+                        wake_up(&journal->j_wait_transaction_locked);
+        }
+        /*
+         * If the handle is marked SYNC, we need to set another commit
+         * going!  We also want to force a commit if the current
+         * transaction is occupying too much of the log, or if the
+         * transaction is too old now.
+         */
+        if (handle->h_sync ||
+                        transaction->t_outstanding_credits >
+                                journal->j_max_transaction_buffers ||
+                        time_after_eq(jiffies, transaction->t_expires)) {
+                /* Do this even for aborted journals: an abort still
+                 * completes the commit thread, it just doesn't write
+                 * anything to disk. */
+                tid_t tid = transaction->t_tid;
+                spin_unlock(&transaction->t_handle_lock);
+                jbd_debug(2, "transaction too old, requesting commit for "
+                                        "handle %p\n", handle);
+                /* This is non-blocking */
+                __jbd2_log_start_commit(journal, transaction->t_tid);
+                spin_unlock(&journal->j_state_lock);
+                /*
+                 * Special case: JBD2_SYNC synchronous updates require us
+                 * to wait for the commit to complete.
+                 */
+                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+                        err = jbd2_log_wait_commit(journal, tid);
+        } else {
+                spin_unlock(&transaction->t_handle_lock);
+                spin_unlock(&journal->j_state_lock);
+        }
+        jbd_free_handle(handle);
+        return err;
+}
+/**int jbd2_journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
+ * to disk.  May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+        handle_t *handle;
+        int ret;
+        handle = jbd2_journal_start(journal, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+        } else {
+                handle->h_sync = 1;
+                ret = jbd2_journal_stop(handle);
+        }
+        return ret;
+}
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+        if (!*list) {
+                jh->b_tnext = jh->b_tprev = jh;
+                *list = jh;
+        } else {
+                /* Insert at the tail of the list to preserve order */
+                struct journal_head *first = *list, *last = first->b_tprev;
+                jh->b_tprev = last;
+                jh->b_tnext = first;
+                last->b_tnext = first->b_tprev = jh;
+        }
+}
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+        if (*list == jh) {
+                *list = jh->b_tnext;
+                if (*list == jh)
+                        *list = NULL;
+        }
+        jh->b_tprev->b_tnext = jh->b_tnext;
+        jh->b_tnext->b_tprev = jh->b_tprev;
+}
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * is holding onto a copy of one of thee pointers, it could go bad.
+ * Generally the caller needs to re-read the pointer from the transaction_t.
+ *
+ * Called under j_list_lock.  The journal may not be locked.
+ */
+void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+{
+        struct journal_head **list = NULL;
+        transaction_t *transaction;
+        struct buffer_head *bh = jh2bh(jh);
+        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+        transaction = jh->b_transaction;
+        if (transaction)
+                assert_spin_locked(&transaction->t_journal->j_list_lock);
+        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+        if (jh->b_jlist != BJ_None)
+                J_ASSERT_JH(jh, transaction != 0);
+        switch (jh->b_jlist) {
+        case BJ_None:
+                return;
+        case BJ_SyncData:
+                list = &transaction->t_sync_datalist;
+                break;
+        case BJ_Metadata:
+                transaction->t_nr_buffers--;
+                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+                list = &transaction->t_buffers;
+                break;
+        case BJ_Forget:
+                list = &transaction->t_forget;
+                break;
+        case BJ_IO:
+                list = &transaction->t_iobuf_list;
+                break;
+        case BJ_Shadow:
+                list = &transaction->t_shadow_list;
+                break;
+        case BJ_LogCtl:
+                list = &transaction->t_log_list;
+                break;
+        case BJ_Reserved:
+                list = &transaction->t_reserved_list;
+                break;
+        case BJ_Locked:
+                list = &transaction->t_locked_list;
+                break;
+        }
+        __blist_del_buffer(list, jh);
+        jh->b_jlist = BJ_None;
+        if (test_clear_buffer_jbddirty(bh))
+                mark_buffer_dirty(bh);  /* Expose it to the VM */
+}
+void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+{
+        __jbd2_journal_temp_unlink_buffer(jh);
+        jh->b_transaction = NULL;
+}
+void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+        jbd_lock_bh_state(jh2bh(jh));
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_unfile_buffer(jh);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(jh2bh(jh));
+}
+/*
+ * Called from jbd2_journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+        struct journal_head *jh;
+        jh = bh2jh(bh);
+        if (buffer_locked(bh) || buffer_dirty(bh))
+                goto out;
+        if (jh->b_next_transaction != 0)
+                goto out;
+        spin_lock(&journal->j_list_lock);
+        if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+                if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+                        /* A written-back ordered data buffer */
+                        JBUFFER_TRACE(jh, "release data");
+                        __jbd2_journal_unfile_buffer(jh);
+                        jbd2_journal_remove_journal_head(bh);
+                        __brelse(bh);
+                }
+        } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+                /* written-back checkpointed metadata buffer */
+                if (jh->b_jlist == BJ_None) {
+                        JBUFFER_TRACE(jh, "remove from checkpoint list");
+                        __jbd2_journal_remove_checkpoint(jh);
+                        jbd2_journal_remove_journal_head(bh);
+                        __brelse(bh);
+                }
+        }
+        spin_unlock(&journal->j_list_lock);
+out:
+        return;
+}
+/**
+ * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @unused_gfp_mask: unused
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat.  We aren't protected by the
+ * BKL here.  We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a jbd2_journal_dirty_data on this
+ * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this?  hmm...  Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state.  But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction.  Yes?
+ */
+int jbd2_journal_try_to_free_buffers(journal_t *journal,
+                                struct page *page, gfp_t unused_gfp_mask)
+{
+        struct buffer_head *head;
+        struct buffer_head *bh;
+        int ret = 0;
+        J_ASSERT(PageLocked(page));
+        head = page_buffers(page);
+        bh = head;
+        do {
+                struct journal_head *jh;
+                /*
+                 * We take our own ref against the journal_head here to avoid
+                 * having to add tons of locking around each instance of
+                 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+                 */
+                jh = jbd2_journal_grab_journal_head(bh);
+                if (!jh)
+                        continue;
+                jbd_lock_bh_state(bh);
+                __journal_try_to_free_buffer(journal, bh);
+                jbd2_journal_put_journal_head(jh);
+                jbd_unlock_bh_state(bh);
+                if (buffer_jbd(bh))
+                        goto busy;
+        } while ((bh = bh->b_this_page) != head);
+        ret = try_to_free_buffers(page);
+busy:
+        return ret;
+}
+/*
+ * This buffer is no longer needed.  If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits.  If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+        int may_free = 1;
+        struct buffer_head *bh = jh2bh(jh);
+        __jbd2_journal_unfile_buffer(jh);
+        if (jh->b_cp_transaction) {
+                JBUFFER_TRACE(jh, "on running+cp transaction");
+                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+                clear_buffer_jbddirty(bh);
+                may_free = 0;
+        } else {
+                JBUFFER_TRACE(jh, "on running transaction");
+                jbd2_journal_remove_journal_head(bh);
+                __brelse(bh);
+        }
+        return may_free;
+}
+/*
+ * jbd2_journal_invalidatepage
+ *
+ * This code is tricky.  It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ *  This is done in ext3 by defining an ext3_setattr method which
+ *  updates i_size before truncate gets going.  By maintaining this
+ *  invariant, we can be sure that it is safe to throw away any buffers
+ *  attached to the current transaction: once the transaction commits,
+ *  we know that the data will not be needed.
+ *
+ *  Note however that we can *not* throw away data belonging to the
+ *  previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ *  The bitmap committed_data images guarantee this: any block which is
+ *  allocated in one transaction and removed in the next will be marked
+ *  as in-use in the committed_data bitmap, so cannot be reused until
+ *  the next transaction to delete the block commits.  This means that
+ *  leaving committing buffers dirty is quite safe: the disk blocks
+ *  cannot be reallocated to a different file and so buffer aliasing is
+ *  not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode.  In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode.  --sct
+ */
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here.  Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+{
+        transaction_t *transaction;
+        struct journal_head *jh;
+        int may_free = 1;
+        int ret;
+        BUFFER_TRACE(bh, "entry");
+        /*
+         * It is safe to proceed here without the j_list_lock because the
+         * buffers cannot be stolen by try_to_free_buffers as long as we are
+         * holding the page lock. --sct
+         */
+        if (!buffer_jbd(bh))
+                goto zap_buffer_unlocked;
+        spin_lock(&journal->j_state_lock);
+        jbd_lock_bh_state(bh);
+        spin_lock(&journal->j_list_lock);
+        jh = jbd2_journal_grab_journal_head(bh);
+        if (!jh)
+                goto zap_buffer_no_jh;
+        transaction = jh->b_transaction;
+        if (transaction == NULL) {
+                /* First case: not on any transaction.  If it
+                 * has no checkpoint link, then we can zap it:
+                 * it's a writeback-mode buffer so we don't care
+                 * if it hits disk safely. */
+                if (!jh->b_cp_transaction) {
+                        JBUFFER_TRACE(jh, "not on any transaction: zap");
+                        goto zap_buffer;
+                }
+                if (!buffer_dirty(bh)) {
+                        /* bdflush has written it.  We can drop it now */
+                        goto zap_buffer;
+                }
+                /* OK, it must be in the journal but still not
+                 * written fully to disk: it's metadata or
+                 * journaled data... */
+                if (journal->j_running_transaction) {
+                        /* ... and once the current transaction has
+                         * committed, the buffer won't be needed any
+                         * longer. */
+                        JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+                        ret = __dispose_buffer(jh,
+                                        journal->j_running_transaction);
+                        jbd2_journal_put_journal_head(jh);
+                        spin_unlock(&journal->j_list_lock);
+                        jbd_unlock_bh_state(bh);
+                        spin_unlock(&journal->j_state_lock);
+                        return ret;
+                } else {
+                        /* There is no currently-running transaction. So the
+                         * orphan record which we wrote for this file must have
+                         * passed into commit.  We must attach this buffer to
+                         * the committing transaction, if it exists. */
+                        if (journal->j_committing_transaction) {
+                                JBUFFER_TRACE(jh, "give to committing trans");
+                                ret = __dispose_buffer(jh,
+                                        journal->j_committing_transaction);
+                                jbd2_journal_put_journal_head(jh);
+                                spin_unlock(&journal->j_list_lock);
+                                jbd_unlock_bh_state(bh);
+                                spin_unlock(&journal->j_state_lock);
+                                return ret;
+                        } else {
+                                /* The orphan record's transaction has
+                                 * committed.  We can cleanse this buffer */
+                                clear_buffer_jbddirty(bh);
+                                goto zap_buffer;
+                        }
+                }
+        } else if (transaction == journal->j_committing_transaction) {
+                if (jh->b_jlist == BJ_Locked) {
+                        /*
+                         * The buffer is on the committing transaction's locked
+                         * list.  We have the buffer locked, so I/O has
+                         * completed.  So we can nail the buffer now.
+                         */
+                        may_free = __dispose_buffer(jh, transaction);
+                        goto zap_buffer;
+                }
+                /*
+                 * If it is committing, we simply cannot touch it.  We
+                 * can remove it's next_transaction pointer from the
+                 * running transaction if that is set, but nothing
+                 * else. */
+                JBUFFER_TRACE(jh, "on committing transaction");
+                set_buffer_freed(bh);
+                if (jh->b_next_transaction) {
+                        J_ASSERT(jh->b_next_transaction ==
+                                        journal->j_running_transaction);
+                        jh->b_next_transaction = NULL;
+                }
+                jbd2_journal_put_journal_head(jh);
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                spin_unlock(&journal->j_state_lock);
+                return 0;
+        } else {
+                /* Good, the buffer belongs to the running transaction.
+                 * We are writing our own transaction's data, not any
+                 * previous one's, so it is safe to throw it away
+                 * (remember that we expect the filesystem to have set
+                 * i_size already for this truncate so recovery will not
+                 * expose the disk blocks we are discarding here.) */
+                J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+                may_free = __dispose_buffer(jh, transaction);
+        }
+zap_buffer:
+        jbd2_journal_put_journal_head(jh);
+zap_buffer_no_jh:
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh);
+        spin_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+        clear_buffer_dirty(bh);
+        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+        clear_buffer_mapped(bh);
+        clear_buffer_req(bh);
+        clear_buffer_new(bh);
+        bh->b_bdev = NULL;
+        return may_free;
+}
+/**
+ * void jbd2_journal_invalidatepage()
+ * @journal: journal to use for flush...
+ * @page:    page to flush
+ * @offset:  length of page to invalidate.
+ *
+ * Reap page buffers containing data after offset in page.
+ *
+ */
+void jbd2_journal_invalidatepage(journal_t *journal,
+                      struct page *page,
+                      unsigned long offset)
+{
+        struct buffer_head *head, *bh, *next;
+        unsigned int curr_off = 0;
+        int may_free = 1;
+        if (!PageLocked(page))
+                BUG();
+        if (!page_has_buffers(page))
+                return;
+        /* We will potentially be playing with lists other than just the
+         * data lists (especially for journaled data mode), so be
+         * cautious in our locking. */
+        head = bh = page_buffers(page);
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                next = bh->b_this_page;
+                if (offset <= curr_off) {
+                        /* This block is wholly outside the truncation point */
+                        lock_buffer(bh);
+                        may_free &= journal_unmap_buffer(journal, bh);
+                        unlock_buffer(bh);
+                }
+                curr_off = next_off;
+                bh = next;
+        } while (bh != head);
+        if (!offset) {
+                if (may_free && try_to_free_buffers(page))
+                        J_ASSERT(!page_has_buffers(page));
+        }
+}
+/*
+ * File a buffer on the given transaction list.
+ */
+void __jbd2_journal_file_buffer(struct journal_head *jh,
+                        transaction_t *transaction, int jlist)
+{
+        struct journal_head **list = NULL;
+        int was_dirty = 0;
+        struct buffer_head *bh = jh2bh(jh);
+        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+        assert_spin_locked(&transaction->t_journal->j_list_lock);
+        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+        J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+                                jh->b_transaction == 0);
+        if (jh->b_transaction && jh->b_jlist == jlist)
+                return;
+        /* The following list of buffer states needs to be consistent
+         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
+         * state. */
+        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                if (test_clear_buffer_dirty(bh) ||
+                    test_clear_buffer_jbddirty(bh))
+                        was_dirty = 1;
+        }
+        if (jh->b_transaction)
+                __jbd2_journal_temp_unlink_buffer(jh);
+        jh->b_transaction = transaction;
+        switch (jlist) {
+        case BJ_None:
+                J_ASSERT_JH(jh, !jh->b_committed_data);
+                J_ASSERT_JH(jh, !jh->b_frozen_data);
+                return;
+        case BJ_SyncData:
+                list = &transaction->t_sync_datalist;
+                break;
+        case BJ_Metadata:
+                transaction->t_nr_buffers++;
+                list = &transaction->t_buffers;
+                break;
+        case BJ_Forget:
+                list = &transaction->t_forget;
+                break;
+        case BJ_IO:
+                list = &transaction->t_iobuf_list;
+                break;
+        case BJ_Shadow:
+                list = &transaction->t_shadow_list;
+                break;
+        case BJ_LogCtl:
+                list = &transaction->t_log_list;
+                break;
+        case BJ_Reserved:
+                list = &transaction->t_reserved_list;
+                break;
+        case BJ_Locked:
+                list =  &transaction->t_locked_list;
+                break;
+        }
+        __blist_add_buffer(list, jh);
+        jh->b_jlist = jlist;
+        if (was_dirty)
+                set_buffer_jbddirty(bh);
+}
+void jbd2_journal_file_buffer(struct journal_head *jh,
+                                transaction_t *transaction, int jlist)
+{
+        jbd_lock_bh_state(jh2bh(jh));
+        spin_lock(&transaction->t_journal->j_list_lock);
+        __jbd2_journal_file_buffer(jh, transaction, jlist);
+        spin_unlock(&transaction->t_journal->j_list_lock);
+        jbd_unlock_bh_state(jh2bh(jh));
+}
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely.  If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under journal->j_list_lock
+ *
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ */
+void __jbd2_journal_refile_buffer(struct journal_head *jh)
+{
+        int was_dirty;
+        struct buffer_head *bh = jh2bh(jh);
+        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+        if (jh->b_transaction)
+                assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+        /* If the buffer is now unused, just drop it. */
+        if (jh->b_next_transaction == NULL) {
+                __jbd2_journal_unfile_buffer(jh);
+                return;
+        }
+        /*
+         * It has been modified by a later transaction: add it to the new
+         * transaction's metadata list.
+         */
+        was_dirty = test_clear_buffer_jbddirty(bh);
+        __jbd2_journal_temp_unlink_buffer(jh);
+        jh->b_transaction = jh->b_next_transaction;
+        jh->b_next_transaction = NULL;
+        __jbd2_journal_file_buffer(jh, jh->b_transaction,
+                                was_dirty ? BJ_Metadata : BJ_Reserved);
+        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+        if (was_dirty)
+                set_buffer_jbddirty(bh);
+}
+/*
+ * For the unlocked version of this call, also make sure that any
+ * hanging journal_head is cleaned up if necessary.
+ *
+ * __jbd2_journal_refile_buffer is usually called as part of a single locked
+ * operation on a buffer_head, in which the caller is probably going to
+ * be hooking the journal_head onto other lists.  In that case it is up
+ * to the caller to remove the journal_head if necessary.  For the
+ * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
+ * doing anything else to the buffer so we need to do the cleanup
+ * ourselves to avoid a jh leak.
+ *
+ * *** The journal_head may be freed by this call! ***
+ */
+void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+        struct buffer_head *bh = jh2bh(jh);
+        jbd_lock_bh_state(bh);
+        spin_lock(&journal->j_list_lock);
+        __jbd2_journal_refile_buffer(jh);
+        jbd_unlock_bh_state(bh);
+        jbd2_journal_remove_journal_head(bh);
+        spin_unlock(&journal->j_list_lock);
+        __brelse(bh);
+}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 6de374513c01..bc4b8106a490 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -334,10 +334,10 @@ static int __init init_jffs2_fs(void)
           which means just 'no padding', without the alignment
           thing. But GCC doesn't have that -- we have to just
           hope the structs are the right sizes, instead. */
-        BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+        BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
-        BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+        BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
-        BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+        BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
-        BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+        BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
        printk(KERN_INFO "JFFS2 version 2.2."
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 87e1d03e8267..b85a0ad2cfb6 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -100,12 +100,12 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 /*
 * The server lockd has called us back to tell us the lock was granted
 */
-u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 {
        const struct file_lock *fl = &lock->fl;
        const struct nfs_fh *fh = &lock->fh;
        struct nlm_wait *block;
-        u32 res = nlm_lck_denied;
+        __be32 res = nlm_lck_denied;
        /*
         * Look up blocked request based on arguments. 
@@ -144,42 +144,12 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 */
 /*
- * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
- * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
- */
-static void nlmclnt_prepare_reclaim(struct nlm_host *host)
-{
-        down_write(&host->h_rwsem);
-        host->h_monitored = 0;
-        host->h_state++;
-        host->h_nextrebind = 0;
-        nlm_rebind_host(host);
-        /*
-         * Mark the locks for reclaiming.
-         */
-        list_splice_init(&host->h_granted, &host->h_reclaim);
-        dprintk("NLM: reclaiming locks for host %s\n", host->h_name);
-}
-static void nlmclnt_finish_reclaim(struct nlm_host *host)
-{
-        host->h_reclaiming = 0;
-        up_write(&host->h_rwsem);
-        dprintk("NLM: done reclaiming locks for host %s", host->h_name);
-}
-/*
 * Reclaim all locks on server host. We do this by spawning a separate
 * reclaimer thread.
 */
 void
-nlmclnt_recovery(struct nlm_host *host, u32 newstate)
+nlmclnt_recovery(struct nlm_host *host)
 {
-        if (host->h_nsmstate == newstate)
-                return;
-        host->h_nsmstate = newstate;
        if (!host->h_reclaiming++) {
                nlm_get_host(host);
                __module_get(THIS_MODULE);
@@ -199,18 +169,30 @@ reclaimer(void *ptr)
        daemonize("%s-reclaim", host->h_name);
        allow_signal(SIGKILL);
+        down_write(&host->h_rwsem);
        /* This one ensures that our parent doesn't terminate while the
         * reclaim is in progress */
        lock_kernel();
        lockd_up(0); /* note: this cannot fail as lockd is already running */
-        nlmclnt_prepare_reclaim(host);
+        dprintk("lockd: reclaiming locks for host %s", host->h_name);
-        /* First, reclaim all locks that have been marked. */
 restart:
        nsmstate = host->h_nsmstate;
+        /* Force a portmap getport - the peer's lockd will
+         * most likely end up on a different port.
+         */
+        host->h_nextrebind = jiffies;
+        nlm_rebind_host(host);
+        /* First, reclaim all locks that have been granted. */
+        list_splice_init(&host->h_granted, &host->h_reclaim);
        list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
                list_del_init(&fl->fl_u.nfs_fl.list);
+                /* Why are we leaking memory here? --okir */
                if (signalled())
                        continue;
                if (nlmclnt_reclaim(host, fl) != 0)
@@ -218,11 +200,13 @@ restart:
                list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
                if (host->h_nsmstate != nsmstate) {
                        /* Argh! The server rebooted again! */
-                        list_splice_init(&host->h_granted, &host->h_reclaim);
                        goto restart;
                }
        }
-        nlmclnt_finish_reclaim(host);
+        host->h_reclaiming = 0;
+        up_write(&host->h_rwsem);
+        dprintk("NLM: done reclaiming locks for host %s", host->h_name);
        /* Now, wake up all processes that sleep on a blocked lock */
        list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 0116729cec5f..3d84f600b633 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -36,14 +36,14 @@ static const struct rpc_call_ops nlmclnt_cancel_ops;
 /*
 * Cookie counter for NLM requests
 */
-static u32      nlm_cookie = 0x1234;
+static atomic_t nlm_cookie = ATOMIC_INIT(0x1234);
-static inline void nlmclnt_next_cookie(struct nlm_cookie *c)
+void nlmclnt_next_cookie(struct nlm_cookie *c)
 {
-        memcpy(c->data, &nlm_cookie, 4);
+        u32     cookie = atomic_inc_return(&nlm_cookie);
-        memset(c->data+4, 0, 4);
+        memcpy(c->data, &cookie, 4);
        c->len=4;
-        nlm_cookie++;
 }
 static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
@@ -153,6 +153,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 {
        struct rpc_clnt         *client = NFS_CLIENT(inode);
        struct sockaddr_in      addr;
+        struct nfs_server       *nfssrv = NFS_SERVER(inode);
        struct nlm_host         *host;
        struct nlm_rqst         *call;
        sigset_t                oldset;
@@ -166,7 +167,9 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
        }
        rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
-        host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers);
+        host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
+                                   nfssrv->nfs_client->cl_hostname,
+                                   strlen(nfssrv->nfs_client->cl_hostname));
        if (host == NULL)
                return -ENOLCK;
@@ -499,7 +502,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        unsigned char fl_flags = fl->fl_flags;
        int status = -ENOLCK;
-        if (!host->h_monitored && nsm_monitor(host) < 0) {
+        if (nsm_monitor(host) < 0) {
                printk(KERN_NOTICE "lockd: failed to monitor %s\n",
                                        host->h_name);
                goto out;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a0d0b58ce7a4..fb24a9730345 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -27,46 +27,60 @@
 #define NLM_HOST_EXPIRE         ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ)
 #define NLM_HOST_COLLECT        ((nrhosts > NLM_HOST_MAX)? 120 * HZ :  60 * HZ)
-static struct nlm_host *        nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head        nlm_hosts[NLM_HOST_NRHASH];
 static unsigned long            next_gc;
 static int                      nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
+static struct nsm_handle *      __nsm_find(const struct sockaddr_in *,
+                                        const char *, int, int);
 /*
 * Find an NLM server handle in the cache. If there is none, create it.
 */
 struct nlm_host *
-nlmclnt_lookup_host(struct sockaddr_in *sin, int proto, int version)
+nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
+                        const char *hostname, int hostname_len)
 {
-        return nlm_lookup_host(0, sin, proto, version);
+        return nlm_lookup_host(0, sin, proto, version,
+                               hostname, hostname_len);
 }
 /*
 * Find an NLM client handle in the cache. If there is none, create it.
 */
 struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp)
+nlmsvc_lookup_host(struct svc_rqst *rqstp,
+                        const char *hostname, int hostname_len)
 {
        return nlm_lookup_host(1, &rqstp->rq_addr,
-                               rqstp->rq_prot, rqstp->rq_vers);
+                               rqstp->rq_prot, rqstp->rq_vers,
+                               hostname, hostname_len);
 }
 /*
 * Common host lookup routine for server & client
 */
 struct nlm_host *
-nlm_lookup_host(int server, struct sockaddr_in *sin,
+nlm_lookup_host(int server, const struct sockaddr_in *sin,
-                                        int proto, int version)
+                                        int proto, int version,
+                                        const char *hostname,
+                                        int hostname_len)
 {
-        struct nlm_host *host, **hp;
+        struct hlist_head *chain;
-        u32             addr;
+        struct hlist_node *pos;
+        struct nlm_host *host;
+        struct nsm_handle *nsm = NULL;
        int             hash;
-        dprintk("lockd: nlm_lookup_host(%08x, p=%d, v=%d)\n",
+        dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n",
-                        (unsigned)(sin? ntohl(sin->sin_addr.s_addr) : 0), proto, version);
+                        NIPQUAD(sin->sin_addr.s_addr), proto, version,
+                        server? "server" : "client",
+                        hostname_len,
+                        hostname? hostname : "<none>");
        hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
@@ -76,7 +90,22 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
        if (time_after_eq(jiffies, next_gc))
                nlm_gc_hosts();
-        for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) {
+        /* We may keep several nlm_host objects for a peer, because each
+         * nlm_host is identified by
+         * (address, protocol, version, server/client)
+         * We could probably simplify this a little by putting all those
+         * different NLM rpc_clients into one single nlm_host object.
+         * This would allow us to have one nlm_host per address.
+         */
+        chain = &nlm_hosts[hash];
+        hlist_for_each_entry(host, pos, chain, h_hash) {
+                if (!nlm_cmp_addr(&host->h_addr, sin))
+                        continue;
+                /* See if we have an NSM handle for this client */
+                if (!nsm)
+                        nsm = host->h_nsmhandle;
                if (host->h_proto != proto)
                        continue;
                if (host->h_version != version)
@@ -84,28 +113,30 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
                if (host->h_server != server)
                        continue;
-                if (nlm_cmp_addr(&host->h_addr, sin)) {
+                /* Move to head of hash chain. */
-                        if (hp != nlm_hosts + hash) {
+                hlist_del(&host->h_hash);
-                                *hp = host->h_next;
+                hlist_add_head(&host->h_hash, chain);
-                                host->h_next = nlm_hosts[hash];
-                                nlm_hosts[hash] = host;
-                        }
-                        nlm_get_host(host);
-                        mutex_unlock(&nlm_host_mutex);
-                        return host;
-                }
-        }
-        /* Ooops, no host found, create it */
+                nlm_get_host(host);
-        dprintk("lockd: creating host entry\n");
+                goto out;
+        }
+        if (nsm)
+                atomic_inc(&nsm->sm_count);
-        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        host = NULL;
-        if (!host)
-                goto nohost;
-        addr = sin->sin_addr.s_addr;
+        /* Sadly, the host isn't in our hash table yet. See if
-        sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr));
+         * we have an NSM handle for it. If not, create one.
+         */
+        if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
+                goto out;
+        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        if (!host) {
+                nsm_release(nsm);
+                goto out;
+        }
+        host->h_name       = nsm->sm_name;
        host->h_addr       = *sin;
        host->h_addr.sin_port = 0;      /* ouch! */
        host->h_version    = version;
@@ -119,9 +150,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
        init_rwsem(&host->h_rwsem);
        host->h_state      = 0;                 /* pseudo NSM state */
        host->h_nsmstate   = 0;                 /* real NSM state */
+        host->h_nsmhandle  = nsm;
        host->h_server     = server;
-        host->h_next       = nlm_hosts[hash];
+        hlist_add_head(&host->h_hash, chain);
-        nlm_hosts[hash]    = host;
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
        INIT_LIST_HEAD(&host->h_granted);
@@ -130,35 +161,39 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
        if (++nrhosts > NLM_HOST_MAX)
                next_gc = 0;
-nohost:
+out:
        mutex_unlock(&nlm_host_mutex);
        return host;
 }
-struct nlm_host *
+/*
-nlm_find_client(void)
+ * Destroy a host
+ */
+static void
+nlm_destroy_host(struct nlm_host *host)
 {
-        /* find a nlm_host for a client for which h_killed == 0.
+        struct rpc_clnt *clnt;
-         * and return it
+        BUG_ON(!list_empty(&host->h_lockowners));
+        BUG_ON(atomic_read(&host->h_count));
+        /*
+         * Release NSM handle and unmonitor host.
         */
-        int hash;
+        nsm_unmonitor(host);
-        mutex_lock(&nlm_host_mutex);
-        for (hash = 0 ; hash < NLM_HOST_NRHASH; hash++) {
+        if ((clnt = host->h_rpcclnt) != NULL) {
-                struct nlm_host *host, **hp;
+                if (atomic_read(&clnt->cl_users)) {
-                for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) {
+                        printk(KERN_WARNING
-                        if (host->h_server &&
+                                "lockd: active RPC handle\n");
-                            host->h_killed == 0) {
+                        clnt->cl_dead = 1;
-                                nlm_get_host(host);
+                } else {
-                                mutex_unlock(&nlm_host_mutex);
+                        rpc_destroy_client(host->h_rpcclnt);
-                                return host;
-                        }
                }
        }
-        mutex_unlock(&nlm_host_mutex);
+        kfree(host);
-        return NULL;
 }
-                                
 /*
 * Create the NLM RPC client for an NLM peer
 */
@@ -260,22 +295,82 @@ void nlm_release_host(struct nlm_host *host)
 }
 /*
+ * We were notified that the host indicated by address &sin
+ * has rebooted.
+ * Release all resources held by that peer.
+ */
+void nlm_host_rebooted(const struct sockaddr_in *sin,
+                                const char *hostname, int hostname_len,
+                                u32 new_state)
+{
+        struct hlist_head *chain;
+        struct hlist_node *pos;
+        struct nsm_handle *nsm;
+        struct nlm_host *host;
+        dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
+                        hostname, NIPQUAD(sin->sin_addr));
+        /* Find the NSM handle for this peer */
+        if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+                return;
+        /* When reclaiming locks on this peer, make sure that
+         * we set up a new notification */
+        nsm->sm_monitored = 0;
+        /* Mark all hosts tied to this NSM state as having rebooted.
+         * We run the loop repeatedly, because we drop the host table
+         * lock for this.
+         * To avoid processing a host several times, we match the nsmstate.
+         */
+again:  mutex_lock(&nlm_host_mutex);
+        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+                hlist_for_each_entry(host, pos, chain, h_hash) {
+                        if (host->h_nsmhandle == nsm
+                         && host->h_nsmstate != new_state) {
+                                host->h_nsmstate = new_state;
+                                host->h_state++;
+                                nlm_get_host(host);
+                                mutex_unlock(&nlm_host_mutex);
+                                if (host->h_server) {
+                                        /* We're server for this guy, just ditch
+                                         * all the locks he held. */
+                                        nlmsvc_free_host_resources(host);
+                                } else {
+                                        /* He's the server, initiate lock recovery. */
+                                        nlmclnt_recovery(host);
+                                }
+                                nlm_release_host(host);
+                                goto again;
+                        }
+                }
+        }
+        mutex_unlock(&nlm_host_mutex);
+}
+/*
 * Shut down the hosts module.
 * Note that this routine is called only at server shutdown time.
 */
 void
 nlm_shutdown_hosts(void)
 {
+        struct hlist_head *chain;
+        struct hlist_node *pos;
        struct nlm_host *host;
-        int             i;
        dprintk("lockd: shutting down host module\n");
        mutex_lock(&nlm_host_mutex);
        /* First, make all hosts eligible for gc */
        dprintk("lockd: nuking all hosts...\n");
-        for (i = 0; i < NLM_HOST_NRHASH; i++) {
+        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-                for (host = nlm_hosts[i]; host; host = host->h_next)
+                hlist_for_each_entry(host, pos, chain, h_hash)
                        host->h_expires = jiffies - 1;
        }
@@ -287,8 +382,8 @@ nlm_shutdown_hosts(void)
        if (nrhosts) {
                printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
                dprintk("lockd: %d hosts left:\n", nrhosts);
-                for (i = 0; i < NLM_HOST_NRHASH; i++) {
+                for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-                        for (host = nlm_hosts[i]; host; host = host->h_next) {
+                        hlist_for_each_entry(host, pos, chain, h_hash) {
                                dprintk("       %s (cnt %d use %d exp %ld)\n",
                                        host->h_name, atomic_read(&host->h_count),
                                        host->h_inuse, host->h_expires);
@@ -305,45 +400,32 @@ nlm_shutdown_hosts(void)
 static void
 nlm_gc_hosts(void)
 {
-        struct nlm_host **q, *host;
+        struct hlist_head *chain;
-        struct rpc_clnt *clnt;
+        struct hlist_node *pos, *next;
-        int             i;
+        struct nlm_host *host;
        dprintk("lockd: host garbage collection\n");
-        for (i = 0; i < NLM_HOST_NRHASH; i++) {
+        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-                for (host = nlm_hosts[i]; host; host = host->h_next)
+                hlist_for_each_entry(host, pos, chain, h_hash)
                        host->h_inuse = 0;
        }
        /* Mark all hosts that hold locks, blocks or shares */
        nlmsvc_mark_resources();
-        for (i = 0; i < NLM_HOST_NRHASH; i++) {
+        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-                q = &nlm_hosts[i];
+                hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
-                while ((host = *q) != NULL) {
                        if (atomic_read(&host->h_count) || host->h_inuse
                         || time_before(jiffies, host->h_expires)) {
                                dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
                                        host->h_name, atomic_read(&host->h_count),
                                        host->h_inuse, host->h_expires);
-                                q = &host->h_next;
                                continue;
                        }
                        dprintk("lockd: delete host %s\n", host->h_name);
-                        *q = host->h_next;
+                        hlist_del_init(&host->h_hash);
-                        /* Don't unmonitor hosts that have been invalidated */
-                        if (host->h_monitored && !host->h_killed)
+                        nlm_destroy_host(host);
-                                nsm_unmonitor(host);
-                        if ((clnt = host->h_rpcclnt) != NULL) {
-                                if (atomic_read(&clnt->cl_users)) {
-                                        printk(KERN_WARNING
-                                                "lockd: active RPC handle\n");
-                                        clnt->cl_dead = 1;
-                                } else {
-                                        rpc_destroy_client(host->h_rpcclnt);
-                                }
-                        }
-                        kfree(host);
                        nrhosts--;
                }
        }
@@ -351,3 +433,88 @@ nlm_gc_hosts(void)
        next_gc = jiffies + NLM_HOST_COLLECT;
 }
+/*
+ * Manage NSM handles
+ */
+static LIST_HEAD(nsm_handles);
+static DEFINE_MUTEX(nsm_mutex);
+static struct nsm_handle *
+__nsm_find(const struct sockaddr_in *sin,
+                const char *hostname, int hostname_len,
+                int create)
+{
+        struct nsm_handle *nsm = NULL;
+        struct list_head *pos;
+        if (!sin)
+                return NULL;
+        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+                if (printk_ratelimit()) {
+                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+                                            "in NFS lock request\n",
+                                hostname_len, hostname);
+                }
+                return NULL;
+        }
+        mutex_lock(&nsm_mutex);
+        list_for_each(pos, &nsm_handles) {
+                nsm = list_entry(pos, struct nsm_handle, sm_link);
+                if (hostname && nsm_use_hostnames) {
+                        if (strlen(nsm->sm_name) != hostname_len
+                         || memcmp(nsm->sm_name, hostname, hostname_len))
+                                continue;
+                } else if (!nlm_cmp_addr(&nsm->sm_addr, sin))
+                        continue;
+                atomic_inc(&nsm->sm_count);
+                goto out;
+        }
+        if (!create) {
+                nsm = NULL;
+                goto out;
+        }
+        nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
+        if (nsm != NULL) {
+                nsm->sm_addr = *sin;
+                nsm->sm_name = (char *) (nsm + 1);
+                memcpy(nsm->sm_name, hostname, hostname_len);
+                nsm->sm_name[hostname_len] = '\0';
+                atomic_set(&nsm->sm_count, 1);
+                list_add(&nsm->sm_link, &nsm_handles);
+        }
+out:
+        mutex_unlock(&nsm_mutex);
+        return nsm;
+}
+struct nsm_handle *
+nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+{
+        return __nsm_find(sin, hostname, hostname_len, 1);
+}
+/*
+ * Release an NSM handle
+ */
+void
+nsm_release(struct nsm_handle *nsm)
+{
+        if (!nsm)
+                return;
+        if (atomic_dec_and_test(&nsm->sm_count)) {
+                mutex_lock(&nsm_mutex);
+                if (atomic_read(&nsm->sm_count) == 0) {
+                        list_del(&nsm->sm_link);
+                        kfree(nsm);
+                }
+                mutex_unlock(&nsm_mutex);
+        }
+}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index a816b920d431..eb243edf8932 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -24,13 +24,13 @@ static struct rpc_program	nsm_program;
 /*
 * Local NSM state
 */
-u32                             nsm_local_state;
+int                             nsm_local_state;
 /*
 * Common procedure for SM_MON/SM_UNMON calls
 */
 static int
-nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
+nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
        struct rpc_clnt *clnt;
        int             status;
@@ -46,10 +46,11 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
                goto out;
        }
-        args.addr = host->h_addr.sin_addr.s_addr;
+        memset(&args, 0, sizeof(args));
-        args.proto= (host->h_proto<<1) | host->h_server;
+        args.mon_name = nsm->sm_name;
+        args.addr = nsm->sm_addr.sin_addr.s_addr;
        args.prog = NLM_PROGRAM;
-        args.vers = host->h_version;
+        args.vers = 3;
        args.proc = NLMPROC_NSM_NOTIFY;
        memset(res, 0, sizeof(*res));
@@ -70,17 +71,22 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
 int
 nsm_monitor(struct nlm_host *host)
 {
+        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
        int             status;
        dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+        BUG_ON(nsm == NULL);
-        status = nsm_mon_unmon(host, SM_MON, &res);
+        if (nsm->sm_monitored)
+                return 0;
+        status = nsm_mon_unmon(nsm, SM_MON, &res);
        if (status < 0 || res.status != 0)
                printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
        else
-                host->h_monitored = 1;
+                nsm->sm_monitored = 1;
        return status;
 }
@@ -90,16 +96,26 @@ nsm_monitor(struct nlm_host *host)
 int
 nsm_unmonitor(struct nlm_host *host)
 {
+        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
-        int             status;
+        int             status = 0;
-        dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+        if (nsm == NULL)
+                return 0;
-        status = nsm_mon_unmon(host, SM_UNMON, &res);
+        host->h_nsmhandle = NULL;
-        if (status < 0)
-                printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", host->h_name);
+        if (atomic_read(&nsm->sm_count) == 1
-        else
+         && nsm->sm_monitored && !nsm->sm_sticky) {
-                host->h_monitored = 0;
+                dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+                status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+                if (status < 0)
+                        printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
+                                        host->h_name);
+                else
+                        nsm->sm_monitored = 0;
+        }
+        nsm_release(nsm);
        return status;
 }
@@ -132,10 +148,10 @@ nsm_create(void)
 * XDR functions for NSM.
 */
-static u32 *
+static __be32 *
-xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-        char    buffer[20];
+        char    buffer[20], *name;
        /*
         * Use the dotted-quad IP address of the remote host as
@@ -143,8 +159,13 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
         * hostname first for whatever remote hostname it receives,
         * so this works alright.
         */
-        sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
+        if (nsm_use_hostnames) {
-        if (!(p = xdr_encode_string(p, buffer))
+                name = argp->mon_name;
+        } else {
+                sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
+                name = buffer;
+        }
+        if (!(p = xdr_encode_string(p, name))
         || !(p = xdr_encode_string(p, utsname()->nodename)))
                return ERR_PTR(-EIO);
        *p++ = htonl(argp->prog);
@@ -155,21 +176,23 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
 }
 static int
-xdr_encode_mon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
        p = xdr_encode_common(rqstp, p, argp);
        if (IS_ERR(p))
                return PTR_ERR(p);
+        /* Surprise - there may even be room for an IPv6 address now */
        *p++ = argp->addr;
-        *p++ = argp->vers;
+        *p++ = 0;
-        *p++ = argp->proto;
+        *p++ = 0;
        *p++ = 0;
        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
        return 0;
 }
 static int
-xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
        p = xdr_encode_common(rqstp, p, argp);
        if (IS_ERR(p))
@@ -179,7 +202,7 @@ xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
 }
 static int
-xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
+xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 {
        resp->status = ntohl(*p++);
        resp->state = ntohl(*p++);
@@ -189,7 +212,7 @@ xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
 }
 static int
-xdr_decode_stat(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
+xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 {
        resp->state = ntohl(*p++);
        return 0;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 3cc369e5693f..634139232aaf 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -33,6 +33,7 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
+#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 #define NLMDBG_FACILITY         NLMDBG_SVC
@@ -61,6 +62,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
 static unsigned long            nlm_grace_period;
 static unsigned long            nlm_timeout = LOCKD_DFLT_TIMEO;
 static int                      nlm_udpport, nlm_tcpport;
+int                             nsm_use_hostnames = 0;
 /*
 * Constants needed for the sysctl interface.
@@ -395,6 +397,22 @@ static ctl_table nlm_sysctls[] = {
                .extra1         = (int *) &nlm_port_min,
                .extra2         = (int *) &nlm_port_max,
        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "nsm_use_hostnames",
+                .data           = &nsm_use_hostnames,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "nsm_local_state",
+                .data           = &nsm_local_state,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
        { .ctl_name = 0 }
 };
@@ -483,6 +501,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
                  &nlm_udpport, 0644);
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
                  &nlm_tcpport, 0644);
+module_param(nsm_use_hostnames, bool, 0644);
 /*
 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a2dd9ccb9b32..0ce5c81ff507 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -24,22 +24,22 @@
 /*
 * Obtain client and file from arguments
 */
-static u32
+static __be32
 nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
                        struct nlm_host **hostp, struct nlm_file **filp)
 {
        struct nlm_host         *host = NULL;
        struct nlm_file         *file = NULL;
        struct nlm_lock         *lock = &argp->lock;
-        u32                     error = 0;
+        __be32                  error = 0;
        /* nfsd callbacks must have been installed for this procedure */
        if (!nlmsvc_ops)
                return nlm_lck_denied_nolocks;
        /* Obtain host handle */
-        if (!(host = nlmsvc_lookup_host(rqstp))
+        if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
-         || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0))
+         || (argp->monitor && nsm_monitor(host) < 0))
                goto no_locks;
        *hostp = host;
@@ -68,7 +68,7 @@ no_locks:
 /*
 * NULL: Test for presence of service
 */
-static int
+static __be32
 nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        dprintk("lockd: NULL          called\n");
@@ -78,7 +78,7 @@ nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
 * TEST: Check for conflicting lock
 */
-static int
+static __be32
 nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                                         struct nlm_res  *resp)
 {
@@ -96,7 +96,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now check for conflicting locks */
        resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock);
@@ -107,7 +107,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        return rpc_success;
 }
-static int
+static __be32
 nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
                                         struct nlm_res  *resp)
 {
@@ -126,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 #if 0
        /* If supplied state doesn't match current state, we assume it's
@@ -150,7 +150,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        return rpc_success;
 }
-static int
+static __be32
 nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
                                           struct nlm_res  *resp)
 {
@@ -169,7 +169,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Try to cancel request. */
        resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
@@ -183,7 +183,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * UNLOCK: release a lock
 */
-static int
+static __be32
 nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
                                           struct nlm_res  *resp)
 {
@@ -202,7 +202,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now try to remove the lock */
        resp->status = nlmsvc_unlock(file, &argp->lock);
@@ -217,7 +217,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 * GRANTED: A server calls us to tell that a process' lock request
 * was granted
 */
-static int
+static __be32
 nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
                                            struct nlm_res  *resp)
 {
@@ -253,14 +253,16 @@ static const struct rpc_call_ops nlm4svc_callback_ops = {
 * because we send the callback before the reply proper. I hope this
 * doesn't break any clients.
 */
-static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
-                int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+                __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
 {
        struct nlm_host *host;
        struct nlm_rqst *call;
-        int stat;
+        __be32 stat;
-        host = nlmsvc_lookup_host(rqstp);
+        host = nlmsvc_lookup_host(rqstp,
+                                  argp->lock.caller,
+                                  argp->lock.len);
        if (host == NULL)
                return rpc_system_err;
@@ -280,35 +282,35 @@ static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *a
        return rpc_success;
 }
-static int nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                             void            *resp)
 {
        dprintk("lockd: TEST_MSG      called\n");
        return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
 }
-static int nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                             void            *resp)
 {
        dprintk("lockd: LOCK_MSG      called\n");
        return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
 }
-static int nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                               void            *resp)
 {
        dprintk("lockd: CANCEL_MSG    called\n");
        return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
 }
-static int nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                               void            *resp)
 {
        dprintk("lockd: UNLOCK_MSG    called\n");
        return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
 }
-static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                void            *resp)
 {
        dprintk("lockd: GRANTED_MSG   called\n");
@@ -318,7 +320,7 @@ static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *arg
 /*
 * SHARE: create a DOS share or alter existing share.
 */
-static int
+static __be32
 nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
                                          struct nlm_res  *resp)
 {
@@ -337,7 +339,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now try to create the share */
        resp->status = nlmsvc_share_file(host, file, argp);
@@ -351,7 +353,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * UNSHARE: Release a DOS share.
 */
-static int
+static __be32
 nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
                                            struct nlm_res  *resp)
 {
@@ -370,7 +372,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now try to lock the file */
        resp->status = nlmsvc_unshare_file(host, file, argp);
@@ -384,7 +386,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * NM_LOCK: Create an unmonitored lock
 */
-static int
+static __be32
 nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
                                            struct nlm_res  *resp)
 {
@@ -397,7 +399,7 @@ nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * FREE_ALL: Release all locks and shares held by client
 */
-static int
+static __be32
 nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                                             void            *resp)
 {
@@ -415,15 +417,11 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * SM_NOTIFY: private callback from statd (not part of official NLM proto)
 */
-static int
+static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
        struct sockaddr_in      saddr = rqstp->rq_addr;
-        int                     vers = argp->vers;
-        int                     prot = argp->proto >> 1;
-        struct nlm_host         *host;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -438,28 +436,17 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
        /* Obtain the host pointer for this NFS server and try to
         * reclaim all locks we hold on this server.
         */
+        memset(&saddr, 0, sizeof(saddr));
        saddr.sin_addr.s_addr = argp->addr;
+        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-        if ((argp->proto & 1)==0) {
-                if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
-                        nlmclnt_recovery(host, argp->state);
-                        nlm_release_host(host);
-                }
-        } else {
-                /* If we run on an NFS server, delete all locks held by the client */
-                if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
-                        nlmsvc_free_host_resources(host);
-                        nlm_release_host(host);
-                }
-        }
        return rpc_success;
 }
 /*
 * client sent a GRANTED_RES, let's remove the associated block
 */
-static int
+static __be32
 nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
                                                void            *resp)
 {
@@ -468,7 +455,7 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
        dprintk("lockd: GRANTED_RES   called\n");
-        nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status);
+        nlmsvc_grant_reply(&argp->cookie, argp->status);
        return rpc_success;
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 93c00ee7189d..7e219b938552 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -40,7 +40,7 @@
 static void nlmsvc_release_block(struct nlm_block *block);
 static void     nlmsvc_insert_block(struct nlm_block *block, unsigned long);
-static int      nlmsvc_remove_block(struct nlm_block *block);
+static void     nlmsvc_remove_block(struct nlm_block *block);
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
@@ -49,7 +49,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
 /*
 * The list of blocked locks to retry
 */
-static struct nlm_block *       nlm_blocked;
+static LIST_HEAD(nlm_blocked);
 /*
 * Insert a blocked lock into the global list
@@ -57,48 +57,44 @@ static struct nlm_block *	nlm_blocked;
 static void
 nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
 {
-        struct nlm_block **bp, *b;
+        struct nlm_block *b;
+        struct list_head *pos;
        dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
-        kref_get(&block->b_count);
+        if (list_empty(&block->b_list)) {
-        if (block->b_queued)
+                kref_get(&block->b_count);
-                nlmsvc_remove_block(block);
+        } else {
-        bp = &nlm_blocked;
+                list_del_init(&block->b_list);
+        }
+        pos = &nlm_blocked;
        if (when != NLM_NEVER) {
                if ((when += jiffies) == NLM_NEVER)
                        when ++;
-                while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER)
+                list_for_each(pos, &nlm_blocked) {
-                        bp = &b->b_next;
+                        b = list_entry(pos, struct nlm_block, b_list);
-        } else
+                        if (time_after(b->b_when,when) || b->b_when == NLM_NEVER)
-                while ((b = *bp) != 0)
+                                break;
-                        bp = &b->b_next;
+                }
+                /* On normal exit from the loop, pos == &nlm_blocked,
+                 * so we will be adding to the end of the list - good
+                 */
+        }
-        block->b_queued = 1;
+        list_add_tail(&block->b_list, pos);
        block->b_when = when;
-        block->b_next = b;
-        *bp = block;
 }
 /*
 * Remove a block from the global list
 */
-static int
+static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
-        struct nlm_block **bp, *b;
+        if (!list_empty(&block->b_list)) {
+                list_del_init(&block->b_list);
-        if (!block->b_queued)
+                nlmsvc_release_block(block);
-                return 1;
-        for (bp = &nlm_blocked; (b = *bp) != 0; bp = &b->b_next) {
-                if (b == block) {
-                        *bp = block->b_next;
-                        block->b_queued = 0;
-                        nlmsvc_release_block(block);
-                        return 1;
-                }
        }
-        return 0;
 }
 /*
@@ -107,14 +103,14 @@ nlmsvc_remove_block(struct nlm_block *block)
 static struct nlm_block *
 nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
 {
-        struct nlm_block        **head, *block;
+        struct nlm_block        *block;
        struct file_lock        *fl;
        dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
                                file, lock->fl.fl_pid,
                                (long long)lock->fl.fl_start,
                                (long long)lock->fl.fl_end, lock->fl.fl_type);
-        for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) {
+        list_for_each_entry(block, &nlm_blocked, b_list) {
                fl = &block->b_call->a_args.lock.fl;
                dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
                                block->b_file, fl->fl_pid,
@@ -143,20 +139,20 @@ static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
 * Find a block with a given NLM cookie.
 */
 static inline struct nlm_block *
-nlmsvc_find_block(struct nlm_cookie *cookie,  struct sockaddr_in *sin)
+nlmsvc_find_block(struct nlm_cookie *cookie)
 {
        struct nlm_block *block;
-        for (block = nlm_blocked; block; block = block->b_next) {
+        list_for_each_entry(block, &nlm_blocked, b_list) {
-                dprintk("cookie: head of blocked queue %p, block %p\n", 
+                if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
-                        nlm_blocked, block);
+                        goto found;
-                if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)
-                                && nlm_cmp_addr(sin, &block->b_host->h_addr))
-                        break;
        }
-        if (block != NULL)
+        return NULL;
-                kref_get(&block->b_count);
+found:
+        dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
+        kref_get(&block->b_count);
        return block;
 }
@@ -169,6 +165,11 @@ nlmsvc_find_block(struct nlm_cookie *cookie,  struct sockaddr_in *sin)
 * request, but (as I found out later) that's because some implementations
 * do just this. Never mind the standards comittees, they support our
 * logging industries.
+ *
+ * 10 years later: I hope we can safely ignore these old and broken
+ * clients by now. Let's fix this so we can uniquely identify an incoming
+ * GRANTED_RES message by cookie, without having to rely on the client's IP
+ * address. --okir
 */
 static inline struct nlm_block *
 nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
@@ -179,7 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
        struct nlm_rqst         *call = NULL;
        /* Create host handle for callback */
-        host = nlmsvc_lookup_host(rqstp);
+        host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
        if (host == NULL)
                return NULL;
@@ -192,6 +193,8 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
        if (block == NULL)
                goto failed;
        kref_init(&block->b_count);
+        INIT_LIST_HEAD(&block->b_list);
+        INIT_LIST_HEAD(&block->b_flist);
        if (!nlmsvc_setgrantargs(call, lock))
                goto failed_free;
@@ -199,7 +202,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
        /* Set notifier function for VFS, and init args */
        call->a_args.lock.fl.fl_flags |= FL_SLEEP;
        call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
-        call->a_args.cookie = *cookie;  /* see above */
+        nlmclnt_next_cookie(&call->a_args.cookie);
        dprintk("lockd: created block %p...\n", block);
@@ -210,8 +213,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
        file->f_count++;
        /* Add to file's list of blocks */
-        block->b_fnext  = file->f_blocks;
+        list_add(&block->b_flist, &file->f_blocks);
-        file->f_blocks  = block;
        /* Set up RPC arguments for callback */
        block->b_call = call;
@@ -248,19 +250,13 @@ static void nlmsvc_free_block(struct kref *kref)
 {
        struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
        struct nlm_file         *file = block->b_file;
-        struct nlm_block        **bp;
        dprintk("lockd: freeing block %p...\n", block);
-        down(&file->f_sema);
        /* Remove block from file's list of blocks */
-        for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) {
+        mutex_lock(&file->f_mutex);
-                if (*bp == block) {
+        list_del_init(&block->b_flist);
-                        *bp = block->b_fnext;
+        mutex_unlock(&file->f_mutex);
-                        break;
-                }
-        }
-        up(&file->f_sema);
        nlmsvc_freegrantargs(block->b_call);
        nlm_release_call(block->b_call);
@@ -274,47 +270,32 @@ static void nlmsvc_release_block(struct nlm_block *block)
                kref_put(&block->b_count, nlmsvc_free_block);
 }
-static void nlmsvc_act_mark(struct nlm_host *host, struct nlm_file *file)
+/*
-{
+ * Loop over all blocks and delete blocks held by
-        struct nlm_block *block;
+ * a matching host.
+ */
-        down(&file->f_sema);
+void nlmsvc_traverse_blocks(struct nlm_host *host,
-        for (block = file->f_blocks; block != NULL; block = block->b_fnext)
+                        struct nlm_file *file,
-                block->b_host->h_inuse = 1;
+                        nlm_host_match_fn_t match)
-        up(&file->f_sema);
-}
-static void nlmsvc_act_unlock(struct nlm_host *host, struct nlm_file *file)
 {
-        struct nlm_block *block;
+        struct nlm_block *block, *next;
 restart:
-        down(&file->f_sema);
+        mutex_lock(&file->f_mutex);
-        for (block = file->f_blocks; block != NULL; block = block->b_fnext) {
+        list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
-                if (host != NULL && host != block->b_host)
+                if (!match(block->b_host, host))
                        continue;
-                if (!block->b_queued)
+                /* Do not destroy blocks that are not on
+                 * the global retry list - why? */
+                if (list_empty(&block->b_list))
                        continue;
                kref_get(&block->b_count);
-                up(&file->f_sema);
+                mutex_unlock(&file->f_mutex);
                nlmsvc_unlink_block(block);
                nlmsvc_release_block(block);
                goto restart;
        }
-        up(&file->f_sema);
+        mutex_unlock(&file->f_mutex);
-}
-/*
- * Loop over all blocks and perform the action specified.
- * (NLM_ACT_CHECK handled by nlmsvc_inspect_file).
- */
-void
-nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
-{
-        if (action == NLM_ACT_MARK)
-                nlmsvc_act_mark(host, file);
-        else
-                nlmsvc_act_unlock(host, file);
 }
 /*
@@ -353,13 +334,13 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 * Attempt to establish a lock, and if it can't be granted, block it
 * if required.
 */
-u32
+__be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
 {
        struct nlm_block        *block, *newblock = NULL;
        int                     error;
-        u32                     ret;
+        __be32                  ret;
        dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
                                file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -373,7 +354,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
        lock->fl.fl_flags &= ~FL_SLEEP;
 again:
        /* Lock file against concurrent access */
-        down(&file->f_sema);
+        mutex_lock(&file->f_mutex);
        /* Get existing block (in case client is busy-waiting) */
        block = nlmsvc_lookup_block(file, lock);
        if (block == NULL) {
@@ -411,10 +392,10 @@ again:
        /* If we don't have a block, create and initialize it. Then
         * retry because we may have slept in kmalloc. */
-        /* We have to release f_sema as nlmsvc_create_block may try to
+        /* We have to release f_mutex as nlmsvc_create_block may try to
         * to claim it while doing host garbage collection */
        if (newblock == NULL) {
-                up(&file->f_sema);
+                mutex_unlock(&file->f_mutex);
                dprintk("lockd: blocking on this lock (allocating).\n");
                if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie)))
                        return nlm_lck_denied_nolocks;
@@ -424,7 +405,7 @@ again:
        /* Append to list of blocked */
        nlmsvc_insert_block(newblock, NLM_NEVER);
 out:
-        up(&file->f_sema);
+        mutex_unlock(&file->f_mutex);
        nlmsvc_release_block(newblock);
        nlmsvc_release_block(block);
        dprintk("lockd: nlmsvc_lock returned %u\n", ret);
@@ -434,7 +415,7 @@ out:
 /*
 * Test for presence of a conflicting lock.
 */
-u32
+__be32
 nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
                                       struct nlm_lock *conflock)
 {
@@ -451,6 +432,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
                                (long long)conflock->fl.fl_start,
                                (long long)conflock->fl.fl_end);
                conflock->caller = "somehost";  /* FIXME */
+                conflock->len = strlen(conflock->caller);
                conflock->oh.len = 0;           /* don't return OH info */
                conflock->svid = conflock->fl.fl_pid;
                return nlm_lck_denied;
@@ -466,7 +448,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 * afterwards. In this case the block will still be there, and hence
 * must be removed.
 */
-u32
+__be32
 nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
 {
        int     error;
@@ -494,7 +476,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
 * be in progress.
 * The calling procedure must check whether the file can be closed.
 */
-u32
+__be32
 nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 {
        struct nlm_block        *block;
@@ -507,9 +489,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
                                (long long)lock->fl.fl_start,
                                (long long)lock->fl.fl_end);
-        down(&file->f_sema);
+        mutex_lock(&file->f_mutex);
        block = nlmsvc_lookup_block(file, lock);
-        up(&file->f_sema);
+        mutex_unlock(&file->f_mutex);
        if (block != NULL) {
                status = nlmsvc_unlink_block(block);
                nlmsvc_release_block(block);
@@ -527,10 +509,10 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 static void
 nlmsvc_notify_blocked(struct file_lock *fl)
 {
-        struct nlm_block        **bp, *block;
+        struct nlm_block        *block;
        dprintk("lockd: VFS unblock notification for block %p\n", fl);
-        for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) {
+        list_for_each_entry(block, &nlm_blocked, b_list) {
                if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
                        nlmsvc_insert_block(block, 0);
                        svc_wake_up(block->b_daemon);
@@ -663,17 +645,14 @@ static const struct rpc_call_ops nlmsvc_grant_ops = {
 * block.
 */
 void
-nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status)
+nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status)
 {
        struct nlm_block        *block;
-        struct nlm_file         *file;
-        dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n", 
+        dprintk("grant_reply: looking for cookie %x, s=%d \n",
-                *(unsigned int *)(cookie->data), 
+                *(unsigned int *)(cookie->data), status);
-                ntohl(rqstp->rq_addr.sin_addr.s_addr), status);
+        if (!(block = nlmsvc_find_block(cookie)))
-        if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr)))
                return;
-        file = block->b_file;
        if (block) {
                if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
@@ -696,16 +675,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
 unsigned long
 nlmsvc_retry_blocked(void)
 {
-        struct nlm_block        *block;
+        unsigned long   timeout = MAX_SCHEDULE_TIMEOUT;
+        struct nlm_block *block;
+        while (!list_empty(&nlm_blocked)) {
+                block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
-        dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
-                        nlm_blocked,
-                        nlm_blocked? nlm_blocked->b_when : 0);
-        while ((block = nlm_blocked) != 0) {
                if (block->b_when == NLM_NEVER)
                        break;
-                if (time_after(block->b_when,jiffies))
+                if (time_after(block->b_when,jiffies)) {
+                        timeout = block->b_when - jiffies;
                        break;
+                }
                dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
                        block, block->b_when);
                kref_get(&block->b_count);
@@ -713,8 +695,5 @@ nlmsvc_retry_blocked(void)
                nlmsvc_release_block(block);
        }
-        if ((block = nlm_blocked) && block->b_when != NLM_NEVER)
+        return timeout;
-                return (block->b_when - jiffies);
-        return MAX_SCHEDULE_TIMEOUT;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index dbb66a3b5cd9..32e99a6e8dca 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -22,8 +22,8 @@
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
 #ifdef CONFIG_LOCKD_V4
-static u32
+static __be32
-cast_to_nlm(u32 status, u32 vers)
+cast_to_nlm(__be32 status, u32 vers)
 {
        /* Note: status is assumed to be in network byte order !!! */
        if (vers != 4){
@@ -52,22 +52,22 @@ cast_to_nlm(u32 status, u32 vers)
 /*
 * Obtain client and file from arguments
 */
-static u32
+static __be32
 nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
                        struct nlm_host **hostp, struct nlm_file **filp)
 {
        struct nlm_host         *host = NULL;
        struct nlm_file         *file = NULL;
        struct nlm_lock         *lock = &argp->lock;
-        u32                     error;
+        __be32                  error = 0;
        /* nfsd callbacks must have been installed for this procedure */
        if (!nlmsvc_ops)
                return nlm_lck_denied_nolocks;
        /* Obtain host handle */
-        if (!(host = nlmsvc_lookup_host(rqstp))
+        if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
-         || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0))
+         || (argp->monitor && nsm_monitor(host) < 0))
                goto no_locks;
        *hostp = host;
@@ -88,13 +88,15 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 no_locks:
        if (host)
                nlm_release_host(host);
+        if (error)
+                return error;
        return nlm_lck_denied_nolocks;
 }
 /*
 * NULL: Test for presence of service
 */
-static int
+static __be32
 nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        dprintk("lockd: NULL          called\n");
@@ -104,7 +106,7 @@ nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
 * TEST: Check for conflicting lock
 */
-static int
+static __be32
 nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
                                         struct nlm_res  *resp)
 {
@@ -122,7 +124,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now check for conflicting locks */
        resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock));
@@ -134,7 +136,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
        return rpc_success;
 }
-static int
+static __be32
 nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
                                         struct nlm_res  *resp)
 {
@@ -153,7 +155,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 #if 0
        /* If supplied state doesn't match current state, we assume it's
@@ -177,7 +179,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        return rpc_success;
 }
-static int
+static __be32
 nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
                                           struct nlm_res  *resp)
 {
@@ -196,7 +198,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Try to cancel request. */
        resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
@@ -210,7 +212,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * UNLOCK: release a lock
 */
-static int
+static __be32
 nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
                                           struct nlm_res  *resp)
 {
@@ -229,7 +231,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now try to remove the lock */
        resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
@@ -244,7 +246,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 * GRANTED: A server calls us to tell that a process' lock request
 * was granted
 */
-static int
+static __be32
 nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
                                            struct nlm_res  *resp)
 {
@@ -280,14 +282,16 @@ static const struct rpc_call_ops nlmsvc_callback_ops = {
 * because we send the callback before the reply proper. I hope this
 * doesn't break any clients.
 */
-static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
-                int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+                __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
 {
        struct nlm_host *host;
        struct nlm_rqst *call;
-        int stat;
+        __be32 stat;
-        host = nlmsvc_lookup_host(rqstp);
+        host = nlmsvc_lookup_host(rqstp,
+                                  argp->lock.caller,
+                                  argp->lock.len);
        if (host == NULL)
                return rpc_system_err;
@@ -307,28 +311,28 @@ static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *ar
        return rpc_success;
 }
-static int nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                             void            *resp)
 {
        dprintk("lockd: TEST_MSG      called\n");
        return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
 }
-static int nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                             void            *resp)
 {
        dprintk("lockd: LOCK_MSG      called\n");
        return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
 }
-static int nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                               void            *resp)
 {
        dprintk("lockd: CANCEL_MSG    called\n");
        return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
 }
-static int
+static __be32
 nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                               void            *resp)
 {
@@ -336,7 +340,7 @@ nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
        return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
 }
-static int
+static __be32
 nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                void            *resp)
 {
@@ -347,7 +351,7 @@ nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * SHARE: create a DOS share or alter existing share.
 */
-static int
+static __be32
 nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
                                          struct nlm_res  *resp)
 {
@@ -366,7 +370,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now try to create the share */
        resp->status = cast_status(nlmsvc_share_file(host, file, argp));
@@ -380,7 +384,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * UNSHARE: Release a DOS share.
 */
-static int
+static __be32
 nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
                                            struct nlm_res  *resp)
 {
@@ -399,7 +403,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-                return rpc_success;
+                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
        /* Now try to unshare the file */
        resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
@@ -413,7 +417,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * NM_LOCK: Create an unmonitored lock
 */
-static int
+static __be32
 nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
                                            struct nlm_res  *resp)
 {
@@ -426,7 +430,7 @@ nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * FREE_ALL: Release all locks and shares held by client
 */
-static int
+static __be32
 nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
                                             void            *resp)
 {
@@ -444,14 +448,11 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
 * SM_NOTIFY: private callback from statd (not part of official NLM proto)
 */
-static int
+static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
        struct sockaddr_in      saddr = rqstp->rq_addr;
-        int                     vers = argp->vers;
-        int                     prot = argp->proto >> 1;
-        struct nlm_host         *host;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -466,19 +467,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
        /* Obtain the host pointer for this NFS server and try to
         * reclaim all locks we hold on this server.
         */
+        memset(&saddr, 0, sizeof(saddr));
        saddr.sin_addr.s_addr = argp->addr;
-        if ((argp->proto & 1)==0) {
+        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-                if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
-                        nlmclnt_recovery(host, argp->state);
-                        nlm_release_host(host);
-                }
-        } else {
-                /* If we run on an NFS server, delete all locks held by the client */
-                if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
-                        nlmsvc_free_host_resources(host);
-                        nlm_release_host(host);
-                }
-        }
        return rpc_success;
 }
@@ -486,7 +477,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 /*
 * client sent a GRANTED_RES, let's remove the associated block
 */
-static int
+static __be32
 nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
                                                void            *resp)
 {
@@ -495,7 +486,7 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
        dprintk("lockd: GRANTED_RES   called\n");
-        nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status);
+        nlmsvc_grant_reply(&argp->cookie, argp->status);
        return rpc_success;
 }
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index 27288c83da96..6220dc2a3f2c 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -23,7 +23,7 @@ nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh)
            && !memcmp(share->s_owner.data, oh->data, oh->len);
 }
-u32
+__be32
 nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
                        struct nlm_args *argp)
 {
@@ -64,7 +64,7 @@ update:
 /*
 * Delete a share.
 */
-u32
+__be32
 nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
                        struct nlm_args *argp)
 {
@@ -85,24 +85,20 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
 }
 /*
- * Traverse all shares for a given file (and host).
+ * Traverse all shares for a given file, and delete
- * NLM_ACT_CHECK is handled by nlmsvc_inspect_file.
+ * those owned by the given (type of) host
 */
-void
+void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file,
-nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action)
+                nlm_host_match_fn_t match)
 {
        struct nlm_share        *share, **shpp;
        shpp = &file->f_shares;
        while ((share = *shpp) !=  NULL) {
-                if (action == NLM_ACT_MARK)
+                if (match(share->s_host, host)) {
-                        share->s_host->h_inuse = 1;
+                        *shpp = share->s_next;
-                else if (action == NLM_ACT_UNLOCK) {
+                        kfree(share);
-                        if (host == NULL || host == share->s_host) {
+                        continue;
-                                *shpp = share->s_next;
-                                kfree(share);
-                                continue;
-                        }
                }
                shpp = &share->s_next;
        }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a92dd98f8401..e83024e16042 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -25,9 +25,9 @@
 /*
 * Global file hash table
 */
-#define FILE_HASH_BITS          5
+#define FILE_HASH_BITS          7
 #define FILE_NRHASH             (1<<FILE_HASH_BITS)
-static struct nlm_file *        nlm_files[FILE_NRHASH];
+static struct hlist_head        nlm_files[FILE_NRHASH];
 static DEFINE_MUTEX(nlm_file_mutex);
 #ifdef NFSD_DEBUG
@@ -78,13 +78,14 @@ static inline unsigned int file_hash(struct nfs_fh *f)
 * This is not quite right, but for now, we assume the client performs
 * the proper R/W checking.
 */
-u32
+__be32
 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
                                        struct nfs_fh *f)
 {
+        struct hlist_node *pos;
        struct nlm_file *file;
        unsigned int    hash;
-        u32             nfserr;
+        __be32          nfserr;
        nlm_debug_print_fh("nlm_file_lookup", f);
@@ -93,7 +94,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
        /* Lock file table */
        mutex_lock(&nlm_file_mutex);
-        for (file = nlm_files[hash]; file; file = file->f_next)
+        hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
                if (!nfs_compare_fh(&file->f_handle, f))
                        goto found;
@@ -105,8 +106,9 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
                goto out_unlock;
        memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
-        file->f_hash = hash;
+        mutex_init(&file->f_mutex);
-        init_MUTEX(&file->f_sema);
+        INIT_HLIST_NODE(&file->f_list);
+        INIT_LIST_HEAD(&file->f_blocks);
        /* Open the file. Note that this must not sleep for too long, else
         * we would lock up lockd:-) So no NFS re-exports, folks.
@@ -115,12 +117,11 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
         * the file.
         */
        if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
-                dprintk("lockd: open failed (nfserr %d)\n", ntohl(nfserr));
+                dprintk("lockd: open failed (error %d)\n", nfserr);
                goto out_free;
        }
-        file->f_next = nlm_files[hash];
+        hlist_add_head(&file->f_list, &nlm_files[hash]);
-        nlm_files[hash] = file;
 found:
        dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
@@ -134,12 +135,6 @@ out_unlock:
 out_free:
        kfree(file);
-#ifdef CONFIG_LOCKD_V4
-        if (nfserr == 1)
-                nfserr = nlm4_stale_fh;
-        else
-#endif
-        nfserr = nlm_lck_denied;
        goto out_unlock;
 }
@@ -149,22 +144,14 @@ out_free:
 static inline void
 nlm_delete_file(struct nlm_file *file)
 {
-        struct nlm_file **fp, *f;
        nlm_debug_print_file("closing file", file);
+        if (!hlist_unhashed(&file->f_list)) {
-        fp = nlm_files + file->f_hash;
+                hlist_del(&file->f_list);
-        while ((f = *fp) != NULL) {
+                nlmsvc_ops->fclose(file->f_file);
-                if (f == file) {
+                kfree(file);
-                        *fp = file->f_next;
+        } else {
-                        nlmsvc_ops->fclose(file->f_file);
+                printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
-                        kfree(file);
-                        return;
-                }
-                fp = &f->f_next;
        }
-        printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
 }
 /*
@@ -172,7 +159,8 @@ nlm_delete_file(struct nlm_file *file)
 * action.
 */
 static int
-nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, int action)
+nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+                        nlm_host_match_fn_t match)
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
@@ -186,17 +174,11 @@ again:
                /* update current lock count */
                file->f_locks++;
                lockhost = (struct nlm_host *) fl->fl_owner;
-                if (action == NLM_ACT_MARK)
+                if (match(lockhost, host)) {
-                        lockhost->h_inuse = 1;
-                else if (action == NLM_ACT_CHECK)
-                        return 1;
-                else if (action == NLM_ACT_UNLOCK) {
                        struct file_lock lock = *fl;
-                        if (host && lockhost != host)
-                                continue;
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -213,53 +195,66 @@ again:
 }
 /*
- * Operate on a single file
+ * Inspect a single file
+ */
+static inline int
+nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
+{
+        nlmsvc_traverse_blocks(host, file, match);
+        nlmsvc_traverse_shares(host, file, match);
+        return nlm_traverse_locks(host, file, match);
+}
+/*
+ * Quick check whether there are still any locks, blocks or
+ * shares on a given file.
 */
 static inline int
-nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action)
+nlm_file_inuse(struct nlm_file *file)
 {
-        if (action == NLM_ACT_CHECK) {
+        struct inode     *inode = nlmsvc_file_inode(file);
-                /* Fast path for mark and sweep garbage collection */
+        struct file_lock *fl;
-                if (file->f_count || file->f_blocks || file->f_shares)
+        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
+                return 1;
+        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+                if (fl->fl_lmops == &nlmsvc_lock_operations)
                        return 1;
-        } else {
-                nlmsvc_traverse_blocks(host, file, action);
-                nlmsvc_traverse_shares(host, file, action);
        }
-        return nlm_traverse_locks(host, file, action);
+        file->f_locks = 0;
+        return 0;
 }
 /*
 * Loop over all files in the file table.
 */
 static int
-nlm_traverse_files(struct nlm_host *host, int action)
+nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match)
 {
-        struct nlm_file *file, **fp;
+        struct hlist_node *pos, *next;
+        struct nlm_file *file;
        int i, ret = 0;
        mutex_lock(&nlm_file_mutex);
        for (i = 0; i < FILE_NRHASH; i++) {
-                fp = nlm_files + i;
+                hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
-                while ((file = *fp) != NULL) {
                        file->f_count++;
                        mutex_unlock(&nlm_file_mutex);
                        /* Traverse locks, blocks and shares of this file
                         * and update file->f_locks count */
-                        if (nlm_inspect_file(host, file, action))
+                        if (nlm_inspect_file(host, file, match))
                                ret = 1;
                        mutex_lock(&nlm_file_mutex);
                        file->f_count--;
                        /* No more references to this file. Let go of it. */
-                        if (!file->f_blocks && !file->f_locks
+                        if (list_empty(&file->f_blocks) && !file->f_locks
                         && !file->f_shares && !file->f_count) {
-                                *fp = file->f_next;
+                                hlist_del(&file->f_list);
                                nlmsvc_ops->fclose(file->f_file);
                                kfree(file);
-                        } else {
-                                fp = &file->f_next;
                        }
                }
        }
@@ -286,23 +281,63 @@ nlm_release_file(struct nlm_file *file)
        mutex_lock(&nlm_file_mutex);
        /* If there are no more locks etc, delete the file */
-        if(--file->f_count == 0) {
+        if (--file->f_count == 0 && !nlm_file_inuse(file))
-                if(!nlm_inspect_file(NULL, file, NLM_ACT_CHECK))
+                nlm_delete_file(file);
-                        nlm_delete_file(file);
-        }
        mutex_unlock(&nlm_file_mutex);
 }
 /*
+ * Helpers function for resource traversal
+ *
+ * nlmsvc_mark_host:
+ *      used by the garbage collector; simply sets h_inuse.
+ *      Always returns 0.
+ *
+ * nlmsvc_same_host:
+ *      returns 1 iff the two hosts match. Used to release
+ *      all resources bound to a specific host.
+ *
+ * nlmsvc_is_client:
+ *      returns 1 iff the host is a client.
+ *      Used by nlmsvc_invalidate_all
+ */
+static int
+nlmsvc_mark_host(struct nlm_host *host, struct nlm_host *dummy)
+{
+        host->h_inuse = 1;
+        return 0;
+}
+static int
+nlmsvc_same_host(struct nlm_host *host, struct nlm_host *other)
+{
+        return host == other;
+}
+static int
+nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy)
+{
+        if (host->h_server) {
+                /* we are destroying locks even though the client
+                 * hasn't asked us too, so don't unmonitor the
+                 * client
+                 */
+                if (host->h_nsmhandle)
+                        host->h_nsmhandle->sm_sticky = 1;
+                return 1;
+        } else
+                return 0;
+}
+/*
 * Mark all hosts that still hold resources
 */
 void
 nlmsvc_mark_resources(void)
 {
        dprintk("lockd: nlmsvc_mark_resources\n");
+        nlm_traverse_files(NULL, nlmsvc_mark_host);
-        nlm_traverse_files(NULL, NLM_ACT_MARK);
 }
 /*
@@ -313,23 +348,25 @@ nlmsvc_free_host_resources(struct nlm_host *host)
 {
        dprintk("lockd: nlmsvc_free_host_resources\n");
-        if (nlm_traverse_files(host, NLM_ACT_UNLOCK))
+        if (nlm_traverse_files(host, nlmsvc_same_host)) {
                printk(KERN_WARNING
-                        "lockd: couldn't remove all locks held by %s",
+                        "lockd: couldn't remove all locks held by %s\n",
                        host->h_name);
+                BUG();
+        }
 }
 /*
- * delete all hosts structs for clients
+ * Remove all locks held for clients
 */
 void
 nlmsvc_invalidate_all(void)
 {
-        struct nlm_host *host;
+        /* Release all locks held by NFS clients.
-        while ((host = nlm_find_client()) != NULL) {
+         * Previously, the code would call
-                nlmsvc_free_host_resources(host);
+         * nlmsvc_free_host_resources for each client in
-                host->h_expires = 0;
+         * turn, which is about as inefficient as it gets.
-                host->h_killed = 1;
+         * Now we just do it once in nlm_traverse_files.
-                nlm_release_host(host);
+         */
-        }
+        nlm_traverse_files(NULL, nlmsvc_is_client);
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 61c46facf257..b7c949256e5a 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -43,7 +43,7 @@ loff_t_to_s32(loff_t offset)
 /*
 * XDR functions for basic NLM types
 */
-static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
 {
        unsigned int    len;
@@ -69,8 +69,8 @@ static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
+nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
 {
        *p++ = htonl(c->len);
        memcpy(p, c->data, c->len);
@@ -78,8 +78,8 @@ nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
        return p;
 }
-static u32 *
+static __be32 *
-nlm_decode_fh(u32 *p, struct nfs_fh *f)
+nlm_decode_fh(__be32 *p, struct nfs_fh *f)
 {
        unsigned int    len;
@@ -95,8 +95,8 @@ nlm_decode_fh(u32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline u32 *
+static inline __be32 *
-nlm_encode_fh(u32 *p, struct nfs_fh *f)
+nlm_encode_fh(__be32 *p, struct nfs_fh *f)
 {
        *p++ = htonl(NFS2_FHSIZE);
        memcpy(p, f->data, NFS2_FHSIZE);
@@ -106,20 +106,20 @@ nlm_encode_fh(u32 *p, struct nfs_fh *f)
 /*
 * Encode and decode owner handle
 */
-static inline u32 *
+static inline __be32 *
-nlm_decode_oh(u32 *p, struct xdr_netobj *oh)
+nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
 {
        return xdr_decode_netobj(p, oh);
 }
-static inline u32 *
+static inline __be32 *
-nlm_encode_oh(u32 *p, struct xdr_netobj *oh)
+nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
 {
        return xdr_encode_netobj(p, oh);
 }
-static u32 *
+static __be32 *
-nlm_decode_lock(u32 *p, struct nlm_lock *lock)
+nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
        s32                     start, len, end;
@@ -153,8 +153,8 @@ nlm_decode_lock(u32 *p, struct nlm_lock *lock)
 /*
 * Encode a lock as part of an NLM call
 */
-static u32 *
+static __be32 *
-nlm_encode_lock(u32 *p, struct nlm_lock *lock)
+nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
        __s32                   start, len;
@@ -184,8 +184,8 @@ nlm_encode_lock(u32 *p, struct nlm_lock *lock)
 /*
 * Encode result of a TEST/TEST_MSG call
 */
-static u32 *
+static __be32 *
-nlm_encode_testres(u32 *p, struct nlm_res *resp)
+nlm_encode_testres(__be32 *p, struct nlm_res *resp)
 {
        s32             start, len;
@@ -221,7 +221,7 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
 * First, the server side XDR functions
 */
 int
-nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        u32     exclusive;
@@ -238,7 +238,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_encode_testres(p, resp)))
                return 0;
@@ -246,7 +246,7 @@ nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        u32     exclusive;
@@ -266,7 +266,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        u32     exclusive;
@@ -282,7 +282,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        if (!(p = nlm_decode_cookie(p, &argp->cookie))
         || !(p = nlm_decode_lock(p, &argp->lock)))
@@ -292,7 +292,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -313,7 +313,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
                return 0;
@@ -323,7 +323,7 @@ nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
                return 0;
@@ -332,7 +332,7 @@ nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
+nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -344,7 +344,7 @@ nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
 }
 int
-nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 {
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
@@ -357,7 +357,7 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
 }
 int
-nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
                return 0;
@@ -366,13 +366,13 @@ nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlmsvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_argsize_check(rqstp, p);
 }
 int
-nlmsvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
@@ -389,7 +389,7 @@ nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
 #endif
 static int
-nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -403,7 +403,7 @@ nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
                return -EIO;
@@ -438,7 +438,7 @@ nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -455,7 +455,7 @@ nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -470,7 +470,7 @@ nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -483,7 +483,7 @@ nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_encode_cookie(p, &resp->cookie)))
                return -EIO;
@@ -493,7 +493,7 @@ nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 static int
-nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_encode_testres(p, resp)))
                return -EIO;
@@ -502,7 +502,7 @@ nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 static int
-nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm_decode_cookie(p, &resp->cookie)))
                return -EIO;
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 36eb175ec335..f4c0b2b9f75a 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -44,8 +44,8 @@ loff_t_to_s64(loff_t offset)
 /*
 * XDR functions for basic NLM types
 */
-static u32 *
+static __be32 *
-nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
+nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
 {
        unsigned int    len;
@@ -71,8 +71,8 @@ nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
        return p;
 }
-static u32 *
+static __be32 *
-nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
+nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
 {
        *p++ = htonl(c->len);
        memcpy(p, c->data, c->len);
@@ -80,8 +80,8 @@ nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
        return p;
 }
-static u32 *
+static __be32 *
-nlm4_decode_fh(u32 *p, struct nfs_fh *f)
+nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
 {
        memset(f->data, 0, sizeof(f->data));
        f->size = ntohl(*p++);
@@ -95,8 +95,8 @@ nlm4_decode_fh(u32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(f->size);
 }
-static u32 *
+static __be32 *
-nlm4_encode_fh(u32 *p, struct nfs_fh *f)
+nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
 {
        *p++ = htonl(f->size);
        if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
@@ -107,20 +107,20 @@ nlm4_encode_fh(u32 *p, struct nfs_fh *f)
 /*
 * Encode and decode owner handle
 */
-static u32 *
+static __be32 *
-nlm4_decode_oh(u32 *p, struct xdr_netobj *oh)
+nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 {
        return xdr_decode_netobj(p, oh);
 }
-static u32 *
+static __be32 *
-nlm4_encode_oh(u32 *p, struct xdr_netobj *oh)
+nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
 {
        return xdr_encode_netobj(p, oh);
 }
-static u32 *
+static __be32 *
-nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
+nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
        __s64                   len, start, end;
@@ -153,8 +153,8 @@ nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
 /*
 * Encode a lock as part of an NLM call
 */
-static u32 *
+static __be32 *
-nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
+nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
 {
        struct file_lock        *fl = &lock->fl;
        __s64                   start, len;
@@ -185,8 +185,8 @@ nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
 /*
 * Encode result of a TEST/TEST_MSG call
 */
-static u32 *
+static __be32 *
-nlm4_encode_testres(u32 *p, struct nlm_res *resp)
+nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
 {
        s64             start, len;
@@ -227,7 +227,7 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
 * First, the server side XDR functions
 */
 int
-nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        u32     exclusive;
@@ -244,7 +244,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_encode_testres(p, resp)))
                return 0;
@@ -252,7 +252,7 @@ nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        u32     exclusive;
@@ -272,7 +272,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        u32     exclusive;
@@ -288,7 +288,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        if (!(p = nlm4_decode_cookie(p, &argp->cookie))
         || !(p = nlm4_decode_lock(p, &argp->lock)))
@@ -298,7 +298,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -319,7 +319,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 int
-nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
                return 0;
@@ -329,7 +329,7 @@ nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
                return 0;
@@ -338,7 +338,7 @@ nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
+nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -350,7 +350,7 @@ nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
 }
 int
-nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 {
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
@@ -363,7 +363,7 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
 }
 int
-nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
                return 0;
@@ -372,13 +372,13 @@ nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 int
-nlm4svc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_argsize_check(rqstp, p);
 }
 int
-nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
@@ -388,14 +388,14 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
 */
 #ifdef NLMCLNT_SUPPORT_SHARES
 static int
-nlm4clt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
+nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
 {
        return 0;
 }
 #endif
 static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -409,7 +409,7 @@ nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
                return -EIO;
@@ -444,7 +444,7 @@ nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -461,7 +461,7 @@ nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -476,7 +476,7 @@ nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
        struct nlm_lock *lock = &argp->lock;
@@ -489,7 +489,7 @@ nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 static int
-nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
                return -EIO;
@@ -499,7 +499,7 @@ nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 static int
-nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_encode_testres(p, resp)))
                return -EIO;
@@ -508,7 +508,7 @@ nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 static int
-nlm4clt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
        if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
                return -EIO;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index c11a4b9fb863..1e36bae4d0eb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -149,12 +149,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
                return -ENOMEM;
        s->s_fs_info = sbi;
-        /* N.B. These should be compile-time tests.
+        BUILD_BUG_ON(32 != sizeof (struct minix_inode));
-           Unfortunately that is impossible. */
+        BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
-        if (32 != sizeof (struct minix_inode))
-                panic("bad V1 i-node size");
-        if (64 != sizeof(struct minix2_inode))
-                panic("bad V2 i-node size");
        if (!sb_set_blocksize(s, BLOCK_SIZE))
                goto out_bad_hblock;
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index a89ac84a8241..589d1eac55c1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -726,7 +726,7 @@ outrel:
                                struct compat_ncp_privatedata_ioctl user32;
                                user32.len = user.len;
                                user32.data = (unsigned long) user.data;
-                                if (copy_to_user(&user32, argp, sizeof(user32)))
+                                if (copy_to_user(argp, &user32, sizeof(user32)))
                                        return -EFAULT;
                        } else
 #endif
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5676163d26e8..db3d7919c601 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -31,10 +31,10 @@ struct cb_compound_hdr_arg {
 };
 struct cb_compound_hdr_res {
-        uint32_t *status;
+        __be32 *status;
        int taglen;
        const char *tag;
-        uint32_t *nops;
+        __be32 *nops;
 };
 struct cb_getattrargs {
@@ -44,7 +44,7 @@ struct cb_getattrargs {
 };
 struct cb_getattrres {
-        uint32_t status;
+        __be32 status;
        uint32_t bitmap[2];
        uint64_t size;
        uint64_t change_attr;
@@ -59,8 +59,8 @@ struct cb_recallargs {
        uint32_t truncate;
 };
-extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(void);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 97cf8f71451f..72e55d83756d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,7 +14,7 @@
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
-unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
 {
        struct nfs_client *clp;
        struct nfs_delegation *delegation;
@@ -55,11 +55,11 @@ out:
        return res->status;
 }
-unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
        struct nfs_client *clp;
        struct inode *inode;
-        unsigned res;
+        __be32 res;
        
        res = htonl(NFS4ERR_BADHANDLE);
        clp = nfs_find_client(args->addr, 4);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 29f932192054..f8ea1f51f590 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -22,9 +22,9 @@
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
-typedef unsigned (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *);
-typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
-typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
 struct callback_op {
@@ -36,24 +36,24 @@ struct callback_op {
 static struct callback_op callback_ops[];
-static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        return htonl(NFS4_OK);
 }
-static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_argsize_check(rqstp, p);
 }
-static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
-static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
 {
-        uint32_t *p;
+        __be32 *p;
        p = xdr_inline_decode(xdr, nbytes);
        if (unlikely(p == NULL))
@@ -61,9 +61,9 @@ static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
        return p;
 }
-static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
 {
-        uint32_t *p;
+        __be32 *p;
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
@@ -81,9 +81,9 @@ static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const c
        return 0;
 }
-static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
-        uint32_t *p;
+        __be32 *p;
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
@@ -99,9 +99,9 @@ static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
        return 0;
 }
-static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 {
-        uint32_t *p;
+        __be32 *p;
        unsigned int attrlen;
        p = read_buf(xdr, 4);
@@ -118,9 +118,9 @@ static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
        return 0;
 }
-static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
-        uint32_t *p;
+        __be32 *p;
        p = read_buf(xdr, 16);
        if (unlikely(p == NULL))
@@ -129,11 +129,11 @@ static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
        return 0;
 }
-static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
-        uint32_t *p;
+        __be32 *p;
        unsigned int minor_version;
-        unsigned status;
+        __be32 status;
        status = decode_string(xdr, &hdr->taglen, &hdr->tag);
        if (unlikely(status != 0))
@@ -159,9 +159,9 @@ static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compou
        return 0;
 }
-static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
 {
-        uint32_t *p;
+        __be32 *p;
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
@@ -169,9 +169,9 @@ static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
        return 0;
 }
-static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
+static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
 {
-        unsigned status;
+        __be32 status;
        status = decode_fh(xdr, &args->fh);
        if (unlikely(status != 0))
@@ -183,10 +183,10 @@ out:
        return status;
 }
-static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
+static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
 {
-        uint32_t *p;
+        __be32 *p;
-        unsigned status;
+        __be32 status;
        args->addr = &rqstp->rq_addr;
        status = decode_stateid(xdr, &args->stateid);
@@ -204,9 +204,9 @@ out:
        return status;
 }
-static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
-        uint32_t *p;
+        __be32 *p;
        p = xdr_reserve_space(xdr, 4 + len);
        if (unlikely(p == NULL))
@@ -217,10 +217,10 @@ static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const ch
 #define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
 #define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
-static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
 {
-        uint32_t bm[2];
+        __be32 bm[2];
-        uint32_t *p;
+        __be32 *p;
        bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
        bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
@@ -247,9 +247,9 @@ static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitma
        return 0;
 }
-static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
 {
-        uint32_t *p;
+        __be32 *p;
        if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
                return 0;
@@ -260,9 +260,9 @@ static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitma
        return 0;
 }
-static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
 {
-        uint32_t *p;
+        __be32 *p;
        if (!(bitmap[0] & FATTR4_WORD0_SIZE))
                return 0;
@@ -273,9 +273,9 @@ static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap,
        return 0;
 }
-static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
 {
-        uint32_t *p;
+        __be32 *p;
        p = xdr_reserve_space(xdr, 12);
        if (unlikely(p == 0))
@@ -285,23 +285,23 @@ static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *
        return 0;
 }
-static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
 {
        if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
                return 0;
        return encode_attr_time(xdr,time);
 }
-static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
 {
        if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
                return 0;
        return encode_attr_time(xdr,time);
 }
-static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
 {
-        unsigned status;
+        __be32 status;
        hdr->status = xdr_reserve_space(xdr, 4);
        if (unlikely(hdr->status == NULL))
@@ -315,9 +315,9 @@ static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compou
        return 0;
 }
-static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
 {
-        uint32_t *p;
+        __be32 *p;
        
        p = xdr_reserve_space(xdr, 8);
        if (unlikely(p == NULL))
@@ -327,10 +327,10 @@ static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
        return 0;
 }
-static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
 {
-        uint32_t *savep = NULL;
+        __be32 *savep = NULL;
-        unsigned status = res->status;
+        __be32 status = res->status;
        
        if (unlikely(status != 0))
                goto out;
@@ -353,15 +353,15 @@ out:
        return status;
 }
-static unsigned process_op(struct svc_rqst *rqstp,
+static __be32 process_op(struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
                struct xdr_stream *xdr_out, void *resp)
 {
        struct callback_op *op = &callback_ops[0];
        unsigned int op_nr = OP_CB_ILLEGAL;
-        unsigned int status = 0;
+        __be32 status = 0;
        long maxlen;
-        unsigned res;
+        __be32 res;
        dprintk("%s: start\n", __FUNCTION__);
        status = decode_op_hdr(xdr_in, &op_nr);
@@ -399,20 +399,20 @@ static unsigned process_op(struct svc_rqst *rqstp,
 /*
 * Decode, process and encode a COMPOUND
 */
-static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        struct cb_compound_hdr_arg hdr_arg;
        struct cb_compound_hdr_res hdr_res;
        struct xdr_stream xdr_in, xdr_out;
-        uint32_t *p;
+        __be32 *p;
-        unsigned int status;
+        __be32 status;
        unsigned int nops = 1;
        dprintk("%s: start\n", __FUNCTION__);
        xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
-        p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+        p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
        xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
        decode_compound_hdr_arg(&xdr_in, &hdr_arg);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8106f3b29e4a..5fea638743e4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -10,7 +10,6 @@
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -233,11 +232,15 @@ void nfs_put_client(struct nfs_client *clp)
 * Find a client by address
 * - caller must hold nfs_client_lock
 */
-static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
 {
        struct nfs_client *clp;
        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                /* Don't match clients that failed to initialise properly */
+                if (clp->cl_cons_state < 0)
+                        continue;
                /* Different NFS versions cannot share the same nfs_client */
                if (clp->cl_nfsversion != nfsversion)
                        continue;
@@ -246,7 +249,7 @@ static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int
                           sizeof(clp->cl_addr.sin_addr)) != 0)
                        continue;
-                if (clp->cl_addr.sin_port == addr->sin_port)
+                if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
                        goto found;
        }
@@ -266,11 +269,12 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
        struct nfs_client *clp;
        spin_lock(&nfs_client_lock);
-        clp = __nfs_find_client(addr, nfsversion);
+        clp = __nfs_find_client(addr, nfsversion, 0);
        spin_unlock(&nfs_client_lock);
+        if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
-        BUG_ON(clp && clp->cl_cons_state == 0);
+                nfs_put_client(clp);
+                clp = NULL;
+        }
        return clp;
 }
@@ -293,7 +297,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
        do {
                spin_lock(&nfs_client_lock);
-                clp = __nfs_find_client(addr, nfsversion);
+                clp = __nfs_find_client(addr, nfsversion, 1);
                if (clp)
                        goto found_client;
                if (new)
@@ -323,25 +327,11 @@ found_client:
        if (new)
                nfs_free_client(new);
-        if (clp->cl_cons_state == NFS_CS_INITING) {
+        error = wait_event_interruptible(nfs_client_active_wq,
-                DECLARE_WAITQUEUE(myself, current);
+                                clp->cl_cons_state != NFS_CS_INITING);
+        if (error < 0) {
-                add_wait_queue(&nfs_client_active_wq, &myself);
+                nfs_put_client(clp);
+                return ERR_PTR(-ERESTARTSYS);
-                for (;;) {
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (signal_pending(current) ||
-                            clp->cl_cons_state > NFS_CS_READY)
-                                break;
-                        schedule();
-                }
-                remove_wait_queue(&nfs_client_active_wq, &myself);
-                if (signal_pending(current)) {
-                        nfs_put_client(clp);
-                        return ERR_PTR(-ERESTARTSYS);
-                }
        }
        if (clp->cl_cons_state < NFS_CS_READY) {
@@ -864,6 +854,7 @@ error:
 */
 static int nfs4_init_client(struct nfs_client *clp,
                int proto, int timeo, int retrans,
+                const char *ip_addr,
                rpc_authflavor_t authflavour)
 {
        int error;
@@ -880,6 +871,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour);
        if (error < 0)
                goto error;
+        memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
        error = nfs_idmap_new(clp);
        if (error < 0) {
@@ -903,6 +895,7 @@ error:
 */
 static int nfs4_set_client(struct nfs_server *server,
                const char *hostname, const struct sockaddr_in *addr,
+                const char *ip_addr,
                rpc_authflavor_t authflavour,
                int proto, int timeo, int retrans)
 {
@@ -917,7 +910,7 @@ static int nfs4_set_client(struct nfs_server *server,
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, proto, timeo, retrans, authflavour);
+        error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
        if (error < 0)
                goto error_put;
@@ -986,7 +979,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
                return ERR_PTR(-ENOMEM);
        /* Get a client record */
-        error = nfs4_set_client(server, hostname, addr, authflavour,
+        error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
                        data->proto, data->timeo, data->retrans);
        if (error < 0)
                goto error;
@@ -1056,6 +1049,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        /* Get a client representation.
         * Note: NFSv4 always uses TCP, */
        error = nfs4_set_client(server, data->hostname, data->addr,
+                        parent_client->cl_ipaddr,
                        data->authflavor,
                        parent_server->client->cl_xprt->prot,
                        parent_client->retrans_timeo,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 481f8892a919..4133ef5264e5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -142,12 +142,12 @@ nfs_opendir(struct inode *inode, struct file *filp)
        return res;
 }
-typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int);
+typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
-        u32             *ptr;
+        __be32          *ptr;
        u64             *dir_cookie;
        loff_t          current_index;
        struct nfs_entry *entry;
@@ -203,8 +203,10 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
         * Note: assumes we have exclusive access to this mapping either
         *       through inode->i_mutex or some other mechanism.
         */
-        if (page->index == 0)
+        if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
-                invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
+                /* Should never happen */
+                nfs_zap_mapping(inode, inode->i_mapping);
+        }
        unlock_page(page);
        return 0;
 error:
@@ -218,7 +220,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 static inline
 int dir_decode(nfs_readdir_descriptor_t *desc)
 {
-        u32     *p = desc->ptr;
+        __be32  *p = desc->ptr;
        p = desc->decode(p, desc->entry, desc->plus);
        if (IS_ERR(p))
                return PTR_ERR(p);
@@ -1517,8 +1519,8 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
        pagevec_init(&lru_pvec, 0);
        if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
                                                        GFP_KERNEL)) {
-                if (!pagevec_add(&lru_pvec, page))
+                pagevec_add(&lru_pvec, page);
-                        __pagevec_lru_add(&lru_pvec);
+                pagevec_lru_add(&lru_pvec);
                SetPageUptodate(page);
                unlock_page(page);
        } else
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9f7f8b9ea1e2..bdfabf854a51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -497,6 +497,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
                        if (dreq->commit_data != NULL)
                                nfs_commit_free(dreq->commit_data);
                        nfs_direct_free_writedata(dreq);
+                        nfs_zap_mapping(inode, inode->i_mapping);
                        nfs_direct_complete(dreq);
        }
 }
@@ -517,6 +518,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 {
        nfs_end_data_update(inode);
        nfs_direct_free_writedata(dreq);
+        nfs_zap_mapping(inode, inode->i_mapping);
        nfs_direct_complete(dreq);
 }
 #endif
@@ -532,10 +534,12 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
        spin_lock(&dreq->lock);
-        if (likely(status >= 0))
+        if (unlikely(status < 0)) {
-                dreq->count += data->res.count;
+                dreq->error = status;
-        else
+                goto out_unlock;
-                dreq->error = task->tk_status;
+        }
+        dreq->count += data->res.count;
        if (data->res.verf->committed != NFS_FILE_SYNC) {
                switch (dreq->flags) {
@@ -550,7 +554,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
                                }
                }
        }
+out_unlock:
        spin_unlock(&dreq->lock);
 }
@@ -828,17 +832,6 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
-        /*
-         * XXX: nfs_end_data_update() already ensures this file's
-         *      cached data is subsequently invalidated.  Do we really
-         *      need to call invalidate_inode_pages2() again here?
-         *
-         *      For aio writes, this invalidation will almost certainly
-         *      occur before the writes complete.  Kind of racey.
-         */
-        if (mapping->nrpages)
-                invalidate_inode_pages2(mapping);
        if (retval > 0)
                iocb->ki_pos = pos + retval;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 76b08ae9ed82..20c6f39ea38a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -9,7 +9,6 @@
 * 2 of the License, or (at your option) any later version.
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bc9376ca86cd..08cc4c5919ab 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -131,6 +131,15 @@ void nfs_zap_caches(struct inode *inode)
        spin_unlock(&inode->i_lock);
 }
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+        if (mapping->nrpages != 0) {
+                spin_lock(&inode->i_lock);
+                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+                spin_unlock(&inode->i_lock);
+        }
+}
 static void nfs_zap_acl_cache(struct inode *inode)
 {
        void (*clear_acl_cache)(struct inode *);
@@ -574,7 +583,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        lock_kernel();
-        if (!inode || is_bad_inode(inode))
+        if (is_bad_inode(inode))
                goto out_nowait;
        if (NFS_STALE(inode))
                goto out_nowait;
@@ -671,13 +680,20 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
                        || nfs_attribute_timeout(inode))
                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+        if (ret < 0)
+                goto out;
        if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
-                nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+                if (mapping->nrpages != 0) {
-                if (S_ISREG(inode->i_mode))
+                        if (S_ISREG(inode->i_mode)) {
-                        nfs_sync_mapping(mapping);
+                                ret = nfs_sync_mapping(mapping);
-                invalidate_inode_pages2(mapping);
+                                if (ret < 0)
+                                        goto out;
+                        }
+                        ret = invalidate_inode_pages2(mapping);
+                        if (ret < 0)
+                                goto out;
+                }
                spin_lock(&inode->i_lock);
                nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
                if (S_ISDIR(inode->i_mode)) {
@@ -687,10 +703,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
                }
                spin_unlock(&inode->i_lock);
+                nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
                dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
                                inode->i_sb->s_id,
                                (long long)NFS_FILEID(inode));
        }
+out:
        return ret;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bea0b016bd70..d205466233f6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -93,15 +93,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 #endif
 /* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d507b021207f..f75fe72b4160 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -95,7 +95,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
 * XDR encode/decode functions for MOUNT
 */
 static int
-xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
+xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path)
 {
        p = xdr_encode_string(p, path);
@@ -104,7 +104,7 @@ xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
 }
 static int
-xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
+xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
@@ -116,7 +116,7 @@ xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
 }
 static int
-xdr_decode_fhstatus3(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
+xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 {
        struct nfs_fh *fh = res->fh;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 60408646176b..ec1114b33d89 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -7,8 +7,6 @@
 * NFS namespace
 */
-#include <linux/config.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index b49501fc0a79..3be4e72a0227 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -66,15 +66,15 @@
 /*
 * Common NFS XDR functions as inlines
 */
-static inline u32 *
+static inline __be32 *
-xdr_encode_fhandle(u32 *p, struct nfs_fh *fhandle)
+xdr_encode_fhandle(__be32 *p, struct nfs_fh *fhandle)
 {
        memcpy(p, fhandle->data, NFS2_FHSIZE);
        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
+xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
 {
        /* NFSv2 handles have a fixed length */
        fhandle->size = NFS2_FHSIZE;
@@ -82,8 +82,8 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline u32*
+static inline __be32*
-xdr_encode_time(u32 *p, struct timespec *timep)
+xdr_encode_time(__be32 *p, struct timespec *timep)
 {
        *p++ = htonl(timep->tv_sec);
        /* Convert nanoseconds into microseconds */
@@ -91,8 +91,8 @@ xdr_encode_time(u32 *p, struct timespec *timep)
        return p;
 }
-static inline u32*
+static inline __be32*
-xdr_encode_current_server_time(u32 *p, struct timespec *timep)
+xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
 {
        /*
         * Passing the invalid value useconds=1000000 is a
@@ -108,8 +108,8 @@ xdr_encode_current_server_time(u32 *p, struct timespec *timep)
        return p;
 }
-static inline u32*
+static inline __be32*
-xdr_decode_time(u32 *p, struct timespec *timep)
+xdr_decode_time(__be32 *p, struct timespec *timep)
 {
        timep->tv_sec = ntohl(*p++);
        /* Convert microseconds into nanoseconds */
@@ -117,8 +117,8 @@ xdr_decode_time(u32 *p, struct timespec *timep)
        return p;
 }
-static u32 *
+static __be32 *
-xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
+xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
        u32 rdev;
        fattr->type = (enum nfs_ftype) ntohl(*p++);
@@ -146,10 +146,10 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-xdr_encode_sattr(u32 *p, struct iattr *attr)
+xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
-        const u32 not_set = __constant_htonl(0xFFFFFFFF);
+        const __be32 not_set = __constant_htonl(0xFFFFFFFF);
        *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
        *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
@@ -184,7 +184,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
 * GETATTR, READLINK, STATFS
 */
 static int
-nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
+nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
        p = xdr_encode_fhandle(p, fh);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -195,7 +195,7 @@ nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
 * Encode SETATTR arguments
 */
 static int
-nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
+nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_sattr(p, args->sattr);
@@ -208,7 +208,7 @@ nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
 * LOOKUP, REMOVE, RMDIR
 */
 static int
-nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
+nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_array(p, args->name, args->len);
@@ -222,7 +222,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
 * exactly to the page we want to fetch.
 */
 static int
-nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
+nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        unsigned int replen;
@@ -246,7 +246,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
 * Decode READ reply
 */
 static int
-nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
+nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
        struct kvec *iov = req->rq_rcv_buf.head;
        int     status, count, recvd, hdrlen;
@@ -286,7 +286,7 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
 * Write arguments. Splice the buffer to be written into the iovec.
 */
 static int
-nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
        struct xdr_buf *sndbuf = &req->rq_snd_buf;
        u32 offset = (u32)args->offset;
@@ -309,7 +309,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
 * CREATE, MKDIR
 */
 static int
-nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
+nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_array(p, args->name, args->len);
@@ -322,7 +322,7 @@ nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
 * Encode RENAME arguments
 */
 static int
-nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
+nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -336,7 +336,7 @@ nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
 * Encode LINK arguments
 */
 static int
-nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
+nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
 {
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_fhandle(p, args->tofh);
@@ -349,7 +349,7 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
 * Encode SYMLINK arguments
 */
 static int
-nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
+nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
 {
        struct xdr_buf *sndbuf = &req->rq_snd_buf;
        size_t pad;
@@ -378,7 +378,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
 * Encode arguments to readdir call
 */
 static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
+nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
        struct rpc_task *task = req->rq_task;
        struct rpc_auth *auth = task->tk_auth;
@@ -404,7 +404,7 @@ nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
 * from nfs_readdir for each entry.
 */
 static int
-nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
@@ -412,7 +412,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
        int hdrlen, recvd;
        int status, nr;
        unsigned int len, pglen;
-        u32 *end, *entry, *kaddr;
+        __be32 *end, *entry, *kaddr;
        if ((status = ntohl(*p++)))
                return -nfs_stat_to_errno(status);
@@ -432,8 +432,8 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0);
+        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (u32 *)((char *)p + pglen);
+        end = (__be32 *)((char *)p + pglen);
        entry = p;
        for (nr = 0; *p++; nr++) {
                if (p + 2 > end)
@@ -468,8 +468,8 @@ err_unmap:
        goto out;
 }
-u32 *
+__be32 *
-nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
+nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
        if (!*p++) {
                if (!*p)
@@ -496,7 +496,7 @@ nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
 * Decode simple status reply
 */
 static int
-nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
        int     status;
@@ -510,7 +510,7 @@ nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
 * GETATTR, SETATTR, WRITE
 */
 static int
-nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
        int     status;
@@ -525,7 +525,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
 * LOOKUP, CREATE, MKDIR
 */
 static int
-nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
+nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 {
        int     status;
@@ -540,7 +540,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
 * Encode READLINK args
 */
 static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args)
+nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        unsigned int replen;
@@ -558,7 +558,7 @@ nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args
 * Decode READLINK reply
 */
 static int
-nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
@@ -601,7 +601,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
 * Decode WRITE reply
 */
 static int
-nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
        res->verf->committed = NFS_FILE_SYNC;
        return nfs_xdr_attrstat(req, p, res->fattr);
@@ -611,7 +611,7 @@ nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
 * Decode STATFS reply
 */
 static int
-nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_fsstat *res)
+nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
 {
        int     status;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3b234d4601e7..e5f128ffc32d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -668,7 +668,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 {
        struct inode            *dir = dentry->d_inode;
        struct nfs_fattr        dir_attr;
-        u32                     *verf = NFS_COOKIEVERF(dir);
+        __be32                  *verf = NFS_COOKIEVERF(dir);
        struct nfs3_readdirargs arg = {
                .fh             = NFS_FH(dir),
                .cookie         = cookie,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 16556fa4effb..0ace092d126f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -105,14 +105,14 @@ static struct {
 /*
 * Common NFS XDR functions as inlines
 */
-static inline u32 *
+static inline __be32 *
-xdr_encode_fhandle(u32 *p, struct nfs_fh *fh)
+xdr_encode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
        return xdr_encode_array(p, fh->data, fh->size);
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
+xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
        if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
                memcpy(fh->data, p, fh->size);
@@ -124,24 +124,24 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
 /*
 * Encode/decode time.
 */
-static inline u32 *
+static inline __be32 *
-xdr_encode_time3(u32 *p, struct timespec *timep)
+xdr_encode_time3(__be32 *p, struct timespec *timep)
 {
        *p++ = htonl(timep->tv_sec);
        *p++ = htonl(timep->tv_nsec);
        return p;
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_time3(u32 *p, struct timespec *timep)
+xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
        timep->tv_sec = ntohl(*p++);
        timep->tv_nsec = ntohl(*p++);
        return p;
 }
-static u32 *
+static __be32 *
-xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
+xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
        unsigned int    type, major, minor;
        int             fmode;
@@ -177,8 +177,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-xdr_encode_sattr(u32 *p, struct iattr *attr)
+xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
        if (attr->ia_valid & ATTR_MODE) {
                *p++ = xdr_one;
@@ -223,8 +223,8 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
+xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 {
        p = xdr_decode_hyper(p, &fattr->pre_size);
        p = xdr_decode_time3(p, &fattr->pre_mtime);
@@ -233,16 +233,16 @@ xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_post_op_attr(u32 *p, struct nfs_fattr *fattr)
+xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
        if (*p++)
                p = xdr_decode_fattr(p, fattr);
        return p;
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
+xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
        if (*p++)
                return xdr_decode_wcc_attr(p, fattr);
@@ -250,8 +250,8 @@ xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
 }
-static inline u32 *
+static inline __be32 *
-xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
+xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
 {
        p = xdr_decode_pre_op_attr(p, fattr);
        return xdr_decode_post_op_attr(p, fattr);
@@ -265,7 +265,7 @@ xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
 * Encode file handle argument
 */
 static int
-nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
+nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
        p = xdr_encode_fhandle(p, fh);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -276,7 +276,7 @@ nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
 * Encode SETATTR arguments
 */
 static int
-nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
+nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_sattr(p, args->sattr);
@@ -291,7 +291,7 @@ nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
 * Encode directory ops argument
 */
 static int
-nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
+nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_array(p, args->name, args->len);
@@ -303,7 +303,7 @@ nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
 * Encode access() argument
 */
 static int
-nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
+nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        *p++ = htonl(args->access);
@@ -317,7 +317,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
 * exactly to the page we want to fetch.
 */
 static int
-nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
+nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        unsigned int replen;
@@ -339,7 +339,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
 * Write arguments. Splice the buffer to be written into the iovec.
 */
 static int
-nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
        struct xdr_buf *sndbuf = &req->rq_snd_buf;
        u32 count = args->count;
@@ -360,7 +360,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
 * Encode CREATE arguments
 */
 static int
-nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
+nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_array(p, args->name, args->len);
@@ -380,7 +380,7 @@ nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
 * Encode MKDIR arguments
 */
 static int
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
+nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_array(p, args->name, args->len);
@@ -393,7 +393,7 @@ nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
 * Encode SYMLINK arguments
 */
 static int
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args)
+nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
 {
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -410,7 +410,7 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args
 * Encode MKNOD arguments
 */
 static int
-nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
+nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_array(p, args->name, args->len);
@@ -429,7 +429,7 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
 * Encode RENAME arguments
 */
 static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
 {
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -443,7 +443,7 @@ nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
 * Encode LINK arguments
 */
 static int
-nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
+nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 {
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_fhandle(p, args->tofh);
@@ -456,7 +456,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
 * Encode arguments to readdir call
 */
 static int
-nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args)
+nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        unsigned int replen;
@@ -485,7 +485,7 @@ nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args
 * We just check for syntactical correctness.
 */
 static int
-nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
+nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
@@ -493,7 +493,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
        int hdrlen, recvd;
        int status, nr;
        unsigned int len, pglen;
-        u32 *entry, *end, *kaddr;
+        __be32 *entry, *end, *kaddr;
        status = ntohl(*p++);
        /* Decode post_op_attrs */
@@ -523,8 +523,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0);
+        kaddr = p = kmap_atomic(*page, KM_USER0);
-        end = (u32 *)((char *)p + pglen);
+        end = (__be32 *)((char *)p + pglen);
        entry = p;
        for (nr = 0; *p++; nr++) {
                if (p + 3 > end)
@@ -583,8 +583,8 @@ err_unmap:
        goto out;
 }
-u32 *
+__be32 *
-nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
+nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
        struct nfs_entry old = *entry;
@@ -626,7 +626,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
 * Encode COMMIT arguments
 */
 static int
-nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
        p = xdr_encode_fhandle(p, args->fh);
        p = xdr_encode_hyper(p, args->offset);
@@ -640,7 +640,7 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
 * Encode GETACL arguments
 */
 static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_getaclargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
@@ -664,7 +664,7 @@ nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
 * Encode SETACL arguments
 */
 static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
                   struct nfs3_setaclargs *args)
 {
        struct xdr_buf *buf = &req->rq_snd_buf;
@@ -711,7 +711,7 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
 * Decode attrstat reply.
 */
 static int
-nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
        int     status;
@@ -726,7 +726,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
 * SATTR, REMOVE, RMDIR
 */
 static int
-nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
        int     status;
@@ -740,7 +740,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
 * Decode LOOKUP reply
 */
 static int
-nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
+nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 {
        int     status;
@@ -759,7 +759,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
 * Decode ACCESS reply
 */
 static int
-nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
+nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 {
        int     status = ntohl(*p++);
@@ -771,7 +771,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
 }
 static int
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *args)
+nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        unsigned int replen;
@@ -789,7 +789,7 @@ nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *ar
 * Decode READLINK reply
 */
 static int
-nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
@@ -837,7 +837,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
 * Decode READ reply
 */
 static int
-nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
+nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
        struct kvec *iov = req->rq_rcv_buf.head;
        int     status, count, ocount, recvd, hdrlen;
@@ -888,7 +888,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
 * Decode WRITE response
 */
 static int
-nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
        int     status;
@@ -910,7 +910,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
 * Decode a CREATE response
 */
 static int
-nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
+nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 {
        int     status;
@@ -937,7 +937,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
 * Decode RENAME reply
 */
 static int
-nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
 {
        int     status;
@@ -952,7 +952,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
 * Decode LINK reply
 */
 static int
-nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
+nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 {
        int     status;
@@ -967,7 +967,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
 * Decode FSSTAT reply
 */
 static int
-nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
+nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
 {
        int             status;
@@ -992,7 +992,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
 * Decode FSINFO reply
 */
 static int
-nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
+nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
 {
        int             status;
@@ -1020,7 +1020,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
 * Decode PATHCONF reply
 */
 static int
-nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
+nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
 {
        int             status;
@@ -1040,7 +1040,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
 * Decode COMMIT reply
 */
 static int
-nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
        int             status;
@@ -1059,7 +1059,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
 * Decode GETACL reply
 */
 static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
                   struct nfs3_getaclres *res)
 {
        struct xdr_buf *buf = &req->rq_rcv_buf;
@@ -1091,7 +1091,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
 * Decode setacl reply.
 */
 static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
        int status = ntohl(*p++);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61095fe4b5ca..6f346677332d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -212,7 +212,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 /* nfs4xdr.c */
-extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24e47f3bbd17..b872779d7cd5 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -7,8 +7,6 @@
 * NFSv4 namespace
 */
-#include <linux/config.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47c7e6e3910d..8118036cc449 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -138,10 +138,10 @@ const u32 nfs4_fs_locations_bitmap[2] = {
        | FATTR4_WORD1_MOUNTED_ON_FILEID
 };
-static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
                struct nfs4_readdir_arg *readdir)
 {
-        u32 *start, *p;
+        __be32 *start, *p;
        BUG_ON(readdir->count < 80);
        if (cookie > 2) {
@@ -162,7 +162,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
         * when talking to the server, we always send cookie 0
         * instead of 1 or 2.
         */
-        start = p = (u32 *)kmap_atomic(*readdir->pages, KM_USER0);
+        start = p = kmap_atomic(*readdir->pages, KM_USER0);
        
        if (cookie == 0) {
                *p++ = xdr_one;                                  /* next */
@@ -1314,11 +1314,9 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
                        case -EROFS:
                                lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
                                return 1;
-                        case -ENOENT:
+                        default:
-                                if (dentry->d_inode == NULL)
+                                goto out_drop;
-                                        return 1;
                }
-                goto out_drop;
        }
        if (state->inode == dentry->d_inode) {
                nfs4_intent_set_file(nd, dentry, state);
@@ -2917,11 +2915,11 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
                .rpc_resp = clp,
                .rpc_cred = cred,
        };
-        u32 *p;
+        __be32 *p;
        int loop = 0;
        int status;
-        p = (u32*)sc_verifier.data;
+        p = (__be32*)sc_verifier.data;
        *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3dd413f52da1..0cf3fa312a33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -471,7 +471,7 @@ struct compound_hdr {
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
-        uint32_t *p;
+        __be32 *p;
        p = xdr_reserve_space(xdr, 4 + len);
        BUG_ON(p == NULL);
@@ -480,7 +480,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
 static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        uint32_t *p;
+        __be32 *p;
        dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
        BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
@@ -494,7 +494,7 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
 {
-        uint32_t *p;
+        __be32 *p;
        p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
        BUG_ON(p == NULL);
@@ -507,8 +507,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        char owner_group[IDMAP_NAMESZ];
        int owner_namelen = 0;
        int owner_grouplen = 0;
-        uint32_t *p;
+        __be32 *p;
-        uint32_t *q;
+        __be32 *q;
        int len;
        uint32_t bmval0 = 0;
        uint32_t bmval1 = 0;
@@ -630,7 +630,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 static int encode_access(struct xdr_stream *xdr, u32 access)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8);
        WRITE32(OP_ACCESS);
@@ -641,7 +641,7 @@ static int encode_access(struct xdr_stream *xdr, u32 access)
 static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8+sizeof(arg->stateid->data));
        WRITE32(OP_CLOSE);
@@ -653,7 +653,7 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
 {
-        uint32_t *p;
+        __be32 *p;
        
        RESERVE_SPACE(16);
        WRITE32(OP_COMMIT);
@@ -665,7 +665,7 @@ static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
 {
-        uint32_t *p;
+        __be32 *p;
        
        RESERVE_SPACE(8);
        WRITE32(OP_CREATE);
@@ -697,7 +697,7 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
 static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(12);
        WRITE32(OP_GETATTR);
@@ -708,7 +708,7 @@ static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
 static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(16);
        WRITE32(OP_GETATTR);
@@ -740,7 +740,7 @@ static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
 static int encode_getfh(struct xdr_stream *xdr)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_GETFH);
@@ -750,7 +750,7 @@ static int encode_getfh(struct xdr_stream *xdr)
 static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8 + name->len);
        WRITE32(OP_LINK);
@@ -780,7 +780,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
 */
 static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(32);
        WRITE32(OP_LOCK);
@@ -809,7 +809,7 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(40);
        WRITE32(OP_LOCKT);
@@ -825,7 +825,7 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
 static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(44);
        WRITE32(OP_LOCKU);
@@ -841,7 +841,7 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
 static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
 {
        int len = name->len;
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8 + len);
        WRITE32(OP_LOOKUP);
@@ -853,7 +853,7 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
 static void encode_share_access(struct xdr_stream *xdr, int open_flags)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8);
        switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
@@ -874,7 +874,7 @@ static void encode_share_access(struct xdr_stream *xdr, int open_flags)
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
 /*
 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
 * owner 4 = 32
@@ -891,7 +891,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        switch(arg->open_flags & O_EXCL) {
@@ -907,7 +907,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        switch (arg->open_flags & O_CREAT) {
@@ -923,7 +923,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        switch (delegation_type) {
@@ -943,7 +943,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation
 static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(NFS4_OPEN_CLAIM_NULL);
@@ -952,7 +952,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
@@ -961,7 +961,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
 static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4+sizeof(stateid->data));
        WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
@@ -991,7 +991,7 @@ static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8+sizeof(arg->stateid->data));
        WRITE32(OP_OPEN_CONFIRM);
@@ -1003,7 +1003,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
 static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8+sizeof(arg->stateid->data));
        WRITE32(OP_OPEN_DOWNGRADE);
@@ -1017,7 +1017,7 @@ static int
 encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
        int len = fh->size;
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8 + len);
        WRITE32(OP_PUTFH);
@@ -1029,7 +1029,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 static int encode_putrootfh(struct xdr_stream *xdr)
 {
-        uint32_t *p;
+        __be32 *p;
        
        RESERVE_SPACE(4);
        WRITE32(OP_PUTROOTFH);
@@ -1040,7 +1040,7 @@ static int encode_putrootfh(struct xdr_stream *xdr)
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
 {
        nfs4_stateid stateid;
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(16);
        if (ctx->state != NULL) {
@@ -1052,7 +1052,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_READ);
@@ -1074,7 +1074,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                FATTR4_WORD1_MOUNTED_ON_FILEID,
        };
        int replen;
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(32+sizeof(nfs4_verifier));
        WRITE32(OP_READDIR);
@@ -1116,7 +1116,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        unsigned int replen;
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_READLINK);
@@ -1134,7 +1134,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
 static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8 + name->len);
        WRITE32(OP_REMOVE);
@@ -1146,7 +1146,7 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
 static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(8 + oldname->len);
        WRITE32(OP_RENAME);
@@ -1162,7 +1162,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
 static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(12);
        WRITE32(OP_RENEW);
@@ -1174,7 +1174,7 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_
 static int
 encode_restorefh(struct xdr_stream *xdr)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_RESTOREFH);
@@ -1185,7 +1185,7 @@ encode_restorefh(struct xdr_stream *xdr)
 static int
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4+sizeof(zero_stateid.data));
        WRITE32(OP_SETATTR);
@@ -1204,7 +1204,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
 static int
 encode_savefh(struct xdr_stream *xdr)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_SAVEFH);
@@ -1215,7 +1215,7 @@ encode_savefh(struct xdr_stream *xdr)
 static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
 {
        int status;
-        uint32_t *p;
+        __be32 *p;
        
        RESERVE_SPACE(4+sizeof(arg->stateid.data));
        WRITE32(OP_SETATTR);
@@ -1229,7 +1229,7 @@ static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *
 static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
        WRITE32(OP_SETCLIENTID);
@@ -1248,7 +1248,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
 static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(12 + sizeof(client_state->cl_confirm.data));
        WRITE32(OP_SETCLIENTID_CONFIRM);
@@ -1260,7 +1260,7 @@ static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_c
 static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_WRITE);
@@ -1279,7 +1279,7 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
 static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
 {
-        uint32_t *p;
+        __be32 *p;
        RESERVE_SPACE(20);
@@ -1295,7 +1295,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
 /*
 * Encode an ACCESS request
 */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct nfs4_accessargs *args)
+static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1313,7 +1313,7 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct n
 /*
 * Encode LOOKUP request
 */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_arg *args)
+static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1337,7 +1337,7 @@ out:
 /*
 * Encode LOOKUP_ROOT request
 */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_root_arg *args)
+static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1358,7 +1358,7 @@ out:
 /*
 * Encode REMOVE request
 */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct nfs4_remove_arg *args)
+static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs4_remove_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1380,7 +1380,7 @@ out:
 /*
 * Encode RENAME request
 */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1410,7 +1410,7 @@ out:
 /*
 * Encode LINK request
 */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs4_link_arg *args)
+static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1440,7 +1440,7 @@ out:
 /*
 * Encode CREATE request
 */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
+static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1470,7 +1470,7 @@ out:
 /*
 * Encode SYMLINK request
 */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
+static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
 {
        return nfs4_xdr_enc_create(req, p, args);
 }
@@ -1478,7 +1478,7 @@ static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct
 /*
 * Encode GETATTR request
 */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args)
+static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1496,7 +1496,7 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct
 /*
 * Encode a CLOSE request
 */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1520,7 +1520,7 @@ out:
 /*
 * Encode an OPEN request
 */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1556,7 +1556,7 @@ out:
 /*
 * Encode an OPEN_CONFIRM request
 */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_open_confirmargs *args)
+static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1577,7 +1577,7 @@ out:
 /*
 * Encode an OPEN request with no attributes.
 */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1601,7 +1601,7 @@ out:
 /*
 * Encode an OPEN_DOWNGRADE request
 */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1625,7 +1625,7 @@ out:
 /*
 * Encode a LOCK request
 */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args)
+static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1646,7 +1646,7 @@ out:
 /*
 * Encode a LOCKT request
 */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args)
+static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1667,7 +1667,7 @@ out:
 /*
 * Encode a LOCKU request
 */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args)
+static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1688,7 +1688,7 @@ out:
 /*
 * Encode a READLINK request
 */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readlink *args)
+static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1709,7 +1709,7 @@ out:
 /*
 * Encode a READDIR request
 */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readdir_arg *args)
+static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1730,7 +1730,7 @@ out:
 /*
 * Encode a READ request
 */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, uint32_t *p, struct nfs_readargs *args)
+static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
        struct rpc_auth *auth = req->rq_task->tk_auth;
        struct xdr_stream xdr;
@@ -1762,7 +1762,7 @@ out:
 /*
 * Encode an SETATTR request
 */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, uint32_t *p, struct nfs_setattrargs *args)
+static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
 {
        struct xdr_stream xdr;
@@ -1788,7 +1788,7 @@ out:
 * Encode a GETACL request
 */
 static int
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,
+nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
                struct nfs_getaclargs *args)
 {
        struct xdr_stream xdr;
@@ -1815,7 +1815,7 @@ out:
 /*
 * Encode a WRITE request
 */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1839,7 +1839,7 @@ out:
 /*
 *  a COMMIT request
 */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1863,7 +1863,7 @@ out:
 /*
 * FSINFO request
 */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fsinfo_arg *args)
+static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1882,7 +1882,7 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs
 /*
 * a PATHCONF request
 */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct nfs4_pathconf_arg *args)
+static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1902,7 +1902,7 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct
 /*
 * a STATFS request
 */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct nfs4_statfs_arg *args)
+static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1923,7 +1923,7 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct n
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1960,7 +1960,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_clie
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nfs4_setclientid *sc)
+static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1997,7 +1997,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args)
+static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2021,7 +2021,7 @@ out:
 /*
 * Encode FS_LOCATIONS request
 */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2086,7 +2086,7 @@ out:
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
-        uint32_t *p;
+        __be32 *p;
        READ_BUF(4);
        READ32(*len);
@@ -2097,7 +2097,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        uint32_t *p;
+        __be32 *p;
        READ_BUF(8);
        READ32(hdr->status);
@@ -2112,7 +2112,7 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t opnum;
        int32_t nfserr;
@@ -2134,7 +2134,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 /* Dummy routine */
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 {
-        uint32_t *p;
+        __be32 *p;
        unsigned int strlen;
        char *str;
@@ -2144,7 +2144,8 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 {
-        uint32_t bmlen, *p;
+        uint32_t bmlen;
+        __be32 *p;
        READ_BUF(4);
        READ32(bmlen);
@@ -2159,9 +2160,9 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
        return 0;
 }
-static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, uint32_t **savep)
+static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
 {
-        uint32_t *p;
+        __be32 *p;
        READ_BUF(4);
        READ32(*attrlen);
@@ -2182,7 +2183,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
-        uint32_t *p;
+        __be32 *p;
        *type = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2202,7 +2203,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
-        uint32_t *p;
+        __be32 *p;
        *change = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2219,7 +2220,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
-        uint32_t *p;
+        __be32 *p;
        *size = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2235,7 +2236,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        *res = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
@@ -2251,7 +2252,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        *res = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
@@ -2267,7 +2268,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
-        uint32_t *p;
+        __be32 *p;
        fsid->major = 0;
        fsid->minor = 0;
@@ -2287,7 +2288,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        *res = 60;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
@@ -2303,7 +2304,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
@@ -2319,7 +2320,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
-        uint32_t *p;
+        __be32 *p;
        *fileid = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2335,7 +2336,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
-        uint32_t *p;
+        __be32 *p;
        *fileid = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2351,7 +2352,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2368,7 +2369,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2385,7 +2386,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2403,7 +2404,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 {
        int n;
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        READ_BUF(4);
@@ -2448,7 +2449,7 @@ out_eio:
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
 {
        int n;
-        uint32_t *p;
+        __be32 *p;
        int status = -EIO;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
@@ -2512,7 +2513,7 @@ out_eio:
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2529,7 +2530,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *maxlink = 1;
@@ -2546,7 +2547,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *maxname = 1024;
@@ -2563,7 +2564,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 1024;
@@ -2584,7 +2585,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 1024;
@@ -2605,7 +2606,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
 {
-        uint32_t *p;
+        __be32 *p;
        *mode = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
@@ -2622,7 +2623,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
-        uint32_t *p;
+        __be32 *p;
        *nlink = 1;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2638,7 +2639,8 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
 {
-        uint32_t len, *p;
+        uint32_t len;
+        __be32 *p;
        *uid = -2;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2662,7 +2664,8 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
 {
-        uint32_t len, *p;
+        uint32_t len;
+        __be32 *p;
        *gid = -2;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2686,7 +2689,8 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
-        uint32_t major = 0, minor = 0, *p;
+        uint32_t major = 0, minor = 0;
+        __be32 *p;
        *rdev = MKDEV(0,0);
        if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2708,7 +2712,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2725,7 +2729,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2742,7 +2746,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status = 0;
        *res = 0;
@@ -2759,7 +2763,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
-        uint32_t *p;
+        __be32 *p;
        *used = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2776,7 +2780,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 {
-        uint32_t *p;
+        __be32 *p;
        uint64_t sec;
        uint32_t nsec;
@@ -2836,7 +2840,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
        return status;
 }
-static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t attrlen)
+static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen)
 {
        unsigned int attrwords = XDR_QUADLEN(attrlen);
        unsigned int nwords = xdr->p - savep;
@@ -2854,7 +2858,7 @@ static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t att
 static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
-        uint32_t *p;
+        __be32 *p;
        READ_BUF(20);
        READ32(cinfo->atomic);
@@ -2865,7 +2869,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t supp, acc;
        int status;
@@ -2882,7 +2886,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_CLOSE);
@@ -2895,7 +2899,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_COMMIT);
@@ -2908,7 +2912,7 @@ static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t bmlen;
        int status;
@@ -2925,7 +2929,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
-        uint32_t *savep;
+        __be32 *savep;
        uint32_t attrlen, 
                 bitmap[2] = {0};
        int status;
@@ -2952,7 +2956,7 @@ xdr_error:
        
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
-        uint32_t *savep;
+        __be32 *savep;
        uint32_t attrlen, 
                 bitmap[2] = {0};
        int status;
@@ -2985,7 +2989,7 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
-        uint32_t *savep;
+        __be32 *savep;
        uint32_t attrlen, 
                 bitmap[2] = {0};
        int status;
@@ -3010,7 +3014,7 @@ xdr_error:
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
 {
-        uint32_t *savep;
+        __be32 *savep;
        uint32_t attrlen,
                 bitmap[2] = {0},
                 type;
@@ -3079,7 +3083,7 @@ xdr_error:
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
-        uint32_t *savep;
+        __be32 *savep;
        uint32_t attrlen, bitmap[2];
        int status;
@@ -3111,7 +3115,7 @@ xdr_error:
 static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t len;
        int status;
@@ -3147,7 +3151,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 {
        uint64_t offset, length, clientid;
-        uint32_t *p;
+        __be32 *p;
        uint32_t namelen, type;
        READ_BUF(32);
@@ -3172,7 +3176,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_LOCK);
@@ -3195,7 +3199,7 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_LOCKU);
@@ -3214,7 +3218,7 @@ static int decode_lookup(struct xdr_stream *xdr)
 /* This is too sick! */
 static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t limit_type, nblocks, blocksize;
        READ_BUF(12);
@@ -3233,7 +3237,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t delegation_type;
        READ_BUF(4);
@@ -3259,7 +3263,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t bmlen;
        int status;
@@ -3287,7 +3291,7 @@ xdr_error:
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
@@ -3300,7 +3304,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
@@ -3324,7 +3328,7 @@ static int decode_putrootfh(struct xdr_stream *xdr)
 static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
 {
        struct kvec *iov = req->rq_rcv_buf.head;
-        uint32_t *p;
+        __be32 *p;
        uint32_t count, eof, recvd, hdrlen;
        int status;
@@ -3354,7 +3358,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        struct page     *page = *rcvbuf->pages;
        struct kvec     *iov = rcvbuf->head;
        unsigned int    nr, pglen = rcvbuf->page_len;
-        uint32_t        *end, *entry, *p, *kaddr;
+        __be32          *end, *entry, *p, *kaddr;
        uint32_t        len, attrlen, xlen;
        int             hdrlen, recvd, status;
@@ -3376,7 +3380,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        xdr_read_pages(xdr, pglen);
        BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-        kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0);
+        kaddr = p = kmap_atomic(page, KM_USER0);
        end = p + ((pglen + readdir->pgbase) >> 2);
        entry = p;
        for (nr = 0; *p++; nr++) {
@@ -3428,7 +3432,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
        struct kvec *iov = rcvbuf->head;
        int hdrlen, len, recvd;
-        uint32_t *p;
+        __be32 *p;
        char *kaddr;
        int status;
@@ -3505,7 +3509,7 @@ decode_restorefh(struct xdr_stream *xdr)
 static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                size_t *acl_len)
 {
-        uint32_t *savep;
+        __be32 *savep;
        uint32_t attrlen,
                 bitmap[2] = {0};
        struct kvec *iov = req->rq_rcv_buf.head;
@@ -3551,7 +3555,7 @@ decode_savefh(struct xdr_stream *xdr)
 static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t bmlen;
        int status;
@@ -3567,7 +3571,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
 static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 {
-        uint32_t *p;
+        __be32 *p;
        uint32_t opnum;
        int32_t nfserr;
@@ -3610,7 +3614,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
 static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-        uint32_t *p;
+        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_WRITE);
@@ -3632,7 +3636,7 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 /*
 * Decode OPEN_DOWNGRADE response
 */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3660,7 +3664,7 @@ out:
 /*
 * Decode ACCESS response
 */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3678,7 +3682,7 @@ out:
 /*
 * Decode LOOKUP response
 */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3701,7 +3705,7 @@ out:
 /*
 * Decode LOOKUP_ROOT response
 */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3721,7 +3725,7 @@ out:
 /*
 * Decode REMOVE response
 */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_remove_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3742,7 +3746,7 @@ out:
 /*
 * Decode RENAME response
 */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3772,7 +3776,7 @@ out:
 /*
 * Decode LINK response
 */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3805,7 +3809,7 @@ out:
 /*
 * Decode CREATE response
 */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3834,7 +3838,7 @@ out:
 /*
 * Decode SYMLINK response
 */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
 {
        return nfs4_xdr_dec_create(rqstp, p, res);
 }
@@ -3842,7 +3846,7 @@ static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4
 /*
 * Decode GETATTR response
 */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3865,7 +3869,7 @@ out:
 * Encode an SETACL request
 */
 static int
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
+nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -3886,7 +3890,7 @@ out:
 * Decode SETACL response
 */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3908,7 +3912,7 @@ out:
 * Decode GETACL response
 */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3930,7 +3934,7 @@ out:
 /*
 * Decode CLOSE response
 */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3960,7 +3964,7 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -3994,7 +3998,7 @@ out:
 /*
 * Decode OPEN_CONFIRM response
 */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4015,7 +4019,7 @@ out:
 /*
 * Decode OPEN response
 */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4039,7 +4043,7 @@ out:
 /*
 * Decode SETATTR response
 */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4065,7 +4069,7 @@ out:
 /*
 * Decode LOCK response
 */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4086,7 +4090,7 @@ out:
 /*
 * Decode LOCKT response
 */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4107,7 +4111,7 @@ out:
 /*
 * Decode LOCKU response
 */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4128,7 +4132,7 @@ out:
 /*
 * Decode READLINK response
 */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4149,7 +4153,7 @@ out:
 /*
 * Decode READDIR response
 */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4170,7 +4174,7 @@ out:
 /*
 * Decode Read response
 */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4193,7 +4197,7 @@ out:
 /*
 * Decode WRITE response
 */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4219,7 +4223,7 @@ out:
 /*
 * Decode COMMIT response
 */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4243,7 +4247,7 @@ out:
 /*
 * FSINFO request
 */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4263,7 +4267,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsi
 /*
 * PATHCONF request
 */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4281,7 +4285,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_p
 /*
 * STATFS request
 */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4299,7 +4303,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fss
 /*
 * GETATTR_BITMAP request
 */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, uint32_t *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4318,7 +4322,7 @@ out:
 /*
 * Decode RENEW response
 */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4334,7 +4338,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
 /*
 * a SETCLIENTID request
 */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
                struct nfs_client *clp)
 {
        struct xdr_stream xdr;
@@ -4353,7 +4357,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4375,7 +4379,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
 /*
 * DELEGRETURN request
 */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4397,7 +4401,7 @@ out:
 /*
 * FS_LOCATIONS request
 */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4417,7 +4421,7 @@ out:
        return status;
 }
-uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
        uint32_t bitmap[2] = {0};
        uint32_t len;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 1d656a645199..8dfefe41a8da 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -69,7 +69,6 @@
 *      Fabian Frederick:       Option parser rebuilt (using parser lib)
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e8d40030cab4..28108c82b887 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -20,7 +20,6 @@
 *   of another (see nfs_lookup())
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -835,7 +834,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        }
        /* RFC3530: The default port for NFS is 2049 */
        if (addr.sin_port == 0)
-                addr.sin_port = NFS_PORT;
+                addr.sin_port = htons(NFS_PORT);
        /* Grab the authentication type */
        authflavour = RPC_AUTH_UNIX;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f6675d2c386c..883dd4a1c157 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -57,6 +57,8 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include <linux/smp_lock.h>
@@ -395,7 +397,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 out:
        clear_bit(BDI_write_congested, &bdi->state);
        wake_up_all(&nfs_write_congestion);
-        writeback_congestion_end();
+        congestion_end(WRITE);
        return err;
 }
@@ -588,10 +590,10 @@ static void nfs_cancel_commit_list(struct list_head *head)
        while(!list_empty(head)) {
                req = nfs_list_entry(head->next);
+                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                nfs_list_remove_request(req);
                nfs_inode_remove_request(req);
-                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                nfs_unlock_request(req);
-                nfs_clear_page_writeback(req);
        }
 }
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 0c2be8c0307d..c11f5375d7c1 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -46,7 +46,7 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
        struct nfsacl_encode_desc *nfsacl_desc =
                (struct nfsacl_encode_desc *) desc;
-        u32 *p = (u32 *) elem;
+        __be32 *p = elem;
        struct posix_acl_entry *entry =
                &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
@@ -127,7 +127,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
 {
        struct nfsacl_decode_desc *nfsacl_desc =
                (struct nfsacl_decode_desc *) desc;
-        u32 *p = (u32 *) elem;
+        __be32 *p = elem;
        struct posix_acl_entry *entry;
        if (!nfsacl_desc->acl) {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cfe141e5d759..f37df46d2eaa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -319,12 +319,25 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 static struct cache_head *export_table[EXPORT_HASHMAX];
+static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+{
+        int i;
+        for (i = 0; i < fsloc->locations_count; i++) {
+                kfree(fsloc->locations[i].path);
+                kfree(fsloc->locations[i].hosts);
+        }
+        kfree(fsloc->locations);
+}
 static void svc_export_put(struct kref *ref)
 {
        struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
        dput(exp->ex_dentry);
        mntput(exp->ex_mnt);
        auth_domain_put(exp->ex_client);
+        kfree(exp->ex_path);
+        nfsd4_fslocs_free(&exp->ex_fslocs);
        kfree(exp);
 }
@@ -386,6 +399,69 @@ static int check_export(struct inode *inode, int flags)
 }
+#ifdef CONFIG_NFSD_V4
+static int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
+{
+        int len;
+        int migrated, i, err;
+        len = qword_get(mesg, buf, PAGE_SIZE);
+        if (len != 5 || memcmp(buf, "fsloc", 5))
+                return 0;
+        /* listsize */
+        err = get_int(mesg, &fsloc->locations_count);
+        if (err)
+                return err;
+        if (fsloc->locations_count > MAX_FS_LOCATIONS)
+                return -EINVAL;
+        if (fsloc->locations_count == 0)
+                return 0;
+        fsloc->locations = kzalloc(fsloc->locations_count
+                        * sizeof(struct nfsd4_fs_location), GFP_KERNEL);
+        if (!fsloc->locations)
+                return -ENOMEM;
+        for (i=0; i < fsloc->locations_count; i++) {
+                /* colon separated host list */
+                err = -EINVAL;
+                len = qword_get(mesg, buf, PAGE_SIZE);
+                if (len <= 0)
+                        goto out_free_all;
+                err = -ENOMEM;
+                fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
+                if (!fsloc->locations[i].hosts)
+                        goto out_free_all;
+                err = -EINVAL;
+                /* slash separated path component list */
+                len = qword_get(mesg, buf, PAGE_SIZE);
+                if (len <= 0)
+                        goto out_free_all;
+                err = -ENOMEM;
+                fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
+                if (!fsloc->locations[i].path)
+                        goto out_free_all;
+        }
+        /* migrated */
+        err = get_int(mesg, &migrated);
+        if (err)
+                goto out_free_all;
+        err = -EINVAL;
+        if (migrated < 0 || migrated > 1)
+                goto out_free_all;
+        fsloc->migrated = migrated;
+        return 0;
+out_free_all:
+        nfsd4_fslocs_free(fsloc);
+        return err;
+}
+#else /* CONFIG_NFSD_V4 */
+static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; }
+#endif
 static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
        /* client path expiry [flags anonuid anongid fsid] */
@@ -398,6 +474,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        int an_int;
        nd.dentry = NULL;
+        exp.ex_path = NULL;
        if (mesg[mlen-1] != '\n')
                return -EINVAL;
@@ -428,6 +505,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        exp.ex_client = dom;
        exp.ex_mnt = nd.mnt;
        exp.ex_dentry = nd.dentry;
+        exp.ex_path = kstrdup(buf, GFP_KERNEL);
+        err = -ENOMEM;
+        if (!exp.ex_path)
+                goto out;
        /* expiry */
        err = -EINVAL;
@@ -435,6 +516,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        if (exp.h.expiry_time == 0)
                goto out;
+        /* fs locations */
+        exp.ex_fslocs.locations = NULL;
+        exp.ex_fslocs.locations_count = 0;
+        exp.ex_fslocs.migrated = 0;
        /* flags */
        err = get_int(&mesg, &an_int);
        if (err == -ENOENT)
@@ -460,6 +546,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                err = check_export(nd.dentry->d_inode, exp.ex_flags);
                if (err) goto out;
+                err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
+                if (err)
+                        goto out;
        }
        expp = svc_export_lookup(&exp);
@@ -473,6 +563,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        else
                exp_put(expp);
 out:
+        kfree(exp.ex_path);
        if (nd.dentry)
                path_release(&nd);
 out_no_path:
@@ -482,7 +573,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        return err;
 }
-static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong);
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+                uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
 static int svc_export_show(struct seq_file *m,
                           struct cache_detail *cd,
@@ -501,8 +593,8 @@ static int svc_export_show(struct seq_file *m,
        seq_putc(m, '(');
        if (test_bit(CACHE_VALID, &h->flags) && 
            !test_bit(CACHE_NEGATIVE, &h->flags))
-                exp_flags(m, exp->ex_flags, exp->ex_fsid, 
+                exp_flags(m, exp->ex_flags, exp->ex_fsid,
-                          exp->ex_anon_uid, exp->ex_anon_gid);
+                          exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
        seq_puts(m, ")\n");
        return 0;
 }
@@ -524,6 +616,10 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
        new->ex_client = item->ex_client;
        new->ex_dentry = dget(item->ex_dentry);
        new->ex_mnt = mntget(item->ex_mnt);
+        new->ex_path = NULL;
+        new->ex_fslocs.locations = NULL;
+        new->ex_fslocs.locations_count = 0;
+        new->ex_fslocs.migrated = 0;
 }
 static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -535,6 +631,14 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        new->ex_anon_uid = item->ex_anon_uid;
        new->ex_anon_gid = item->ex_anon_gid;
        new->ex_fsid = item->ex_fsid;
+        new->ex_path = item->ex_path;
+        item->ex_path = NULL;
+        new->ex_fslocs.locations = item->ex_fslocs.locations;
+        item->ex_fslocs.locations = NULL;
+        new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
+        item->ex_fslocs.locations_count = 0;
+        new->ex_fslocs.migrated = item->ex_fslocs.migrated;
+        item->ex_fslocs.migrated = 0;
 }
 static struct cache_head *svc_export_alloc(void)
@@ -1044,34 +1148,25 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 * for a given NFSv4 client.   The root is defined to be the
 * export point with fsid==0
 */
-int
+__be32
 exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
               struct cache_req *creq)
 {
-        struct svc_expkey *fsid_key;
        struct svc_export *exp;
-        int rv;
+        __be32 rv;
        u32 fsidv[2];
        mk_fsid_v1(fsidv, 0);
-        fsid_key = exp_find_key(clp, 1, fsidv, creq);
+        exp = exp_find(clp, 1, fsidv, creq);
-        if (IS_ERR(fsid_key) && PTR_ERR(fsid_key) == -EAGAIN)
+        if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN)
                return nfserr_dropit;
-        if (!fsid_key || IS_ERR(fsid_key))
-                return nfserr_perm;
-        exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq);
        if (exp == NULL)
-                rv = nfserr_perm;
+                return nfserr_perm;
        else if (IS_ERR(exp))
-                rv = nfserrno(PTR_ERR(exp));
+                return nfserrno(PTR_ERR(exp));
-        else {
+        rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
-                rv = fh_compose(fhp, exp,
+        exp_put(exp);
-                                fsid_key->ek_dentry, NULL);
-                exp_put(exp);
-        }
-        cache_put(&fsid_key->h, &svc_expkey_cache);
        return rv;
 }
@@ -1158,7 +1253,8 @@ static struct flags {
        { 0, {"", ""}}
 };
-static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong)
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+                uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
 {
        int first = 0;
        struct flags *flg;
@@ -1174,6 +1270,21 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t
                seq_printf(m, "%sanonuid=%d", first++?",":"", anonu);
        if (anong != (gid_t)-2 && anong != (0x10000-2))
                seq_printf(m, "%sanongid=%d", first++?",":"", anong);
+        if (fsloc && fsloc->locations_count > 0) {
+                char *loctype = (fsloc->migrated) ? "refer" : "replicas";
+                int i;
+                seq_printf(m, "%s%s=", first++?",":"", loctype);
+                seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
+                seq_putc(m, '@');
+                seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
+                for (i = 1; i < fsloc->locations_count; i++) {
+                        seq_putc(m, ';');
+                        seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
+                        seq_putc(m, '@');
+                        seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
+                }
+        }
 }
 static int e_show(struct seq_file *m, void *p)
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 7b889ff15ae6..11fdaf7721b4 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -25,7 +25,7 @@
 static u32
 nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 {
-        u32             nfserr;
+        __be32          nfserr;
        struct svc_fh   fh;
        /* must initialize before using! but maxsize doesn't matter */
@@ -39,18 +39,20 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
        fh_put(&fh);
        rqstp->rq_client = NULL;
        exp_readunlock();
-        /* nlm and nfsd don't share error codes.
+        /* We return nlm error codes as nlm doesn't know
-         * we invent: 0 = no error
+         * about nfsd, but nfsd does know about nlm..
-         *            1 = stale file handle
-         *            2 = other error
         */
        switch (nfserr) {
        case nfs_ok:
                return 0;
+        case nfserr_dropit:
+                return nlm_drop_reply;
+#ifdef CONFIG_LOCKD_V4
        case nfserr_stale:
-                return 1;
+                return nlm4_stale_fh;
+#endif
        default:
-                return 2;
+                return nlm_lck_denied;
        }
 }
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index fe56b38364cc..e3eca0816986 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -21,7 +21,7 @@
 /*
 * NULL call.
 */
-static int
+static __be32
 nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        return nfs_ok;
@@ -30,12 +30,12 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
 * Get the Access and/or Default ACL of a file.
 */
-static int nfsacld_proc_getacl(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
                struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
        svc_fh *fh;
        struct posix_acl *acl;
-        int nfserr = 0;
+        __be32 nfserr = 0;
        dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
@@ -97,12 +97,12 @@ fail:
 /*
 * Set the Access and/or Default ACL of a file.
 */
-static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
                struct nfsd3_setaclargs *argp,
                struct nfsd_attrstat *resp)
 {
        svc_fh *fh;
-        int nfserr = 0;
+        __be32 nfserr = 0;
        dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
@@ -128,7 +128,7 @@ static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
 /*
 * Check file attributes
 */
-static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
                struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
 {
        dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
@@ -140,10 +140,10 @@ static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
 /*
 * Check file access
 */
-static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
                struct nfsd3_accessres *resp)
 {
-        int nfserr;
+        __be32 nfserr;
        dprintk("nfsd: ACCESS(2acl)   %s 0x%x\n",
                        SVCFH_fmt(&argp->fh),
@@ -158,7 +158,7 @@ static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *
 /*
 * XDR decode functions
 */
-static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclargs *argp)
 {
        if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -169,7 +169,7 @@ static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
 }
-static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_setaclargs *argp)
 {
        struct kvec *head = rqstp->rq_arg.head;
@@ -194,7 +194,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
        return (n > 0);
 }
-static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd_fhandle *argp)
 {
        if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
        return xdr_argsize_check(rqstp, p);
 }
-static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_accessargs *argp)
 {
        if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -217,7 +217,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
 */
 /* GETACL */
-static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
 {
        struct dentry *dentry = resp->fh.fh_dentry;
@@ -241,7 +241,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
        rqstp->rq_res.page_len = w;
        while (w > 0) {
-                if (!svc_take_res_page(rqstp))
+                if (!rqstp->rq_respages[rqstp->rq_resused++])
                        return 0;
                w -= PAGE_SIZE;
        }
@@ -259,7 +259,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
        return 1;
 }
-static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd_attrstat *resp)
 {
        p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -267,7 +267,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
 }
 /* ACCESS */
-static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_accessres *resp)
 {
        p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -278,7 +278,7 @@ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
 /*
 * XDR release functions
 */
-static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
 {
        fh_put(&resp->fh);
@@ -287,7 +287,7 @@ static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
        return 1;
 }
-static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd_fhandle *resp)
 {
        fh_put(&resp->fh);
@@ -333,4 +333,5 @@ struct svc_version	nfsd_acl_version2 = {
                .vs_proc        = nfsd_acl_procedures2,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
+                .vs_hidden      = 1,
 };
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 16e10c170aed..fcad2895ddb0 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -19,7 +19,7 @@
 /*
 * NULL call.
 */
-static int
+static __be32
 nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        return nfs_ok;
@@ -28,12 +28,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
 * Get the Access and/or Default ACL of a file.
 */
-static int nfsd3_proc_getacl(struct svc_rqst * rqstp,
+static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
                struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
        svc_fh *fh;
        struct posix_acl *acl;
-        int nfserr = 0;
+        __be32 nfserr = 0;
        fh = fh_copy(&resp->fh, &argp->fh);
        if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
@@ -93,12 +93,12 @@ fail:
 /*
 * Set the Access and/or Default ACL of a file.
 */
-static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
+static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
                struct nfsd3_setaclargs *argp,
                struct nfsd3_attrstat *resp)
 {
        svc_fh *fh;
-        int nfserr = 0;
+        __be32 nfserr = 0;
        fh = fh_copy(&resp->fh, &argp->fh);
        nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
@@ -122,7 +122,7 @@ static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
 /*
 * XDR decode functions
 */
-static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclargs *args)
 {
        if (!(p = nfs3svc_decode_fh(p, &args->fh)))
@@ -133,7 +133,7 @@ static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
 }
-static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_setaclargs *args)
 {
        struct kvec *head = rqstp->rq_arg.head;
@@ -163,7 +163,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
 */
 /* GETACL */
-static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
 {
        struct dentry *dentry = resp->fh.fh_dentry;
@@ -185,7 +185,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
                rqstp->rq_res.page_len = w;
                while (w > 0) {
-                        if (!svc_take_res_page(rqstp))
+                        if (!rqstp->rq_respages[rqstp->rq_resused++])
                                return 0;
                        w -= PAGE_SIZE;
                }
@@ -208,7 +208,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
 }
 /* SETACL */
-static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_attrstat *resp)
 {
        p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
@@ -219,7 +219,7 @@ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
 /*
 * XDR release functions
 */
-static int nfs3svc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
 {
        fh_put(&resp->fh);
@@ -263,5 +263,6 @@ struct svc_version	nfsd_acl_version3 = {
                .vs_proc        = nfsd_acl_procedures3,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
+                .vs_hidden      = 1,
 };
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index f61142afea44..64db601c2bd2 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,7 +43,7 @@ static int	nfs3_ftypes[] = {
 /*
 * NULL call.
 */
-static int
+static __be32
 nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        return nfs_ok;
@@ -52,11 +52,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
 * Get a file's attributes
 */
-static int
+static __be32
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
                                           struct nfsd3_attrstat *resp)
 {
-        int     err, nfserr;
+        int     err;
+        __be32  nfserr;
        dprintk("nfsd: GETATTR(3)  %s\n",
                SVCFH_fmt(&argp->fh));
@@ -76,11 +77,11 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 /*
 * Set a file's attributes
 */
-static int
+static __be32
 nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
                                           struct nfsd3_attrstat  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: SETATTR(3)  %s\n",
                                SVCFH_fmt(&argp->fh));
@@ -94,11 +95,11 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
 /*
 * Look up a path name component
 */
-static int
+static __be32
 nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
                                          struct nfsd3_diropres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: LOOKUP(3)   %s %.*s\n",
                                SVCFH_fmt(&argp->fh),
@@ -118,11 +119,11 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 /*
 * Check file access
 */
-static int
+static __be32
 nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
                                          struct nfsd3_accessres *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: ACCESS(3)   %s 0x%x\n",
                                SVCFH_fmt(&argp->fh),
@@ -137,11 +138,11 @@ nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
 /*
 * Read a symlink.
 */
-static int
+static __be32
 nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
                                           struct nfsd3_readlinkres *resp)
 {
-        int nfserr;
+        __be32 nfserr;
        dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
@@ -155,11 +156,12 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
 /*
 * Read a portion of a file.
 */
-static int
+static __be32
 nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
                                        struct nfsd3_readres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
+        u32     max_blocksize = svc_max_payload(rqstp);
        dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
                                SVCFH_fmt(&argp->fh),
@@ -172,15 +174,15 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
         */
        resp->count = argp->count;
-        if (NFSSVC_MAXBLKSIZE < resp->count)
+        if (max_blocksize < resp->count)
-                resp->count = NFSSVC_MAXBLKSIZE;
+                resp->count = max_blocksize;
        svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
        fh_copy(&resp->fh, &argp->fh);
        nfserr = nfsd_read(rqstp, &resp->fh, NULL,
                                  argp->offset,
-                                  argp->vec, argp->vlen,
+                                  rqstp->rq_vec, argp->vlen,
                                  &resp->count);
        if (nfserr == 0) {
                struct inode    *inode = resp->fh.fh_dentry->d_inode;
@@ -194,11 +196,11 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 /*
 * Write data to a file
 */
-static int
+static __be32
 nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
                                         struct nfsd3_writeres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
                                SVCFH_fmt(&argp->fh),
@@ -210,7 +212,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
        resp->committed = argp->stable;
        nfserr = nfsd_write(rqstp, &resp->fh, NULL,
                                   argp->offset,
-                                   argp->vec, argp->vlen,
+                                   rqstp->rq_vec, argp->vlen,
                                   argp->len,
                                   &resp->committed);
        resp->count = argp->count;
@@ -222,13 +224,13 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 * At least in theory; we'll see how it fares in practice when the
 * first reports about SunOS compatibility problems start to pour in...
 */
-static int
+static __be32
 nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
                                          struct nfsd3_diropres   *resp)
 {
        svc_fh          *dirfhp, *newfhp = NULL;
        struct iattr    *attr;
-        u32             nfserr;
+        __be32          nfserr;
        dprintk("nfsd: CREATE(3)   %s %.*s\n",
                                SVCFH_fmt(&argp->fh),
@@ -264,11 +266,11 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 /*
 * Make directory. This operation is not idempotent.
 */
-static int
+static __be32
 nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
                                         struct nfsd3_diropres   *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: MKDIR(3)    %s %.*s\n",
                                SVCFH_fmt(&argp->fh),
@@ -284,11 +286,11 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
        RETURN_STATUS(nfserr);
 }
-static int
+static __be32
 nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
                                           struct nfsd3_diropres    *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: SYMLINK(3)  %s %.*s -> %.*s\n",
                                SVCFH_fmt(&argp->ffh),
@@ -306,11 +308,12 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
 /*
 * Make socket/fifo/device.
 */
-static int
+static __be32
 nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
                                         struct nfsd3_diropres  *resp)
 {
-        int     nfserr, type;
+        __be32  nfserr;
+        int type;
        dev_t   rdev = 0;
        dprintk("nfsd: MKNOD(3)    %s %.*s\n",
@@ -342,11 +345,11 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
 /*
 * Remove file/fifo/socket etc.
 */
-static int
+static __be32
 nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
                                          struct nfsd3_attrstat  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: REMOVE(3)   %s %.*s\n",
                                SVCFH_fmt(&argp->fh),
@@ -362,11 +365,11 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 /*
 * Remove a directory
 */
-static int
+static __be32
 nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
                                         struct nfsd3_attrstat  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: RMDIR(3)    %s %.*s\n",
                                SVCFH_fmt(&argp->fh),
@@ -378,11 +381,11 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
        RETURN_STATUS(nfserr);
 }
-static int
+static __be32
 nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
                                          struct nfsd3_renameres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: RENAME(3)   %s %.*s ->\n",
                                SVCFH_fmt(&argp->ffh),
@@ -400,11 +403,11 @@ nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
        RETURN_STATUS(nfserr);
 }
-static int
+static __be32
 nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
                                        struct nfsd3_linkres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: LINK(3)     %s ->\n",
                                SVCFH_fmt(&argp->ffh));
@@ -423,11 +426,12 @@ nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
 /*
 * Read a portion of a directory.
 */
-static int
+static __be32
 nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
                                           struct nfsd3_readdirres  *resp)
 {
-        int             nfserr, count;
+        __be32          nfserr;
+        int             count;
        dprintk("nfsd: READDIR(3)  %s %d bytes at %d\n",
                                SVCFH_fmt(&argp->fh),
@@ -458,11 +462,12 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 * Read a portion of a directory, including file handles and attrs.
 * For now, we choose to ignore the dircount parameter.
 */
-static int
+static __be32
 nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
                                               struct nfsd3_readdirres  *resp)
 {
-        int     nfserr, count = 0;
+        __be32  nfserr;
+        int     count = 0;
        loff_t  offset;
        int     i;
        caddr_t page_addr = NULL;
@@ -516,11 +521,11 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 /*
 * Get file system stats
 */
-static int
+static __be32
 nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
                                           struct nfsd3_fsstatres *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: FSSTAT(3)   %s\n",
                                SVCFH_fmt(&argp->fh));
@@ -533,20 +538,21 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 /*
 * Get file system info
 */
-static int
+static __be32
 nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
                                           struct nfsd3_fsinfores *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
+        u32     max_blocksize = svc_max_payload(rqstp);
        dprintk("nfsd: FSINFO(3)   %s\n",
                                SVCFH_fmt(&argp->fh));
-        resp->f_rtmax  = NFSSVC_MAXBLKSIZE;
+        resp->f_rtmax  = max_blocksize;
-        resp->f_rtpref = NFSSVC_MAXBLKSIZE;
+        resp->f_rtpref = max_blocksize;
        resp->f_rtmult = PAGE_SIZE;
-        resp->f_wtmax  = NFSSVC_MAXBLKSIZE;
+        resp->f_wtmax  = max_blocksize;
-        resp->f_wtpref = NFSSVC_MAXBLKSIZE;
+        resp->f_wtpref = max_blocksize;
        resp->f_wtmult = PAGE_SIZE;
        resp->f_dtpref = PAGE_SIZE;
        resp->f_maxfilesize = ~(u32) 0;
@@ -574,11 +580,11 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 /*
 * Get pathconf info for the specified file
 */
-static int
+static __be32
 nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
                                             struct nfsd3_pathconfres *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: PATHCONF(3) %s\n",
                                SVCFH_fmt(&argp->fh));
@@ -617,11 +623,11 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 /*
 * Commit a file (range) to stable storage.
 */
-static int
+static __be32
 nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
                                           struct nfsd3_commitres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
                                SVCFH_fmt(&argp->fh),
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 243d94b9653a..b4baca3053c3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -42,23 +42,23 @@ static u32	nfs3_ftypes[] = {
 /*
 * XDR functions for basic NFS types
 */
-static inline u32 *
+static inline __be32 *
-encode_time3(u32 *p, struct timespec *time)
+encode_time3(__be32 *p, struct timespec *time)
 {
        *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
        return p;
 }
-static inline u32 *
+static inline __be32 *
-decode_time3(u32 *p, struct timespec *time)
+decode_time3(__be32 *p, struct timespec *time)
 {
        time->tv_sec = ntohl(*p++);
        time->tv_nsec = ntohl(*p++);
        return p;
 }
-static inline u32 *
+static inline __be32 *
-decode_fh(u32 *p, struct svc_fh *fhp)
+decode_fh(__be32 *p, struct svc_fh *fhp)
 {
        unsigned int size;
        fh_init(fhp, NFS3_FHSIZE);
@@ -72,13 +72,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
 }
 /* Helper function for NFSv3 ACL code */
-u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp)
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
 {
        return decode_fh(p, fhp);
 }
-static inline u32 *
+static inline __be32 *
-encode_fh(u32 *p, struct svc_fh *fhp)
+encode_fh(__be32 *p, struct svc_fh *fhp)
 {
        unsigned int size = fhp->fh_handle.fh_size;
        *p++ = htonl(size);
@@ -91,8 +91,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
 * Decode a file name and make sure that the path contains
 * no slashes or null bytes.
 */
-static inline u32 *
+static inline __be32 *
-decode_filename(u32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, int *lenp)
 {
        char            *name;
        int             i;
@@ -107,8 +107,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-decode_sattr3(u32 *p, struct iattr *iap)
+decode_sattr3(__be32 *p, struct iattr *iap)
 {
        u32     tmp;
@@ -153,8 +153,8 @@ decode_sattr3(u32 *p, struct iattr *iap)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
              struct kstat *stat)
 {
        struct dentry   *dentry = fhp->fh_dentry;
@@ -186,8 +186,8 @@ encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
        return p;
 }
-static inline u32 *
+static inline __be32 *
-encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
        struct inode    *inode = fhp->fh_dentry->d_inode;
@@ -224,8 +224,8 @@ encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 * The inode may be NULL if the call failed because of a stale file
 * handle. In this case, no attributes are returned.
 */
-static u32 *
+static __be32 *
-encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
        struct dentry *dentry = fhp->fh_dentry;
        if (dentry && dentry->d_inode != NULL) {
@@ -243,8 +243,8 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 }
 /* Helper for NFSv3 ACLs */
-u32 *
+__be32 *
-nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
        return encode_post_op_attr(rqstp, p, fhp);
 }
@@ -252,8 +252,8 @@ nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 /*
 * Enocde weak cache consistency data
 */
-static u32 *
+static __be32 *
-encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
        struct dentry   *dentry = fhp->fh_dentry;
@@ -278,7 +278,7 @@ encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 * XDR decode functions
 */
 int
-nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
+nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
@@ -286,7 +286,7 @@ nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args
 }
 int
-nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_sattrargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -303,7 +303,7 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_diropargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -314,7 +314,7 @@ nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_accessargs *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
@@ -325,11 +325,12 @@ nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readargs *args)
 {
        unsigned int len;
        int v,pn;
+        u32 max_blocksize = svc_max_payload(rqstp);
        if (!(p = decode_fh(p, &args->fh))
         || !(p = xdr_decode_hyper(p, &args->offset)))
@@ -337,17 +338,16 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
        len = args->count = ntohl(*p++);
-        if (len > NFSSVC_MAXBLKSIZE)
+        if (len > max_blocksize)
-                len = NFSSVC_MAXBLKSIZE;
+                len = max_blocksize;
        /* set up the kvec */
        v=0;
        while (len > 0) {
-                pn = rqstp->rq_resused;
+                pn = rqstp->rq_resused++;
-                svc_take_page(rqstp);
+                rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
-                args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+                rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
-                args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+                len -= rqstp->rq_vec[v].iov_len;
-                len -= args->vec[v].iov_len;
                v++;
        }
        args->vlen = v;
@@ -355,10 +355,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_writeargs *args)
 {
        unsigned int len, v, hdr;
+        u32 max_blocksize = svc_max_payload(rqstp);
        if (!(p = decode_fh(p, &args->fh))
         || !(p = xdr_decode_hyper(p, &args->offset)))
@@ -373,26 +374,26 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
            rqstp->rq_arg.len - hdr < len)
                return 0;
-        args->vec[0].iov_base = (void*)p;
+        rqstp->rq_vec[0].iov_base = (void*)p;
-        args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+        rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
-        if (len > NFSSVC_MAXBLKSIZE)
+        if (len > max_blocksize)
-                len = NFSSVC_MAXBLKSIZE;
+                len = max_blocksize;
        v=  0;
-        while (len > args->vec[v].iov_len) {
+        while (len > rqstp->rq_vec[v].iov_len) {
-                len -= args->vec[v].iov_len;
+                len -= rqstp->rq_vec[v].iov_len;
                v++;
-                args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
+                rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
-                args->vec[v].iov_len = PAGE_SIZE;
+                rqstp->rq_vec[v].iov_len = PAGE_SIZE;
        }
-        args->vec[v].iov_len = len;
+        rqstp->rq_vec[v].iov_len = len;
        args->vlen = v+1;
-        return args->count == args->len && args->vec[0].iov_len > 0;
+        return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
 }
 int
-nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_createargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -416,7 +417,7 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
        return xdr_argsize_check(rqstp, p);
 }
 int
-nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_createargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -428,7 +429,7 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_symlinkargs *args)
 {
        unsigned int len;
@@ -446,11 +447,11 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
         * This page appears in the rq_res.pages list, but as pages_len is always
         * 0, it won't get in the way
         */
-        svc_take_page(rqstp);
        len = ntohl(*p++);
        if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
                return 0;
-        args->tname = new = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+        args->tname = new =
+                page_address(rqstp->rq_respages[rqstp->rq_resused++]);
        args->tlen = len;
        /* first copy and check from the first page */
        old = (char*)p;
@@ -480,7 +481,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_mknodargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -504,7 +505,7 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_renameargs *args)
 {
        if (!(p = decode_fh(p, &args->ffh))
@@ -517,19 +518,19 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readlinkargs *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
-        svc_take_page(rqstp);
+        args->buffer =
-        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+                page_address(rqstp->rq_respages[rqstp->rq_resused++]);
        return xdr_argsize_check(rqstp, p);
 }
 int
-nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_linkargs *args)
 {
        if (!(p = decode_fh(p, &args->ffh))
@@ -541,7 +542,7 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readdirargs *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
@@ -554,17 +555,18 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
        if (args->count > PAGE_SIZE)
                args->count = PAGE_SIZE;
-        svc_take_page(rqstp);
+        args->buffer =
-        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+                page_address(rqstp->rq_respages[rqstp->rq_resused++]);
        return xdr_argsize_check(rqstp, p);
 }
 int
-nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readdirargs *args)
 {
        int len, pn;
+        u32 max_blocksize = svc_max_payload(rqstp);
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
@@ -573,13 +575,12 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
        args->dircount = ntohl(*p++);
        args->count    = ntohl(*p++);
-        len = (args->count > NFSSVC_MAXBLKSIZE) ? NFSSVC_MAXBLKSIZE :
+        len = (args->count > max_blocksize) ? max_blocksize :
                                                  args->count;
        args->count = len;
        while (len > 0) {
-                pn = rqstp->rq_resused;
+                pn = rqstp->rq_resused++;
-                svc_take_page(rqstp);
                if (!args->buffer)
                        args->buffer = page_address(rqstp->rq_respages[pn]);
                len -= PAGE_SIZE;
@@ -589,7 +590,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_commitargs *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
@@ -608,14 +609,14 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
 * will work properly.
 */
 int
-nfs3svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
 /* GETATTR */
 int
-nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_attrstat *resp)
 {
        if (resp->status == 0)
@@ -625,7 +626,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 /* SETATTR, REMOVE, RMDIR */
 int
-nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_attrstat *resp)
 {
        p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -634,7 +635,7 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
 /* LOOKUP */
 int
-nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_diropres *resp)
 {
        if (resp->status == 0) {
@@ -647,7 +648,7 @@ nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 /* ACCESS */
 int
-nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_accessres *resp)
 {
        p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -658,7 +659,7 @@ nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
 /* READLINK */
 int
-nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readlinkres *resp)
 {
        p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -668,7 +669,6 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
                rqstp->rq_res.page_len = resp->len;
                if (resp->len & 3) {
                        /* need to pad the tail */
-                        rqstp->rq_restailpage = 0;
                        rqstp->rq_res.tail[0].iov_base = p;
                        *p = 0;
                        rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -680,7 +680,7 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
 /* READ */
 int
-nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readres *resp)
 {
        p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -693,7 +693,6 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
                rqstp->rq_res.page_len = resp->count;
                if (resp->count & 3) {
                        /* need to pad the tail */
-                        rqstp->rq_restailpage = 0;
                        rqstp->rq_res.tail[0].iov_base = p;
                        *p = 0;
                        rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
@@ -705,7 +704,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 /* WRITE */
 int
-nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_writeres *resp)
 {
        p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -720,7 +719,7 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
 /* CREATE, MKDIR, SYMLINK, MKNOD */
 int
-nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_diropres *resp)
 {
        if (resp->status == 0) {
@@ -734,7 +733,7 @@ nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
 /* RENAME */
 int
-nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_renameres *resp)
 {
        p = encode_wcc_data(rqstp, p, &resp->ffh);
@@ -744,7 +743,7 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
 /* LINK */
 int
-nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_linkres *resp)
 {
        p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -754,7 +753,7 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
 /* READDIR */
 int
-nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_readdirres *resp)
 {
        p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -768,7 +767,6 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
                rqstp->rq_res.page_len = (resp->count) << 2;
                /* add the 'tail' to the end of the 'head' page - page 0. */
-                rqstp->rq_restailpage = 0;
                rqstp->rq_res.tail[0].iov_base = p;
                *p++ = 0;               /* no more entries */
                *p++ = htonl(resp->common.err == nfserr_eof);
@@ -778,8 +776,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
                return xdr_ressize_check(rqstp, p);
 }
-static inline u32 *
+static inline __be32 *
-encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
+encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
             int namlen, ino_t ino)
 {
        *p++ = xdr_one;                          /* mark entry present */
@@ -792,8 +790,8 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
        return p;
 }
-static inline u32 *
+static inline __be32 *
-encode_entryplus_baggage(struct nfsd3_readdirres *cd, u32 *p,
+encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
                struct svc_fh *fhp)
 {
                p = encode_post_op_attr(cd->rqstp, p, fhp);
@@ -855,7 +853,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
 {
        struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
                                                        common);
-        u32             *p = cd->buffer;
+        __be32          *p = cd->buffer;
        caddr_t         curr_page_addr = NULL;
        int             pn;             /* current page number */
        int             slen;           /* string (name) length */
@@ -921,7 +919,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
        } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
                /* temporarily encode entry into next page, then move back to
                 * current and next page in rq_respages[] */
-                u32 *p1, *tmp;
+                __be32 *p1, *tmp;
                int len1, len2;
                /* grab next page for temporary storage of entry */
@@ -1011,7 +1009,7 @@ nfs3svc_encode_entry_plus(struct readdir_cd *cd, const char *name,
 /* FSSTAT */
 int
-nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_fsstatres *resp)
 {
        struct kstatfs  *s = &resp->stats;
@@ -1033,7 +1031,7 @@ nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
 /* FSINFO */
 int
-nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_fsinfores *resp)
 {
        *p++ = xdr_zero;        /* no post_op_attr */
@@ -1057,7 +1055,7 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
 /* PATHCONF */
 int
-nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_pathconfres *resp)
 {
        *p++ = xdr_zero;        /* no post_op_attr */
@@ -1076,7 +1074,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
 /* COMMIT */
 int
-nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_commitres *resp)
 {
        p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -1092,7 +1090,7 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
 * XDR release functions
 */
 int
-nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_attrstat *resp)
 {
        fh_put(&resp->fh);
@@ -1100,7 +1098,7 @@ nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfs3svc_release_fhandle2(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd3_fhandle_pair *resp)
 {
        fh_put(&resp->fh1);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index edb107e61b91..5d94555cdc83 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -63,6 +63,8 @@
 #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
                | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE)
+#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP)
 #define MASK_EQUAL(mask1, mask2) \
        ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
@@ -96,24 +98,26 @@ deny_mask(u32 allow_mask, unsigned int flags)
 /* XXX: modify functions to return NFS errors; they're only ever
 * used by nfs code, after all.... */
-static int
+/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
-mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+ * side of being more restrictive, so the mode bit mapping below is
+ * pessimistic.  An optimistic version would be needed to handle DENY's,
+ * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * bits. */
+static void
+low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
 {
-        u32 ignore = 0;
+        u32 write_mode = NFS4_WRITE_MODE;
-        if (!(flags & NFS4_ACL_DIR))
+        if (flags & NFS4_ACL_DIR)
-                ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */
+                write_mode |= NFS4_ACE_DELETE_CHILD;
-        perm |= ignore;
        *mode = 0;
        if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
                *mode |= ACL_READ;
-        if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE)
+        if ((perm & write_mode) == write_mode)
                *mode |= ACL_WRITE;
        if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
                *mode |= ACL_EXECUTE;
-        if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags)))
-                return -EINVAL;
-        return 0;
 }
 struct ace_container {
@@ -338,38 +342,6 @@ sort_pacl(struct posix_acl *pacl)
        return;
 }
-static int
-write_pace(struct nfs4_ace *ace, struct posix_acl *pacl,
-                struct posix_acl_entry **pace, short tag, unsigned int flags)
-{
-        struct posix_acl_entry *this = *pace;
-        if (*pace == pacl->a_entries + pacl->a_count)
-                return -EINVAL; /* fell off the end */
-        (*pace)++;
-        this->e_tag = tag;
-        if (tag == ACL_USER_OBJ)
-                flags |= NFS4_ACL_OWNER;
-        if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags))
-                return -EINVAL;
-        this->e_id = (tag == ACL_USER || tag == ACL_GROUP ?
-                        ace->who : ACL_UNDEFINED_ID);
-        return 0;
-}
-static struct nfs4_ace *
-get_next_v4_ace(struct list_head **p, struct list_head *head)
-{
-        struct nfs4_ace *ace;
-        *p = (*p)->next;
-        if (*p == head)
-                return NULL;
-        ace = list_entry(*p, struct nfs4_ace, l_ace);
-        return ace;
-}
 int
 nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
                struct posix_acl **dpacl, unsigned int flags)
@@ -385,42 +357,23 @@ nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
                goto out;
        error = nfs4_acl_split(acl, dacl);
-        if (error < 0)
+        if (error)
                goto out_acl;
-        if (pacl != NULL) {
+        *pacl = _nfsv4_to_posix_one(acl, flags);
-                if (acl->naces == 0) {
+        if (IS_ERR(*pacl)) {
-                        error = -ENODATA;
+                error = PTR_ERR(*pacl);
-                        goto try_dpacl;
+                *pacl = NULL;
-                }
+                goto out_acl;
-                *pacl = _nfsv4_to_posix_one(acl, flags);
-                if (IS_ERR(*pacl)) {
-                        error = PTR_ERR(*pacl);
-                        *pacl = NULL;
-                        goto out_acl;
-                }
        }
-try_dpacl:
+        *dpacl = _nfsv4_to_posix_one(dacl, flags);
-        if (dpacl != NULL) {
+        if (IS_ERR(*dpacl)) {
-                if (dacl->naces == 0) {
+                error = PTR_ERR(*dpacl);
-                        if (pacl == NULL || *pacl == NULL)
+                *dpacl = NULL;
-                                error = -ENODATA;
-                        goto out_acl;
-                }
-                error = 0;
-                *dpacl = _nfsv4_to_posix_one(dacl, flags);
-                if (IS_ERR(*dpacl)) {
-                        error = PTR_ERR(*dpacl);
-                        *dpacl = NULL;
-                        goto out_acl;
-                }
        }
 out_acl:
-        if (error && pacl) {
+        if (error) {
                posix_acl_release(*pacl);
                *pacl = NULL;
        }
@@ -429,349 +382,311 @@ out:
        return error;
 }
+/*
+ * While processing the NFSv4 ACE, this maintains bitmasks representing
+ * which permission bits have been allowed and which denied to a given
+ * entity: */
+struct posix_ace_state {
+        u32 allow;
+        u32 deny;
+};
+struct posix_user_ace_state {
+        uid_t uid;
+        struct posix_ace_state perms;
+};
+struct posix_ace_state_array {
+        int n;
+        struct posix_user_ace_state aces[];
+};
+/*
+ * While processing the NFSv4 ACE, this maintains the partial permissions
+ * calculated so far: */
+struct posix_acl_state {
+        struct posix_ace_state owner;
+        struct posix_ace_state group;
+        struct posix_ace_state other;
+        struct posix_ace_state everyone;
+        struct posix_ace_state mask; /* Deny unused in this case */
+        struct posix_ace_state_array *users;
+        struct posix_ace_state_array *groups;
+};
 static int
-same_who(struct nfs4_ace *a, struct nfs4_ace *b)
+init_state(struct posix_acl_state *state, int cnt)
 {
-        return a->whotype == b->whotype &&
+        int alloc;
-                (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who);
+        memset(state, 0, sizeof(struct posix_acl_state));
+        /*
+         * In the worst case, each individual acl could be for a distinct
+         * named user or group, but we don't no which, so we allocate
+         * enough space for either:
+         */
+        alloc = sizeof(struct posix_ace_state_array)
+                + cnt*sizeof(struct posix_ace_state);
+        state->users = kzalloc(alloc, GFP_KERNEL);
+        if (!state->users)
+                return -ENOMEM;
+        state->groups = kzalloc(alloc, GFP_KERNEL);
+        if (!state->groups) {
+                kfree(state->users);
+                return -ENOMEM;
+        }
+        return 0;
 }
-static int
+static void
-complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny,
+free_state(struct posix_acl_state *state) {
-                unsigned int flags)
+        kfree(state->users);
-{
+        kfree(state->groups);
-        int ignore = 0;
-        if (!(flags & NFS4_ACL_DIR))
-                ignore |= NFS4_ACE_DELETE_CHILD;
-        return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags),
-                          ignore|deny->access_mask) &&
-                allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
-                deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE &&
-                allow->flag == deny->flag &&
-                same_who(allow, deny);
 }
-static inline int
+static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
-user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-                struct posix_acl *pacl, struct posix_acl_entry **pace,
-                unsigned int flags)
 {
-        int error = -EINVAL;
+        state->mask.allow |= astate->allow;
-        struct nfs4_ace *ace, *ace2;
-        ace = get_next_v4_ace(p, &n4acl->ace_head);
-        if (ace == NULL)
-                goto out;
-        if (ace2type(ace) != ACL_USER_OBJ)
-                goto out;
-        error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags);
-        if (error < 0)
-                goto out;
-        error = -EINVAL;
-        ace2 = get_next_v4_ace(p, &n4acl->ace_head);
-        if (ace2 == NULL)
-                goto out;
-        if (!complementary_ace_pair(ace, ace2, flags))
-                goto out;
-        error = 0;
-out:
-        return error;
 }
-static inline int
+/*
-users_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
+ * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
-                struct nfs4_ace **mask_ace,
+ * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
-                struct posix_acl *pacl, struct posix_acl_entry **pace,
+ * to traditional read/write/execute permissions.
-                unsigned int flags)
+ *
-{
+ * It's problematic to reject acls that use certain mode bits, because it
-        int error = -EINVAL;
+ * places the burden on users to learn the rules about which bits one
-        struct nfs4_ace *ace, *ace2;
+ * particular server sets, without giving the user a lot of help--we return an
+ * error that could mean any number of different things.  To make matters
+ * worse, the problematic bits might be introduced by some application that's
+ * automatically mapping from some other acl model.
+ *
+ * So wherever possible we accept anything, possibly erring on the side of
+ * denying more permissions than necessary.
+ *
+ * However we do reject *explicit* DENY's of a few bits representing
+ * permissions we could never deny:
+ */
-        ace = get_next_v4_ace(p, &n4acl->ace_head);
+static inline int check_deny(u32 mask, int isowner)
-        if (ace == NULL)
+{
-                goto out;
+        if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
-        while (ace2type(ace) == ACL_USER) {
+                return -EINVAL;
-                if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+        if (!isowner)
-                        goto out;
+                return 0;
-                if (*mask_ace &&
+        if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
-                        !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
+                return -EINVAL;
-                        goto out;
+        return 0;
-                *mask_ace = ace;
-                ace = get_next_v4_ace(p, &n4acl->ace_head);
-                if (ace == NULL)
-                        goto out;
-                if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
-                        goto out;
-                error = write_pace(ace, pacl, pace, ACL_USER, flags);
-                if (error < 0)
-                        goto out;
-                error = -EINVAL;
-                ace2 = get_next_v4_ace(p, &n4acl->ace_head);
-                if (ace2 == NULL)
-                        goto out;
-                if (!complementary_ace_pair(ace, ace2, flags))
-                        goto out;
-                if ((*mask_ace)->flag != ace2->flag ||
-                                !same_who(*mask_ace, ace2))
-                        goto out;
-                ace = get_next_v4_ace(p, &n4acl->ace_head);
-                if (ace == NULL)
-                        goto out;
-        }
-        error = 0;
-out:
-        return error;
 }
-static inline int
+static struct posix_acl *
-group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
+posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
-                struct nfs4_ace **mask_ace,
-                struct posix_acl *pacl, struct posix_acl_entry **pace,
-                unsigned int flags)
 {
-        int error = -EINVAL;
+        struct posix_acl_entry *pace;
-        struct nfs4_ace *ace, *ace2;
+        struct posix_acl *pacl;
-        struct ace_container *ac;
+        int nace;
-        struct list_head group_l;
+        int i, error = 0;
-        INIT_LIST_HEAD(&group_l);
-        ace = list_entry(*p, struct nfs4_ace, l_ace);
-        /* group owner (mask and allow aces) */
-        if (pacl->a_count != 3) {
+        nace = 4 + state->users->n + state->groups->n;
-                /* then the group owner should be preceded by mask */
+        pacl = posix_acl_alloc(nace, GFP_KERNEL);
-                if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+        if (!pacl)
-                        goto out;
+                return ERR_PTR(-ENOMEM);
-                if (*mask_ace &&
-                        !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
-                        goto out;
-                *mask_ace = ace;
-                ace = get_next_v4_ace(p, &n4acl->ace_head);
-                if (ace == NULL)
-                        goto out;
-                if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace))
+        pace = pacl->a_entries;
-                        goto out;
+        pace->e_tag = ACL_USER_OBJ;
+        error = check_deny(state->owner.deny, 1);
+        if (error)
+                goto out_err;
+        low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
+        pace->e_id = ACL_UNDEFINED_ID;
+        for (i=0; i < state->users->n; i++) {
+                pace++;
+                pace->e_tag = ACL_USER;
+                error = check_deny(state->users->aces[i].perms.deny, 0);
+                if (error)
+                        goto out_err;
+                low_mode_from_nfs4(state->users->aces[i].perms.allow,
+                                        &pace->e_perm, flags);
+                pace->e_id = state->users->aces[i].uid;
+                add_to_mask(state, &state->users->aces[i].perms);
        }
-        if (ace2type(ace) != ACL_GROUP_OBJ)
+        pace++;
-                goto out;
+        pace->e_tag = ACL_GROUP_OBJ;
+        error = check_deny(state->group.deny, 0);
-        ac = kmalloc(sizeof(*ac), GFP_KERNEL);
+        if (error)
-        error = -ENOMEM;
+                goto out_err;
-        if (ac == NULL)
+        low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
-                goto out;
+        pace->e_id = ACL_UNDEFINED_ID;
-        ac->ace = ace;
+        add_to_mask(state, &state->group);
-        list_add_tail(&ac->ace_l, &group_l);
+        for (i=0; i < state->groups->n; i++) {
-        error = -EINVAL;
+                pace++;
-        if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
+                pace->e_tag = ACL_GROUP;
-                goto out;
+                error = check_deny(state->groups->aces[i].perms.deny, 0);
+                if (error)
-        error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags);
+                        goto out_err;
-        if (error < 0)
+                low_mode_from_nfs4(state->groups->aces[i].perms.allow,
-                goto out;
+                                        &pace->e_perm, flags);
+                pace->e_id = state->groups->aces[i].uid;
-        error = -EINVAL;
+                add_to_mask(state, &state->groups->aces[i].perms);
-        ace = get_next_v4_ace(p, &n4acl->ace_head);
+        }
-        if (ace == NULL)
-                goto out;
-        /* groups (mask and allow aces) */
-        while (ace2type(ace) == ACL_GROUP) {
-                if (*mask_ace == NULL)
-                        goto out;
-                if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE ||
-                        !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
-                        goto out;
-                *mask_ace = ace;
-                ace = get_next_v4_ace(p, &n4acl->ace_head);
+        pace++;
-                if (ace == NULL)
+        pace->e_tag = ACL_MASK;
-                        goto out;
+        low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
-                ac = kmalloc(sizeof(*ac), GFP_KERNEL);
+        pace->e_id = ACL_UNDEFINED_ID;
-                error = -ENOMEM;
-                if (ac == NULL)
-                        goto out;
-                error = -EINVAL;
-                if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE ||
-                                !same_who(ace, *mask_ace))
-                        goto out;
-                ac->ace = ace;
+        pace++;
-                list_add_tail(&ac->ace_l, &group_l);
+        pace->e_tag = ACL_OTHER;
+        error = check_deny(state->other.deny, 0);
+        if (error)
+                goto out_err;
+        low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
+        pace->e_id = ACL_UNDEFINED_ID;
-                error = write_pace(ace, pacl, pace, ACL_GROUP, flags);
+        return pacl;
-                if (error < 0)
+out_err:
-                        goto out;
+        posix_acl_release(pacl);
-                error = -EINVAL;
+        return ERR_PTR(error);
-                ace = get_next_v4_ace(p, &n4acl->ace_head);
+}
-                if (ace == NULL)
-                        goto out;
-        }
-        /* group owner (deny ace) */
+static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
+{
+        /* Allow all bits in the mask not already denied: */
+        astate->allow |= mask & ~astate->deny;
+}
-        if (ace2type(ace) != ACL_GROUP_OBJ)
+static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
-                goto out;
+{
-        ac = list_entry(group_l.next, struct ace_container, ace_l);
+        /* Deny all bits in the mask not already allowed: */
-        ace2 = ac->ace;
+        astate->deny |= mask & ~astate->allow;
-        if (!complementary_ace_pair(ace2, ace, flags))
+}
-                goto out;
-        list_del(group_l.next);
-        kfree(ac);
-        /* groups (deny aces) */
+static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
+{
+        int i;
-        while (!list_empty(&group_l)) {
+        for (i = 0; i < a->n; i++)
-                ace = get_next_v4_ace(p, &n4acl->ace_head);
+                if (a->aces[i].uid == uid)
-                if (ace == NULL)
+                        return i;
-                        goto out;
+        /* Not found: */
-                if (ace2type(ace) != ACL_GROUP)
+        a->n++;
-                        goto out;
+        a->aces[i].uid = uid;
-                ac = list_entry(group_l.next, struct ace_container, ace_l);
+        a->aces[i].perms.allow = state->everyone.allow;
-                ace2 = ac->ace;
+        a->aces[i].perms.deny  = state->everyone.deny;
-                if (!complementary_ace_pair(ace2, ace, flags))
-                        goto out;
-                list_del(group_l.next);
-                kfree(ac);
-        }
-        ace = get_next_v4_ace(p, &n4acl->ace_head);
+        return i;
-        if (ace == NULL)
-                goto out;
-        if (ace2type(ace) != ACL_OTHER)
-                goto out;
-        error = 0;
-out:
-        while (!list_empty(&group_l)) {
-                ac = list_entry(group_l.next, struct ace_container, ace_l);
-                list_del(group_l.next);
-                kfree(ac);
-        }
-        return error;
 }
-static inline int
+static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
-mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-                struct nfs4_ace **mask_ace,
-                struct posix_acl *pacl, struct posix_acl_entry **pace,
-                unsigned int flags)
 {
-        int error = -EINVAL;
+        int i;
-        struct nfs4_ace *ace;
-        ace = list_entry(*p, struct nfs4_ace, l_ace);
+        for (i=0; i < a->n; i++)
-        if (pacl->a_count != 3) {
+                deny_bits(&a->aces[i].perms, mask);
-                if (*mask_ace == NULL)
-                        goto out;
-                (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags);
-                write_pace(*mask_ace, pacl, pace, ACL_MASK, flags);
-        }
-        error = 0;
-out:
-        return error;
 }
-static inline int
+static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
-other_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-                struct posix_acl *pacl, struct posix_acl_entry **pace,
-                unsigned int flags)
 {
-        int error = -EINVAL;
+        int i;
-        struct nfs4_ace *ace, *ace2;
-        ace = list_entry(*p, struct nfs4_ace, l_ace);
+        for (i=0; i < a->n; i++)
-        if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
+                allow_bits(&a->aces[i].perms, mask);
-                goto out;
-        error = write_pace(ace, pacl, pace, ACL_OTHER, flags);
-        if (error < 0)
-                goto out;
-        error = -EINVAL;
-        ace2 = get_next_v4_ace(p, &n4acl->ace_head);
-        if (ace2 == NULL)
-                goto out;
-        if (!complementary_ace_pair(ace, ace2, flags))
-                goto out;
-        error = 0;
-out:
-        return error;
 }
-static int
+static void process_one_v4_ace(struct posix_acl_state *state,
-calculate_posix_ace_count(struct nfs4_acl *n4acl)
+                                struct nfs4_ace *ace)
 {
-        if (n4acl->naces == 6) /* owner, owner group, and other only */
+        u32 mask = ace->access_mask;
-                return 3;
+        int i;
-        else { /* Otherwise there must be a mask entry. */
-                /* Also, the remaining entries are for named users and
+        switch (ace2type(ace)) {
-                 * groups, and come in threes (mask, allow, deny): */
+        case ACL_USER_OBJ:
-                if (n4acl->naces < 7)
+                if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
-                        return -EINVAL;
+                        allow_bits(&state->owner, mask);
-                if ((n4acl->naces - 7) % 3)
+                } else {
-                        return -EINVAL;
+                        deny_bits(&state->owner, mask);
-                return 4 + (n4acl->naces - 7)/3;
+                }
+                break;
+        case ACL_USER:
+                i = find_uid(state, state->users, ace->who);
+                if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+                        allow_bits(&state->users->aces[i].perms, mask);
+                } else {
+                        deny_bits(&state->users->aces[i].perms, mask);
+                        mask = state->users->aces[i].perms.deny;
+                        deny_bits(&state->owner, mask);
+                }
+                break;
+        case ACL_GROUP_OBJ:
+                if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+                        allow_bits(&state->group, mask);
+                } else {
+                        deny_bits(&state->group, mask);
+                        mask = state->group.deny;
+                        deny_bits(&state->owner, mask);
+                        deny_bits(&state->everyone, mask);
+                        deny_bits_array(state->users, mask);
+                        deny_bits_array(state->groups, mask);
+                }
+                break;
+        case ACL_GROUP:
+                i = find_uid(state, state->groups, ace->who);
+                if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+                        allow_bits(&state->groups->aces[i].perms, mask);
+                } else {
+                        deny_bits(&state->groups->aces[i].perms, mask);
+                        mask = state->groups->aces[i].perms.deny;
+                        deny_bits(&state->owner, mask);
+                        deny_bits(&state->group, mask);
+                        deny_bits(&state->everyone, mask);
+                        deny_bits_array(state->users, mask);
+                        deny_bits_array(state->groups, mask);
+                }
+                break;
+        case ACL_OTHER:
+                if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+                        allow_bits(&state->owner, mask);
+                        allow_bits(&state->group, mask);
+                        allow_bits(&state->other, mask);
+                        allow_bits(&state->everyone, mask);
+                        allow_bits_array(state->users, mask);
+                        allow_bits_array(state->groups, mask);
+                } else {
+                        deny_bits(&state->owner, mask);
+                        deny_bits(&state->group, mask);
+                        deny_bits(&state->other, mask);
+                        deny_bits(&state->everyone, mask);
+                        deny_bits_array(state->users, mask);
+                        deny_bits_array(state->groups, mask);
+                }
        }
 }
 static struct posix_acl *
 _nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags)
 {
+        struct posix_acl_state state;
        struct posix_acl *pacl;
-        int error = -EINVAL, nace = 0;
+        struct nfs4_ace *ace;
-        struct list_head *p;
+        int ret;
-        struct nfs4_ace *mask_ace = NULL;
-        struct posix_acl_entry *pace;
-        nace = calculate_posix_ace_count(n4acl);
-        if (nace < 0)
-                goto out_err;
-        pacl = posix_acl_alloc(nace, GFP_KERNEL);
-        error = -ENOMEM;
-        if (pacl == NULL)
-                goto out_err;
-        pace = &pacl->a_entries[0];
-        p = &n4acl->ace_head;
-        error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags);
-        if (error)
-                goto out_acl;
-        error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
-        if (error)
-                goto out_acl;
-        error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace,
+        ret = init_state(&state, n4acl->naces);
-                                                flags);
+        if (ret)
-        if (error)
+                return ERR_PTR(ret);
-                goto out_acl;
-        error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
+        list_for_each_entry(ace, &n4acl->ace_head, l_ace)
-        if (error)
+                process_one_v4_ace(&state, ace);
-                goto out_acl;
-        error = other_from_v4(n4acl, &p, pacl, &pace, flags);
-        if (error)
-                goto out_acl;
-        error = -EINVAL;
+        pacl = posix_state_to_acl(&state, flags);
-        if (p->next != &n4acl->ace_head)
-                goto out_acl;
-        if (pace != pacl->a_entries + pacl->a_count)
-                goto out_acl;
-        sort_pacl(pacl);
+        free_state(&state);
-        return pacl;
+        if (!IS_ERR(pacl))
-out_acl:
+                sort_pacl(pacl);
-        posix_acl_release(pacl);
-out_err:
-        pacl = ERR_PTR(error);
        return pacl;
 }
@@ -785,22 +700,41 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
        list_for_each_safe(h, n, &acl->ace_head) {
                ace = list_entry(h, struct nfs4_ace, l_ace);
-                if ((ace->flag & NFS4_INHERITANCE_FLAGS)
+                if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
-                                != NFS4_INHERITANCE_FLAGS)
+                    ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
-                        continue;
+                        return -EINVAL;
-                error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+                if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
-                                ace->access_mask, ace->whotype, ace->who);
+                        return -EINVAL;
-                if (error < 0)
-                        goto out;
-                list_del(h);
+                switch (ace->flag & NFS4_INHERITANCE_FLAGS) {
-                kfree(ace);
+                case 0:
-                acl->naces--;
+                        /* Leave this ace in the effective acl: */
+                        continue;
+                case NFS4_INHERITANCE_FLAGS:
+                        /* Add this ace to the default acl and remove it
+                         * from the effective acl: */
+                        error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+                                ace->access_mask, ace->whotype, ace->who);
+                        if (error)
+                                return error;
+                        list_del(h);
+                        kfree(ace);
+                        acl->naces--;
+                        break;
+                case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE:
+                        /* Add this ace to the default, but leave it in
+                         * the effective acl as well: */
+                        error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+                                ace->access_mask, ace->whotype, ace->who);
+                        if (error)
+                                return error;
+                        break;
+                default:
+                        return -EINVAL;
+                }
        }
+        return 0;
-out:
-        return error;
 }
 static short
@@ -930,23 +864,6 @@ nfs4_acl_write_who(int who, char *p)
        return -1;
 }
-static inline int
-match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who)
-{
-        switch (ace->whotype) {
-                case NFS4_ACL_WHO_NAMED:
-                        return who == ace->who;
-                case NFS4_ACL_WHO_OWNER:
-                        return who == owner;
-                case NFS4_ACL_WHO_GROUP:
-                        return who == group;
-                case NFS4_ACL_WHO_EVERYONE:
-                        return 1;
-                default:
-                        return 0;
-        }
-}
 EXPORT_SYMBOL(nfs4_acl_new);
 EXPORT_SYMBOL(nfs4_acl_free);
 EXPORT_SYMBOL(nfs4_acl_add_ace);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f6ca9fb3fc63..f57655a7a2b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -85,8 +85,8 @@ enum nfs_cb_opnum4 {
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
-static inline u32 *
+static inline __be32 *
-xdr_writemem(u32 *p, const void *ptr, int nbytes)
+xdr_writemem(__be32 *p, const void *ptr, int nbytes)
 {
        int tmp = XDR_QUADLEN(nbytes);
        if (!tmp)
@@ -205,7 +205,7 @@ nfs_cb_stat_to_errno(int stat)
 static int
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
-        u32 * p;
+        __be32 * p;
        RESERVE_SPACE(16);
        WRITE32(0);            /* tag length is always 0 */
@@ -218,7 +218,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
-        u32 *p;
+        __be32 *p;
        int len = cb_rec->cbr_fhlen;
        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
@@ -231,7 +231,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 }
 static int
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
+nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 {
        struct xdr_stream xdrs, *xdr = &xdrs;
@@ -241,7 +241,7 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
 }
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
 {
        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr = {
@@ -257,7 +257,7 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args
 static int
 decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        u32 *p;
+        __be32 *p;
        READ_BUF(8);
        READ32(hdr->status);
@@ -272,7 +272,7 @@ decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 static int
 decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 {
-        u32 *p;
+        __be32 *p;
        u32 op;
        int32_t nfserr;
@@ -291,13 +291,13 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 }
 static int
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p)
+nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 {
        return 0;
 }
 static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
+nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
 {
        struct xdr_stream xdr;
        struct nfs4_cb_compound_hdr hdr;
@@ -421,7 +421,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
        /* Create RPC client */
        cb->cb_client = rpc_create(&args);
-        if (!cb->cb_client) {
+        if (IS_ERR(cb->cb_client)) {
                dprintk("NFSD: couldn't create callback client\n");
                goto out_err;
        }
@@ -448,10 +448,10 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 out_rpciod:
        atomic_dec(&clp->cl_count);
        rpciod_down();
-        cb->cb_client = NULL;
 out_clnt:
        rpc_shutdown_client(cb->cb_client);
 out_err:
+        cb->cb_client = NULL;
        dprintk("NFSD: warning: no callback path to client %.*s\n",
                (int)clp->cl_name.len, clp->cl_name.data);
 }
@@ -461,7 +461,7 @@ nfs4_cb_null(struct rpc_task *task, void *dummy)
 {
        struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
        struct nfs4_callback *cb = &clp->cl_callback;
-        u32 addr = htonl(cb->cb_addr);
+        __be32 addr = htonl(cb->cb_addr);
        dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 15ded7a30a72..0a7bbdc4a10a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -67,32 +67,32 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
        *dst = *src;
 }
-static int
+static __be32
-do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
 {
-        int accmode, status;
+        __be32 status;
        if (open->op_truncate &&
                !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
                return nfserr_inval;
-        accmode = MAY_NOP;
        if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-                accmode = MAY_READ;
+                accmode |= MAY_READ;
-        if (open->op_share_deny & NFS4_SHARE_ACCESS_WRITE)
+        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                accmode |= (MAY_WRITE | MAY_TRUNC);
-        accmode |= MAY_OWNER_OVERRIDE;
+        if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
+                accmode |= MAY_WRITE;
        status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
        return status;
 }
-static int
+static __be32
 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
        struct svc_fh resfh;
-        int status;
+        __be32 status;
        fh_init(&resfh, NFS4_FHSIZE);
        open->op_truncate = 0;
@@ -124,17 +124,17 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                                &resfh.fh_handle.fh_base,
                                resfh.fh_handle.fh_size);
-                status = do_open_permission(rqstp, current_fh, open);
+                status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
        }
        fh_put(&resfh);
        return status;
 }
-static int
+static __be32
 do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
-        int status;
+        __be32 status;
        /* Only reclaims from previously confirmed clients are valid */
        if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
@@ -155,16 +155,16 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
                (open->op_iattr.ia_size == 0);
-        status = do_open_permission(rqstp, current_fh, open);
+        status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, struct nfs4_stateowner **replay_owner)
 {
-        int status;
+        __be32 status;
        dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
                (int)open->op_fname.len, open->op_fname.data,
                open->op_stateowner);
@@ -177,7 +177,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
        /* check seqid for replay. set nfs4_owner */
        status = nfsd4_process_open1(open);
-        if (status == NFSERR_REPLAY_ME) {
+        if (status == nfserr_replay_me) {
                struct nfs4_replay *rp = &open->op_stateowner->so_replay;
                fh_put(current_fh);
                current_fh->fh_handle.fh_size = rp->rp_openfh_len;
@@ -188,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
                        dprintk("nfsd4_open: replay failed"
                                " restoring previous filehandle\n");
                else
-                        status = NFSERR_REPLAY_ME;
+                        status = nfserr_replay_me;
        }
        if (status)
                goto out;
@@ -261,7 +261,7 @@ out:
 /*
 * filehandle-manipulating ops.
 */
-static inline int
+static inline __be32
 nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
 {
        if (!current_fh->fh_dentry)
@@ -271,7 +271,7 @@ nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
        return nfs_ok;
 }
-static inline int
+static inline __be32
 nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putfh *putfh)
 {
        fh_put(current_fh);
@@ -280,10 +280,10 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putf
        return fh_verify(rqstp, current_fh, 0, MAY_NOP);
 }
-static inline int
+static inline __be32
 nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 {
-        int status;
+        __be32 status;
        fh_put(current_fh);
        status = exp_pseudoroot(rqstp->rq_client, current_fh,
@@ -291,7 +291,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 {
        if (!save_fh->fh_dentry)
@@ -301,7 +301,7 @@ nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
        return nfs_ok;
 }
-static inline int
+static inline __be32
 nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 {
        if (!current_fh->fh_dentry)
@@ -314,7 +314,7 @@ nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 /*
 * misc nfsv4 ops
 */
-static inline int
+static inline __be32
 nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_access *access)
 {
        if (access->ac_req_access & ~NFS3_ACCESS_FULL)
@@ -324,10 +324,10 @@ nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_acc
        return nfsd_access(rqstp, current_fh, &access->ac_resp_access, &access->ac_supported);
 }
-static inline int
+static inline __be32
 nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_commit *commit)
 {
-        int status;
+        __be32 status;
        u32 *p = (u32 *)commit->co_verf.data;
        *p++ = nfssvc_boot.tv_sec;
@@ -339,11 +339,11 @@ nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_com
        return status;
 }
-static int
+static __be32
 nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create)
 {
        struct svc_fh resfh;
-        int status;
+        __be32 status;
        dev_t rdev;
        fh_init(&resfh, NFS4_FHSIZE);
@@ -423,10 +423,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_getattr *getattr)
 {
-        int status;
+        __be32 status;
        status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
        if (status)
@@ -442,11 +442,11 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ge
        return nfs_ok;
 }
-static inline int
+static inline __be32
 nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
           struct svc_fh *save_fh, struct nfsd4_link *link)
 {
-        int status = nfserr_nofilehandle;
+        __be32 status = nfserr_nofilehandle;
        if (!save_fh->fh_dentry)
                return status;
@@ -456,11 +456,11 @@ nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
        return status;
 }
-static int
+static __be32
 nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 {
        struct svc_fh tmp_fh;
-        int ret;
+        __be32 ret;
        fh_init(&tmp_fh, NFS4_FHSIZE);
        if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh,
@@ -474,16 +474,16 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
        return nfsd_lookup(rqstp, current_fh, "..", 2, current_fh);
 }
-static inline int
+static inline __be32
 nfsd4_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lookup *lookup)
 {
        return nfsd_lookup(rqstp, current_fh, lookup->lo_name, lookup->lo_len, current_fh);
 }
-static inline int
+static inline __be32
 nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read)
 {
-        int status;
+        __be32 status;
        /* no need to check permission - this will be done in nfsd_read() */
@@ -508,7 +508,7 @@ out:
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readdir *readdir)
 {
        u64 cookie = readdir->rd_cookie;
@@ -531,7 +531,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_re
        return nfs_ok;
 }
-static inline int
+static inline __be32
 nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readlink *readlink)
 {
        readlink->rl_rqstp = rqstp;
@@ -539,10 +539,10 @@ nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_r
        return nfs_ok;
 }
-static inline int
+static inline __be32
 nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_remove *remove)
 {
-        int status;
+        __be32 status;
        if (nfs4_in_grace())
                return nfserr_grace;
@@ -556,11 +556,11 @@ nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_rem
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
             struct svc_fh *save_fh, struct nfsd4_rename *rename)
 {
-        int status = nfserr_nofilehandle;
+        __be32 status = nfserr_nofilehandle;
        if (!save_fh->fh_dentry)
                return status;
@@ -589,10 +589,10 @@ nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr)
 {
-        int status = nfs_ok;
+        __be32 status = nfs_ok;
        if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
                nfs4_lock_state();
@@ -614,13 +614,13 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se
        return status;
 }
-static inline int
+static inline __be32
 nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write)
 {
        stateid_t *stateid = &write->wr_stateid;
        struct file *filp = NULL;
        u32 *p;
-        int status = nfs_ok;
+        __be32 status = nfs_ok;
        /* no need to check permission - this will be done in nfsd_write() */
@@ -646,7 +646,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
        *p++ = nfssvc_boot.tv_usec;
        status =  nfsd_write(rqstp, current_fh, filp, write->wr_offset,
-                        write->wr_vec, write->wr_vlen, write->wr_buflen,
+                        rqstp->rq_vec, write->wr_vlen, write->wr_buflen,
                        &write->wr_how_written);
        if (filp)
                fput(filp);
@@ -661,12 +661,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
 * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
 * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
 */
-static int
+static __be32
 nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_verify *verify)
 {
-        u32 *buf, *p;
+        __be32 *buf, *p;
        int count;
-        int status;
+        __be32 status;
        status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
        if (status)
@@ -715,7 +715,7 @@ out_kfree:
 /*
 * NULL call.
 */
-static int
+static __be32
 nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        return nfs_ok;
@@ -731,7 +731,7 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 /*
 * COMPOUND call.
 */
-static int
+static __be32
 nfsd4_proc_compound(struct svc_rqst *rqstp,
                    struct nfsd4_compoundargs *args,
                    struct nfsd4_compoundres *resp)
@@ -741,7 +741,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        struct svc_fh   *save_fh = NULL;
        struct nfs4_stateowner *replay_owner = NULL;
        int             slack_space;    /* in words, not bytes! */
-        int             status;
+        __be32          status;
        status = nfserr_resource;
        current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL);
@@ -802,13 +802,29 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH
                * require a valid current filehandle
                */
-                if ((!current_fh->fh_dentry) &&
+                if (!current_fh->fh_dentry) {
-                   !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) ||
+                        if (!((op->opnum == OP_PUTFH) ||
-                   (op->opnum == OP_SETCLIENTID) ||
+                              (op->opnum == OP_PUTROOTFH) ||
-                   (op->opnum == OP_SETCLIENTID_CONFIRM) ||
+                              (op->opnum == OP_SETCLIENTID) ||
-                   (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) ||
+                              (op->opnum == OP_SETCLIENTID_CONFIRM) ||
-                   (op->opnum == OP_RELEASE_LOCKOWNER))) {
+                              (op->opnum == OP_RENEW) ||
-                        op->status = nfserr_nofilehandle;
+                              (op->opnum == OP_RESTOREFH) ||
+                              (op->opnum == OP_RELEASE_LOCKOWNER))) {
+                                op->status = nfserr_nofilehandle;
+                                goto encode_op;
+                        }
+                }
+                /* Check must be done at start of each operation, except
+                 * for GETATTR and ops not listed as returning NFS4ERR_MOVED
+                 */
+                else if (current_fh->fh_export->ex_fslocs.migrated &&
+                         !((op->opnum == OP_GETATTR) ||
+                           (op->opnum == OP_PUTROOTFH) ||
+                           (op->opnum == OP_PUTPUBFH) ||
+                           (op->opnum == OP_RENEW) ||
+                           (op->opnum == OP_SETCLIENTID) ||
+                           (op->opnum == OP_RELEASE_LOCKOWNER))) {
+                        op->status = nfserr_moved;
                        goto encode_op;
                }
                switch (op->opnum) {
@@ -921,7 +937,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                }
 encode_op:
-                if (op->status == NFSERR_REPLAY_ME) {
+                if (op->status == nfserr_replay_me) {
                        op->replay = &replay_owner->so_replay;
                        nfsd4_encode_replay(resp, op);
                        status = op->status = op->replay->rp_status;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1cbd2e4ee122..e9d07704680e 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -83,13 +83,13 @@ md5_to_hex(char *out, char *md5)
        *out = '\0';
 }
-int
+__be32
 nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 {
        struct xdr_netobj cksum;
        struct hash_desc desc;
        struct scatterlist sg[1];
-        int status = nfserr_resource;
+        __be32 status = nfserr_resource;
        dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
                        clname->len, clname->data);
@@ -193,7 +193,7 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
        struct dentry_list *child;
        if (name && isdotent(name, namlen))
-                return nfs_ok;
+                return 0;
        dentry = lookup_one_len(name, parent, namlen);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -333,14 +333,14 @@ purge_old(struct dentry *parent, struct dentry *child)
        int status;
        if (nfs4_has_reclaimed_state(child->d_name.name))
-                return nfs_ok;
+                return 0;
        status = nfsd4_clear_clid_dir(parent, child);
        if (status)
                printk("failed to remove client recovery directory %s\n",
                                child->d_name.name);
        /* Keep trying, success or failure: */
-        return nfs_ok;
+        return 0;
 }
 void
@@ -365,10 +365,10 @@ load_recdir(struct dentry *parent, struct dentry *child)
                printk("nfsd4: illegal name %s in recovery directory\n",
                                child->d_name.name);
                /* Keep trying; maybe the others are OK: */
-                return nfs_ok;
+                return 0;
        }
        nfs4_client_to_reclaim(child->d_name.name);
-        return nfs_ok;
+        return 0;
 }
 int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ebcf226a9e4a..293b6495829f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -710,10 +710,10 @@ out_err:
 *              as described above.
 *
 */
-int
+__be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 {
-        u32                     ip_addr = rqstp->rq_addr.sin_addr.s_addr;
+        __be32                  ip_addr = rqstp->rq_addr.sin_addr.s_addr;
        struct xdr_netobj       clname = { 
                .len = setclid->se_namelen,
                .data = setclid->se_name,
@@ -721,7 +721,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
        nfs4_verifier           clverifier = setclid->se_verf;
        unsigned int            strhashval;
        struct nfs4_client      *conf, *unconf, *new;
-        int                     status;
+        __be32                  status;
        char                    dname[HEXDIR_LEN];
        
        if (!check_name(clname))
@@ -875,14 +875,14 @@ out:
 *
 * NOTE: callback information will be processed here in a future patch
 */
-int
+__be32
 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm)
 {
-        u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
+        __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
        struct nfs4_client *conf, *unconf;
        nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
        clientid_t * clid = &setclientid_confirm->sc_clientid;
-        int status;
+        __be32 status;
        if (STALE_CLIENTID(clid))
                return nfserr_stale_clientid;
@@ -1280,13 +1280,13 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
 * Called to check deny when READ with all zero stateid or
 * WRITE with all zero or all one stateid
 */
-static int
+static __be32
 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_file *fp;
        struct nfs4_stateid *stp;
-        int ret;
+        __be32 ret;
        dprintk("NFSD: nfs4_share_conflict\n");
@@ -1444,7 +1444,7 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 };
-int
+__be32
 nfsd4_process_open1(struct nfsd4_open *open)
 {
        clientid_t *clientid = &open->op_clientid;
@@ -1477,7 +1477,7 @@ nfsd4_process_open1(struct nfsd4_open *open)
        }
        if (open->op_seqid == sop->so_seqid - 1) {
                if (sop->so_replay.rp_buflen)
-                        return NFSERR_REPLAY_ME;
+                        return nfserr_replay_me;
                /* The original OPEN failed so spectacularly
                 * that we don't even have replay data saved!
                 * Therefore, we have no choice but to continue
@@ -1501,7 +1501,7 @@ renew:
        return nfs_ok;
 }
-static inline int
+static inline __be32
 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 {
        if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
@@ -1522,12 +1522,12 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
        return NULL;
 }
-static int
+static __be32
 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
                struct nfs4_delegation **dp)
 {
        int flags;
-        int status = nfserr_bad_stateid;
+        __be32 status = nfserr_bad_stateid;
        *dp = find_delegation_file(fp, &open->op_delegate_stateid);
        if (*dp == NULL)
@@ -1546,11 +1546,11 @@ out:
        return nfs_ok;
 }
-static int
+static __be32
 nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
 {
        struct nfs4_stateid *local;
-        int status = nfserr_share_denied;
+        __be32 status = nfserr_share_denied;
        struct nfs4_stateowner *sop = open->op_stateowner;
        list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
@@ -1575,7 +1575,7 @@ nfs4_alloc_stateid(void)
        return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
 }
-static int
+static __be32
 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
                struct nfs4_delegation *dp,
                struct svc_fh *cur_fh, int flags)
@@ -1590,7 +1590,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
                get_file(dp->dl_vfs_file);
                stp->st_vfs_file = dp->dl_vfs_file;
        } else {
-                int status;
+                __be32 status;
                status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
                                &stp->st_vfs_file);
                if (status) {
@@ -1604,7 +1604,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
        return 0;
 }
-static inline int
+static inline __be32
 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
                struct nfsd4_open *open)
 {
@@ -1619,22 +1619,22 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
        return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
 }
-static int
+static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
        struct file *filp = stp->st_vfs_file;
        struct inode *inode = filp->f_dentry->d_inode;
        unsigned int share_access, new_writer;
-        int status;
+        __be32 status;
        set_access(&share_access, stp->st_access_bmap);
        new_writer = (~share_access) & open->op_share_access
                        & NFS4_SHARE_ACCESS_WRITE;
        if (new_writer) {
-                status = get_write_access(inode);
+                int err = get_write_access(inode);
-                if (status)
+                if (err)
-                        return nfserrno(status);
+                        return nfserrno(err);
        }
        status = nfsd4_truncate(rqstp, cur_fh, open);
        if (status) {
@@ -1738,14 +1738,14 @@ out:
 /*
 * called with nfs4_lock_state() held.
 */
-int
+__be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
        struct nfs4_file *fp = NULL;
        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
-        int status;
+        __be32 status;
        status = nfserr_inval;
        if (!access_valid(open->op_share_access)
@@ -1833,11 +1833,11 @@ static struct work_struct laundromat_work;
 static void laundromat_main(void *);
 static DECLARE_WORK(laundromat_work, laundromat_main, NULL);
-int 
+__be32
 nfsd4_renew(clientid_t *clid)
 {
        struct nfs4_client *clp;
-        int status;
+        __be32 status;
        nfs4_lock_state();
        dprintk("process_renew(%08x/%08x): starting\n", 
@@ -1996,9 +1996,9 @@ access_permit_write(unsigned long access_bmap)
 }
 static
-int nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
 {
-        int status = nfserr_openmode;
+        __be32 status = nfserr_openmode;
        if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
                goto out;
@@ -2009,7 +2009,7 @@ out:
        return status;
 }
-static inline int
+static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
        /* Trying to call delegreturn with a special stateid? Yuch: */
@@ -2043,14 +2043,14 @@ io_during_grace_disallowed(struct inode *inode, int flags)
 /*
 * Checks for stateid operations
 */
-int
+__be32
 nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
 {
        struct nfs4_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
        stateid_t *stidp;
        struct inode *ino = current_fh->fh_dentry->d_inode;
-        int status;
+        __be32 status;
        dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
                stateid->si_boot, stateid->si_stateownerid, 
@@ -2125,7 +2125,7 @@ setlkflg (int type)
 /* 
 * Checks for sequence id mutating operations. 
 */
-static int
+static __be32
 nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
        struct nfs4_stateid *stp;
@@ -2169,7 +2169,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
                clientid_t *lockclid = &lock->v.new.clientid;
                struct nfs4_client *clp = sop->so_client;
                int lkflg = 0;
-                int status;
+                __be32 status;
                lkflg = setlkflg(lock->lk_type);
@@ -2233,7 +2233,7 @@ check_replay:
        if (seqid == sop->so_seqid - 1) {
                dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
                /* indicate replay to calling function */
-                return NFSERR_REPLAY_ME;
+                return nfserr_replay_me;
        }
        printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
                        sop->so_seqid, seqid);
@@ -2241,10 +2241,10 @@ check_replay:
        return nfserr_bad_seqid;
 }
-int
+__be32
 nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc, struct nfs4_stateowner **replay_owner)
 {
-        int status;
+        __be32 status;
        struct nfs4_stateowner *sop;
        struct nfs4_stateid *stp;
@@ -2310,10 +2310,10 @@ reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
        }
 }
-int
+__be32
 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_downgrade *od, struct nfs4_stateowner **replay_owner)
 {
-        int status;
+        __be32 status;
        struct nfs4_stateid *stp;
        unsigned int share_access;
@@ -2365,10 +2365,10 @@ out:
 /*
 * nfs4_unlock_state() called after encode
 */
-int
+__be32
 nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_close *close, struct nfs4_stateowner **replay_owner)
 {
-        int status;
+        __be32 status;
        struct nfs4_stateid *stp;
        dprintk("NFSD: nfsd4_close on file %.*s\n", 
@@ -2404,10 +2404,10 @@ out:
        return status;
 }
-int
+__be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr)
 {
-        int status;
+        __be32 status;
        if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
                goto out;
@@ -2635,7 +2635,7 @@ check_lock_length(u64 offset, u64 length)
 /*
 *  LOCK operation 
 */
-int
+__be32
 nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock, struct nfs4_stateowner **replay_owner)
 {
        struct nfs4_stateowner *open_sop = NULL;
@@ -2644,8 +2644,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
        struct file *filp;
        struct file_lock file_lock;
        struct file_lock conflock;
-        int status = 0;
+        __be32 status = 0;
        unsigned int strhashval;
+        int err;
        dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
                (long long) lock->lk_offset,
@@ -2758,13 +2759,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
         * locks_copy_lock: */
        conflock.fl_ops = NULL;
        conflock.fl_lmops = NULL;
-        status = posix_lock_file_conf(filp, &file_lock, &conflock);
+        err = posix_lock_file_conf(filp, &file_lock, &conflock);
        dprintk("NFSD: nfsd4_lock: posix_lock_file_conf status %d\n",status);
-        switch (-status) {
+        switch (-err) {
        case 0: /* success! */
                update_stateid(&lock_stp->st_stateid);
                memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 
                                sizeof(stateid_t));
+                status = 0;
                break;
        case (EAGAIN):          /* conflock holds conflicting lock */
                status = nfserr_denied;
@@ -2775,7 +2777,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
                status = nfserr_deadlock;
                break;
        default:        
-                dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",status);
+                dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",err);
                status = nfserr_resource;
                break;
        }
@@ -2793,14 +2795,14 @@ out:
 /*
 * LOCKT operation
 */
-int
+__be32
 nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lockt *lockt)
 {
        struct inode *inode;
        struct file file;
        struct file_lock file_lock;
        struct file_lock conflock;
-        int status;
+        __be32 status;
        if (nfs4_in_grace())
                return nfserr_grace;
@@ -2873,13 +2875,14 @@ out:
        return status;
 }
-int
+__be32
 nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_locku *locku, struct nfs4_stateowner **replay_owner)
 {
        struct nfs4_stateid *stp;
        struct file *filp = NULL;
        struct file_lock file_lock;
-        int status;
+        __be32 status;
+        int err;
                                                        
        dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
                (long long) locku->lu_offset,
@@ -2917,8 +2920,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
        /*
        *  Try to unlock the file in the VFS.
        */
-        status = posix_lock_file(filp, &file_lock); 
+        err = posix_lock_file(filp, &file_lock);
-        if (status) {
+        if (err) {
                dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
                goto out_nfserr;
        }
@@ -2937,7 +2940,7 @@ out:
        return status;
 out_nfserr:
-        status = nfserrno(status);
+        status = nfserrno(err);
        goto out;
 }
@@ -2965,7 +2968,7 @@ out:
        return status;
 }
-int
+__be32
 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner)
 {
        clientid_t *clid = &rlockowner->rl_clientid;
@@ -2974,7 +2977,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *
        struct xdr_netobj *owner = &rlockowner->rl_owner;
        struct list_head matches;
        int i;
-        int status;
+        __be32 status;
        dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
                clid->cl_boot, clid->cl_id);
@@ -3111,7 +3114,7 @@ nfs4_find_reclaim_client(clientid_t *clid)
 /*
 * Called from OPEN. Look for clientid in reclaim list.
 */
-int
+__be32
 nfs4_check_open_reclaim(clientid_t *clid)
 {
        return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5be00436b5b8..f3f239db04bb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -60,8 +60,16 @@
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
-static int
+/*
-check_filename(char *str, int len, int err)
+ * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
+ * directory in order to indicate to the client that a filesystem boundary is present
+ * We use a fixed fsid for a referral
+ */
+#define NFS4_REFERRAL_FSID_MAJOR        0x8000000ULL
+#define NFS4_REFERRAL_FSID_MINOR        0x8000000ULL
+static __be32
+check_filename(char *str, int len, __be32 err)
 {
        int i;
@@ -86,8 +94,8 @@ check_filename(char *str, int len, int err)
 * consistent with the style used in NFSv2/v3...
 */
 #define DECODE_HEAD                             \
-        u32 *p;                                 \
+        __be32 *p;                              \
-        int status
+        __be32 status
 #define DECODE_TAIL                             \
        status = 0;                             \
 out:                                            \
@@ -136,13 +144,13 @@ xdr_error:					\
        }                                       \
 } while (0)
-static u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
 {
        /* We want more bytes than seem to be available.
         * Maybe we need a new page, maybe we have just run out
         */
        int avail = (char*)argp->end - (char*)argp->p;
-        u32 *p;
+        __be32 *p;
        if (avail + argp->pagelen < nbytes)
                return NULL;
        if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
@@ -189,7 +197,7 @@ defer_free(struct nfsd4_compoundargs *argp,
        return 0;
 }
-static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
+static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
        void *new = NULL;
        if (p == argp->tmp) {
@@ -209,7 +217,7 @@ static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
 }
-static int
+static __be32
 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 {
        u32 bmlen;
@@ -232,13 +240,14 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
    struct nfs4_acl **acl)
 {
        int expected_len, len = 0;
        u32 dummy32;
        char *buf;
+        int host_err;
        DECODE_HEAD;
        iattr->ia_valid = 0;
@@ -272,7 +281,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                *acl = nfs4_acl_new();
                if (*acl == NULL) {
-                        status = -ENOMEM;
+                        host_err = -ENOMEM;
                        goto out_nfserr;
                }
                defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl);
@@ -287,20 +296,20 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                        len += XDR_QUADLEN(dummy32) << 2;
                        READMEM(buf, dummy32);
                        ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
-                        status = 0;
+                        host_err = 0;
                        if (ace.whotype != NFS4_ACL_WHO_NAMED)
                                ace.who = 0;
                        else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP)
-                                status = nfsd_map_name_to_gid(argp->rqstp,
+                                host_err = nfsd_map_name_to_gid(argp->rqstp,
                                                buf, dummy32, &ace.who);
                        else
-                                status = nfsd_map_name_to_uid(argp->rqstp,
+                                host_err = nfsd_map_name_to_uid(argp->rqstp,
                                                buf, dummy32, &ace.who);
-                        if (status)
+                        if (host_err)
                                goto out_nfserr;
-                        status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+                        host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
                                 ace.access_mask, ace.whotype, ace.who);
-                        if (status)
+                        if (host_err)
                                goto out_nfserr;
                }
        } else
@@ -319,7 +328,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+                if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
                        goto out_nfserr;
                iattr->ia_valid |= ATTR_UID;
        }
@@ -330,7 +339,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+                if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
                        goto out_nfserr;
                iattr->ia_valid |= ATTR_GID;
        }
@@ -406,11 +415,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
        DECODE_TAIL;
 out_nfserr:
-        status = nfserrno(status);
+        status = nfserrno(host_err);
        goto out;
 }
-static int
+static __be32
 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
 {
        DECODE_HEAD;
@@ -421,7 +430,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
        DECODE_HEAD;
@@ -436,7 +445,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 }
-static int
+static __be32
 nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
 {
        DECODE_HEAD;
@@ -448,7 +457,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
 {
        DECODE_HEAD;
@@ -488,7 +497,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        DECODE_TAIL;
 }
-static inline int
+static inline __be32
 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
 {
        DECODE_HEAD;
@@ -500,13 +509,13 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu
        DECODE_TAIL;
 }
-static inline int
+static inline __be32
 nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
 {
        return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
 }
-static int
+static __be32
 nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
 {
        DECODE_HEAD;
@@ -521,7 +530,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
        DECODE_HEAD;
@@ -560,7 +569,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 {
        DECODE_HEAD;
@@ -579,7 +588,7 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 {
        DECODE_HEAD;
@@ -598,7 +607,7 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
 {
        DECODE_HEAD;
@@ -613,7 +622,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 {
        DECODE_HEAD;
@@ -691,7 +700,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
 {
        DECODE_HEAD;
@@ -705,7 +714,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
 {
        DECODE_HEAD;
@@ -721,7 +730,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
 {
        DECODE_HEAD;
@@ -736,7 +745,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
        DECODE_HEAD;
@@ -750,7 +759,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
 {
        DECODE_HEAD;
@@ -766,7 +775,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
 {
        DECODE_HEAD;
@@ -781,7 +790,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
 {
        DECODE_HEAD;
@@ -801,7 +810,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
 {
        DECODE_HEAD;
@@ -812,7 +821,7 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
        DECODE_HEAD;
@@ -826,7 +835,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
 {
        DECODE_HEAD;
@@ -851,7 +860,7 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
 {
        DECODE_HEAD;
@@ -864,7 +873,7 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
 }
 /* Also used for NVERIFY */
-static int
+static __be32
 nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
 {
 #if 0
@@ -900,7 +909,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 {
        int avail;
@@ -926,32 +935,32 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
                printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 
                goto xdr_error;
        }
-        write->wr_vec[0].iov_base = p;
+        argp->rqstp->rq_vec[0].iov_base = p;
-        write->wr_vec[0].iov_len = avail;
+        argp->rqstp->rq_vec[0].iov_len = avail;
        v = 0;
        len = write->wr_buflen;
-        while (len > write->wr_vec[v].iov_len) {
+        while (len > argp->rqstp->rq_vec[v].iov_len) {
-                len -= write->wr_vec[v].iov_len;
+                len -= argp->rqstp->rq_vec[v].iov_len;
                v++;
-                write->wr_vec[v].iov_base = page_address(argp->pagelist[0]);
+                argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
                argp->pagelist++;
                if (argp->pagelen >= PAGE_SIZE) {
-                        write->wr_vec[v].iov_len = PAGE_SIZE;
+                        argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
                        argp->pagelen -= PAGE_SIZE;
                } else {
-                        write->wr_vec[v].iov_len = argp->pagelen;
+                        argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
                        argp->pagelen -= len;
                }
        }
-        argp->end = (u32*) (write->wr_vec[v].iov_base + write->wr_vec[v].iov_len);
+        argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
-        argp->p = (u32*)  (write->wr_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
+        argp->p = (__be32*)  (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
-        write->wr_vec[v].iov_len = len;
+        argp->rqstp->rq_vec[v].iov_len = len;
        write->wr_vlen = v+1;
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
 {
        DECODE_HEAD;
@@ -965,7 +974,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
        DECODE_TAIL;
 }
-static int
+static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
        DECODE_HEAD;
@@ -1171,7 +1180,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 * task to translate them into Linux-specific versions which are more
 * consistent with the style used in NFSv2/v3...
 */
-#define ENCODE_HEAD              u32 *p
+#define ENCODE_HEAD              __be32 *p
 #define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {                           \
@@ -1201,8 +1210,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 * Header routine to setup seqid operation replay cache
 */
 #define ENCODE_SEQID_OP_HEAD                                    \
-        u32 *p;                                                 \
+        __be32 *p;                                              \
-        u32 *save;                                              \
+        __be32 *save;                                           \
                                                                \
        save = resp->p;
@@ -1223,6 +1232,120 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                        stateowner->so_replay.rp_buflen);       \
        } } while (0);
+/* Encode as an array of strings the string given with components
+ * seperated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+                                   __be32 **pp, int *buflen)
+{
+        __be32 *p = *pp;
+        __be32 *countp = p;
+        int strlen, count=0;
+        char *str, *end;
+        dprintk("nfsd4_encode_components(%s)\n", components);
+        if ((*buflen -= 4) < 0)
+                return nfserr_resource;
+        WRITE32(0); /* We will fill this in with @count later */
+        end = str = components;
+        while (*end) {
+                for (; *end && (*end != sep); end++)
+                        ; /* Point to end of component */
+                strlen = end - str;
+                if (strlen) {
+                        if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
+                                return nfserr_resource;
+                        WRITE32(strlen);
+                        WRITEMEM(str, strlen);
+                        count++;
+                }
+                else
+                        end++;
+                str = end;
+        }
+        *pp = p;
+        p = countp;
+        WRITE32(count);
+        return 0;
+}
+/*
+ * encode a location element of a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
+                                    __be32 **pp, int *buflen)
+{
+        __be32 status;
+        __be32 *p = *pp;
+        status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+        if (status)
+                return status;
+        status = nfsd4_encode_components('/', location->path, &p, buflen);
+        if (status)
+                return status;
+        *pp = p;
+        return 0;
+}
+/*
+ * Return the path to an export point in the pseudo filesystem namespace
+ * Returned string is safe to use as long as the caller holds a reference
+ * to @exp.
+ */
+static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+{
+        struct svc_fh tmp_fh;
+        char *path, *rootpath;
+        fh_init(&tmp_fh, NFS4_FHSIZE);
+        *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle);
+        if (*stat)
+                return NULL;
+        rootpath = tmp_fh.fh_export->ex_path;
+        path = exp->ex_path;
+        if (strncmp(path, rootpath, strlen(rootpath))) {
+                printk("nfsd: fs_locations failed;"
+                        "%s is not contained in %s\n", path, rootpath);
+                *stat = nfserr_notsupp;
+                return NULL;
+        }
+        return path + strlen(rootpath);
+}
+/*
+ *  encode a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
+                                     struct svc_export *exp,
+                                     __be32 **pp, int *buflen)
+{
+        __be32 status;
+        int i;
+        __be32 *p = *pp;
+        struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
+        char *root = nfsd4_path(rqstp, exp, &status);
+        if (status)
+                return status;
+        status = nfsd4_encode_components('/', root, &p, buflen);
+        if (status)
+                return status;
+        if ((*buflen -= 4) < 0)
+                return nfserr_resource;
+        WRITE32(fslocs->locations_count);
+        for (i=0; i<fslocs->locations_count; i++) {
+                status = nfsd4_encode_fs_location4(&fslocs->locations[i],
+                                                   &p, buflen);
+                if (status)
+                        return status;
+        }
+        *pp = p;
+        return 0;
+}
 static u32 nfs4_ftypes[16] = {
        NF4BAD,  NF4FIFO, NF4CHR, NF4BAD,
@@ -1231,9 +1354,9 @@ static u32 nfs4_ftypes[16] = {
        NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
 };
-static int
+static __be32
 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
-                        u32 **p, int *buflen)
+                        __be32 **p, int *buflen)
 {
        int status;
@@ -1253,25 +1376,44 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
        return 0;
 }
-static inline int
+static inline __be32
-nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen)
+nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
 {
        return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
 }
-static inline int
+static inline __be32
-nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen)
+nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
 {
        return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
 }
-static inline int
+static inline __be32
 nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
-                u32 **p, int *buflen)
+                __be32 **p, int *buflen)
 {
        return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
 }
+#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
+                              FATTR4_WORD0_RDATTR_ERROR)
+#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
+{
+        /* As per referral draft:  */
+        if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
+            *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
+                if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
+                    *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
+                        *rdattr_err = NFSERR_MOVED;
+                else
+                        return nfserr_moved;
+        }
+        *bmval0 &= WORD0_ABSENT_FS_ATTRS;
+        *bmval1 &= WORD1_ABSENT_FS_ATTRS;
+        return 0;
+}
 /*
 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
@@ -1280,9 +1422,9 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
 * @countp is the buffer size in _words_; upon successful return this becomes
 * replaced with the number of words written.
 */
-int
+__be32
 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
-                struct dentry *dentry, u32 *buffer, int *countp, u32 *bmval,
+                struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
                struct svc_rqst *rqstp)
 {
        u32 bmval0 = bmval[0];
@@ -1291,11 +1433,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        struct svc_fh tempfh;
        struct kstatfs statfs;
        int buflen = *countp << 2;
-        u32 *attrlenp;
+        __be32 *attrlenp;
        u32 dummy;
        u64 dummy64;
-        u32 *p = buffer;
+        u32 rdattr_err = 0;
-        int status;
+        __be32 *p = buffer;
+        __be32 status;
+        int err;
        int aclsupport = 0;
        struct nfs4_acl *acl = NULL;
@@ -1303,14 +1447,20 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
        BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
-        status = vfs_getattr(exp->ex_mnt, dentry, &stat);
+        if (exp->ex_fslocs.migrated) {
-        if (status)
+                status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
+                if (status)
+                        goto out;
+        }
+        err = vfs_getattr(exp->ex_mnt, dentry, &stat);
+        if (err)
                goto out_nfserr;
        if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
            (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
                       FATTR4_WORD1_SPACE_TOTAL))) {
-                status = vfs_statfs(dentry, &statfs);
+                err = vfs_statfs(dentry, &statfs);
-                if (status)
+                if (err)
                        goto out_nfserr;
        }
        if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
@@ -1322,18 +1472,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        }
        if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
                        | FATTR4_WORD0_SUPPORTED_ATTRS)) {
-                status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+                err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
-                aclsupport = (status == 0);
+                aclsupport = (err == 0);
                if (bmval0 & FATTR4_WORD0_ACL) {
-                        if (status == -EOPNOTSUPP)
+                        if (err == -EOPNOTSUPP)
                                bmval0 &= ~FATTR4_WORD0_ACL;
-                        else if (status == -EINVAL) {
+                        else if (err == -EINVAL) {
                                status = nfserr_attrnotsupp;
                                goto out;
-                        } else if (status != 0)
+                        } else if (err != 0)
                                goto out_nfserr;
                }
        }
+        if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+                if (exp->ex_fslocs.locations == NULL) {
+                        bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
+                }
+        }
        if ((buflen -= 16) < 0)
                goto out_resource;
@@ -1343,12 +1498,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        attrlenp = p++;                /* to be backfilled later */
        if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+                u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
                if ((buflen -= 12) < 0)
                        goto out_resource;
+                if (!aclsupport)
+                        word0 &= ~FATTR4_WORD0_ACL;
+                if (!exp->ex_fslocs.locations)
+                        word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
                WRITE32(2);
-                WRITE32(aclsupport ?
+                WRITE32(word0);
-                        NFSD_SUPPORTED_ATTRS_WORD0 :
-                        NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL);
                WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
        }
        if (bmval0 & FATTR4_WORD0_TYPE) {
@@ -1402,7 +1560,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_FSID) {
                if ((buflen -= 16) < 0)
                        goto out_resource;
-                if (is_fsid(fhp, rqstp->rq_reffh)) {
+                if (exp->ex_fslocs.migrated) {
+                        WRITE64(NFS4_REFERRAL_FSID_MAJOR);
+                        WRITE64(NFS4_REFERRAL_FSID_MINOR);
+                } else if (is_fsid(fhp, rqstp->rq_reffh)) {
                        WRITE64((u64)exp->ex_fsid);
                        WRITE64((u64)0);
                } else {
@@ -1425,7 +1586,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
-                WRITE32(0);
+                WRITE32(rdattr_err);
        }
        if (bmval0 & FATTR4_WORD0_ACL) {
                struct nfs4_ace *ace;
@@ -1513,6 +1674,13 @@ out_acl:
                        goto out_resource;
                WRITE64((u64) statfs.f_files);
        }
+        if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+                status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
+                if (status == nfserr_resource)
+                        goto out_resource;
+                if (status)
+                        goto out;
+        }
        if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
@@ -1536,12 +1704,12 @@ out_acl:
        if (bmval0 & FATTR4_WORD0_MAXREAD) {
                if ((buflen -= 8) < 0)
                        goto out_resource;
-                WRITE64((u64) NFSSVC_MAXBLKSIZE);
+                WRITE64((u64) svc_max_payload(rqstp));
        }
        if (bmval0 & FATTR4_WORD0_MAXWRITE) {
                if ((buflen -= 8) < 0)
                        goto out_resource;
-                WRITE64((u64) NFSSVC_MAXBLKSIZE);
+                WRITE64((u64) svc_max_payload(rqstp));
        }
        if (bmval1 & FATTR4_WORD1_MODE) {
                if ((buflen -= 4) < 0)
@@ -1652,7 +1820,7 @@ out:
                fh_put(&tempfh);
        return status;
 out_nfserr:
-        status = nfserrno(status);
+        status = nfserrno(err);
        goto out;
 out_resource:
        *countp = 0;
@@ -1663,13 +1831,13 @@ out_serverfault:
        goto out;
 }
-static int
+static __be32
 nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
-                const char *name, int namlen, u32 *p, int *buflen)
+                const char *name, int namlen, __be32 *p, int *buflen)
 {
        struct svc_export *exp = cd->rd_fhp->fh_export;
        struct dentry *dentry;
-        int nfserr;
+        __be32 nfserr;
        dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
        if (IS_ERR(dentry))
@@ -1698,10 +1866,10 @@ out_put:
        return nfserr;
 }
-static u32 *
+static __be32 *
-nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr)
+nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
 {
-        u32 *attrlenp;
+        __be32 *attrlenp;
        if (buflen < 6)
                return NULL;
@@ -1721,8 +1889,8 @@ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
 {
        struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
        int buflen;
-        u32 *p = cd->buffer;
+        __be32 *p = cd->buffer;
-        int nfserr = nfserr_toosmall;
+        __be32 nfserr = nfserr_toosmall;
        /* In nfsv4, "." and ".." never make it onto the wire.. */
        if (name && isdotent(name, namlen)) {
@@ -1778,7 +1946,7 @@ fail:
 }
 static void
-nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_access *access)
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
        ENCODE_HEAD;
@@ -1791,7 +1959,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_acc
 }
 static void
-nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_close *close)
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -1806,7 +1974,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_clos
 static void
-nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_commit *commit)
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
        ENCODE_HEAD;
@@ -1818,7 +1986,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_com
 }
 static void
-nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_create *create)
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
        ENCODE_HEAD;
@@ -1832,8 +2000,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_cre
        }
 }
-static int
+static __be32
-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_getattr *getattr)
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
 {
        struct svc_fh *fhp = getattr->ga_fhp;
        int buflen;
@@ -1845,14 +2013,13 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ge
        nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
                                    resp->p, &buflen, getattr->ga_bmval,
                                    resp->rqstp);
        if (!nfserr)
                resp->p += buflen;
        return nfserr;
 }
 static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, int nfserr, struct svc_fh *fhp)
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
 {
        unsigned int len;
        ENCODE_HEAD;
@@ -1892,7 +2059,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 }
 static void
-nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock)
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -1908,14 +2075,14 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
 }
 static void
-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lockt *lockt)
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
 {
        if (nfserr == nfserr_denied)
                nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
 }
 static void
-nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_locku *locku)
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -1931,7 +2098,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
 static void
-nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link *link)
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
        ENCODE_HEAD;
@@ -1944,7 +2111,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link
 static void
-nfsd4_encode_open(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open *open)
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
        ENCODE_SEQID_OP_HEAD;
@@ -2009,7 +2176,7 @@ out:
 }
 static void
-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_confirm *oc)
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
        ENCODE_SEQID_OP_HEAD;
                                        
@@ -2024,7 +2191,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfs
 }
 static void
-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_downgrade *od)
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
        ENCODE_SEQID_OP_HEAD;
                                        
@@ -2038,8 +2205,9 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct n
        ENCODE_SEQID_OP_TAIL(od->od_stateowner);
 }
-static int
+static __be32
-nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read *read)
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+                  struct nfsd4_read *read)
 {
        u32 eof;
        int v, pn;
@@ -2054,31 +2222,33 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
        RESERVE_SPACE(8); /* eof flag and byte count */
-        maxcount = NFSSVC_MAXBLKSIZE;
+        maxcount = svc_max_payload(resp->rqstp);
        if (maxcount > read->rd_length)
                maxcount = read->rd_length;
        len = maxcount;
        v = 0;
        while (len > 0) {
-                pn = resp->rqstp->rq_resused;
+                pn = resp->rqstp->rq_resused++;
-                svc_take_page(resp->rqstp);
+                resp->rqstp->rq_vec[v].iov_base =
-                read->rd_iov[v].iov_base = page_address(resp->rqstp->rq_respages[pn]);
+                        page_address(resp->rqstp->rq_respages[pn]);
-                read->rd_iov[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE;
+                resp->rqstp->rq_vec[v].iov_len =
+                        len < PAGE_SIZE ? len : PAGE_SIZE;
                v++;
                len -= PAGE_SIZE;
        }
        read->rd_vlen = v;
        nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
-                        read->rd_offset, read->rd_iov, read->rd_vlen,
+                        read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
                        &maxcount);
        if (nfserr == nfserr_symlink)
                nfserr = nfserr_inval;
        if (nfserr)
                return nfserr;
-        eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size);
+        eof = (read->rd_offset + maxcount >=
+               read->rd_fhp->fh_dentry->d_inode->i_size);
        WRITE32(eof);
        WRITE32(maxcount);
@@ -2088,7 +2258,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
        resp->xbuf->page_len = maxcount;
        /* Use rest of head for padding and remaining ops: */
-        resp->rqstp->rq_restailpage = 0;
        resp->xbuf->tail[0].iov_base = p;
        resp->xbuf->tail[0].iov_len = 0;
        if (maxcount&3) {
@@ -2101,8 +2270,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
        return 0;
 }
-static int
+static __be32
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readlink *readlink)
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
 {
        int maxcount;
        char *page;
@@ -2113,8 +2282,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
        if (resp->xbuf->page_len)
                return nfserr_resource;
-        svc_take_page(resp->rqstp);
+        page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
-        page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
        maxcount = PAGE_SIZE;
        RESERVE_SPACE(4);
@@ -2138,7 +2306,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
        resp->xbuf->page_len = maxcount;
        /* Use rest of head for padding and remaining ops: */
-        resp->rqstp->rq_restailpage = 0;
        resp->xbuf->tail[0].iov_base = p;
        resp->xbuf->tail[0].iov_len = 0;
        if (maxcount&3) {
@@ -2151,12 +2318,12 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
        return 0;
 }
-static int
+static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readdir *readdir)
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
 {
        int maxcount;
        loff_t offset;
-        u32 *page, *savep, *tailbase;
+        __be32 *page, *savep, *tailbase;
        ENCODE_HEAD;
        if (nfserr)
@@ -2189,8 +2356,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
                goto err_no_verf;
        }
-        svc_take_page(resp->rqstp);
+        page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
-        page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
        readdir->common.err = 0;
        readdir->buflen = maxcount;
        readdir->buffer = page;
@@ -2215,10 +2381,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
        p = readdir->buffer;
        *p++ = 0;       /* no more entries */
        *p++ = htonl(readdir->common.err == nfserr_eof);
-        resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+        resp->xbuf->page_len = ((char*)p) - (char*)page_address(
+                resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
        /* Use rest of head for padding and remaining ops: */
-        resp->rqstp->rq_restailpage = 0;
        resp->xbuf->tail[0].iov_base = tailbase;
        resp->xbuf->tail[0].iov_len = 0;
        resp->p = resp->xbuf->tail[0].iov_base;
@@ -2232,7 +2398,7 @@ err_no_verf:
 }
 static void
-nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_remove *remove)
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
        ENCODE_HEAD;
@@ -2244,7 +2410,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rem
 }
 static void
-nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rename *rename)
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
        ENCODE_HEAD;
@@ -2261,7 +2427,7 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ren
 * regardless of the error status.
 */
 static void
-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setattr *setattr)
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
        ENCODE_HEAD;
@@ -2280,7 +2446,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_se
 }
 static void
-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setclientid *scd)
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
        ENCODE_HEAD;
@@ -2299,7 +2465,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd
 }
 static void
-nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_write *write)
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
        ENCODE_HEAD;
@@ -2315,7 +2481,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_writ
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
-        u32 *statp;
+        __be32 *statp;
        ENCODE_HEAD;
        RESERVE_SPACE(8);
@@ -2453,7 +2619,7 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 */
 int
-nfs4svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
@@ -2475,9 +2641,9 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
 }
 int
-nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args)
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
 {
-        int status;
+        __be32 status;
        args->p = p;
        args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
@@ -2496,7 +2662,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoun
 }
 int
-nfs4svc_encode_compoundres(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundres *resp)
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp)
 {
        /*
         * All that remains is to write the tag and operation count...
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index fdf7cf3dfadc..6100bbe27432 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,7 +29,7 @@
 */
 #define CACHESIZE               1024
 #define HASHSIZE                64
-#define REQHASH(xid)            ((((xid) >> 24) ^ (xid)) & (HASHSIZE-1))
+#define REQHASH(xid)            (((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
 static struct hlist_head *      hash_list;
 static struct list_head         lru_head;
@@ -127,8 +127,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
        struct hlist_node       *hn;
        struct hlist_head       *rh;
        struct svc_cacherep     *rp;
-        u32                     xid = rqstp->rq_xid,
+        __be32                  xid = rqstp->rq_xid;
-                                proto =  rqstp->rq_prot,
+        u32                     proto =  rqstp->rq_prot,
                                vers = rqstp->rq_vers,
                                proc = rqstp->rq_proc;
        unsigned long           age;
@@ -258,7 +258,7 @@ found_entry:
 * In this case, nfsd_cache_update is called with statp == NULL.
 */
 void
-nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
+nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 {
        struct svc_cacherep *rp;
        struct kvec     *resv = &rqstp->rq_res.head[0], *cachv;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5c6a477c20ec..39aed901514b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -57,6 +57,7 @@ enum {
        NFSD_Pool_Threads,
        NFSD_Versions,
        NFSD_Ports,
+        NFSD_MaxBlkSize,
        /*
         * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
         * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -82,6 +83,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
 static ssize_t write_ports(struct file *file, char *buf, size_t size);
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
@@ -100,6 +102,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Pool_Threads] = write_pool_threads,
        [NFSD_Versions] = write_versions,
        [NFSD_Ports] = write_ports,
+        [NFSD_MaxBlkSize] = write_maxblksize,
 #ifdef CONFIG_NFSD_V4
        [NFSD_Leasetime] = write_leasetime,
        [NFSD_RecoveryDir] = write_recoverydir,
@@ -523,18 +526,20 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                err = nfsd_create_serv();
                if (!err) {
                        int proto = 0;
-                        err = lockd_up(proto);
+                        err = svc_addsock(nfsd_serv, fd, buf, &proto);
-                        if (!err) {
+                        if (err >= 0) {
-                                err = svc_addsock(nfsd_serv, fd, buf, &proto);
+                                err = lockd_up(proto);
-                                if (err)
+                                if (err < 0)
-                                        lockd_down();
+                                        svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
                        }
                        /* Decrease the count, but don't shutdown the
                         * the service
                         */
+                        lock_kernel();
                        nfsd_serv->sv_nrthreads--;
+                        unlock_kernel();
                }
-                return err;
+                return err < 0 ? err : 0;
        }
        if (buf[0] == '-') {
                char *toclose = kstrdup(buf+1, GFP_KERNEL);
@@ -545,12 +550,43 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
                if (nfsd_serv)
                        len = svc_sock_names(buf, nfsd_serv, toclose);
                unlock_kernel();
+                if (len >= 0)
+                        lockd_down();
                kfree(toclose);
                return len;
        }
        return -EINVAL;
 }
+int nfsd_max_blksize;
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
+{
+        char *mesg = buf;
+        if (size > 0) {
+                int bsize;
+                int rv = get_int(&mesg, &bsize);
+                if (rv)
+                        return rv;
+                /* force bsize into allowed range and
+                 * required alignment.
+                 */
+                if (bsize < 1024)
+                        bsize = 1024;
+                if (bsize > NFSSVC_MAXBLKSIZE)
+                        bsize = NFSSVC_MAXBLKSIZE;
+                bsize &= ~(1024-1);
+                lock_kernel();
+                if (nfsd_serv && nfsd_serv->sv_nrthreads) {
+                        unlock_kernel();
+                        return -EBUSY;
+                }
+                nfsd_max_blksize = bsize;
+                unlock_kernel();
+        }
+        return sprintf(buf, "%d\n", nfsd_max_blksize);
+}
 #ifdef CONFIG_NFSD_V4
 extern time_t nfs4_leasetime(void);
@@ -616,6 +652,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 501d83884530..727ab3bd450d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -76,7 +76,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 * comment in the NFSv3 spec says this is incorrect (implementation notes for
 * the write call).
 */
-static inline int
+static inline __be32
 nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 {
        /* Type can be negative when creating hardlinks - not to a dir */
@@ -110,13 +110,13 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 * This is only called at the start of an nfsproc call, so fhp points to
 * a svc_fh which is all 0 except for the over-the-wire file handle.
 */
-u32
+__be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 {
        struct knfsd_fh *fh = &fhp->fh_handle;
        struct svc_export *exp = NULL;
        struct dentry   *dentry;
-        u32             error = 0;
+        __be32          error = 0;
        dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
@@ -315,7 +315,7 @@ static inline void _fh_update_old(struct dentry *dentry,
                fh->ofh_dirino = 0;
 }
-int
+__be32
 fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh)
 {
        /* ref_fh is a reference file handle.
@@ -451,7 +451,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
 * Update file handle information after changing a dentry.
 * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
 */
-int
+__be32
 fh_update(struct svc_fh *fhp)
 {
        struct dentry *dentry;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 06cd0db0f32b..ec983b777680 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -30,22 +30,22 @@ typedef struct svc_buf	svc_buf;
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
-static int
+static __be32
 nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
        return nfs_ok;
 }
-static int
+static __be32
-nfsd_return_attrs(int err, struct nfsd_attrstat *resp)
+nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
 {
        if (err) return err;
        return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
                                    resp->fh.fh_dentry,
                                    &resp->stat));
 }
-static int
+static __be32
-nfsd_return_dirop(int err, struct nfsd_diropres *resp)
+nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
 {
        if (err) return err;
        return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
@@ -56,11 +56,11 @@ nfsd_return_dirop(int err, struct nfsd_diropres *resp)
 * Get a file's attributes
 * N.B. After this call resp->fh needs an fh_put
 */
-static int
+static __be32
 nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
                                          struct nfsd_attrstat *resp)
 {
-        int nfserr;
+        __be32 nfserr;
        dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
        fh_copy(&resp->fh, &argp->fh);
@@ -72,11 +72,11 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 * Set a file's attributes
 * N.B. After this call resp->fh needs an fh_put
 */
-static int
+static __be32
 nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
                                          struct nfsd_attrstat  *resp)
 {
-        int nfserr;
+        __be32 nfserr;
        dprintk("nfsd: SETATTR  %s, valid=%x, size=%ld\n",
                SVCFH_fmt(&argp->fh),
                argp->attrs.ia_valid, (long) argp->attrs.ia_size);
@@ -92,11 +92,11 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
 * doesn't exist yet.
 * N.B. After this call resp->fh needs an fh_put
 */
-static int
+static __be32
 nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
                                         struct nfsd_diropres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: LOOKUP   %s %.*s\n",
                SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -112,11 +112,11 @@ nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 /*
 * Read a symlink.
 */
-static int
+static __be32
 nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
                                           struct nfsd_readlinkres *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
@@ -132,11 +132,11 @@ nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
 * Read a portion of a file.
 * N.B. After this call resp->fh needs an fh_put
 */
-static int
+static __be32
 nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
                                       struct nfsd_readres  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: READ    %s %d bytes at %d\n",
                SVCFH_fmt(&argp->fh),
@@ -146,20 +146,20 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
         * status, 17 words for fattr, and 1 word for the byte count.
         */
-        if (NFSSVC_MAXBLKSIZE < argp->count) {
+        if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
                printk(KERN_NOTICE
                        "oversized read request from %u.%u.%u.%u:%d (%d bytes)\n",
                                NIPQUAD(rqstp->rq_addr.sin_addr.s_addr),
                                ntohs(rqstp->rq_addr.sin_port),
                                argp->count);
-                argp->count = NFSSVC_MAXBLKSIZE;
+                argp->count = NFSSVC_MAXBLKSIZE_V2;
        }
        svc_reserve(rqstp, (19<<2) + argp->count + 4);
        resp->count = argp->count;
        nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
                                  argp->offset,
-                                  argp->vec, argp->vlen,
+                                  rqstp->rq_vec, argp->vlen,
                                  &resp->count);
        if (nfserr) return nfserr;
@@ -172,11 +172,11 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 * Write data to a file
 * N.B. After this call resp->fh needs an fh_put
 */
-static int
+static __be32
 nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
                                        struct nfsd_attrstat  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        int     stable = 1;
        dprintk("nfsd: WRITE    %s %d bytes at %d\n",
@@ -185,7 +185,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
        nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
                                   argp->offset,
-                                   argp->vec, argp->vlen,
+                                   rqstp->rq_vec, argp->vlen,
                                   argp->len,
                                   &stable);
        return nfsd_return_attrs(nfserr, resp);
@@ -197,7 +197,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 * and the actual create() call in compliance with VFS protocols.
 * N.B. After this call _both_ argp->fh and resp->fh need an fh_put
 */
-static int
+static __be32
 nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                                         struct nfsd_diropres   *resp)
 {
@@ -206,7 +206,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
        struct iattr    *attr = &argp->attrs;
        struct inode    *inode;
        struct dentry   *dchild;
-        int             nfserr, type, mode;
+        int             type, mode;
+        __be32          nfserr;
        dev_t           rdev = 0, wanted = new_decode_dev(attr->ia_size);
        dprintk("nfsd: CREATE   %s %.*s\n",
@@ -225,7 +226,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
        nfserr = nfserr_exist;
        if (isdotent(argp->name, argp->len))
                goto done;
-        fh_lock(dirfhp);
+        fh_lock_nested(dirfhp, I_MUTEX_PARENT);
        dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
        if (IS_ERR(dchild)) {
                nfserr = nfserrno(PTR_ERR(dchild));
@@ -348,11 +349,11 @@ done:
        return nfsd_return_dirop(nfserr, resp);
 }
-static int
+static __be32
 nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
                                         void                  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: REMOVE   %s %.*s\n", SVCFH_fmt(&argp->fh),
                argp->len, argp->name);
@@ -363,11 +364,11 @@ nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
        return nfserr;
 }
-static int
+static __be32
 nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
                                         void                   *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: RENAME   %s %.*s -> \n",
                SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
@@ -381,11 +382,11 @@ nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
        return nfserr;
 }
-static int
+static __be32
 nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
                                void                        *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: LINK     %s ->\n",
                SVCFH_fmt(&argp->ffh));
@@ -401,12 +402,12 @@ nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
        return nfserr;
 }
-static int
+static __be32
 nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
                                          void                    *resp)
 {
        struct svc_fh   newfh;
-        int             nfserr;
+        __be32          nfserr;
        dprintk("nfsd: SYMLINK  %s %.*s -> %.*s\n",
                SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
@@ -430,11 +431,11 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
 * Make directory. This operation is not idempotent.
 * N.B. After this call resp->fh needs an fh_put
 */
-static int
+static __be32
 nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                                        struct nfsd_diropres   *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: MKDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -454,11 +455,11 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 /*
 * Remove a directory
 */
-static int
+static __be32
 nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
                                        void                  *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: RMDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -470,11 +471,12 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 /*
 * Read a portion of a directory.
 */
-static int
+static __be32
 nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
                                          struct nfsd_readdirres  *resp)
 {
-        int             nfserr, count;
+        int             count;
+        __be32          nfserr;
        loff_t          offset;
        dprintk("nfsd: READDIR  %s %d bytes at %d\n",
@@ -509,11 +511,11 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 /*
 * Get file system info
 */
-static int
+static __be32
 nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
                                          struct nfsd_statfsres *resp)
 {
-        int     nfserr;
+        __be32  nfserr;
        dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
@@ -553,7 +555,7 @@ static struct svc_procedure		nfsd_procedures2[18] = {
  PROC(none,     void,          void,           none,           RC_NOCACHE, ST),
  PROC(lookup,   diropargs,     diropres,       fhandle,        RC_NOCACHE, ST+FH+AT),
  PROC(readlink, readlinkargs,  readlinkres,    none,           RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
-  PROC(read,     readargs,      readres,        fhandle,        RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
+  PROC(read,     readargs,      readres,        fhandle,        RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4),
  PROC(none,     void,          void,           none,           RC_NOCACHE, ST),
  PROC(write,    writeargs,     attrstat,       fhandle,        RC_REPLBUFF, ST+AT),
  PROC(create,   createargs,    diropres,       fhandle,        RC_REPLBUFF, ST+FH+AT),
@@ -579,11 +581,11 @@ struct svc_version	nfsd_version2 = {
 /*
 * Map errnos to NFS errnos.
 */
-int
+__be32
 nfserrno (int errno)
 {
        static struct {
-                int     nfserr;
+                __be32  nfserr;
                int     syserr;
        } nfs_errtbl[] = {
                { nfs_ok, 0 },
@@ -615,11 +617,10 @@ nfserrno (int errno)
                { nfserr_badname, -ESRCH },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
-                { -1, -EIO }
        };
        int     i;
-        for (i = 0; nfs_errtbl[i].nfserr != -1; i++) {
+        for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
                if (nfs_errtbl[i].syserr == errno)
                        return nfs_errtbl[i].nfserr;
        }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 19443056ec30..0aaccb03bf76 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -198,9 +198,26 @@ int nfsd_create_serv(void)
                unlock_kernel();
                return 0;
        }
+        if (nfsd_max_blksize == 0) {
+                /* choose a suitable default */
+                struct sysinfo i;
+                si_meminfo(&i);
+                /* Aim for 1/4096 of memory per thread
+                 * This gives 1MB on 4Gig machines
+                 * But only uses 32K on 128M machines.
+                 * Bottom out at 8K on 32M and smaller.
+                 * Of course, this is only a default.
+                 */
+                nfsd_max_blksize = NFSSVC_MAXBLKSIZE;
+                i.totalram <<= PAGE_SHIFT - 12;
+                while (nfsd_max_blksize > i.totalram &&
+                       nfsd_max_blksize >= 8*1024*2)
+                        nfsd_max_blksize /= 2;
+        }
        atomic_set(&nfsd_busy, 0);
-        nfsd_serv = svc_create_pooled(&nfsd_program, NFSD_BUFSIZE,
+        nfsd_serv = svc_create_pooled(&nfsd_program,
+                                      nfsd_max_blksize,
                                      nfsd_last_thread,
                                      nfsd, SIG_NOCLEAN, THIS_MODULE);
        if (nfsd_serv == NULL)
@@ -474,12 +491,12 @@ out:
 }
 int
-nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
+nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 {
        struct svc_procedure    *proc;
        kxdrproc_t              xdr;
-        u32                     nfserr;
+        __be32                  nfserr;
-        u32                     *nfserrp;
+        __be32                  *nfserrp;
        dprintk("nfsd_dispatch: vers %d proc %d\n",
                                rqstp->rq_vers, rqstp->rq_proc);
@@ -498,7 +515,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
        /* Decode arguments */
        xdr = proc->pc_decode;
-        if (xdr && !xdr(rqstp, (u32*)rqstp->rq_arg.head[0].iov_base,
+        if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
                        rqstp->rq_argp)) {
                dprintk("nfsd: failed to decode arguments!\n");
                nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
@@ -511,7 +528,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
         */
        nfserrp = rqstp->rq_res.head[0].iov_base
                + rqstp->rq_res.head[0].iov_len;
-        rqstp->rq_res.head[0].iov_len += sizeof(u32);
+        rqstp->rq_res.head[0].iov_len += sizeof(__be32);
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 3f14a17eaa6e..56ebb1443e0e 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -37,8 +37,8 @@ static u32	nfs_ftypes[] = {
 /*
 * XDR functions for basic NFS types
 */
-static u32 *
+static __be32 *
-decode_fh(u32 *p, struct svc_fh *fhp)
+decode_fh(__be32 *p, struct svc_fh *fhp)
 {
        fh_init(fhp, NFS_FHSIZE);
        memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
@@ -50,13 +50,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
 }
 /* Helper function for NFSv2 ACL code */
-u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
 {
        return decode_fh(p, fhp);
 }
-static inline u32 *
+static inline __be32 *
-encode_fh(u32 *p, struct svc_fh *fhp)
+encode_fh(__be32 *p, struct svc_fh *fhp)
 {
        memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
        return p + (NFS_FHSIZE>> 2);
@@ -66,8 +66,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
 * Decode a file name and make sure that the path contains
 * no slashes or null bytes.
 */
-static inline u32 *
+static inline __be32 *
-decode_filename(u32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, int *lenp)
 {
        char            *name;
        int             i;
@@ -82,8 +82,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-decode_pathname(u32 *p, char **namp, int *lenp)
+decode_pathname(__be32 *p, char **namp, int *lenp)
 {
        char            *name;
        int             i;
@@ -98,8 +98,8 @@ decode_pathname(u32 *p, char **namp, int *lenp)
        return p;
 }
-static inline u32 *
+static inline __be32 *
-decode_sattr(u32 *p, struct iattr *iap)
+decode_sattr(__be32 *p, struct iattr *iap)
 {
        u32     tmp, tmp1;
@@ -151,8 +151,8 @@ decode_sattr(u32 *p, struct iattr *iap)
        return p;
 }
-static u32 *
+static __be32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
             struct kstat *stat)
 {
        struct dentry   *dentry = fhp->fh_dentry;
@@ -195,7 +195,7 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
 }
 /* Helper function for NFSv2 ACL code */
-u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
        struct kstat stat;
        vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
@@ -206,13 +206,13 @@ u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 * XDR decode functions
 */
 int
-nfssvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_argsize_check(rqstp, p);
 }
 int
-nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
+nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
@@ -220,7 +220,7 @@ nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
 }
 int
-nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_sattrargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -231,7 +231,7 @@ nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_diropargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -242,7 +242,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_readargs *args)
 {
        unsigned int len;
@@ -254,19 +254,18 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
        len = args->count     = ntohl(*p++);
        p++; /* totalcount - unused */
-        if (len > NFSSVC_MAXBLKSIZE)
+        if (len > NFSSVC_MAXBLKSIZE_V2)
-                len = NFSSVC_MAXBLKSIZE;
+                len = NFSSVC_MAXBLKSIZE_V2;
        /* set up somewhere to store response.
         * We take pages, put them on reslist and include in iovec
         */
        v=0;
        while (len > 0) {
-                pn=rqstp->rq_resused;
+                pn = rqstp->rq_resused++;
-                svc_take_page(rqstp);
+                rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
-                args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+                rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
-                args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+                len -= rqstp->rq_vec[v].iov_len;
-                len -= args->vec[v].iov_len;
                v++;
        }
        args->vlen = v;
@@ -274,7 +273,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_writeargs *args)
 {
        unsigned int len;
@@ -286,25 +285,25 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
        args->offset = ntohl(*p++);     /* offset */
        p++;                            /* totalcount */
        len = args->len = ntohl(*p++);
-        args->vec[0].iov_base = (void*)p;
+        rqstp->rq_vec[0].iov_base = (void*)p;
-        args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+        rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
                                (((void*)p) - rqstp->rq_arg.head[0].iov_base);
-        if (len > NFSSVC_MAXBLKSIZE)
+        if (len > NFSSVC_MAXBLKSIZE_V2)
-                len = NFSSVC_MAXBLKSIZE;
+                len = NFSSVC_MAXBLKSIZE_V2;
        v = 0;
-        while (len > args->vec[v].iov_len) {
+        while (len > rqstp->rq_vec[v].iov_len) {
-                len -= args->vec[v].iov_len;
+                len -= rqstp->rq_vec[v].iov_len;
                v++;
-                args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
+                rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
-                args->vec[v].iov_len = PAGE_SIZE;
+                rqstp->rq_vec[v].iov_len = PAGE_SIZE;
        }
-        args->vec[v].iov_len = len;
+        rqstp->rq_vec[v].iov_len = len;
        args->vlen = v+1;
-        return args->vec[0].iov_len > 0;
+        return rqstp->rq_vec[0].iov_len > 0;
 }
 int
-nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_createargs *args)
 {
        if (!(p = decode_fh(p, &args->fh))
@@ -316,7 +315,7 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_renameargs *args)
 {
        if (!(p = decode_fh(p, &args->ffh))
@@ -329,18 +328,17 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, struct nfsd_readlinkargs *args)
+nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
                return 0;
-        svc_take_page(rqstp);
+        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
-        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
        return xdr_argsize_check(rqstp, p);
 }
 int
-nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_linkargs *args)
 {
        if (!(p = decode_fh(p, &args->ffh))
@@ -352,7 +350,7 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_symlinkargs *args)
 {
        if (!(p = decode_fh(p, &args->ffh))
@@ -365,7 +363,7 @@ nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_readdirargs *args)
 {
        if (!(p = decode_fh(p, &args->fh)))
@@ -375,8 +373,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
        if (args->count > PAGE_SIZE)
                args->count = PAGE_SIZE;
-        svc_take_page(rqstp);
+        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
-        args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
        return xdr_argsize_check(rqstp, p);
 }
@@ -385,13 +382,13 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
 * XDR encode functions
 */
 int
-nfssvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
        return xdr_ressize_check(rqstp, p);
 }
 int
-nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_attrstat *resp)
 {
        p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -399,7 +396,7 @@ nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_diropres *resp)
 {
        p = encode_fh(p, &resp->fh);
@@ -408,7 +405,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_readlinkres *resp)
 {
        *p++ = htonl(resp->len);
@@ -416,7 +413,6 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
        rqstp->rq_res.page_len = resp->len;
        if (resp->len & 3) {
                /* need to pad the tail */
-                rqstp->rq_restailpage = 0;
                rqstp->rq_res.tail[0].iov_base = p;
                *p = 0;
                rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -425,7 +421,7 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_readres *resp)
 {
        p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -436,7 +432,6 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
        rqstp->rq_res.page_len = resp->count;
        if (resp->count & 3) {
                /* need to pad the tail */
-                rqstp->rq_restailpage = 0;
                rqstp->rq_res.tail[0].iov_base = p;
                *p = 0;
                rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
@@ -445,7 +440,7 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_readdirres *resp)
 {
        xdr_ressize_check(rqstp, p);
@@ -458,12 +453,12 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 }
 int
-nfssvc_encode_statfsres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_statfsres *resp)
 {
        struct kstatfs  *stat = &resp->stats;
-        *p++ = htonl(NFSSVC_MAXBLKSIZE);        /* max transfer size */
+        *p++ = htonl(NFSSVC_MAXBLKSIZE_V2);     /* max transfer size */
        *p++ = htonl(stat->f_bsize);
        *p++ = htonl(stat->f_blocks);
        *p++ = htonl(stat->f_bfree);
@@ -476,7 +471,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
                    int namlen, loff_t offset, ino_t ino, unsigned int d_type)
 {
        struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
-        u32     *p = cd->buffer;
+        __be32  *p = cd->buffer;
        int     buflen, slen;
        /*
@@ -502,7 +497,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
        *p++ = htonl((u32) ino);                /* file id */
        p    = xdr_encode_array(p, name, namlen);/* name length & name */
        cd->offset = p;                 /* remember pointer */
-        *p++ = ~(u32) 0;                /* offset of next entry */
+        *p++ = htonl(~0U);              /* offset of next entry */
        cd->buflen = buflen;
        cd->buffer = p;
@@ -514,7 +509,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
 * XDR release functions
 */
 int
-nfssvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_fhandle *resp)
 {
        fh_put(&resp->fh);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 443ebc52e382..f21e917bb8ed 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -54,6 +54,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
+#include <linux/jhash.h>
 #include <asm/uaccess.h>
@@ -81,10 +82,19 @@ struct raparms {
        dev_t                   p_dev;
        int                     p_set;
        struct file_ra_state    p_ra;
+        unsigned int            p_hindex;
 };
+struct raparm_hbucket {
+        struct raparms          *pb_head;
+        spinlock_t              pb_lock;
+} ____cacheline_aligned_in_smp;
 static struct raparms *         raparml;
-static struct raparms *         raparm_cache;
+#define RAPARM_HASH_BITS        4
+#define RAPARM_HASH_SIZE        (1<<RAPARM_HASH_BITS)
+#define RAPARM_HASH_MASK        (RAPARM_HASH_SIZE-1)
+static struct raparm_hbucket    raparm_hash[RAPARM_HASH_SIZE];
 /* 
 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
@@ -100,7 +110,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        struct dentry *dentry = *dpp;
        struct vfsmount *mnt = mntget(exp->ex_mnt);
        struct dentry *mounts = dget(dentry);
-        int err = nfs_ok;
+        int err = 0;
        while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
@@ -138,14 +148,15 @@ out:
 *   clients and is explicitly disallowed for NFSv3
 *      NeilBrown <neilb@cse.unsw.edu.au>
 */
-int
+__be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
                                        int len, struct svc_fh *resfh)
 {
        struct svc_export       *exp;
        struct dentry           *dparent;
        struct dentry           *dentry;
-        int                     err;
+        __be32                  err;
+        int                     host_err;
        dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
@@ -183,7 +194,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
                        exp2 = exp_parent(exp->ex_client, mnt, dentry,
                                          &rqstp->rq_chandle);
                        if (IS_ERR(exp2)) {
-                                err = PTR_ERR(exp2);
+                                host_err = PTR_ERR(exp2);
                                dput(dentry);
                                mntput(mnt);
                                goto out_nfserr;
@@ -200,14 +211,14 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
        } else {
                fh_lock(fhp);
                dentry = lookup_one_len(name, dparent, len);
-                err = PTR_ERR(dentry);
+                host_err = PTR_ERR(dentry);
                if (IS_ERR(dentry))
                        goto out_nfserr;
                /*
                 * check if we have crossed a mount point ...
                 */
                if (d_mountpoint(dentry)) {
-                        if ((err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+                        if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
                                dput(dentry);
                                goto out_nfserr;
                        }
@@ -226,7 +237,7 @@ out:
        return err;
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out;
 }
@@ -234,7 +245,7 @@ out_nfserr:
 * Set various file attributes.
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
             int check_guard, time_t guardtime)
 {
@@ -243,7 +254,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        int             accmode = MAY_SATTR;
        int             ftype = 0;
        int             imode;
-        int             err;
+        __be32          err;
+        int             host_err;
        int             size_change = 0;
        if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -309,19 +321,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                 * If we are changing the size of the file, then
                 * we need to break all leases.
                 */
-                err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
+                host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
-                if (err == -EWOULDBLOCK)
+                if (host_err == -EWOULDBLOCK)
-                        err = -ETIMEDOUT;
+                        host_err = -ETIMEDOUT;
-                if (err) /* ENOMEM or EWOULDBLOCK */
+                if (host_err) /* ENOMEM or EWOULDBLOCK */
                        goto out_nfserr;
-                err = get_write_access(inode);
+                host_err = get_write_access(inode);
-                if (err)
+                if (host_err)
                        goto out_nfserr;
                size_change = 1;
-                err = locks_verify_truncate(inode, NULL, iap->ia_size);
+                host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
-                if (err) {
+                if (host_err) {
                        put_write_access(inode);
                        goto out_nfserr;
                }
@@ -347,8 +359,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        err = nfserr_notsync;
        if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
                fh_lock(fhp);
-                err = notify_change(dentry, iap);
+                host_err = notify_change(dentry, iap);
-                err = nfserrno(err);
+                err = nfserrno(host_err);
                fh_unlock(fhp);
        }
        if (size_change)
@@ -360,7 +372,7 @@ out:
        return err;
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out;
 }
@@ -410,11 +422,12 @@ out:
        return error;
 }
-int
+__be32
 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
    struct nfs4_acl *acl)
 {
-        int error;
+        __be32 error;
+        int host_error;
        struct dentry *dentry;
        struct inode *inode;
        struct posix_acl *pacl = NULL, *dpacl = NULL;
@@ -430,22 +443,20 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (S_ISDIR(inode->i_mode))
                flags = NFS4_ACL_DIR;
-        error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+        host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
-        if (error == -EINVAL) {
+        if (host_error == -EINVAL) {
                error = nfserr_attrnotsupp;
                goto out;
-        } else if (error < 0)
+        } else if (host_error < 0)
                goto out_nfserr;
-        if (pacl) {
+        host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
-                error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
+        if (host_error < 0)
-                if (error < 0)
+                goto out_nfserr;
-                        goto out_nfserr;
-        }
-        if (dpacl) {
+        if (S_ISDIR(inode->i_mode)) {
-                error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
+                host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
-                if (error < 0)
+                if (host_error < 0)
                        goto out_nfserr;
        }
@@ -456,7 +467,7 @@ out:
        posix_acl_release(dpacl);
        return (error);
 out_nfserr:
-        error = nfserrno(error);
+        error = nfserrno(host_error);
        goto out;
 }
@@ -563,14 +574,14 @@ static struct accessmap	nfs3_anyaccess[] = {
    {   0,                      0                               }
 };
-int
+__be32
 nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
 {
        struct accessmap        *map;
        struct svc_export       *export;
        struct dentry           *dentry;
        u32                     query, result = 0, sresult = 0;
-        unsigned int            error;
+        __be32                  error;
        error = fh_verify(rqstp, fhp, 0, MAY_NOP);
        if (error)
@@ -590,7 +601,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
        query = *access;
        for  (; map->access; map++) {
                if (map->access & query) {
-                        unsigned int err2;
+                        __be32 err2;
                        sresult |= map->access;
@@ -629,13 +640,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 * The access argument indicates the type of open (read/write/lock)
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        int access, struct file **filp)
 {
        struct dentry   *dentry;
        struct inode    *inode;
-        int             flags = O_RDONLY|O_LARGEFILE, err;
+        int             flags = O_RDONLY|O_LARGEFILE;
+        __be32          err;
+        int             host_err;
        /*
         * If we get here, then the client has already done an "open",
@@ -665,10 +678,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * Check to see if there are any leases on this file.
         * This may block while leases are broken.
         */
-        err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+        host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
-        if (err == -EWOULDBLOCK)
+        if (host_err == -EWOULDBLOCK)
-                err = -ETIMEDOUT;
+                host_err = -ETIMEDOUT;
-        if (err) /* NOMEM or WOULDBLOCK */
+        if (host_err) /* NOMEM or WOULDBLOCK */
                goto out_nfserr;
        if (access & MAY_WRITE) {
@@ -681,10 +694,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
        if (IS_ERR(*filp))
-                err = PTR_ERR(*filp);
+                host_err = PTR_ERR(*filp);
 out_nfserr:
-        if (err)
+        err = nfserrno(host_err);
-                err = nfserrno(err);
 out:
        return err;
 }
@@ -743,16 +755,20 @@ nfsd_sync_dir(struct dentry *dp)
 * Obtain the readahead parameters for the file
 * specified by (dev, ino).
 */
-static DEFINE_SPINLOCK(ra_lock);
 static inline struct raparms *
 nfsd_get_raparms(dev_t dev, ino_t ino)
 {
        struct raparms  *ra, **rap, **frap = NULL;
        int depth = 0;
+        unsigned int hash;
+        struct raparm_hbucket *rab;
-        spin_lock(&ra_lock);
+        hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
-        for (rap = &raparm_cache; (ra = *rap); rap = &ra->p_next) {
+        rab = &raparm_hash[hash];
+        spin_lock(&rab->pb_lock);
+        for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
                if (ra->p_ino == ino && ra->p_dev == dev)
                        goto found;
                depth++;
@@ -761,7 +777,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
        }
        depth = nfsdstats.ra_size*11/10;
        if (!frap) {    
-                spin_unlock(&ra_lock);
+                spin_unlock(&rab->pb_lock);
                return NULL;
        }
        rap = frap;
@@ -769,15 +785,16 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
        ra->p_dev = dev;
        ra->p_ino = ino;
        ra->p_set = 0;
+        ra->p_hindex = hash;
 found:
-        if (rap != &raparm_cache) {
+        if (rap != &rab->pb_head) {
                *rap = ra->p_next;
-                ra->p_next   = raparm_cache;
+                ra->p_next   = rab->pb_head;
-                raparm_cache = ra;
+                rab->pb_head = ra;
        }
        ra->p_count++;
        nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
-        spin_unlock(&ra_lock);
+        spin_unlock(&rab->pb_lock);
        return ra;
 }
@@ -791,36 +808,41 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
 {
        unsigned long count = desc->count;
        struct svc_rqst *rqstp = desc->arg.data;
+        struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
        if (size > count)
                size = count;
        if (rqstp->rq_res.page_len == 0) {
                get_page(page);
-                rqstp->rq_respages[rqstp->rq_resused++] = page;
+                put_page(*pp);
+                *pp = page;
+                rqstp->rq_resused++;
                rqstp->rq_res.page_base = offset;
                rqstp->rq_res.page_len = size;
-        } else if (page != rqstp->rq_respages[rqstp->rq_resused-1]) {
+        } else if (page != pp[-1]) {
                get_page(page);
-                rqstp->rq_respages[rqstp->rq_resused++] = page;
+                put_page(*pp);
+                *pp = page;
+                rqstp->rq_resused++;
                rqstp->rq_res.page_len += size;
-        } else {
+        } else
                rqstp->rq_res.page_len += size;
-        }
        desc->count = count - size;
        desc->written += size;
        return size;
 }
-static int
+static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
        struct inode *inode;
        struct raparms  *ra;
        mm_segment_t    oldfs;
-        int             err;
+        __be32          err;
+        int             host_err;
        err = nfserr_perm;
        inode = file->f_dentry->d_inode;
@@ -837,32 +859,33 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                file->f_ra = ra->p_ra;
        if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
-                svc_pushback_unused_pages(rqstp);
+                rqstp->rq_resused = 1;
-                err = file->f_op->sendfile(file, &offset, *count,
+                host_err = file->f_op->sendfile(file, &offset, *count,
                                                 nfsd_read_actor, rqstp);
        } else {
                oldfs = get_fs();
                set_fs(KERNEL_DS);
-                err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+                host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
                set_fs(oldfs);
        }
        /* Write back readahead params */
        if (ra) {
-                spin_lock(&ra_lock);
+                struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+                spin_lock(&rab->pb_lock);
                ra->p_ra = file->f_ra;
                ra->p_set = 1;
                ra->p_count--;
-                spin_unlock(&ra_lock);
+                spin_unlock(&rab->pb_lock);
        }
-        if (err >= 0) {
+        if (host_err >= 0) {
-                nfsdstats.io_read += err;
+                nfsdstats.io_read += host_err;
-                *count = err;
+                *count = host_err;
                err = 0;
                fsnotify_access(file->f_dentry);
        } else 
-                err = nfserrno(err);
+                err = nfserrno(host_err);
 out:
        return err;
 }
@@ -877,7 +900,7 @@ static void kill_suid(struct dentry *dentry)
        mutex_unlock(&dentry->d_inode->i_mutex);
 }
-static int
+static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                                loff_t offset, struct kvec *vec, int vlen,
                                unsigned long cnt, int *stablep)
@@ -886,7 +909,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        struct dentry           *dentry;
        struct inode            *inode;
        mm_segment_t            oldfs;
-        int                     err = 0;
+        __be32                  err = 0;
+        int                     host_err;
        int                     stable = *stablep;
 #ifdef MSNFS
@@ -922,18 +946,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        /* Write the data. */
        oldfs = get_fs(); set_fs(KERNEL_DS);
-        err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
+        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
        set_fs(oldfs);
-        if (err >= 0) {
+        if (host_err >= 0) {
                nfsdstats.io_write += cnt;
                fsnotify_modify(file->f_dentry);
        }
        /* clear setuid/setgid flag after write */
-        if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+        if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
                kill_suid(dentry);
-        if (err >= 0 && stable) {
+        if (host_err >= 0 && stable) {
                static ino_t    last_ino;
                static dev_t    last_dev;
@@ -959,7 +983,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                        if (inode->i_state & I_DIRTY) {
                                dprintk("nfsd: write sync %d\n", current->pid);
-                                err=nfsd_sync(file);
+                                host_err=nfsd_sync(file);
                        }
 #if 0
                        wake_up(&inode->i_wait);
@@ -969,11 +993,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                last_dev = inode->i_sb->s_dev;
        }
-        dprintk("nfsd: write complete err=%d\n", err);
+        dprintk("nfsd: write complete host_err=%d\n", host_err);
-        if (err >= 0)
+        if (host_err >= 0)
                err = 0;
        else 
-                err = nfserrno(err);
+                err = nfserrno(host_err);
 out:
        return err;
 }
@@ -983,12 +1007,12 @@ out:
 * on entry. On return, *count contains the number of bytes actually read.
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                loff_t offset, struct kvec *vec, int vlen,
                unsigned long *count)
 {
-        int             err;
+        __be32          err;
        if (file) {
                err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1012,12 +1036,12 @@ out:
 * The stable flag requests synchronous writes.
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
                int *stablep)
 {
-        int                     err = 0;
+        __be32                  err = 0;
        if (file) {
                err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1049,12 +1073,12 @@ out:
 * Unfortunately we cannot lock the file to make sure we return full WCC
 * data to the client, as locking happens lower down in the filesystem.
 */
-int
+__be32
 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
               loff_t offset, unsigned long count)
 {
        struct file     *file;
-        int             err;
+        __be32          err;
        if ((u64)count > ~(u64)offset)
                return nfserr_inval;
@@ -1082,14 +1106,15 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 *
 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
 */
-int
+__be32
 nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                char *fname, int flen, struct iattr *iap,
                int type, dev_t rdev, struct svc_fh *resfhp)
 {
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
-        int             err;
+        __be32          err;
+        int             host_err;
        err = nfserr_perm;
        if (!flen)
@@ -1116,7 +1141,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
                fh_lock_nested(fhp, I_MUTEX_PARENT);
                dchild = lookup_one_len(fname, dentry, flen);
-                err = PTR_ERR(dchild);
+                host_err = PTR_ERR(dchild);
                if (IS_ERR(dchild))
                        goto out_nfserr;
                err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
@@ -1155,22 +1180,22 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = nfserr_perm;
        switch (type) {
        case S_IFREG:
-                err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+                host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
                break;
        case S_IFDIR:
-                err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+                host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
                break;
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
-                err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+                host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
                break;
        default:
                printk("nfsd: bad file type %o in nfsd_create\n", type);
-                err = -EINVAL;
+                host_err = -EINVAL;
        }
-        if (err < 0)
+        if (host_err < 0)
                goto out_nfserr;
        if (EX_ISSYNC(fhp->fh_export)) {
@@ -1185,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
         * directories via NFS.
         */
        if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
-                int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+                __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
                if (err2)
                        err = err2;
        }
@@ -1200,7 +1225,7 @@ out:
        return err;
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out;
 }
@@ -1208,7 +1233,7 @@ out_nfserr:
 /*
 * NFSv3 version of nfsd_create
 */
-int
+__be32
 nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                char *fname, int flen, struct iattr *iap,
                struct svc_fh *resfhp, int createmode, u32 *verifier,
@@ -1216,7 +1241,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 {
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
-        int             err;
+        __be32          err;
+        int             host_err;
        __u32           v_mtime=0, v_atime=0;
        int             v_mode=0;
@@ -1246,7 +1272,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
         * Compose the response file handle.
         */
        dchild = lookup_one_len(fname, dentry, flen);
-        err = PTR_ERR(dchild);
+        host_err = PTR_ERR(dchild);
        if (IS_ERR(dchild))
                goto out_nfserr;
@@ -1302,8 +1328,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out;
        }
-        err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+        host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
-        if (err < 0)
+        if (host_err < 0)
                goto out_nfserr;
        if (EX_ISSYNC(fhp->fh_export)) {
@@ -1332,7 +1358,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
         */
 set_attr:
        if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
-                int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+                __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
                if (err2)
                        err = err2;
        }
@@ -1350,7 +1376,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        return err;
 
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out;
 }
 #endif /* CONFIG_NFSD_V3 */
@@ -1360,13 +1386,14 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 * fits into the buffer. On return, it contains the true length.
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 {
        struct dentry   *dentry;
        struct inode    *inode;
        mm_segment_t    oldfs;
-        int             err;
+        __be32          err;
+        int             host_err;
        err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
        if (err)
@@ -1385,18 +1412,18 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
         */
        oldfs = get_fs(); set_fs(KERNEL_DS);
-        err = inode->i_op->readlink(dentry, buf, *lenp);
+        host_err = inode->i_op->readlink(dentry, buf, *lenp);
        set_fs(oldfs);
-        if (err < 0)
+        if (host_err < 0)
                goto out_nfserr;
-        *lenp = err;
+        *lenp = host_err;
        err = 0;
 out:
        return err;
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out;
 }
@@ -1404,7 +1431,7 @@ out_nfserr:
 * Create a symlink and look up its inode
 * N.B. After this call _both_ fhp and resfhp need an fh_put
 */
-int
+__be32
 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                char *fname, int flen,
                                char *path,  int plen,
@@ -1412,7 +1439,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                struct iattr *iap)
 {
        struct dentry   *dentry, *dnew;
-        int             err, cerr;
+        __be32          err, cerr;
+        int             host_err;
        umode_t         mode;
        err = nfserr_noent;
@@ -1428,7 +1456,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        fh_lock(fhp);
        dentry = fhp->fh_dentry;
        dnew = lookup_one_len(fname, dentry, flen);
-        err = PTR_ERR(dnew);
+        host_err = PTR_ERR(dnew);
        if (IS_ERR(dnew))
                goto out_nfserr;
@@ -1440,21 +1468,21 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (unlikely(path[plen] != 0)) {
                char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
                if (path_alloced == NULL)
-                        err = -ENOMEM;
+                        host_err = -ENOMEM;
                else {
                        strncpy(path_alloced, path, plen);
                        path_alloced[plen] = 0;
-                        err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+                        host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
                        kfree(path_alloced);
                }
        } else
-                err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+                host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
-        if (!err)
+        if (!host_err) {
                if (EX_ISSYNC(fhp->fh_export))
-                        err = nfsd_sync_dir(dentry);
+                        host_err = nfsd_sync_dir(dentry);
-        if (err)
+        }
-                err = nfserrno(err);
+        err = nfserrno(host_err);
        fh_unlock(fhp);
        cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
@@ -1464,7 +1492,7 @@ out:
        return err;
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out;
 }
@@ -1472,13 +1500,14 @@ out_nfserr:
 * Create a hardlink
 * N.B. After this call _both_ ffhp and tfhp need an fh_put
 */
-int
+__be32
 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                                char *name, int len, struct svc_fh *tfhp)
 {
        struct dentry   *ddir, *dnew, *dold;
        struct inode    *dirp, *dest;
-        int             err;
+        __be32          err;
+        int             host_err;
        err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
        if (err)
@@ -1499,24 +1528,25 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        dirp = ddir->d_inode;
        dnew = lookup_one_len(name, ddir, len);
-        err = PTR_ERR(dnew);
+        host_err = PTR_ERR(dnew);
        if (IS_ERR(dnew))
                goto out_nfserr;
        dold = tfhp->fh_dentry;
        dest = dold->d_inode;
-        err = vfs_link(dold, dirp, dnew);
+        host_err = vfs_link(dold, dirp, dnew);
-        if (!err) {
+        if (!host_err) {
                if (EX_ISSYNC(ffhp->fh_export)) {
                        err = nfserrno(nfsd_sync_dir(ddir));
                        write_inode_now(dest, 1);
                }
+                err = 0;
        } else {
-                if (err == -EXDEV && rqstp->rq_vers == 2)
+                if (host_err == -EXDEV && rqstp->rq_vers == 2)
                        err = nfserr_acces;
                else
-                        err = nfserrno(err);
+                        err = nfserrno(host_err);
        }
        dput(dnew);
@@ -1526,7 +1556,7 @@ out:
        return err;
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
        goto out_unlock;
 }
@@ -1534,13 +1564,14 @@ out_nfserr:
 * Rename a file
 * N.B. After this call _both_ ffhp and tfhp need an fh_put
 */
-int
+__be32
 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
                            struct svc_fh *tfhp, char *tname, int tlen)
 {
        struct dentry   *fdentry, *tdentry, *odentry, *ndentry, *trap;
        struct inode    *fdir, *tdir;
-        int             err;
+        __be32          err;
+        int             host_err;
        err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
        if (err)
@@ -1571,22 +1602,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        fill_pre_wcc(tfhp);
        odentry = lookup_one_len(fname, fdentry, flen);
-        err = PTR_ERR(odentry);
+        host_err = PTR_ERR(odentry);
        if (IS_ERR(odentry))
                goto out_nfserr;
-        err = -ENOENT;
+        host_err = -ENOENT;
        if (!odentry->d_inode)
                goto out_dput_old;
-        err = -EINVAL;
+        host_err = -EINVAL;
        if (odentry == trap)
                goto out_dput_old;
        ndentry = lookup_one_len(tname, tdentry, tlen);
-        err = PTR_ERR(ndentry);
+        host_err = PTR_ERR(ndentry);
        if (IS_ERR(ndentry))
                goto out_dput_old;
-        err = -ENOTEMPTY;
+        host_err = -ENOTEMPTY;
        if (ndentry == trap)
                goto out_dput_new;
@@ -1594,14 +1625,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
                ((atomic_read(&odentry->d_count) > 1)
                 || (atomic_read(&ndentry->d_count) > 1))) {
-                        err = -EPERM;
+                        host_err = -EPERM;
        } else
 #endif
-        err = vfs_rename(fdir, odentry, tdir, ndentry);
+        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-        if (!err && EX_ISSYNC(tfhp->fh_export)) {
+        if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
-                err = nfsd_sync_dir(tdentry);
+                host_err = nfsd_sync_dir(tdentry);
-                if (!err)
+                if (!host_err)
-                        err = nfsd_sync_dir(fdentry);
+                        host_err = nfsd_sync_dir(fdentry);
        }
 out_dput_new:
@@ -1609,8 +1640,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 out_dput_old:
        dput(odentry);
 out_nfserr:
-        if (err)
+        err = nfserrno(host_err);
-                err = nfserrno(err);
        /* we cannot reply on fh_unlock on the two filehandles,
         * as that would do the wrong thing if the two directories
@@ -1629,13 +1659,14 @@ out:
 * Unlink a file or directory
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                                char *fname, int flen)
 {
        struct dentry   *dentry, *rdentry;
        struct inode    *dirp;
-        int             err;
+        __be32          err;
+        int             host_err;
        err = nfserr_acces;
        if (!flen || isdotent(fname, flen))
@@ -1649,7 +1680,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        dirp = dentry->d_inode;
        rdentry = lookup_one_len(fname, dentry, flen);
-        err = PTR_ERR(rdentry);
+        host_err = PTR_ERR(rdentry);
        if (IS_ERR(rdentry))
                goto out_nfserr;
@@ -1666,22 +1697,23 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 #ifdef MSNFS
                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
                        (atomic_read(&rdentry->d_count) > 1)) {
-                        err = -EPERM;
+                        host_err = -EPERM;
                } else
 #endif
-                err = vfs_unlink(dirp, rdentry);
+                host_err = vfs_unlink(dirp, rdentry);
        } else { /* It's RMDIR */
-                err = vfs_rmdir(dirp, rdentry);
+                host_err = vfs_rmdir(dirp, rdentry);
        }
        dput(rdentry);
-        if (err == 0 &&
+        if (host_err)
-            EX_ISSYNC(fhp->fh_export))
+                goto out_nfserr;
-                        err = nfsd_sync_dir(dentry);
+        if (EX_ISSYNC(fhp->fh_export))
+                host_err = nfsd_sync_dir(dentry);
 out_nfserr:
-        err = nfserrno(err);
+        err = nfserrno(host_err);
 out:
        return err;
 }
@@ -1690,11 +1722,12 @@ out:
 * Read entries from a directory.
 * The  NFSv3/4 verifier we ignore for now.
 */
-int
+__be32
 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
             struct readdir_cd *cdp, encode_dent_fn func)
 {
-        int             err;
+        __be32          err;
+        int             host_err;
        struct file     *file;
        loff_t          offset = *offsetp;
@@ -1716,10 +1749,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
        do {
                cdp->err = nfserr_eof; /* will be cleared on successful read */
-                err = vfs_readdir(file, (filldir_t) func, cdp);
+                host_err = vfs_readdir(file, (filldir_t) func, cdp);
-        } while (err >=0 && cdp->err == nfs_ok);
+        } while (host_err >=0 && cdp->err == nfs_ok);
-        if (err)
+        if (host_err)
-                err = nfserrno(err);
+                err = nfserrno(host_err);
        else
                err = cdp->err;
        *offsetp = vfs_llseek(file, 0, 1);
@@ -1736,10 +1769,10 @@ out:
 * Get file system stats
 * N.B. After this call fhp needs an fh_put
 */
-int
+__be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
-        int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+        __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
        if (!err && vfs_statfs(fhp->fh_dentry,stat))
                err = nfserr_io;
        return err;
@@ -1748,7 +1781,7 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 /*
 * Check for a user's access permissions to this inode.
 */
-int
+__be32
 nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
@@ -1829,11 +1862,11 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
 void
 nfsd_racache_shutdown(void)
 {
-        if (!raparm_cache)
+        if (!raparml)
                return;
        dprintk("nfsd: freeing readahead buffers.\n");
        kfree(raparml);
-        raparm_cache = raparml = NULL;
+        raparml = NULL;
 }
 /*
 * Initialize readahead param cache
@@ -1842,19 +1875,31 @@ int
 nfsd_racache_init(int cache_size)
 {
        int     i;
+        int     j = 0;
+        int     nperbucket;
-        if (raparm_cache)
+        if (raparml)
                return 0;
+        if (cache_size < 2*RAPARM_HASH_SIZE)
+                cache_size = 2*RAPARM_HASH_SIZE;
        raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL);
        if (raparml != NULL) {
                dprintk("nfsd: allocating %d readahead buffers.\n",
                        cache_size);
+                for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
+                        raparm_hash[i].pb_head = NULL;
+                        spin_lock_init(&raparm_hash[i].pb_lock);
+                }
+                nperbucket = cache_size >> RAPARM_HASH_BITS;
                memset(raparml, 0, sizeof(struct raparms) * cache_size);
                for (i = 0; i < cache_size - 1; i++) {
-                        raparml[i].p_next = raparml + i + 1;
+                        if (i % nperbucket == 0)
+                                raparm_hash[j++].pb_head = raparml + i;
+                        if (i % nperbucket < nperbucket-1)
+                                raparml[i].p_next = raparml + i + 1;
                }
-                raparm_cache = raparml;
        } else {
                printk(KERN_WARNING
                       "nfsd: Could not allocate memory read-ahead cache.\n");
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index e1fceb8aa32d..d11753c50bc1 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -152,14 +152,16 @@ static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
        struct o2nm_node *node, *ret = NULL;
        while (*p) {
+                int cmp;
                parent = *p;
                node = rb_entry(parent, struct o2nm_node, nd_ip_node);
-                if (memcmp(&ip_needle, &node->nd_ipv4_address,
+                cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
-                           sizeof(ip_needle)) < 0)
+                                sizeof(ip_needle));
+                if (cmp < 0)
                        p = &(*p)->rb_left;
-                else if (memcmp(&ip_needle, &node->nd_ipv4_address,
+                else if (cmp > 0)
-                                sizeof(ip_needle)) > 0)
                        p = &(*p)->rb_right;
                else {
                        ret = node;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d9ba0a931a03..1be74c4e7814 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
+#include <linux/sched.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -691,6 +692,12 @@ static int ocfs2_zero_extend(struct inode *inode,
                }
                start_off += sb->s_blocksize;
+                /*
+                 * Very large extends have the potential to lock up
+                 * the cpu for extended periods of time.
+                 */
+                cond_resched();
        }
 out:
@@ -728,31 +735,36 @@ static int ocfs2_extend_file(struct inode *inode,
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
                OCFS2_I(inode)->ip_clusters;
-        if (clusters_to_add) {
+        /* 
-                /* 
+         * protect the pages that ocfs2_zero_extend is going to be
-                 * protect the pages that ocfs2_zero_extend is going to
+         * pulling into the page cache.. we do this before the
-                 * be pulling into the page cache.. we do this before the
+         * metadata extend so that we don't get into the situation
-                 * metadata extend so that we don't get into the situation
+         * where we've extended the metadata but can't get the data
-                 * where we've extended the metadata but can't get the data
+         * lock to zero.
-                 * lock to zero.
+         */
-                 */
+        ret = ocfs2_data_lock(inode, 1);
-                ret = ocfs2_data_lock(inode, 1);
+        if (ret < 0) {
-                if (ret < 0) {
+                mlog_errno(ret);
-                        mlog_errno(ret);
+                goto out;
-                        goto out;
+        }
-                }
+        if (clusters_to_add) {
                ret = ocfs2_extend_allocation(inode, clusters_to_add);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
                }
+        }
-                ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+        /*
-                if (ret < 0) {
+         * Call this even if we don't add any clusters to the tree. We
-                        mlog_errno(ret);
+         * still need to zero the area between the old i_size and the
-                        goto out_unlock;
+         * new i_size.
-                }
+         */
+        ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_unlock;
        }
        if (!tail_to_skip) {
@@ -764,8 +776,7 @@ static int ocfs2_extend_file(struct inode *inode,
        }
 out_unlock:
-        if (clusters_to_add) /* this is the only case in which we lock */
+        ocfs2_data_unlock(inode, 1);
-                ocfs2_data_unlock(inode, 1);
 out:
        return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 259155f0eb2e..a57b751d4f40 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1085,14 +1085,6 @@ static int ocfs2_rename(struct inode *old_dir,
                        BUG();
        }
-        if (atomic_read(&old_dentry->d_count) > 2) {
-                shrink_dcache_parent(old_dentry);
-                if (atomic_read(&old_dentry->d_count) > 2) {
-                        status = -EBUSY;
-                        goto bail;
-                }
-        }
        /* Assume a directory heirarchy thusly:
         * a/b/c
         * a/d
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4c29cd7cc8e6..76b46ebbb10c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -339,7 +339,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
 #if BITS_PER_LONG == 32
 # if defined(CONFIG_LBD)
-        BUG_ON(sizeof(sector_t) != 8);
+        BUILD_BUG_ON(sizeof(sector_t) != 8);
        pagefactor = PAGE_CACHE_SIZE;
        bitshift = BITS_PER_LONG;
 # else
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 51c6a748df49..6fb4b6150d77 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -376,18 +376,48 @@ static char *make_block_name(struct gendisk *disk)
        return name;
 }
-static void disk_sysfs_symlinks(struct gendisk *disk)
+static int disk_sysfs_symlinks(struct gendisk *disk)
 {
        struct device *target = get_device(disk->driverfs_dev);
+        int err;
+        char *disk_name = NULL;
        if (target) {
-                char *disk_name = make_block_name(disk);
+                disk_name = make_block_name(disk);
-                sysfs_create_link(&disk->kobj,&target->kobj,"device");
+                if (!disk_name) {
-                if (disk_name) {
+                        err = -ENOMEM;
-                        sysfs_create_link(&target->kobj,&disk->kobj,disk_name);
+                        goto err_out;
-                        kfree(disk_name);
                }
+                err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
+                if (err)
+                        goto err_out_disk_name;
+                err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
+                if (err)
+                        goto err_out_dev_link;
        }
-        sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem");
+        err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
+                                "subsystem");
+        if (err)
+                goto err_out_disk_name_lnk;
+        kfree(disk_name);
+        return 0;
+err_out_disk_name_lnk:
+        if (target) {
+                sysfs_remove_link(&target->kobj, disk_name);
+err_out_dev_link:
+                sysfs_remove_link(&disk->kobj, "device");
+err_out_disk_name:
+                kfree(disk_name);
+err_out:
+                put_device(target);
+        }
+        return err;
 }
 /* Not exported, helper to add_disk(). */
@@ -406,7 +436,11 @@ void register_disk(struct gendisk *disk)
                *s = '!';
        if ((err = kobject_add(&disk->kobj)))
                return;
-        disk_sysfs_symlinks(disk);
+        err = disk_sysfs_symlinks(disk);
+        if (err) {
+                kobject_del(&disk->kobj);
+                return;
+        }
        disk_sysfs_add_subdirs(disk);
        /* No minors to use for partitions */
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 4f8df71e49d3..8c7af1777819 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -32,13 +32,11 @@
 #include <asm/unaligned.h>
 #define SYS_IND(p)      (get_unaligned(&p->sys_ind))
-#define NR_SECTS(p)     ({ __typeof__(p->nr_sects) __a =        \
+#define NR_SECTS(p)     ({ __le32 __a = get_unaligned(&p->nr_sects);    \
-                                get_unaligned(&p->nr_sects);    \
                                le32_to_cpu(__a); \
                        })
-#define START_SECT(p)   ({ __typeof__(p->start_sect) __a =      \
+#define START_SECT(p)   ({ __le32 __a = get_unaligned(&p->start_sect);  \
-                                get_unaligned(&p->start_sect);  \
                                le32_to_cpu(__a); \
                        })
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 82da55b5cffe..8df27401d292 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -72,6 +72,7 @@
 #include <linux/audit.h>
 #include <linux/poll.h>
 #include <linux/nsproxy.h>
+#include <linux/oom.h>
 #include "internal.h"
 /* NOTE:
@@ -86,7 +87,7 @@
 /* Worst case buffer size needed for holding an integer. */
-#define PROC_NUMBUF 10
+#define PROC_NUMBUF 13
 struct pid_entry {
        int len;
@@ -689,7 +690,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        oom_adjust = simple_strtol(buffer, &end, 0);
-        if ((oom_adjust < -16 || oom_adjust > 15) && oom_adjust != OOM_DISABLE)
+        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
+             oom_adjust != OOM_DISABLE)
                return -EINVAL;
        if (*end == '\n')
                end++;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8d88e58ed5cc..93c43b676e59 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -647,7 +647,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
                if (get_user(c, buf))
                        return -EFAULT;
-                __handle_sysrq(c, NULL, NULL, 0);
+                __handle_sysrq(c, NULL, 0);
        }
        return count;
 }
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 1bfae42117ca..e3d466a228d4 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1304,8 +1304,8 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        bh = sb_bread(sb, block);
        if (bh == NULL)
-                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%lu) "
+                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
-                                 "reading failed", __FUNCTION__, bh->b_blocknr);
+                                 "reading failed", __FUNCTION__, block);
        else {
                if (buffer_locked(bh)) {
                        PROC_INFO_INC(sb, scan_bitmap.wait);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index c093642fb983..b67ce9354048 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 7e5a2f5ebeb0..9c69bcacad22 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1780,7 +1780,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                err = -EDQUOT;
                goto out_end_trans;
        }
-        if (!dir || !dir->i_nlink) {
+        if (!dir->i_nlink) {
                err = -EPERM;
                goto out_bad_inode;
        }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ad8cbc49883a..85ce23268302 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -53,6 +53,7 @@
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -970,7 +971,7 @@ int reiserfs_async_progress_wait(struct super_block *s)
        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
        if (atomic_read(&j->j_async_throttle))
-                blk_congestion_wait(WRITE, HZ / 10);
+                congestion_wait(WRITE, HZ / 10);
        return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c89aa2338191..9041802df832 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -430,20 +430,29 @@ int remove_save_link(struct inode *inode, int truncate)
        return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
 }
-static void reiserfs_put_super(struct super_block *s)
+static void reiserfs_kill_sb(struct super_block *s)
 {
-        struct reiserfs_transaction_handle th;
+        if (REISERFS_SB(s)) {
-        th.t_trans_id = 0;
+                if (REISERFS_SB(s)->xattr_root) {
+                        d_invalidate(REISERFS_SB(s)->xattr_root);
+                        dput(REISERFS_SB(s)->xattr_root);
+                        REISERFS_SB(s)->xattr_root = NULL;
+                }
-        if (REISERFS_SB(s)->xattr_root) {
+                if (REISERFS_SB(s)->priv_root) {
-                d_invalidate(REISERFS_SB(s)->xattr_root);
+                        d_invalidate(REISERFS_SB(s)->priv_root);
-                dput(REISERFS_SB(s)->xattr_root);
+                        dput(REISERFS_SB(s)->priv_root);
+                        REISERFS_SB(s)->priv_root = NULL;
+                }
        }
-        if (REISERFS_SB(s)->priv_root) {
+        kill_block_super(s);
-                d_invalidate(REISERFS_SB(s)->priv_root);
+}
-                dput(REISERFS_SB(s)->priv_root);
-        }
+static void reiserfs_put_super(struct super_block *s)
+{
+        struct reiserfs_transaction_handle th;
+        th.t_trans_id = 0;
        /* change file system state to current state if it was mounted with read-write permissions */
        if (!(s->s_flags & MS_RDONLY)) {
@@ -2156,7 +2165,7 @@ struct file_system_type reiserfs_fs_type = {
        .owner = THIS_MODULE,
        .name = "reiserfs",
        .get_sb = get_super_block,
-        .kill_sb = kill_block_super,
+        .kill_sb = reiserfs_kill_sb,
        .fs_flags = FS_REQUIRES_DEV,
 };
diff --git a/fs/splice.c b/fs/splice.c
index 13e92dd19fbb..a567010b62ac 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -607,7 +607,7 @@ find_page:
                        ret = -ENOMEM;
                        page = page_cache_alloc_cold(mapping);
                        if (unlikely(!page))
-                                goto out_nomem;
+                                goto out_ret;
                        /*
                         * This will also lock the page
@@ -666,7 +666,7 @@ find_page:
                if (sd->pos + this_len > isize)
                        vmtruncate(mapping->host, isize);
-                goto out;
+                goto out_ret;
        }
        if (buf->page != page) {
@@ -698,7 +698,7 @@ find_page:
 out:
        page_cache_release(page);
        unlock_page(page);
-out_nomem:
+out_ret:
        return ret;
 }
diff --git a/fs/super.c b/fs/super.c
index aec99ddbe53f..47e554c12e76 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -260,17 +260,17 @@ int fsync_super(struct super_block *sb)
 *      that need destruction out of superblock, call generic_shutdown_super()
 *      and release aforementioned objects.  Note: dentries and inodes _are_
 *      taken care of and do not need specific handling.
+ *
+ *      Upon calling this function, the filesystem may no longer alter or
+ *      rearrange the set of dentries belonging to this super_block, nor may it
+ *      change the attachments of dentries to inodes.
 */
 void generic_shutdown_super(struct super_block *sb)
 {
-        struct dentry *root = sb->s_root;
        struct super_operations *sop = sb->s_op;
-        if (root) {
+        if (sb->s_root) {
-                sb->s_root = NULL;
+                shrink_dcache_for_umount(sb);
-                shrink_dcache_parent(root);
-                shrink_dcache_sb(sb);
-                dput(root);
                fsync_super(sb);
                lock_super(sb);
                sb->s_flags &= ~MS_ACTIVE;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 146f1dedec84..298303b5a716 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -483,17 +483,12 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
                    (victim->d_parent->d_inode == dir->d_inode)) {
                        victim->d_inode->i_mtime = CURRENT_TIME;
                        fsnotify_modify(victim);
-                        /**
-                         * Drop reference from initial sysfs_get_dentry().
-                         */
-                        dput(victim);
                        res = 0;
                } else
                        d_drop(victim);
                
                /**
-                 * Drop the reference acquired from sysfs_get_dentry() above.
+                 * Drop the reference acquired from lookup_one_len() above.
                 */
                dput(victim);
        }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 350cba5d6803..dc9e7dc07fb7 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -358,16 +358,11 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
        unsigned long blocknr;
        int size = 0, i;
        
-        if (1024 != sizeof (struct xenix_super_block))
+        BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
-                panic("Xenix FS: bad superblock size");
+        BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
-        if (512 != sizeof (struct sysv4_super_block))
+        BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
-                panic("SystemV FS: bad superblock size");
+        BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
-        if (512 != sizeof (struct sysv2_super_block))
+        BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
-                panic("SystemV FS: bad superblock size");
-        if (500 != sizeof (struct coh_super_block))
-                panic("Coherent FS: bad superblock size");
-        if (64 != sizeof (struct sysv_inode))
-                panic("sysv fs: bad inode size");
        sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
        if (!sbi)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1d3b5d2070e5..1aea6a4f9a4a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1621,9 +1621,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
-        if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY)
+        if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY) {
                printk("UDF-fs: Partition marked readonly; forcing readonly mount\n");
                sb->s_flags |= MS_RDONLY;
+        }
        if ( udf_find_fileset(sb, &fileset, &rootdir) )
        {
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 22f820a9b15c..17437574f79c 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -184,14 +184,13 @@ void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
 dev_t
 ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
 {
-        __fs32 fs32;
+        __u32 fs32;
        dev_t dev;
        if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                fs32 = ufsi->i_u1.i_data[1];
+                fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]);
        else
-                fs32 = ufsi->i_u1.i_data[0];
+                fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]);
-        fs32 = fs32_to_cpu(sb, fs32);
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUNx86:
        case UFS_ST_SUN:
@@ -212,7 +211,7 @@ ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
 void
 ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev)
 {
-        __fs32 fs32;
+        __u32 fs32;
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUNx86:
@@ -227,11 +226,10 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
                fs32 = old_encode_dev(dev);
                break;
        }
-        fs32 = cpu_to_fs32(sb, fs32);
        if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                ufsi->i_u1.i_data[1] = fs32;
+                ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32);
        else
-                ufsi->i_u1.i_data[0] = fs32;
+                ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32);
 }
 /**
diff --git a/fs/xattr.c b/fs/xattr.c
index c32f15b5f60f..395635100f77 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -135,6 +135,26 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 }
 EXPORT_SYMBOL_GPL(vfs_getxattr);
+ssize_t
+vfs_listxattr(struct dentry *d, char *list, size_t size)
+{
+        ssize_t error;
+        error = security_inode_listxattr(d);
+        if (error)
+                return error;
+        error = -EOPNOTSUPP;
+        if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+                error = d->d_inode->i_op->listxattr(d, list, size);
+        } else {
+                error = security_inode_listsecurity(d->d_inode, list, size);
+                if (size && error > size)
+                        error = -ERANGE;
+        }
+        return error;
+}
+EXPORT_SYMBOL_GPL(vfs_listxattr);
 int
 vfs_removexattr(struct dentry *dentry, char *name)
 {
@@ -346,17 +366,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
                        return -ENOMEM;
        }
-        error = security_inode_listxattr(d);
+        error = vfs_listxattr(d, klist, size);
-        if (error)
-                goto out;
-        error = -EOPNOTSUPP;
-        if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
-                error = d->d_inode->i_op->listxattr(d, klist, size);
-        } else {
-                error = security_inode_listsecurity(d->d_inode, klist, size);
-                if (size && error > size)
-                        error = -ERANGE;
-        }
        if (error > 0) {
                if (size && copy_to_user(list, klist, error))
                        error = -EFAULT;
@@ -365,7 +375,6 @@ listxattr(struct dentry *d, char __user *list, size_t size)
                   than XATTR_LIST_MAX bytes. Not possible. */
                error = -E2BIG;
        }
-out:
        kfree(klist);
        return error;
 }
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index d59737589815..004baf600611 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
@@ -53,7 +54,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
                                        __FUNCTION__, lflags);
-                blk_congestion_wait(WRITE, HZ/50);
+                congestion_wait(WRITE, HZ/50);
        } while (1);
 }
@@ -131,7 +132,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
                                        __FUNCTION__, lflags);
-                blk_congestion_wait(WRITE, HZ/50);
+                congestion_wait(WRITE, HZ/50);
        } while (1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9bbadafdcb00..db5f5a3608ca 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -30,6 +30,7 @@
 #include <linux/hash.h>
 #include <linux/kthread.h>
 #include <linux/migrate.h>
+#include <linux/backing-dev.h>
 #include "xfs_linux.h"
 STATIC kmem_zone_t *xfs_buf_zone;
@@ -395,7 +396,7 @@ _xfs_buf_lookup_pages(
                        XFS_STATS_INC(xb_page_retries);
                        xfsbufd_wakeup(0, gfp_mask);
-                        blk_congestion_wait(WRITE, HZ/50);
+                        congestion_wait(WRITE, HZ/50);
                        goto retry;
                }
author	David Woodhouse <dwmw2@infradead.org>	2006-10-21 11:46:04 -0400
committer	David Woodhouse <dwmw2@infradead.org>	2006-10-21 11:46:04 -0400
commit	513b046c96cc2fbce730a3474f6f7ff0c4fdd05c (patch)
tree	e8006368b6f643067486f92405a404757807d6da /fs
parent	82810b7b6cc7a74c68881a13b0eb66c7a6370fcc (diff)
parent	c7a3bd177f248d01ee18a01d22048c80e071c331 (diff)