Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts: include/linux/kernel.h
author: Steven Whitehouse <swhiteho@redhat.com> 2006-07-03 10:25:08 -0400
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-07-03 10:25:08 -0400
commit: 0a1340c185734a57fbf4775927966ad4a1347b02 (patch)
tree: d9ed8f0dd809a7c542a3356601125ea5b5aaa804 /fs
parent: af18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff)
parent: 29454dde27d8e340bb1987bad9aa504af7081eba (diff)
523 files changed, 18972 insertions, 19366 deletions
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index a767e05b60bf..1e898144eb7c 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -24,7 +24,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/error.c b/fs/9p/error.c
index 981fe8ecd780..ae91555c1558 100644
--- a/fs/9p/error.c
+++ b/fs/9p/error.c
@@ -27,7 +27,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/list.h>
diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c
index 6f2617820a4e..8556097fcda8 100644
--- a/fs/9p/fcall.c
+++ b/fs/9p/fcall.c
@@ -24,7 +24,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fcprint.c b/fs/9p/fcprint.c
index 583e827baebd..34b96114a28d 100644
--- a/fs/9p/fcprint.c
+++ b/fs/9p/fcprint.c
@@ -21,7 +21,6 @@
 *  Boston, MA  02111-1301  USA
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b7608af07ce8..70492ccb4385 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -20,7 +20,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c7..90a79c784549 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -23,7 +23,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -712,7 +711,7 @@ static void v9fs_read_work(void *a)
 * v9fs_send_request - send 9P request
 * The function can sleep until the request is scheduled for sending.
 * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
 * that can be retrieved by PTR_ERR macros.
 *
 * @m: mux data
@@ -932,6 +931,8 @@ v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
                                        r.rcall || r.err);
                        } while (!r.rcall && !r.err && err==-ERESTARTSYS &&
                                m->trans->status==Connected && !m->err);
+                        err = -ERESTARTSYS;
                }
                sigpending = 1;
        }
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 94e0a7fd9fc2..34d43355beb7 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -25,7 +25,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d37416eb5791..22f7ccd58d38 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,7 +23,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f867b8d3e973..450b0c1b385e 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -38,7 +38,7 @@
 */
 extern struct file_system_type v9fs_fs_type;
-extern struct address_space_operations v9fs_addr_operations;
+extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index efda46fb64d9..9dfd259a70b4 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
-#include <linux/version.h>
 #include <linux/pagemap.h>
 #include <linux/idr.h>
@@ -103,6 +102,6 @@ UnmapAndUnlock:
        return retval;
 }
-struct address_space_operations v9fs_addr_operations = {
+const struct address_space_operations v9fs_addr_operations = {
      .readpage = v9fs_vfs_readpage,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 1a8e46084f0e..c3c47eda7574 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
-#include <linux/version.h>
 #include <linux/list.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2cb87ba4b1c1..2f580a197b8d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -300,7 +300,7 @@ clunk_fid:
        fid = V9FS_NOFID;
 put_fid:
-        if (fid >= 0)
+        if (fid != V9FS_NOFID)
                v9fs_put_idpool(fid, &v9ses->fidpool);
        kfree(fcall);
@@ -530,9 +530,6 @@ error:
        if (vfid)
                v9fs_fid_destroy(vfid);
-        if (inode)
-                iput(inode);
        return err;
 }
@@ -1054,6 +1051,9 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
        int ret;
        char *link = __getname();
+        if (unlikely(!link))
+                return -ENOMEM;
        if (buflen > PATH_MAX)
                buflen = PATH_MAX;
@@ -1171,9 +1171,6 @@ error:
        if (vfid)
                v9fs_fid_destroy(vfid);
-        if (inode)
-                iput(inode);
        return err;
 }
@@ -1227,6 +1224,9 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        name = __getname();
+        if (unlikely(!name))
+                return -ENOMEM;
        sprintf(name, "%d\n", oldfid->fid);
        retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
        __putname(name);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e3..63320d4e15d2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -25,7 +25,6 @@
 */
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -99,12 +98,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 * @flags: mount flags
 * @dev_name: device name that was mounted
 * @data: mount options
+ * @mnt: mountpoint record to be instantiated
 *
 */
-static struct super_block *v9fs_get_sb(struct file_system_type
+static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
-                                       *fs_type, int flags,
+                       const char *dev_name, void *data,
-                                       const char *dev_name, void *data)
+                       struct vfsmount *mnt)
 {
        struct super_block *sb = NULL;
        struct v9fs_fcall *fcall = NULL;
@@ -123,17 +123,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
-                return ERR_PTR(-ENOMEM);
+                return -ENOMEM;
        if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
                dprintk(DEBUG_ERROR, "problem initiating session\n");
-                sb = ERR_PTR(newfid);
+                retval = newfid;
                goto out_free_session;
        }
        sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-        if (IS_ERR(sb))
+        if (IS_ERR(sb)) {
+                retval = PTR_ERR(sb);
                goto out_close_session;
+        }
        v9fs_fill_super(sb, v9ses, flags);
        inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -184,19 +186,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
                goto put_back_sb;
        }
-        return sb;
+        return simple_set_mnt(mnt, sb);
 out_close_session:
        v9fs_session_close(v9ses);
 out_free_session:
        kfree(v9ses);
-        return sb;
+        return retval;
 put_back_sb:
        /* deactivate_super calls v9fs_kill_super which will frees the rest */
        up_write(&sb->s_umount);
        deactivate_super(sb);
-        return ERR_PTR(retval);
+        return retval;
 }
 /**
@@ -253,11 +255,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 static void
-v9fs_umount_begin(struct super_block *sb)
+v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
-        v9fs_session_cancel(v9ses);
+        if (flags & MNT_FORCE)
+                v9fs_session_cancel(v9ses);
 }
 static struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 563a59e5e694..a7cd7db5e533 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -53,7 +53,7 @@ config EXT2_FS_SECURITY
 config EXT2_FS_XIP
        bool "Ext2 execute in place support"
-        depends on EXT2_FS
+        depends on EXT2_FS && MMU
        help
          Execute in place can be used on memory-backed block devices. If you
          enable this option, you can select to mount block devices which are
@@ -327,7 +327,7 @@ source "fs/gfs2/Kconfig"
 config OCFS2_FS
        tristate "OCFS2 file system support (EXPERIMENTAL)"
-        depends on NET && EXPERIMENTAL
+        depends on NET && SYSFS && EXPERIMENTAL
        select CONFIGFS_FS
        select JBD
        select CRC32
@@ -357,6 +357,16 @@ config OCFS2_FS
                  - POSIX ACLs
                  - readpages / writepages (not user visible)
+config OCFS2_DEBUG_MASKLOG
+        bool "OCFS2 logging support"
+        depends on OCFS2_FS
+        default y
+        help
+          The ocfs2 filesystem has an extensive logging system.  The system
+          allows selection of events to log via files in /sys/o2cb/logmask/.
+          This option will enlarge your kernel, but it allows debugging of
+          ocfs2 filesystem issues.
 config MINIX_FS
        tristate "Minix fs support"
        help
@@ -394,18 +404,30 @@ config INOTIFY
        bool "Inotify file change notification support"
        default y
        ---help---
-          Say Y here to enable inotify support and the associated system
+          Say Y here to enable inotify support.  Inotify is a file change
-          calls.  Inotify is a file change notification system and a
+          notification system and a replacement for dnotify.  Inotify fixes
-          replacement for dnotify.  Inotify fixes numerous shortcomings in
+          numerous shortcomings in dnotify and introduces several new features
-          dnotify and introduces several new features.  It allows monitoring
+          including multiple file events, one-shot support, and unmount
-          of both files and directories via a single open fd.  Other features
-          include multiple file events, one-shot support, and unmount
          notification.
          For more information, see Documentation/filesystems/inotify.txt
          If unsure, say Y.
+config INOTIFY_USER
+        bool "Inotify support for userspace"
+        depends on INOTIFY
+        default y
+        ---help---
+          Say Y here to enable inotify support for userspace, including the
+          associated system calls.  Inotify allows monitoring of both files and
+          directories via a single open fd.  Events are read from the file
+          descriptor, which is also select()- and poll()-able.
+          For more information, see Documentation/filesystems/inotify.txt
+          If unsure, say Y.
 config QUOTA
        bool "Quota support"
        help
@@ -765,7 +787,8 @@ endmenu
 menu "Pseudo filesystems"
 config PROC_FS
-        bool "/proc file system support"
+        bool "/proc file system support" if EMBEDDED
+        default y
        help
          This is a virtual file system providing information about the status
          of the system. "Virtual" means that it doesn't take up any space on
@@ -1102,6 +1125,44 @@ config JFFS2_SUMMARY
          If unsure, say 'N'.
+config JFFS2_FS_XATTR
+        bool "JFFS2 XATTR support (EXPERIMENTAL)"
+        depends on JFFS2_FS && EXPERIMENTAL
+        default n
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          
+          If unsure, say N.
+config JFFS2_FS_POSIX_ACL
+        bool "JFFS2 POSIX Access Control Lists"
+        depends on JFFS2_FS_XATTR
+        default y
+        select FS_POSIX_ACL
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          
+          To learn more about Access Control Lists, visit the Posix ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          
+          If you don't know what Access Control Lists are, say N
+config JFFS2_FS_SECURITY
+        bool "JFFS2 Security Labels"
+        depends on JFFS2_FS_XATTR
+        default y
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the jffs2 filesystem.
+          
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
 config JFFS2_COMPRESSION_OPTIONS
        bool "Advanced compression options for JFFS2"
        depends on JFFS2_FS
@@ -1321,11 +1382,19 @@ config UFS_FS
 config UFS_FS_WRITE
        bool "UFS file system write support (DANGEROUS)"
-        depends on UFS_FS && EXPERIMENTAL && BROKEN
+        depends on UFS_FS && EXPERIMENTAL
        help
          Say Y here if you want to try writing to UFS partitions. This is
          experimental, so you should back up your UFS partitions beforehand.
+config UFS_DEBUG
+        bool "UFS debugging"
+        depends on UFS_FS
+        help
+          If you are experiencing any problems with the UFS filesystem, say
+          Y here.  This will result in _many_ additional debugging messages to be
+          written to the system log.
 endmenu
 menu "Network File Systems"
@@ -1432,7 +1501,12 @@ config NFSD
        select LOCKD
        select SUNRPC
        select EXPORTFS
-        select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL
+        select NFSD_V2_ACL if NFSD_V3_ACL
+        select NFS_ACL_SUPPORT if NFSD_V2_ACL
+        select NFSD_TCP if NFSD_V4
+        select CRYPTO_MD5 if NFSD_V4
+        select CRYPTO if NFSD_V4
+        select FS_POSIX_ACL if NFSD_V4
        help
          If you want your Linux box to act as an NFS *server*, so that other
          computers on your local network which support NFS can access certain
@@ -1470,7 +1544,6 @@ config NFSD_V3
 config NFSD_V3_ACL
        bool "Provide server support for the NFSv3 ACL protocol extension"
        depends on NFSD_V3
-        select NFSD_V2_ACL
        help
          Implement the NFSv3 ACL protocol extension for manipulating POSIX
          Access Control Lists on exported file systems. NFS clients should
@@ -1480,10 +1553,6 @@ config NFSD_V3_ACL
 config NFSD_V4
        bool "Provide NFSv4 server support (EXPERIMENTAL)"
        depends on NFSD_V3 && EXPERIMENTAL
-        select NFSD_TCP
-        select CRYPTO_MD5
-        select CRYPTO
-        select FS_POSIX_ACL
        help
          If you would like to include the NFSv4 server as well as the NFSv2
          and NFSv3 servers, say Y here.  This feature is experimental, and
@@ -1664,7 +1733,7 @@ config CIFS_STATS
          mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
 config CIFS_STATS2
-        bool "CIFS extended statistics"
+        bool "Extended statistics"
        depends on CIFS_STATS
        help
          Enabling this option will allow more detailed statistics on SMB
@@ -1677,6 +1746,32 @@ config CIFS_STATS2
          Unless you are a developer or are doing network performance analysis
          or tuning, say N.
+config CIFS_WEAK_PW_HASH
+        bool "Support legacy servers which use weaker LANMAN security"
+        depends on CIFS
+        help
+          Modern CIFS servers including Samba and most Windows versions
+          (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+          security mechanisms. These hash the password more securely
+          than the mechanisms used in the older LANMAN version of the
+          SMB protocol needed to establish sessions with old SMB servers.
+          Enabling this option allows the cifs module to mount to older
+          LANMAN based servers such as OS/2 and Windows 95, but such
+          mounts may be less secure than mounts using NTLM or more recent
+          security mechanisms if you are on a public network.  Unless you
+          have a need to access old SMB servers (and are on a private 
+          network) you probably want to say N.  Even if this support
+          is enabled in the kernel build, they will not be used
+          automatically. At runtime LANMAN mounts are disabled but
+          can be set to required (or optional) either in
+          /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+          option on the mount command. This support is disabled by 
+          default in order to reduce the possibility of a downgrade
+          attack.
+ 
+          If unsure, say N.
 config CIFS_XATTR
        bool "CIFS extended attributes"
        depends on CIFS
@@ -1705,6 +1800,16 @@ config CIFS_POSIX
          (such as Samba 3.10 and later) which can negotiate
          CIFS POSIX ACL support.  If unsure, say N.
+config CIFS_DEBUG2
+        bool "Enable additional CIFS debugging routines"
+        help
+           Enabling this option adds a few more debugging routines
+           to the cifs code which slightly increases the size of
+           the cifs module and can cause additional logging of debug
+           messages in some error paths, slowing performance. This
+           option can be turned off unless you are debugging
+           cifs problems.  If unsure, say N.
+           
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
@@ -1720,7 +1825,7 @@ config CIFS_EXPERIMENTAL
            If unsure, say N.
 config CIFS_UPCALL
-          bool "CIFS Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
+          bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
          depends on CIFS_EXPERIMENTAL
          select CONNECTOR
          help
diff --git a/fs/Makefile b/fs/Makefile
index c731d2c0f409..64df11047ccc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
                ioprio.o pnode.o drop_caches.o splice.o sync.o
 obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
@@ -66,7 +67,6 @@ obj-$(CONFIG_MSDOS_FS)		+= msdos/
 obj-$(CONFIG_VFAT_FS)           += vfat/
 obj-$(CONFIG_BFS_FS)            += bfs/
 obj-$(CONFIG_ISO9660_FS)        += isofs/
-obj-$(CONFIG_DEVFS_FS)          += devfs/
 obj-$(CONFIG_HFSPLUS_FS)        += hfsplus/ # Before hfs to find wrapped HFS+
 obj-$(CONFIG_HFS_FS)            += hfs/
 obj-$(CONFIG_VXFS_FS)           += freevxfs/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 7b075fc397da..d3c7905b2ddc 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
 *
 *  Common directory handling for ADFS
 */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/adfs_fs.h>
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index a02802a30798..534f3eecc985 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -72,7 +72,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, adfs_get_block);
 }
-static struct address_space_operations adfs_aops = {
+static const struct address_space_operations adfs_aops = {
        .readpage       = adfs_readpage,
        .writepage      = adfs_writepage,
        .sync_page      = block_sync_page,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 252abda0d200..ba1c88af49fe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
        return parse_options(sb, data);
 }
-static int adfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct adfs_sb_info *asb = ADFS_SB(sb);
+        struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
        buf->f_type    = ADFS_SUPER_MAGIC;
        buf->f_namelen = asb->s_namelen;
-        buf->f_bsize   = sb->s_blocksize;
+        buf->f_bsize   = dentry->d_sb->s_blocksize;
        buf->f_blocks  = asb->s_size;
        buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
        buf->f_bavail  =
-        buf->f_bfree   = adfs_map_free(sb);
+        buf->f_bfree   = adfs_map_free(dentry->d_sb);
        buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
        return 0;
@@ -470,10 +470,11 @@ error:
        return -EINVAL;
 }
-static struct super_block *adfs_get_sb(struct file_system_type *fs_type,
+static int adfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+                           mnt);
 }
 static struct file_system_type adfs_fs_type = {
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a43a876742b8..0ddd4cc0d1a0 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -195,9 +195,9 @@ extern struct inode_operations   affs_symlink_inode_operations;
 extern const struct file_operations      affs_file_operations;
 extern const struct file_operations      affs_file_operations_ofs;
 extern const struct file_operations      affs_dir_operations;
-extern struct address_space_operations   affs_symlink_aops;
+extern const struct address_space_operations     affs_symlink_aops;
-extern struct address_space_operations   affs_aops;
+extern const struct address_space_operations     affs_aops;
-extern struct address_space_operations   affs_aops_ofs;
+extern const struct address_space_operations     affs_aops_ofs;
 extern struct dentry_operations  affs_dentry_operations;
 extern struct dentry_operations  affs_dentry_operations_intl;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7076262af39b..3de8590e4f6a 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,affs_get_block);
 }
-struct address_space_operations affs_aops = {
+const struct address_space_operations affs_aops = {
        .readpage = affs_readpage,
        .writepage = affs_writepage,
        .sync_page = block_sync_page,
@@ -759,7 +759,7 @@ out:
        goto done;
 }
-struct address_space_operations affs_aops_ofs = {
+const struct address_space_operations affs_aops_ofs = {
        .readpage = affs_readpage_ofs,
        //.writepage = affs_writepage_ofs,
        //.sync_page = affs_sync_page_ofs,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4d7e5b19e5cd..5200f4938df0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,7 +18,7 @@
 extern struct timezone sys_tz;
-static int affs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 static void
@@ -271,6 +271,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        int                      reserved;
        unsigned long            mount_flags;
        int                      tmp_flags;     /* fix remount prototype... */
+        u8                       sig[4];
        pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
@@ -370,8 +371,9 @@ got_root:
                printk(KERN_ERR "AFFS: Cannot read boot block\n");
                goto out_error;
        }
-        chksum = be32_to_cpu(*(__be32 *)boot_bh->b_data);
+        memcpy(sig, boot_bh->b_data, 4);
        brelse(boot_bh);
+        chksum = be32_to_cpu(*(__be32 *)sig);
        /* Dircache filesystems are compatible with non-dircache ones
         * when reading. As long as they aren't supported, writing is
@@ -420,11 +422,11 @@ got_root:
        }
        if (mount_flags & SF_VERBOSE) {
-                chksum = cpu_to_be32(chksum);
+                u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
-                printk(KERN_NOTICE "AFFS: Mounting volume \"%*s\": Type=%.3s\\%c, Blocksize=%d\n",
+                printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
-                        AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0],
+                        len > 31 ? 31 : len,
                        AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
-                        (char *)&chksum,((char *)&chksum)[3] + '0',blocksize);
+                        sig, sig[3] + '0', blocksize);
        }
        sb->s_flags |= MS_NODEV | MS_NOSUID;
@@ -508,8 +510,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 }
 static int
-affs_statfs(struct super_block *sb, struct kstatfs *buf)
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        int              free;
        pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
@@ -524,10 +527,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf)
        return 0;
 }
-static struct super_block *affs_get_sb(struct file_system_type *fs_type,
+static int affs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+                           mnt);
 }
 static struct file_system_type affs_fs_type = {
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 426f0f094f23..f802256a5933 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -66,7 +66,7 @@ fail:
        return err;
 }
-struct address_space_operations affs_symlink_aops = {
+const struct address_space_operations affs_symlink_aops = {
        .readpage       = affs_symlink_readpage,
 };
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 009a9ae88d61..bfc1fd22d5b1 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer,
        /* we found it in the graveyard - resurrect it */
 found_dead_server:
-        list_del(&server->link);
+        list_move_tail(&server->link, &cell->sv_list);
-        list_add_tail(&server->link, &cell->sv_list);
        afs_get_server(server);
        afs_kafstimod_del_timer(&server->timeout);
        spin_unlock(&cell->sv_gylock);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a6dff6a4f204..2fc99877cb0d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -185,9 +185,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
        _enter("{%lu},%lu", dir->i_ino, index);
-        page = read_cache_page(dir->i_mapping,index,
+        page = read_mapping_page(dir->i_mapping, index, NULL);
-                               (filler_t *) dir->i_mapping->a_ops->readpage,
-                               NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                kmap(page);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7bb716887e29..67d6634101fd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -35,7 +35,7 @@ struct inode_operations afs_file_inode_operations = {
        .getattr        = afs_inode_getattr,
 };
-struct address_space_operations afs_fs_aops = {
+const struct address_space_operations afs_fs_aops = {
        .readpage       = afs_file_readpage,
        .sync_page      = block_sync_page,
        .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 72febdf9a35a..e88b3b65ae49 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -69,7 +69,7 @@ extern const struct file_operations afs_dir_file_operations;
 /*
 * file.c
 */
-extern struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_fs_aops;
 extern struct inode_operations afs_file_inode_operations;
 #ifdef AFS_CACHING_SUPPORT
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 7ac07d0d47b9..f09a794f248e 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -136,8 +136,7 @@ static int kafsasyncd(void *arg)
                        if (!list_empty(&kafsasyncd_async_attnq)) {
                                op = list_entry(kafsasyncd_async_attnq.next,
                                                struct afs_async_op, link);
-                                list_del(&op->link);
+                                list_move_tail(&op->link,
-                                list_add_tail(&op->link,
                                              &kafsasyncd_async_busyq);
                        }
@@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op)
        init_waitqueue_entry(&op->waiter, kafsasyncd_task);
        add_wait_queue(&op->call->waitq, &op->waiter);
-        list_del(&op->link);
+        list_move_tail(&op->link, &kafsasyncd_async_busyq);
-        list_add_tail(&op->link, &kafsasyncd_async_busyq);
        spin_unlock(&kafsasyncd_async_lock);
@@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op)
        spin_lock(&kafsasyncd_async_lock);
-        list_del(&op->link);
+        list_move_tail(&op->link, &kafsasyncd_async_attnq);
-        list_add_tail(&op->link, &kafsasyncd_async_attnq);
        spin_unlock(&kafsasyncd_async_lock);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83c..99785a79d043 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -63,7 +63,6 @@ unsigned long afs_mntpt_expiry_timeout = 20;
 int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 {
        struct page *page;
-        filler_t *filler;
        size_t size;
        char *buf;
        int ret;
@@ -71,10 +70,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
        _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
        /* read the contents of the symlink into the pagecache */
-        filler = (filler_t *) AFS_VNODE_TO_I(vnode)->i_mapping->a_ops->readpage;
+        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
-        page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
-                               filler, NULL);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto out;
@@ -160,7 +156,6 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        struct page *page = NULL;
        size_t size;
        char *buf, *devname = NULL, *options = NULL;
-        filler_t *filler;
        int ret;
        kenter("{%s}", mntpt->d_name.name);
@@ -182,9 +177,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
                goto error;
        /* read the contents of the AFS special symlink */
-        filler = (filler_t *)mntpt->d_inode->i_mapping->a_ops->readpage;
+        page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
-        page = read_cache_page(mntpt->d_inode->i_mapping, 0, filler, NULL);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto error;
@@ -210,7 +203,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        /* try and do the mount */
        kdebug("--- attempting mount %s -o %s ---", devname, options);
-        mnt = do_kern_mount("afs", 0, devname, options);
+        mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
        kdebug("--- mount result %p ---", mnt);
        free_page((unsigned long) devname);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 62b093aa41c6..22afaae1a4ce 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
 resurrect_server:
        _debug("resurrecting server");
-        list_del(&zombie->link);
+        list_move_tail(&zombie->link, &cell->sv_list);
-        list_add_tail(&zombie->link, &cell->sv_list);
        afs_get_server(zombie);
        afs_kafstimod_del_timer(&zombie->timeout);
        spin_unlock(&cell->sv_gylock);
@@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server)
        }
        spin_lock(&cell->sv_gylock);
-        list_del(&server->link);
+        list_move_tail(&server->link, &cell->sv_graveyard);
-        list_add_tail(&server->link, &cell->sv_graveyard);
        /* time out in 10 secs */
        afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231ab..67d1f5c819ec 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -38,9 +38,9 @@ struct afs_mount_params {
 static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
                            unsigned long flags);
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
+static int afs_get_sb(struct file_system_type *fs_type,
-                                      int flags, const char *dev_name,
+                      int flags, const char *dev_name,
-                                      void *data);
+                      void *data, struct vfsmount *mnt);
 static struct inode *afs_alloc_inode(struct super_block *sb);
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
 static void afs_destroy_inode(struct inode *inode);
-static struct file_system_type afs_fs_type = {
+struct file_system_type afs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "afs",
        .get_sb         = afs_get_sb,
@@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
 * get an AFS superblock
 * - TODO: don't use get_sb_nodev(), but rather call sget() directly
 */
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
+static int afs_get_sb(struct file_system_type *fs_type,
-                                      int flags,
+                      int flags,
-                                      const char *dev_name,
+                      const char *dev_name,
-                                      void *options)
+                      void *options,
+                      struct vfsmount *mnt)
 {
        struct afs_mount_params params;
        struct super_block *sb;
@@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
        ret = afscm_start();
        if (ret < 0) {
                _leave(" = %d", ret);
-                return ERR_PTR(ret);
+                return ret;
        }
        /* parse the options */
@@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
                goto error;
        }
        sb->s_flags |= MS_ACTIVE;
+        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
        afs_put_cell(params.default_cell);
-        _leave(" = %p", sb);
+        _leave(" = 0 [%p]", 0, sb);
-        return sb;
+        return 0;
 error:
        afs_put_volume(params.volume);
        afs_put_cell(params.default_cell);
        afscm_stop();
        _leave(" = %d", ret);
-        return ERR_PTR(ret);
+        return ret;
 } /* end afs_get_sb() */
 /*****************************************************************************/
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e95..32de8cc6fae8 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
        return sb->s_fs_info;
 }
+extern struct file_system_type afs_fs_type;
 #endif /* __KERNEL__ */
 #endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index eced20618ecc..331f730a1fb3 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell,
        /* found in the graveyard - resurrect */
        _debug("found in graveyard");
        atomic_inc(&vlocation->usage);
-        list_del(&vlocation->link);
+        list_move_tail(&vlocation->link, &cell->vl_list);
-        list_add_tail(&vlocation->link, &cell->vl_list);
        spin_unlock(&cell->vl_gylock);
        afs_kafstimod_del_timer(&vlocation->timeout);
@@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation)
        }
        /* move to graveyard queue */
-        list_del(&vlocation->link);
+        list_move_tail(&vlocation->link,&cell->vl_graveyard);
-        list_add_tail(&vlocation->link,&cell->vl_graveyard);
        /* remove from pending timeout queue (refcounted if actually being
         * updated) */
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 9867fef3261d..cf62da5d7825 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
                                        vnode->cb_expiry * HZ);
                spin_lock(&afs_cb_hash_lock);
-                list_del(&vnode->cb_hash_link);
+                list_move_tail(&vnode->cb_hash_link,
-                list_add_tail(&vnode->cb_hash_link,
                              &afs_cb_hash(server, &vnode->fid));
                spin_unlock(&afs_cb_hash_lock);
diff --git a/fs/aio.c b/fs/aio.c
index e41e932ba489..950630187acc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
 *      invoked both for initial i/o submission and
 *      subsequent retries via the aio_kick_handler.
 *      Expects to be invoked with iocb->ki_ctx->lock
- *      already held. The lock is released and reaquired
+ *      already held. The lock is released and reacquired
 *      as needed during processing.
 *
 * Calls the iocb retry method (already setup for the
@@ -777,11 +777,11 @@ out:
 static int __aio_run_iocbs(struct kioctx *ctx)
 {
        struct kiocb *iocb;
-        LIST_HEAD(run_list);
+        struct list_head run_list;
        assert_spin_locked(&ctx->ctx_lock);
-        list_splice_init(&ctx->run_list, &run_list);
+        list_replace_init(&ctx->run_list, &run_list);
        while (!list_empty(&run_list)) {
                iocb = list_entry(run_list.next, struct kiocb,
                        ki_run_list);
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b977ece69f0c..aca123752406 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
+static int autofs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, autofs_fill_super);
+        return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
 }
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b8ce02607d66..8dbd44f10e9d 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -174,6 +174,12 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        struct autofs_info *ino = autofs4_dentry_ino(p);
                        unsigned int ino_count = atomic_read(&ino->count);
+                        /*
+                         * Clean stale dentries below that have not been
+                         * invalidated after a mount fail during lookup
+                         */
+                        d_invalidate(p);
                        /* allow for dget above and top is already dgot */
                        if (p == top)
                                ino_count += 2;
@@ -370,8 +376,7 @@ next:
                DPRINTK("returning %p %.*s",
                        expired, (int)expired->d_name.len, expired->d_name.name);
                spin_lock(&dcache_lock);
-                list_del(&expired->d_parent->d_subdirs);
+                list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
-                list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
                spin_unlock(&dcache_lock);
                return expired;
        }
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index acecec8578ce..5d9193332bef 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
+static int autofs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, autofs4_fill_super);
+        return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 }
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 68ebd10f345d..fcaeead9696b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
                        char **out, int *out_len);
 static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
-static int befs_statfs(struct super_block *, struct kstatfs *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, befs_mount_options *);
 static const struct super_operations befs_sops = {
@@ -73,7 +73,7 @@ static struct inode_operations befs_dir_inode_operations = {
        .lookup         = befs_lookup,
 };
-static struct address_space_operations befs_aops = {
+static const struct address_space_operations befs_aops = {
        .readpage       = befs_readpage,
        .sync_page      = block_sync_page,
        .bmap           = befs_bmap,
@@ -325,7 +325,7 @@ befs_read_inode(struct inode *inode)
        if (!bh) {
                befs_error(sb, "unable to read inode block - "
                           "inode = %lu", inode->i_ino);
-                goto unaquire_none;
+                goto unacquire_none;
        }
        raw_inode = (befs_inode *) bh->b_data;
@@ -334,7 +334,7 @@ befs_read_inode(struct inode *inode)
        if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) {
                befs_error(sb, "Bad inode: %lu", inode->i_ino);
-                goto unaquire_bh;
+                goto unacquire_bh;
        }
        inode->i_mode = (umode_t) fs32_to_cpu(sb, raw_inode->mode);
@@ -402,17 +402,17 @@ befs_read_inode(struct inode *inode)
                befs_error(sb, "Inode %lu is not a regular file, "
                           "directory or symlink. THAT IS WRONG! BeFS has no "
                           "on disk special files", inode->i_ino);
-                goto unaquire_bh;
+                goto unacquire_bh;
        }
        brelse(bh);
        befs_debug(sb, "<--- befs_read_inode()");
        return;
-      unaquire_bh:
+      unacquire_bh:
        brelse(bh);
-      unaquire_none:
+      unacquire_none:
        make_bad_inode(inode);
        befs_debug(sb, "<--- befs_read_inode() - Bad inode");
        return;
@@ -761,14 +761,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_ERR
                       "BeFS(%s): Unable to allocate memory for private "
                       "portion of superblock. Bailing.\n", sb->s_id);
-                goto unaquire_none;
+                goto unacquire_none;
        }
        befs_sb = BEFS_SB(sb);
        memset(befs_sb, 0, sizeof(befs_sb_info));
        if (!parse_options((char *) data, &befs_sb->mount_opts)) {
                befs_error(sb, "cannot parse mount options");
-                goto unaquire_priv_sbp;
+                goto unacquire_priv_sbp;
        }
        befs_debug(sb, "---> befs_fill_super()");
@@ -794,7 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        if (!(bh = sb_bread(sb, sb_block))) {
                befs_error(sb, "unable to read superblock");
-                goto unaquire_priv_sbp;
+                goto unacquire_priv_sbp;
        }
        /* account for offset of super block on x86 */
@@ -809,20 +809,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (befs_load_sb(sb, disk_sb) != BEFS_OK)
-                goto unaquire_bh;
+                goto unacquire_bh;
        befs_dump_super_block(sb, disk_sb);
        brelse(bh);
        if (befs_check_sb(sb) != BEFS_OK)
-                goto unaquire_priv_sbp;
+                goto unacquire_priv_sbp;
        if( befs_sb->num_blocks > ~((sector_t)0) ) {
                befs_error(sb, "blocks count: %Lu "
                        "is larger than the host can use",
                        befs_sb->num_blocks);
-                goto unaquire_priv_sbp;
+                goto unacquire_priv_sbp;
        }
        /*
@@ -838,7 +838,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root) {
                iput(root);
                befs_error(sb, "get root inode failed");
-                goto unaquire_priv_sbp;
+                goto unacquire_priv_sbp;
        }
        /* load nls library */
@@ -860,13 +860,13 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 /*****************/
-      unaquire_bh:
+      unacquire_bh:
        brelse(bh);
-      unaquire_priv_sbp:
+      unacquire_priv_sbp:
        kfree(sb->s_fs_info);
-      unaquire_none:
+      unacquire_none:
        sb->s_fs_info = NULL;
        return -EINVAL;
 }
@@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data)
 }
 static int
-befs_statfs(struct super_block *sb, struct kstatfs *buf)
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        befs_debug(sb, "---> befs_statfs()");
@@ -899,11 +900,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf)
        return 0;
 }
-static struct super_block *
+static int
 befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-            void *data)
+            void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+                           mnt);
 }
 static struct file_system_type befs_fs_type = {
@@ -923,18 +925,18 @@ init_befs_fs(void)
        err = befs_init_inodecache();
        if (err)
-                goto unaquire_none;
+                goto unacquire_none;
        err = register_filesystem(&befs_fs_type);
        if (err)
-                goto unaquire_inodecache;
+                goto unacquire_inodecache;
        return 0;
-unaquire_inodecache:
+unacquire_inodecache:
        befs_destroy_inodecache();
-unaquire_none:
+unacquire_none:
        return err;
 }
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 9d791004b21c..31973bbbf057 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -50,7 +50,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 /* file.c */
 extern struct inode_operations bfs_file_inops;
 extern const struct file_operations bfs_file_operations;
-extern struct address_space_operations bfs_aops;
+extern const struct address_space_operations bfs_aops;
 /* dir.c */
 extern struct inode_operations bfs_dir_inops;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index d83cd74a2e4e..3d5aca28a0a0 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -153,7 +153,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, bfs_get_block);
 }
-struct address_space_operations bfs_aops = {
+const struct address_space_operations bfs_aops = {
        .readpage       = bfs_readpage,
        .writepage      = bfs_writepage,
        .sync_page      = block_sync_page,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 55a7a78332f8..cf74f3d4d966 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s)
        s->s_fs_info = NULL;
 }
-static int bfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *s = dentry->d_sb;
        struct bfs_sb_info *info = BFS_SB(s);
        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
        buf->f_type = BFS_MAGIC;
@@ -410,10 +411,10 @@ out:
        return -EINVAL;
 }
-static struct super_block *bfs_get_sb(struct file_system_type *fs_type,
+static int bfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
 }
 static struct file_system_type bfs_fs_type = {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 537893a16014..d0434406eaeb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -38,15 +38,13 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/random.h>
+#include <linux/elf.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
-#include <linux/elf.h>
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_library(struct file *);
-static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs);
-static int load_elf_library(struct file*);
 static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
 extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
@@ -59,15 +57,15 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
 * don't even try.
 */
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file);
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file);
 #else
 #define elf_core_dump   NULL
 #endif
 #if ELF_EXEC_PAGESIZE > PAGE_SIZE
-# define ELF_MIN_ALIGN  ELF_EXEC_PAGESIZE
+#define ELF_MIN_ALIGN   ELF_EXEC_PAGESIZE
 #else
-# define ELF_MIN_ALIGN  PAGE_SIZE
+#define ELF_MIN_ALIGN   PAGE_SIZE
 #endif
 #ifndef ELF_CORE_EFLAGS
@@ -86,7 +84,7 @@ static struct linux_binfmt elf_format = {
                .min_coredump   = ELF_EXEC_PAGESIZE
 };
-#define BAD_ADDR(x)     ((unsigned long)(x) > TASK_SIZE)
+#define BAD_ADDR(x) ((unsigned long)(x) > TASK_SIZE)
 static int set_brk(unsigned long start, unsigned long end)
 {
@@ -104,13 +102,11 @@ static int set_brk(unsigned long start, unsigned long end)
        return 0;
 }
 /* We need to explicitly zero any fractional pages
   after the data section (i.e. bss).  This would
   contain the junk from the file that should not
-   be in memory */
+   be in memory
+ */
 static int padzero(unsigned long elf_bss)
 {
        unsigned long nbyte;
@@ -129,7 +125,9 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items))
 #define STACK_ROUND(sp, items) \
        ((15 + (unsigned long) ((sp) + (items))) &~ 15UL)
-#define STACK_ALLOC(sp, len) ({ elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; old_sp; })
+#define STACK_ALLOC(sp, len) ({ \
+        elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \
+        old_sp; })
 #else
 #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
 #define STACK_ROUND(sp, items) \
@@ -138,7 +136,7 @@ static int padzero(unsigned long elf_bss)
 #endif
 static int
-create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
+create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                int interp_aout, unsigned long load_addr,
                unsigned long interp_load_addr)
 {
@@ -161,7 +159,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
         * for userspace to get any other way, in others (i386) it is
         * merely difficult.
         */
        u_platform = NULL;
        if (k_platform) {
                size_t len = strlen(k_platform) + 1;
@@ -171,7 +168,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
                 * evictions by the processes running on the same package. One
                 * thing we can do is to shuffle the initial stack for them.
                 */
-         
                p = arch_align_stack(p);
                u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
@@ -180,9 +177,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
        }
        /* Create the ELF interpreter info */
-        elf_info = (elf_addr_t *) current->mm->saved_auxv;
+        elf_info = (elf_addr_t *)current->mm->saved_auxv;
 #define NEW_AUX_ENT(id, val) \
-        do { elf_info[ei_index++] = id; elf_info[ei_index++] = val; } while (0)
+        do { \
+                elf_info[ei_index++] = id; \
+                elf_info[ei_index++] = val; \
+        } while (0)
 #ifdef ARCH_DLINFO
        /* 
@@ -195,21 +195,22 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
        NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
        NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
        NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
-        NEW_AUX_ENT(AT_PHENT, sizeof (struct elf_phdr));
+        NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
        NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
        NEW_AUX_ENT(AT_BASE, interp_load_addr);
        NEW_AUX_ENT(AT_FLAGS, 0);
        NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-        NEW_AUX_ENT(AT_UID, (elf_addr_t) tsk->uid);
+        NEW_AUX_ENT(AT_UID, tsk->uid);
-        NEW_AUX_ENT(AT_EUID, (elf_addr_t) tsk->euid);
+        NEW_AUX_ENT(AT_EUID, tsk->euid);
-        NEW_AUX_ENT(AT_GID, (elf_addr_t) tsk->gid);
+        NEW_AUX_ENT(AT_GID, tsk->gid);
-        NEW_AUX_ENT(AT_EGID, (elf_addr_t) tsk->egid);
+        NEW_AUX_ENT(AT_EGID, tsk->egid);
-        NEW_AUX_ENT(AT_SECURE, (elf_addr_t) security_bprm_secureexec(bprm));
+        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
        if (k_platform) {
-                NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform);
+                NEW_AUX_ENT(AT_PLATFORM,
+                            (elf_addr_t)(unsigned long)u_platform);
        }
        if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
-                NEW_AUX_ENT(AT_EXECFD, (elf_addr_t) bprm->interp_data);
+                NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
        }
 #undef NEW_AUX_ENT
        /* AT_NULL is zero; clear the rest too */
@@ -232,7 +233,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
        /* Point sp at the lowest address on the stack */
 #ifdef CONFIG_STACK_GROWSUP
        sp = (elf_addr_t __user *)bprm->p - items - ei_index;
-        bprm->exec = (unsigned long) sp; /* XXX: PARISC HACK */
+        bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */
 #else
        sp = (elf_addr_t __user *)bprm->p;
 #endif
@@ -285,7 +286,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 #ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-                        struct elf_phdr *eppnt, int prot, int type)
+                struct elf_phdr *eppnt, int prot, int type)
 {
        unsigned long map_addr;
        unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
@@ -310,9 +311,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
   is only provided so that we can read a.out libraries that have
   an ELF header */
-static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
+static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
-                                     struct file * interpreter,
+                struct file *interpreter, unsigned long *interp_load_addr)
-                                     unsigned long *interp_load_addr)
 {
        struct elf_phdr *elf_phdata;
        struct elf_phdr *eppnt;
@@ -342,15 +342,15 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
                goto out;
        /* Now read in all of the header information */
        size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
        if (size > ELF_MIN_ALIGN)
                goto out;
-        elf_phdata = (struct elf_phdr *) kmalloc(size, GFP_KERNEL);
+        elf_phdata = kmalloc(size, GFP_KERNEL);
        if (!elf_phdata)
                goto out;
-        retval = kernel_read(interpreter,interp_elf_ex->e_phoff,(char *)elf_phdata,size);
+        retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
+                             (char *)elf_phdata,size);
        error = -EIO;
        if (retval != size) {
                if (retval < 0)
@@ -359,58 +359,65 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
        }
        eppnt = elf_phdata;
-        for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) {
+        for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
-          if (eppnt->p_type == PT_LOAD) {
+                if (eppnt->p_type == PT_LOAD) {
-            int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+                        int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
-            int elf_prot = 0;
+                        int elf_prot = 0;
-            unsigned long vaddr = 0;
+                        unsigned long vaddr = 0;
-            unsigned long k, map_addr;
+                        unsigned long k, map_addr;
-            if (eppnt->p_flags & PF_R) elf_prot =  PROT_READ;
+                        if (eppnt->p_flags & PF_R)
-            if (eppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
+                                elf_prot = PROT_READ;
-            if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
+                        if (eppnt->p_flags & PF_W)
-            vaddr = eppnt->p_vaddr;
+                                elf_prot |= PROT_WRITE;
-            if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
+                        if (eppnt->p_flags & PF_X)
-                elf_type |= MAP_FIXED;
+                                elf_prot |= PROT_EXEC;
+                        vaddr = eppnt->p_vaddr;
-            map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type);
+                        if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
-            error = map_addr;
+                                elf_type |= MAP_FIXED;
-            if (BAD_ADDR(map_addr))
-                goto out_close;
+                        map_addr = elf_map(interpreter, load_addr + vaddr,
+                                           eppnt, elf_prot, elf_type);
-            if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) {
+                        error = map_addr;
-                load_addr = map_addr - ELF_PAGESTART(vaddr);
+                        if (BAD_ADDR(map_addr))
-                load_addr_set = 1;
+                                goto out_close;
-            }
+                        if (!load_addr_set &&
-            /*
+                            interp_elf_ex->e_type == ET_DYN) {
-             * Check to see if the section's size will overflow the
+                                load_addr = map_addr - ELF_PAGESTART(vaddr);
-             * allowed task size. Note that p_filesz must always be
+                                load_addr_set = 1;
-             * <= p_memsize so it is only necessary to check p_memsz.
+                        }
-             */
-            k = load_addr + eppnt->p_vaddr;
+                        /*
-            if (k > TASK_SIZE || eppnt->p_filesz > eppnt->p_memsz ||
+                         * Check to see if the section's size will overflow the
-                eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) {
+                         * allowed task size. Note that p_filesz must always be
-                error = -ENOMEM;
+                         * <= p_memsize so it's only necessary to check p_memsz.
-                goto out_close;
+                         */
-            }
+                        k = load_addr + eppnt->p_vaddr;
+                        if (k > TASK_SIZE ||
-            /*
+                            eppnt->p_filesz > eppnt->p_memsz ||
-             * Find the end of the file mapping for this phdr, and keep
+                            eppnt->p_memsz > TASK_SIZE ||
-             * track of the largest address we see for this.
+                            TASK_SIZE - eppnt->p_memsz < k) {
-             */
+                                error = -ENOMEM;
-            k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
+                                goto out_close;
-            if (k > elf_bss)
+                        }
-                elf_bss = k;
+                        /*
-            /*
+                         * Find the end of the file mapping for this phdr, and
-             * Do the same thing for the memory mapping - between
+                         * keep track of the largest address we see for this.
-             * elf_bss and last_bss is the bss section.
+                         */
-             */
+                        k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
-            k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+                        if (k > elf_bss)
-            if (k > last_bss)
+                                elf_bss = k;
-                last_bss = k;
-          }
+                        /*
+                         * Do the same thing for the memory mapping - between
+                         * elf_bss and last_bss is the bss section.
+                         */
+                        k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+                        if (k > last_bss)
+                                last_bss = k;
+                }
        }
        /*
@@ -424,7 +431,8 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
                goto out_close;
        }
-        elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);   /* What we have mapped so far */
+        /* What we have mapped so far */
+        elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
        /* Map the last of the bss segment */
        if (last_bss > elf_bss) {
@@ -436,7 +444,7 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
        }
        *interp_load_addr = load_addr;
-        error = ((unsigned long) interp_elf_ex->e_entry) + load_addr;
+        error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
 out_close:
        kfree(elf_phdata);
@@ -444,8 +452,8 @@ out:
        return error;
 }
-static unsigned long load_aout_interp(struct exec * interp_ex,
+static unsigned long load_aout_interp(struct exec *interp_ex,
-                             struct file * interpreter)
+                struct file *interpreter)
 {
        unsigned long text_data, elf_entry = ~0UL;
        char __user * addr;
@@ -464,7 +472,7 @@ static unsigned long load_aout_interp(struct exec * interp_ex,
        case ZMAGIC:
        case QMAGIC:
                offset = N_TXTOFF(*interp_ex);
-                addr = (char __user *) N_TXTADDR(*interp_ex);
+                addr = (char __user *)N_TXTADDR(*interp_ex);
                break;
        default:
                goto out;
@@ -480,7 +488,6 @@ static unsigned long load_aout_interp(struct exec * interp_ex,
        flush_icache_range((unsigned long)addr,
                           (unsigned long)addr + text_data);
        down_write(&current->mm->mmap_sem);     
        do_brk(ELF_PAGESTART(text_data + ELF_MIN_ALIGN - 1),
                interp_ex->a_bss);
@@ -519,7 +526,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
-static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 {
        struct file *interpreter = NULL; /* to shut gcc up */
        unsigned long load_addr = 0, load_bias = 0;
@@ -528,7 +535,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        unsigned int interpreter_type = INTERPRETER_NONE;
        unsigned char ibcs2_interpreter = 0;
        unsigned long error;
-        struct elf_phdr * elf_ppnt, *elf_phdata;
+        struct elf_phdr *elf_ppnt, *elf_phdata;
        unsigned long elf_bss, elf_brk;
        int elf_exec_fileno;
        int retval, i;
@@ -553,7 +560,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        }
        
        /* Get the exec-header */
-        loc->elf_ex = *((struct elfhdr *) bprm->buf);
+        loc->elf_ex = *((struct elfhdr *)bprm->buf);
        retval = -ENOEXEC;
        /* First of all, some simple consistency checks */
@@ -568,7 +575,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                goto out;
        /* Now read in all of the header information */
        if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
                goto out;
        if (loc->elf_ex.e_phnum < 1 ||
@@ -576,18 +582,19 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                goto out;
        size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
        retval = -ENOMEM;
-        elf_phdata = (struct elf_phdr *) kmalloc(size, GFP_KERNEL);
+        elf_phdata = kmalloc(size, GFP_KERNEL);
        if (!elf_phdata)
                goto out;
-        retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, (char *) elf_phdata, size);
+        retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
+                             (char *)elf_phdata, size);
        if (retval != size) {
                if (retval >= 0)
                        retval = -EIO;
                goto out_free_ph;
        }
-        files = current->files;         /* Refcounted so ok */
+        files = current->files; /* Refcounted so ok */
        retval = unshare_files();
        if (retval < 0)
                goto out_free_ph;
@@ -598,7 +605,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* exec will make our files private anyway, but for the a.out
           loader stuff we need to do it earlier */
        retval = get_unused_fd();
        if (retval < 0)
                goto out_free_fh;
@@ -620,7 +626,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                         * shared libraries - for now assume that this
                         * is an a.out format binary
                         */
                        retval = -ENOEXEC;
                        if (elf_ppnt->p_filesz > PATH_MAX || 
                            elf_ppnt->p_filesz < 2)
@@ -628,13 +633,13 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                        retval = -ENOMEM;
                        elf_interpreter = kmalloc(elf_ppnt->p_filesz,
-                                                           GFP_KERNEL);
+                                                  GFP_KERNEL);
                        if (!elf_interpreter)
                                goto out_free_file;
                        retval = kernel_read(bprm->file, elf_ppnt->p_offset,
-                                           elf_interpreter,
+                                             elf_interpreter,
-                                           elf_ppnt->p_filesz);
+                                             elf_ppnt->p_filesz);
                        if (retval != elf_ppnt->p_filesz) {
                                if (retval >= 0)
                                        retval = -EIO;
@@ -678,7 +683,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                        retval = PTR_ERR(interpreter);
                        if (IS_ERR(interpreter))
                                goto out_free_interp;
-                        retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE);
+                        retval = kernel_read(interpreter, 0, bprm->buf,
+                                             BINPRM_BUF_SIZE);
                        if (retval != BINPRM_BUF_SIZE) {
                                if (retval >= 0)
                                        retval = -EIO;
@@ -686,8 +692,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                        }
                        /* Get the exec headers */
-                        loc->interp_ex = *((struct exec *) bprm->buf);
+                        loc->interp_ex = *((struct exec *)bprm->buf);
-                        loc->interp_elf_ex = *((struct elfhdr *) bprm->buf);
+                        loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
                        break;
                }
                elf_ppnt++;
@@ -739,7 +745,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* OK, we are done with that, now set up the arg stuff,
           and then start this sucker up */
        if ((!bprm->sh_bang) && (interpreter_type == INTERPRETER_AOUT)) {
                char *passed_p = passed_fileno;
                sprintf(passed_fileno, "%d", elf_exec_fileno);
@@ -759,7 +764,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* Discard our unneeded old files struct */
        if (files) {
-                steal_locks(files);
                put_files_struct(files);
                files = NULL;
        }
@@ -778,7 +782,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        if (elf_read_implies_exec(loc->elf_ex, executable_stack))
                current->personality |= READ_IMPLIES_EXEC;
-        if ( !(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
                current->flags |= PF_RANDOMIZE;
        arch_pick_mmap_layout(current->mm);
@@ -799,8 +803,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
           the correct location in memory.  At this point, we assume that
           the image should be loaded at fixed address, not at a variable
           address. */
+        for(i = 0, elf_ppnt = elf_phdata;
-        for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
                int elf_prot = 0, elf_flags;
                unsigned long k, vaddr;
@@ -828,30 +832,35 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                                                        load_bias, nbyte)) {
                                        /*
                                         * This bss-zeroing can fail if the ELF
-                                         * file specifies odd protections.  So
+                                         * file specifies odd protections. So
                                         * we don't check the return value
                                         */
                                }
                        }
                }
-                if (elf_ppnt->p_flags & PF_R) elf_prot |= PROT_READ;
+                if (elf_ppnt->p_flags & PF_R)
-                if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
+                        elf_prot |= PROT_READ;
-                if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
+                if (elf_ppnt->p_flags & PF_W)
+                        elf_prot |= PROT_WRITE;
+                if (elf_ppnt->p_flags & PF_X)
+                        elf_prot |= PROT_EXEC;
-                elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
+                elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
                vaddr = elf_ppnt->p_vaddr;
                if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
                        elf_flags |= MAP_FIXED;
                } else if (loc->elf_ex.e_type == ET_DYN) {
-                        /* Try and get dynamic programs out of the way of the default mmap
+                        /* Try and get dynamic programs out of the way of the
-                           base, as well as whatever program they might try to exec.  This
+                         * default mmap base, as well as whatever program they
-                           is because the brk will follow the loader, and is not movable.  */
+                         * might try to exec.  This is because the brk will
+                         * follow the loader, and is not movable.  */
                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
                }
-                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags);
+                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+                                elf_prot, elf_flags);
                if (BAD_ADDR(error)) {
                        send_sig(SIGKILL, current, 0);
                        goto out_free_dentry;
@@ -868,8 +877,10 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                        }
                }
                k = elf_ppnt->p_vaddr;
-                if (k < start_code) start_code = k;
+                if (k < start_code)
-                if (start_data < k) start_data = k;
+                        start_code = k;
+                if (start_data < k)
+                        start_data = k;
                /*
                 * Check to see if the section's size will overflow the
@@ -879,7 +890,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                if (k > TASK_SIZE || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
                    elf_ppnt->p_memsz > TASK_SIZE ||
                    TASK_SIZE - elf_ppnt->p_memsz < k) {
-                        /* set_brk can never work.  Avoid overflows.  */
+                        /* set_brk can never work. Avoid overflows. */
                        send_sig(SIGKILL, current, 0);
                        goto out_free_dentry;
                }
@@ -967,8 +978,9 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        compute_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
-        create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT),
+        create_elf_tables(bprm, &loc->elf_ex,
-                        load_addr, interp_load_addr);
+                          (interpreter_type == INTERPRETER_AOUT),
+                          load_addr, interp_load_addr);
        /* N.B. passed_fileno might not be initialized? */
        if (interpreter_type == INTERPRETER_AOUT)
                current->mm->arg_start += strlen(passed_fileno) + 1;
@@ -982,7 +994,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
                   and some applications "depend" upon this behavior.
                   Since we do not have the power to recompile these, we
-                   emulate the SVr4 behavior.  Sigh.  */
+                   emulate the SVr4 behavior. Sigh. */
                down_write(&current->mm->mmap_sem);
                error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                                MAP_FIXED | MAP_PRIVATE, 0);
@@ -1037,7 +1049,6 @@ out_free_ph:
 /* This is really simpleminded and specialized - we are loading an
   a.out library that is given an ELF header. */
 static int load_elf_library(struct file *file)
 {
        struct elf_phdr *elf_phdata;
@@ -1047,7 +1058,7 @@ static int load_elf_library(struct file *file)
        struct elfhdr elf_ex;
        error = -ENOEXEC;
-        retval = kernel_read(file, 0, (char *) &elf_ex, sizeof(elf_ex));
+        retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex));
        if (retval != sizeof(elf_ex))
                goto out;
@@ -1056,7 +1067,7 @@ static int load_elf_library(struct file *file)
        /* First of all, some simple consistency checks */
        if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
-           !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+            !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
                goto out;
        /* Now read in all of the header information */
@@ -1104,7 +1115,8 @@ static int load_elf_library(struct file *file)
                goto out_free_ph;
        }
-        len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1);
+        len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
+                            ELF_MIN_ALIGN - 1);
        bss = eppnt->p_memsz + eppnt->p_vaddr;
        if (bss > len) {
                down_write(&current->mm->mmap_sem);
@@ -1163,7 +1175,7 @@ static int maydump(struct vm_area_struct *vma)
        if (vma->vm_flags & (VM_IO | VM_RESERVED))
                return 0;
-        /* Dump shared memory only if mapped from an anonymous file.  */
+        /* Dump shared memory only if mapped from an anonymous file. */
        if (vma->vm_flags & VM_SHARED)
                return vma->vm_file->f_dentry->d_inode->i_nlink == 0;
@@ -1174,7 +1186,7 @@ static int maydump(struct vm_area_struct *vma)
        return 1;
 }
-#define roundup(x, y)  ((((x)+((y)-1))/(y))*(y))
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 /* An ELF note in memory */
 struct memelfnote
@@ -1277,11 +1289,11 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 }
 /*
- * fill up all the fields in prstatus from the given task struct, except registers
+ * fill up all the fields in prstatus from the given task struct, except
- * which need to be filled up separately.
+ * registers which need to be filled up separately.
 */
 static void fill_prstatus(struct elf_prstatus *prstatus,
-                        struct task_struct *p, long signr) 
+                struct task_struct *p, long signr)
 {
        prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
        prstatus->pr_sigpend = p->pending.signal.sig[0];
@@ -1366,8 +1378,8 @@ struct elf_thread_status
 /*
 * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then
+ * we need to keep a linked list of every threads pr_status and then create
- * create a single section for them in the final core file.
+ * a single section for them in the final core file.
 */
 static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 {
@@ -1378,19 +1390,23 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
        fill_prstatus(&t->prstatus, p, signr);
        elf_core_copy_task_regs(p, &t->prstatus.pr_reg);        
        
-        fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus));
+        fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+                  &(t->prstatus));
        t->num_notes++;
        sz += notesize(&t->notes[0]);
-        if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu))) {
+        if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
-                fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), &(t->fpu));
+                                                                &t->fpu))) {
+                fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+                          &(t->fpu));
                t->num_notes++;
                sz += notesize(&t->notes[1]);
        }
 #ifdef ELF_CORE_COPY_XFPREGS
        if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
-                fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), &t->xfpu);
+                fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu),
+                          &t->xfpu);
                t->num_notes++;
                sz += notesize(&t->notes[2]);
        }
@@ -1405,7 +1421,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 * and then they are actually written out.  If we run out of core limit
 * we just truncate.
 */
-static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 {
 #define NUM_NOTES       6
        int has_dumped = 0;
@@ -1434,12 +1450,12 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
        /*
         * We no longer stop all VM operations.
         * 
-         * This is because those proceses that could possibly change map_count or
+         * This is because those proceses that could possibly change map_count
-         * the mmap / vma pages are now blocked in do_exit on current finishing
+         * or the mmap / vma pages are now blocked in do_exit on current
-         * this core dump.
+         * finishing this core dump.
         *
         * Only ptrace can touch these memory addresses, but it doesn't change
-         * the map_count or the pages allocated.  So no possibility of crashing
+         * the map_count or the pages allocated. So no possibility of crashing
         * exists while dumping the mm->vm_next areas to the core file.
         */
  
@@ -1501,7 +1517,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 #endif
        /* Set up header */
-        fill_elf_header(elf, segs+1);   /* including notes section */
+        fill_elf_header(elf, segs + 1); /* including notes section */
        has_dumped = 1;
        current->flags |= PF_DUMPCORE;
@@ -1511,24 +1527,24 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
         * with info from their /proc.
         */
-        fill_note(notes +0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
+        fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
-        
        fill_psinfo(psinfo, current->group_leader, current->mm);
-        fill_note(notes +1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+        fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
        
        numnote = 2;
-        auxv = (elf_addr_t *) current->mm->saved_auxv;
+        auxv = (elf_addr_t *)current->mm->saved_auxv;
        i = 0;
        do
                i += 2;
        while (auxv[i - 2] != AT_NULL);
        fill_note(&notes[numnote++], "CORE", NT_AUXV,
-                  i * sizeof (elf_addr_t), auxv);
+                  i * sizeof(elf_addr_t), auxv);
        /* Try to dump the FPU. */
-        if ((prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, fpu)))
+        if ((prstatus->pr_fpvalid =
+             elf_core_copy_task_fpregs(current, regs, fpu)))
                fill_note(notes + numnote++,
                          "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
 #ifdef ELF_CORE_COPY_XFPREGS
@@ -1577,8 +1593,10 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
                phdr.p_memsz = sz;
                offset += phdr.p_filesz;
                phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
-                if (vma->vm_flags & VM_WRITE) phdr.p_flags |= PF_W;
+                if (vma->vm_flags & VM_WRITE)
-                if (vma->vm_flags & VM_EXEC) phdr.p_flags |= PF_X;
+                        phdr.p_flags |= PF_W;
+                if (vma->vm_flags & VM_EXEC)
+                        phdr.p_flags |= PF_X;
                phdr.p_align = ELF_EXEC_PAGESIZE;
                DUMP_WRITE(&phdr, sizeof(phdr));
@@ -1595,7 +1613,9 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
        /* write out the thread status notes section */
        list_for_each(t, &thread_list) {
-                struct elf_thread_status *tmp = list_entry(t, struct elf_thread_status, list);
+                struct elf_thread_status *tmp =
+                                list_entry(t, struct elf_thread_status, list);
                for (i = 0; i < tmp->num_notes; i++)
                        if (!writenote(&tmp->notes[i], file))
                                goto end_coredump;
@@ -1612,18 +1632,19 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
                for (addr = vma->vm_start;
                     addr < vma->vm_end;
                     addr += PAGE_SIZE) {
-                        struct page* page;
+                        struct page *page;
                        struct vm_area_struct *vma;
                        if (get_user_pages(current, current->mm, addr, 1, 0, 1,
                                                &page, &vma) <= 0) {
-                                DUMP_SEEK (file->f_pos + PAGE_SIZE);
+                                DUMP_SEEK(file->f_pos + PAGE_SIZE);
                        } else {
                                if (page == ZERO_PAGE(addr)) {
-                                        DUMP_SEEK (file->f_pos + PAGE_SIZE);
+                                        DUMP_SEEK(file->f_pos + PAGE_SIZE);
                                } else {
                                        void *kaddr;
-                                        flush_cache_page(vma, addr, page_to_pfn(page));
+                                        flush_cache_page(vma, addr,
+                                                         page_to_pfn(page));
                                        kaddr = kmap(page);
                                        if ((size += PAGE_SIZE) > limit ||
                                            !dump_write(file, kaddr,
@@ -1645,7 +1666,8 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
        if ((off_t)file->f_pos != offset) {
                /* Sanity check */
-                printk(KERN_WARNING "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
+                printk(KERN_WARNING
+                       "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
                       (off_t)file->f_pos, offset);
        }
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a2e48c999c24..eba4e23b9ca0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -435,9 +435,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
                                   struct elf_fdpic_params *interp_params)
 {
        unsigned long sp, csp, nitems;
-        elf_caddr_t *argv, *envp;
+        elf_caddr_t __user *argv, *envp;
        size_t platform_len = 0, len;
-        char *k_platform, *u_platform, *p;
+        char *k_platform;
+        char __user *u_platform, *p;
        long hwcap;
        int loop;
@@ -462,12 +463,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        if (k_platform) {
                platform_len = strlen(k_platform) + 1;
                sp -= platform_len;
+                u_platform = (char __user *) sp;
                if (__copy_to_user(u_platform, k_platform, platform_len) != 0)
                        return -EFAULT;
        }
-        u_platform = (char *) sp;
 #if defined(__i386__) && defined(CONFIG_SMP)
        /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
         * by the processes running on the same package. One thing we can do
@@ -490,7 +490,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        sp = (sp - len) & ~7UL;
        exec_params->map_addr = sp;
-        if (copy_to_user((void *) sp, exec_params->loadmap, len) != 0)
+        if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0)
                return -EFAULT;
        current->mm->context.exec_fdpic_loadmap = (unsigned long) sp;
@@ -501,7 +501,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
                sp = (sp - len) & ~7UL;
                interp_params->map_addr = sp;
-                if (copy_to_user((void *) sp, interp_params->loadmap, len) != 0)
+                if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0)
                        return -EFAULT;
                current->mm->context.interp_fdpic_loadmap = (unsigned long) sp;
@@ -527,7 +527,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        /* put the ELF interpreter info on the stack */
 #define NEW_AUX_ENT(nr, id, val)                                                \
        do {                                                                    \
-                struct { unsigned long _id, _val; } *ent = (void *) csp;        \
+                struct { unsigned long _id, _val; } __user *ent = (void __user *) csp;  \
                __put_user((id), &ent[nr]._id);                                 \
                __put_user((val), &ent[nr]._val);                               \
        } while (0)
@@ -564,13 +564,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        /* allocate room for argv[] and envv[] */
        csp -= (bprm->envc + 1) * sizeof(elf_caddr_t);
-        envp = (elf_caddr_t *) csp;
+        envp = (elf_caddr_t __user *) csp;
        csp -= (bprm->argc + 1) * sizeof(elf_caddr_t);
-        argv = (elf_caddr_t *) csp;
+        argv = (elf_caddr_t __user *) csp;
        /* stack argc */
        csp -= sizeof(unsigned long);
-        __put_user(bprm->argc, (unsigned long *) csp);
+        __put_user(bprm->argc, (unsigned long __user *) csp);
        BUG_ON(csp != sp);
@@ -581,7 +581,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
 #endif
-        p = (char *) current->mm->arg_start;
+        p = (char __user *) current->mm->arg_start;
        for (loop = bprm->argc; loop > 0; loop--) {
                __put_user((elf_caddr_t) p, argv++);
                len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
@@ -1025,7 +1025,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                /* clear the bit between beginning of mapping and beginning of PT_LOAD */
                if (prot & PROT_WRITE && disp > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-                        clear_user((void *) maddr, disp);
+                        clear_user((void __user *) maddr, disp);
                        maddr += disp;
                }
@@ -1059,7 +1059,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (prot & PROT_WRITE && excess1 > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess1);
-                        clear_user((void *) maddr + phdr->p_filesz, excess1);
+                        clear_user((void __user *) maddr + phdr->p_filesz, excess1);
                }
 #else
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c1..a62fd4018a20 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -16,7 +16,6 @@
 */
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -510,7 +509,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                /* OK, This is the point of no return */
-                set_personality(PER_LINUX);
+                set_personality(PER_LINUX_32BIT);
        }
        /*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a39..34ebbc191e46 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
 } Node;
 static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
@@ -203,7 +204,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                goto _error;
        if (files) {
-                steal_locks(files);
                put_files_struct(files);
                files = NULL;
        }
@@ -638,7 +638,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
        if (!inode)
                goto out2;
-        err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count);
+        err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
        if (err) {
                iput(inode);
                inode = NULL;
@@ -740,10 +740,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
        return err;
 }
-static struct super_block *bm_get_sb(struct file_system_type *fs_type,
+static int bm_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, bm_fill_super);
+        return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
 }
 static struct linux_binfmt misc_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 00a91dc25d16..32b5d625ce9c 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -32,7 +32,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
-#include <linux/config.h>
 #include <linux/elf.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f5958f413bd1..9633a490dab0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -5,14 +5,12 @@
 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 */
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/smp_lock.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
@@ -300,10 +298,10 @@ static struct super_operations bdev_sops = {
        .clear_inode = bdev_clear_inode,
 };
-static struct super_block *bd_get_sb(struct file_system_type *fs_type,
+static int bd_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+        return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 static struct file_system_type bd_type = {
@@ -414,21 +412,31 @@ EXPORT_SYMBOL(bdput);
 static struct block_device *bd_acquire(struct inode *inode)
 {
        struct block_device *bdev;
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
-        if (bdev && igrab(bdev->bd_inode)) {
+        if (bdev) {
+                atomic_inc(&bdev->bd_inode->i_count);
                spin_unlock(&bdev_lock);
                return bdev;
        }
        spin_unlock(&bdev_lock);
        bdev = bdget(inode->i_rdev);
        if (bdev) {
                spin_lock(&bdev_lock);
-                if (inode->i_bdev)
+                if (!inode->i_bdev) {
-                        __bd_forget(inode);
+                        /*
-                inode->i_bdev = bdev;
+                         * We take an additional bd_inode->i_count for inode,
-                inode->i_mapping = bdev->bd_inode->i_mapping;
+                         * and it's released in clear_inode() of inode.
-                list_add(&inode->i_devices, &bdev->bd_inodes);
+                         * So, we can access it via ->i_mapping always
+                         * without igrab().
+                         */
+                        atomic_inc(&bdev->bd_inode->i_count);
+                        inode->i_bdev = bdev;
+                        inode->i_mapping = bdev->bd_inode->i_mapping;
+                        list_add(&inode->i_devices, &bdev->bd_inodes);
+                }
                spin_unlock(&bdev_lock);
        }
        return bdev;
@@ -438,10 +446,18 @@ static struct block_device *bd_acquire(struct inode *inode)
 void bd_forget(struct inode *inode)
 {
+        struct block_device *bdev = NULL;
        spin_lock(&bdev_lock);
-        if (inode->i_bdev)
+        if (inode->i_bdev) {
+                if (inode->i_sb != blockdev_superblock)
+                        bdev = inode->i_bdev;
                __bd_forget(inode);
+        }
        spin_unlock(&bdev_lock);
+        if (bdev)
+                iput(bdev->bd_inode);
 }
 int bd_claim(struct block_device *bdev, void *holder)
@@ -1077,7 +1093,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
-struct address_space_operations def_blk_aops = {
+const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
        .sync_page      = block_sync_page,
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a68077..3660dcb97591 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -18,7 +18,6 @@
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
@@ -331,7 +330,6 @@ long do_fsync(struct file *file, int datasync)
                goto out;
        }
-        current->flags |= PF_SYNCWRITE;
        ret = filemap_fdatawrite(mapping);
        /*
@@ -346,7 +344,6 @@ long do_fsync(struct file *file, int datasync)
        err = filemap_fdatawait(mapping);
        if (!ret)
                ret = err;
-        current->flags &= ~PF_SYNCWRITE;
 out:
        return ret;
 }
@@ -566,7 +563,7 @@ still_busy:
 * Completion handler for block_write_full_page() - pages which are unlocked
 * during I/O, and which have PageWriteback cleared upon I/O completion.
 */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
        char b[BDEVNAME_SIZE];
        unsigned long flags;
@@ -854,7 +851,7 @@ int __set_page_dirty_buffers(struct page *page)
                write_lock_irq(&mapping->tree_lock);
                if (page->mapping) {    /* Race with truncate? */
                        if (mapping_cap_account_dirty(mapping))
-                                inc_page_state(nr_dirty);
+                                __inc_zone_page_state(page, NR_FILE_DIRTY);
                        radix_tree_tag_set(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
@@ -2600,7 +2597,7 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        unsigned to;
        struct page *page;
-        struct address_space_operations *a_ops = mapping->a_ops;
+        const struct address_space_operations *a_ops = mapping->a_ops;
        char *kaddr;
        int ret = 0;
@@ -3168,7 +3165,6 @@ EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_prepare_write);
-EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f3418f7a6e9d..a4cbc6706ef0 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -4,7 +4,6 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
@@ -14,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/seq_file.h>
 #include <linux/kobject.h>
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 7271bb0257f6..a61d17ed1827 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,9 +1,24 @@
+Version 1.44
+------------
+Rewritten sessionsetup support, including support for legacy SMB
+session setup needed for OS/2 and older servers such as Windows 95 and 98.
+Fix oops on ls to OS/2 servers.  Add support for level 1 FindFirst
+so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
+or recent levels of FindFirst unless server says it supports NT SMBs
+(instead use legacy equivalents from LANMAN dialect). Fix to allow
+NTLMv2 authentication support (now can use stronger password hashing
+on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
+Allow override of global cifs security flags on mount via "sec=" option(s).
 Version 1.43
 ------------
 POSIX locking to servers which support CIFS POSIX Extensions
 (disabled by default controlled by proc/fs/cifs/Experimental).
 Handle conversion of long share names (especially Asian languages)
-to Unicode during mount. 
+to Unicode during mount. Fix memory leak in sess struct on reconnect.
+Fix rare oops after acpi suspend.  Fix O_TRUNC opens to overwrite on
+cifs open which helps rare case when setpathinfo fails or server does
+not support it. 
 Version 1.42
 ------------
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 58c77254a23b..a26f26ed5a17 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -3,4 +3,4 @@
 #
 obj-$(CONFIG_CIFS) += cifs.o
-cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o ntlmssp.o
+cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 0355003f4f0a..7986d0d97ace 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -443,7 +443,10 @@ A partial list of the supported mount options follows:
                SFU does).  In the future the bottom 9 bits of the mode
                mode also will be emulated using queries of the security
                descriptor (ACL).
-sec             Security mode.  Allowed values are:
+ sign           Must use packet signing (helps avoid unwanted data modification
+                by intermediate systems in the route).  Note that signing
+                does not work with lanman or plaintext authentication.
+ sec            Security mode.  Allowed values are:
                        none    attempt to connection as a null user (no name)
                        krb5    Use Kerberos version 5 authentication
                        krb5i   Use Kerberos authentication and packet signing
@@ -453,6 +456,8 @@ sec		Security mode.  Allowed values are:
                                server requires signing also can be the default) 
                        ntlmv2  Use NTLMv2 password hashing      
                        ntlmv2i Use NTLMv2 password hashing with packet signing
+                        lanman  (if configured in kernel config) use older
+                                lanman hash
 The mount.cifs mount helper also accepts a few mount options before -o
 including:
@@ -485,14 +490,34 @@ PacketSigningEnabled	If set to one, cifs packet signing is enabled
                        it.  If set to two, cifs packet signing is
                        required even if the server considers packet
                        signing optional. (default 1)
+SecurityFlags           Flags which control security negotiation and
+                        also packet signing. Authentication (may/must)
+                        flags (e.g. for NTLM and/or NTLMv2) may be combined with
+                        the signing flags.  Specifying two different password
+                        hashing mechanisms (as "must use") on the other hand 
+                        does not make much sense. Default flags are 
+                                0x07007 
+                        (NTLM, NTLMv2 and packet signing allowed).  Maximum 
+                        allowable flags if you want to allow mounts to servers
+                        using weaker password hashes is 0x37037 (lanman,
+                        plaintext, ntlm, ntlmv2, signing allowed):
+ 
+                        may use packet signing                          0x00001
+                        must use packet signing                         0x01001
+                        may use NTLM (most common password hash)        0x00002
+                        must use NTLM                                   0x02002
+                        may use NTLMv2                                  0x00004
+                        must use NTLMv2                                 0x04004
+                        may use Kerberos security (not implemented yet) 0x00008
+                        must use Kerberos (not implemented yet)         0x08008
+                        may use lanman (weak) password hash             0x00010
+                        must use lanman password hash                   0x10010
+                        may use plaintext passwords                     0x00020
+                        must use plaintext passwords                    0x20020
+                        (reserved for future packet encryption)         0x00040
 cifsFYI                 If set to one, additional debug information is
                        logged to the system error log. (default 0)
-ExtendedSecurity        If set to one, SPNEGO session establishment
-                        is allowed which enables more advanced 
-                        secure CIFS session establishment (default 0)
-NTLMV2Enabled           If set to one, more secure password hashes
-                        are used when the server supports them and
-                        when kerberos is not negotiated (default 0)
 traceSMB                If set to one, debug information is logged to the
                        system error log with the start of smb requests
                        and responses (default 0)
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 086ae8f4a207..2e75883b7f54 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -17,7 +17,6 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -467,7 +466,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        asn1_open(&ctx, security_blob, length);
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit header "));
+                cFYI(1, ("Error decoding negTokenInit header"));
                return 0;
        } else if ((cls != ASN1_APL) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
@@ -495,7 +494,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                }
                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        cFYI(1, ("Error decoding negTokenInit "));
+                        cFYI(1, ("Error decoding negTokenInit"));
                        return 0;
                } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                           || (tag != ASN1_EOC)) {
@@ -505,7 +504,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                }
                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        cFYI(1, ("Error decoding negTokenInit "));
+                        cFYI(1, ("Error decoding negTokenInit"));
                        return 0;
                } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                           || (tag != ASN1_SEQ)) {
@@ -515,7 +514,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                }
                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        cFYI(1, ("Error decoding 2nd part of negTokenInit "));
+                        cFYI(1, ("Error decoding 2nd part of negTokenInit"));
                        return 0;
                } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                           || (tag != ASN1_EOC)) {
@@ -527,7 +526,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                if (asn1_header_decode
                    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-                        cFYI(1, ("Error decoding 2nd part of negTokenInit "));
+                        cFYI(1, ("Error decoding 2nd part of negTokenInit"));
                        return 0;
                } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                           || (tag != ASN1_SEQ)) {
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f4124a32bef8..96abeb738978 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -39,7 +39,7 @@ cifs_dump_mem(char *label, void *data, int length)
        char *charptr = data;
        char buf[10], line[80];
-        printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n\n", 
+        printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n", 
                label, length, data);
        for (i = 0; i < length; i += 16) {
                line[0] = 0;
@@ -57,6 +57,57 @@ cifs_dump_mem(char *label, void *data, int length)
        }
 }
+#ifdef CONFIG_CIFS_DEBUG2
+void cifs_dump_detail(struct smb_hdr * smb)
+{
+        cERROR(1,("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+                  smb->Command, smb->Status.CifsError,
+                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
+        cERROR(1,("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+}
+void cifs_dump_mids(struct TCP_Server_Info * server)
+{
+        struct list_head *tmp;
+        struct mid_q_entry * mid_entry;
+        if(server == NULL)
+                return;
+        cERROR(1,("Dump pending requests:"));
+        spin_lock(&GlobalMid_Lock);
+        list_for_each(tmp, &server->pending_mid_q) {
+                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                if(mid_entry) {
+                        cERROR(1,("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                                mid_entry->midState,
+                                (int)mid_entry->command,
+                                mid_entry->pid,
+                                mid_entry->tsk,
+                                mid_entry->mid));
+#ifdef CONFIG_CIFS_STATS2
+                        cERROR(1,("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+                                mid_entry->largeBuf,
+                                mid_entry->resp_buf,
+                                mid_entry->when_received,
+                                jiffies));
+#endif /* STATS2 */
+                        cERROR(1,("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+                                  mid_entry->multiEnd));
+                        if(mid_entry->resp_buf) {
+                                cifs_dump_detail(mid_entry->resp_buf);
+                                cifs_dump_mem("existing buf: ",
+                                        mid_entry->resp_buf,
+                                        62 /* fixme */);
+                        }
+                        
+                }
+        }
+        spin_unlock(&GlobalMid_Lock);
+}
+#endif /* CONFIG_CIFS_DEBUG2 */
 #ifdef CONFIG_PROC_FS
 static int
 cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
@@ -73,7 +124,6 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
        *beginBuffer = buf + offset;
-        
        length =
            sprintf(buf,
                    "Display Internal CIFS Data Structures for Debugging\n"
@@ -395,12 +445,12 @@ static read_proc_t traceSMB_read;
 static write_proc_t traceSMB_write;
 static read_proc_t multiuser_mount_read;
 static write_proc_t multiuser_mount_write;
-static read_proc_t extended_security_read;
+static read_proc_t security_flags_read;
-static write_proc_t extended_security_write;
+static write_proc_t security_flags_write;
-static read_proc_t ntlmv2_enabled_read;
+/* static read_proc_t ntlmv2_enabled_read;
 static write_proc_t ntlmv2_enabled_write;
 static read_proc_t packet_signing_enabled_read;
-static write_proc_t packet_signing_enabled_write;
+static write_proc_t packet_signing_enabled_write;*/
 static read_proc_t experimEnabled_read;
 static write_proc_t experimEnabled_write;
 static read_proc_t linuxExtensionsEnabled_read;
@@ -458,10 +508,10 @@ cifs_proc_init(void)
                pde->write_proc = multiuser_mount_write;
        pde =
-            create_proc_read_entry("ExtendedSecurity", 0, proc_fs_cifs,
+            create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
-                                extended_security_read, NULL);
+                                security_flags_read, NULL);
        if (pde)
-                pde->write_proc = extended_security_write;
+                pde->write_proc = security_flags_write;
        pde =
        create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs,
@@ -469,7 +519,7 @@ cifs_proc_init(void)
        if (pde)
                pde->write_proc = lookupFlag_write;
-        pde =
+/*      pde =
            create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs,
                                ntlmv2_enabled_read, NULL);
        if (pde)
@@ -479,7 +529,7 @@ cifs_proc_init(void)
            create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs,
                                packet_signing_enabled_read, NULL);
        if (pde)
-                pde->write_proc = packet_signing_enabled_write;
+                pde->write_proc = packet_signing_enabled_write;*/
 }
 void
@@ -496,9 +546,9 @@ cifs_proc_clean(void)
 #endif
        remove_proc_entry("MultiuserMount", proc_fs_cifs);
        remove_proc_entry("OplockEnabled", proc_fs_cifs);
-        remove_proc_entry("NTLMV2Enabled",proc_fs_cifs);
+/*      remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
-        remove_proc_entry("ExtendedSecurity",proc_fs_cifs);
+        remove_proc_entry("SecurityFlags",proc_fs_cifs);
-        remove_proc_entry("PacketSigningEnabled",proc_fs_cifs);
+/*      remove_proc_entry("PacketSigningEnabled",proc_fs_cifs); */
        remove_proc_entry("LinuxExtensionsEnabled",proc_fs_cifs);
        remove_proc_entry("Experimental",proc_fs_cifs);
        remove_proc_entry("LookupCacheEnabled",proc_fs_cifs);
@@ -782,12 +832,12 @@ multiuser_mount_write(struct file *file, const char __user *buffer,
 }
 static int
-extended_security_read(char *page, char **start, off_t off,
+security_flags_read(char *page, char **start, off_t off,
                       int count, int *eof, void *data)
 {
        int len;
-        len = sprintf(page, "%d\n", extended_security);
+        len = sprintf(page, "0x%x\n", extended_security);
        len -= off;
        *start = page + off;
@@ -803,24 +853,52 @@ extended_security_read(char *page, char **start, off_t off,
        return len;
 }
 static int
-extended_security_write(struct file *file, const char __user *buffer,
+security_flags_write(struct file *file, const char __user *buffer,
                        unsigned long count, void *data)
 {
+        unsigned int flags;
+        char flags_string[12];
        char c;
-        int rc;
-        rc = get_user(c, buffer);
+        if((count < 1) || (count > 11))
-        if (rc)
+                return -EINVAL;
-                return rc;
-        if (c == '0' || c == 'n' || c == 'N')
+        memset(flags_string, 0, 12);
-                extended_security = 0;
-        else if (c == '1' || c == 'y' || c == 'Y')
+        if(copy_from_user(flags_string, buffer, count))
-                extended_security = 1;
+                return -EFAULT;
+        if(count < 3) {
+                /* single char or single char followed by null */
+                c = flags_string[0];
+                if (c == '0' || c == 'n' || c == 'N')
+                        extended_security = CIFSSEC_DEF; /* default */
+                else if (c == '1' || c == 'y' || c == 'Y')
+                        extended_security = CIFSSEC_MAX;
+                return count;
+        }
+        /* else we have a number */
+        flags = simple_strtoul(flags_string, NULL, 0);
+        cFYI(1,("sec flags 0x%x", flags));
+        if(flags <= 0)  {
+                cERROR(1,("invalid security flags %s",flags_string));
+                return -EINVAL;
+        }
+        if(flags & ~CIFSSEC_MASK) {
+                cERROR(1,("attempt to set unsupported security flags 0x%x",
+                        flags & ~CIFSSEC_MASK));
+                return -EINVAL;
+        }
+        /* flags look ok - update the global security flags for cifs module */
+        extended_security = flags;
        return count;
 }
-static int
+/* static int
 ntlmv2_enabled_read(char *page, char **start, off_t off,
                       int count, int *eof, void *data)
 {
@@ -855,6 +933,8 @@ ntlmv2_enabled_write(struct file *file, const char __user *buffer,
                ntlmv2_support = 0;
        else if (c == '1' || c == 'y' || c == 'Y')
                ntlmv2_support = 1;
+        else if (c == '2')
+                ntlmv2_support = 2;
        return count;
 }
@@ -898,7 +978,7 @@ packet_signing_enabled_write(struct file *file, const char __user *buffer,
                sign_CIFS_PDUs = 2;
        return count;
-}
+} */
 #endif
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 4304d9dcfb6c..c26cd0d2c6d5 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -24,6 +24,10 @@
 #define _H_CIFS_DEBUG
 void cifs_dump_mem(char *label, void *data, int length);
+#ifdef CONFIG_CIFS_DEBUG2
+void cifs_dump_detail(struct smb_hdr *);
+void cifs_dump_mids(struct TCP_Server_Info *);
+#endif
 extern int traceSMB;            /* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
 #define CIFS_INFO       0x01
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d2b128255944..d2a8b2941fc2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -22,6 +22,7 @@
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
+#include "cifsglob.h"
 #include "cifs_debug.h"
 /*
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e7d63737e651..a89efaf78a26 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -26,6 +26,8 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include <linux/ctype.h>
+#include <linux/random.h>
 /* Calculate and return the CIFS signature based on the mac key and the smb pdu */
 /* the 16 byte signature must be allocated by the caller  */
@@ -35,6 +37,8 @@
 extern void mdfour(unsigned char *out, unsigned char *in, int n);
 extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
+                       unsigned char *p24);
        
 static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu, 
                                    const char * key, char * signature)
@@ -45,7 +49,7 @@ static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu,
                return -EINVAL;
        MD5Init(&context);
-        MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
+        MD5Update(&context,key,CIFS_SESS_KEY_SIZE+16);
        MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length);
        MD5Final(signature,&context);
        return 0;
@@ -90,7 +94,7 @@ static int cifs_calc_signature2(const struct kvec * iov, int n_vec,
                return -EINVAL;
        MD5Init(&context);
-        MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
+        MD5Update(&context,key,CIFS_SESS_KEY_SIZE+16);
        for(i=0;i<n_vec;i++) {
                if(iov[i].iov_base == NULL) {
                        cERROR(1,("null iovec entry"));
@@ -204,11 +208,12 @@ int cifs_calculate_mac_key(char * key, const char * rn, const char * password)
        E_md4hash(password, temp_key);
        mdfour(key,temp_key,16);
-        memcpy(key+16,rn, CIFS_SESSION_KEY_SIZE);
+        memcpy(key+16,rn, CIFS_SESS_KEY_SIZE);
        return 0;
 }
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_info)
+int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, 
+                                const struct nls_table * nls_info)
 {
        char temp_hash[16];
        struct HMACMD5Context ctx;
@@ -225,6 +230,8 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_
        user_name_len = strlen(ses->userName);
        if(user_name_len > MAX_USERNAME_SIZE)
                return -EINVAL;
+        if(ses->domainName == NULL)
+                return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
        dom_name_len = strlen(ses->domainName);
        if(dom_name_len > MAX_USERNAME_SIZE)
                return -EINVAL;
@@ -259,16 +266,131 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_
        kfree(unicode_buf);
        return 0;
 }
-void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response)
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
+{
+        int i;
+        char password_with_pad[CIFS_ENCPWD_SIZE];
+        if(ses->server == NULL)
+                return;
+        memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+        strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
+        if((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
+                if(extended_security & CIFSSEC_MAY_PLNTXT) {
+                        memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); 
+                        return;
+                }
+        /* calculate old style session key */
+        /* calling toupper is less broken than repeatedly
+        calling nls_toupper would be since that will never
+        work for UTF8, but neither handles multibyte code pages
+        but the only alternative would be converting to UCS-16 (Unicode)
+        (using a routine something like UniStrupr) then
+        uppercasing and then converting back from Unicode - which
+        would only worth doing it if we knew it were utf8. Basically
+        utf8 and other multibyte codepages each need their own strupper
+        function since a byte at a time will ont work. */
+        for(i = 0; i < CIFS_ENCPWD_SIZE; i++) {
+                password_with_pad[i] = toupper(password_with_pad[i]);
+        }
+        SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
+        /* clear password before we return/free memory */
+        memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+}
+#endif /* CIFS_WEAK_PW_HASH */
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, 
+                            const struct nls_table * nls_cp)
+{
+        int rc = 0;
+        int len;
+        char nt_hash[16];
+        struct HMACMD5Context * pctxt;
+        wchar_t * user;
+        wchar_t * domain;
+        pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
+        if(pctxt == NULL)
+                return -ENOMEM;
+        /* calculate md4 hash of password */
+        E_md4hash(ses->password, nt_hash);
+        /* convert Domainname to unicode and uppercase */
+        hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+        /* convert ses->userName to unicode and uppercase */
+        len = strlen(ses->userName);
+        user = kmalloc(2 + (len * 2), GFP_KERNEL);
+        if(user == NULL)
+                goto calc_exit_2;
+        len = cifs_strtoUCS(user, ses->userName, len, nls_cp);
+        UniStrupr(user);
+        hmac_md5_update((char *)user, 2*len, pctxt);
+        /* convert ses->domainName to unicode and uppercase */
+        if(ses->domainName) {
+                len = strlen(ses->domainName);
+                domain = kmalloc(2 + (len * 2), GFP_KERNEL);
+                if(domain == NULL)
+                        goto calc_exit_1;
+                len = cifs_strtoUCS(domain, ses->domainName, len, nls_cp);
+                UniStrupr(domain);
+                hmac_md5_update((char *)domain, 2*len, pctxt);
+        
+                kfree(domain);
+        }
+calc_exit_1:
+        kfree(user);
+calc_exit_2:
+        /* BB FIXME what about bytes 24 through 40 of the signing key? 
+           compare with the NTLM example */
+        hmac_md5_final(ses->server->mac_signing_key, pctxt);
+        return rc;
+}
+void setup_ntlmv2_rsp(struct cifsSesInfo * ses, char * resp_buf, 
+                      const struct nls_table * nls_cp)
+{
+        int rc;
+        struct ntlmv2_resp * buf = (struct ntlmv2_resp *)resp_buf;
+        buf->blob_signature = cpu_to_le32(0x00000101);
+        buf->reserved = 0;
+        buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+        get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
+        buf->reserved2 = 0;
+        buf->names[0].type = 0;
+        buf->names[0].length = 0;
+        /* calculate buf->ntlmv2_hash */
+        rc = calc_ntlmv2_hash(ses, nls_cp);
+        if(rc)
+                cERROR(1,("could not get v2 hash rc %d",rc));
+        CalcNTLMv2_response(ses, resp_buf);
+}
+void CalcNTLMv2_response(const struct cifsSesInfo * ses, char * v2_session_response)
 {
        struct HMACMD5Context context;
+        /* rest of v2 struct already generated */
        memcpy(v2_session_response + 8, ses->server->cryptKey,8);
-        /* gen_blob(v2_session_response + 16); */
        hmac_md5_init_limK_to_64(ses->server->mac_signing_key, 16, &context);
-        hmac_md5_update(ses->server->cryptKey,8,&context);
+        hmac_md5_update(v2_session_response+8, 
-/*      hmac_md5_update(v2_session_response+16)client thing,8,&context); */ /* BB fix */
+                        sizeof(struct ntlmv2_resp) - 8, &context);
        hmac_md5_final(v2_session_response,&context);
-        cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); /* BB removeme BB */
+/*      cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce9..c28ede599946 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -56,8 +56,8 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = 0;
+unsigned int extended_security = CIFSSEC_DEF;
-unsigned int ntlmv2_support = 0;
+/* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct * oplockThread; /* remove sparse warning */
 struct task_struct * oplockThread = NULL;
@@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb)
 }
 static int
-cifs_statfs(struct super_block *sb, struct kstatfs *buf)
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        int xid; 
        int rc = -EOPNOTSUPP;
        struct cifs_sb_info *cifs_sb;
@@ -402,12 +403,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
 #endif
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-static void cifs_umount_begin(struct super_block * sblock)
+static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 {
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo * tcon;
-        cifs_sb = CIFS_SB(sblock);
+        if (!(flags & MNT_FORCE))
+                return;
+        cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
        if(cifs_sb == NULL)
                return;
@@ -460,9 +463,9 @@ struct super_operations cifs_super_ops = {
        .remount_fs = cifs_remount,
 };
-static struct super_block *
+static int
 cifs_get_sb(struct file_system_type *fs_type,
-            int flags, const char *dev_name, void *data)
+            int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
        int rc;
        struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
@@ -470,7 +473,7 @@ cifs_get_sb(struct file_system_type *fs_type,
        cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
        if (IS_ERR(sb))
-                return sb;
+                return PTR_ERR(sb);
        sb->s_flags = flags;
@@ -478,10 +481,10 @@ cifs_get_sb(struct file_system_type *fs_type,
        if (rc) {
                up_write(&sb->s_umount);
                deactivate_super(sb);
-                return ERR_PTR(rc);
+                return rc;
        }
        sb->s_flags |= MS_ACTIVE;
-        return sb;
+        return simple_set_mnt(mnt, sb);
 }
 static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
@@ -905,7 +908,7 @@ static int cifs_dnotify_thread(void * dummyarg)
        struct cifsSesInfo *ses;
        do {
-                if(try_to_freeze())
+                if (try_to_freeze())
                        continue;
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(15*HZ);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c98755dca868..8f75c6f24701 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,7 +32,8 @@
 #define TRUE 1
 #endif
-extern struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops_smallbuf;
 /* Functions related to super block operations */
 extern struct super_operations cifs_super_ops;
@@ -74,7 +75,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                         size_t write_size, loff_t * poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
-extern int cifs_flush(struct file *);
+extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
@@ -99,5 +100,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t  cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
                       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.43"
+#define CIFS_VERSION   "1.44"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 006eb33bff5f..6d7cf5f3bc0b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -88,7 +88,8 @@ enum statusEnum {
 };
 enum securityEnum {
-        NTLM = 0,               /* Legacy NTLM012 auth with NTLM hash */
+        LANMAN = 0,             /* Legacy LANMAN auth */
+        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO */
        NTLMSSP,                /* NTLMSSP via SPNEGO */
@@ -157,7 +158,7 @@ struct TCP_Server_Info {
        /* 16th byte of RFC1001 workstation name is always null */
        char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
        __u32 sequence_number; /* needed for CIFS PDU signature */
-        char mac_signing_key[CIFS_SESSION_KEY_SIZE + 16]; 
+        char mac_signing_key[CIFS_SESS_KEY_SIZE + 16]; 
 };
 /*
@@ -179,10 +180,13 @@ struct cifsUidInfo {
 struct cifsSesInfo {
        struct list_head cifsSessionList;
        struct semaphore sesSem;
+#if 0
        struct cifsUidInfo *uidInfo;    /* pointer to user info */
+#endif
        struct TCP_Server_Info *server; /* pointer to server info */
        atomic_t inUse; /* # of mounts (tree connections) on this ses */
        enum statusEnum status;
+        unsigned overrideSecFlg;  /* if non-zero override global sec flags */
        __u16 ipc_tid;          /* special tid for connection to IPC share */
        __u16 flags;
        char *serverOS;         /* name of operating system underlying server */
@@ -194,7 +198,7 @@ struct cifsSesInfo {
        char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for 
                                TCP names - will ipv6 and sctp addresses fit? */
        char userName[MAX_USERNAME_SIZE + 1];
-        char domainName[MAX_USERNAME_SIZE + 1];
+        char * domainName;
        char * password;
 };
 /* session flags */
@@ -209,12 +213,12 @@ struct cifsTconInfo {
        struct list_head openFileList;
        struct semaphore tconSem;
        struct cifsSesInfo *ses;        /* pointer to session associated with */
-        char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource (in ASCII not UTF) */
+        char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
        char *nativeFileSystem;
        __u16 tid;              /* The 2 byte tree id */
        __u16 Flags;            /* optional support bits */
        enum statusEnum tidStatus;
-        atomic_t useCount;      /* how many mounts (explicit or implicit) to this share */
+        atomic_t useCount;      /* how many explicit/implicit mounts to share */
 #ifdef CONFIG_CIFS_STATS
        atomic_t num_smbs_sent;
        atomic_t num_writes;
@@ -254,7 +258,7 @@ struct cifsTconInfo {
        spinlock_t stat_lock;
 #endif /* CONFIG_CIFS_STATS */
        FILE_SYSTEM_DEVICE_INFO fsDevInfo;
-        FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo;  /* ok if file system name truncated */
+        FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
        FILE_SYSTEM_UNIX_INFO fsUnixInfo;
        unsigned retry:1;
        unsigned nocase:1;
@@ -305,7 +309,6 @@ struct cifsFileInfo {
        atomic_t wrtPending;   /* handle in use - defer close */
        struct semaphore fh_sem; /* prevents reopen race after dead ses*/
        char * search_resume_name; /* BB removeme BB */
-        unsigned int resume_name_length; /* BB removeme - field renamed and moved BB */
        struct cifs_search_info srch_inf;
 };
@@ -391,9 +394,9 @@ struct mid_q_entry {
        struct smb_hdr *resp_buf;       /* response buffer */
        int midState;   /* wish this were enum but can not pass to wait_event */
        __u8 command;   /* smb command code */
-        unsigned multiPart:1;   /* multiple responses to one SMB request */
        unsigned largeBuf:1;    /* if valid response, is pointer to large buf */
-        unsigned multiResp:1;   /* multiple trans2 responses for one request  */
+        unsigned multiRsp:1;   /* multiple trans2 responses for one request  */
+        unsigned multiEnd:1; /* both received */
 };
 struct oplock_q_entry {
@@ -430,15 +433,35 @@ struct dir_notify_req {
 #define   CIFS_LARGE_BUFFER     2
 #define   CIFS_IOVEC            4    /* array of response buffers */
-/* Type of session setup needed */
+/* Security Flags: indicate type of session setup needed */
-#define   CIFS_PLAINTEXT        0
+#define   CIFSSEC_MAY_SIGN      0x00001
-#define   CIFS_LANMAN           1
+#define   CIFSSEC_MAY_NTLM      0x00002
-#define   CIFS_NTLM             2
+#define   CIFSSEC_MAY_NTLMV2    0x00004
-#define   CIFS_NTLMSSP_NEG      3
+#define   CIFSSEC_MAY_KRB5      0x00008
-#define   CIFS_NTLMSSP_AUTH     4
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define   CIFS_SPNEGO_INIT      5
+#define   CIFSSEC_MAY_LANMAN    0x00010
-#define   CIFS_SPNEGO_TARG      6
+#define   CIFSSEC_MAY_PLNTXT    0x00020
+#endif /* weak passwords */
+#define   CIFSSEC_MAY_SEAL      0x00040 /* not supported yet */
+#define   CIFSSEC_MUST_SIGN     0x01001
+/* note that only one of the following can be set so the
+result of setting MUST flags more than once will be to
+require use of the stronger protocol */
+#define   CIFSSEC_MUST_NTLM     0x02002
+#define   CIFSSEC_MUST_NTLMV2   0x04004
+#define   CIFSSEC_MUST_KRB5     0x08008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MUST_LANMAN   0x10010
+#define   CIFSSEC_MUST_PLNTXT   0x20020
+#define   CIFSSEC_MASK          0x37037 /* current flags supported if weak */
+#else     
+#define   CIFSSEC_MASK          0x07007 /* flags supported if no weak config */
+#endif /* WEAK_PW_HASH */
+#define   CIFSSEC_MUST_SEAL     0x40040 /* not supported yet */
+#define   CIFSSEC_DEF  CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2
+#define   CIFSSEC_MAX  CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2
 /*
 *****************************************************************
 * All constants go here
@@ -500,16 +523,16 @@ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; /* Outstanding dir notify requests */
-GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; /* Dir notify response queue */
+GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;/* DirNotify response queue */
 /*
 * Global transaction id (XID) information
 */
 GLOBAL_EXTERN unsigned int GlobalCurrentXid;    /* protected by GlobalMid_Sem */
-GLOBAL_EXTERN unsigned int GlobalTotalActiveXid;        /* prot by GlobalMid_Sem */
+GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;  /* prot by GlobalMid_Sem */
-GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above and list operations */
+GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
-                                        /* on midQ entries */
+                                          /* on midQ entries */
 GLOBAL_EXTERN char Local_System_Name[15];
 /*
@@ -531,7 +554,7 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
 GLOBAL_EXTERN atomic_t midCount;
 /* Misc globals */
-GLOBAL_EXTERN unsigned int multiuser_mount;     /* if enabled allows new sessions
+GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
                                to be established on existing mount if we
                                have the uid/password or Kerberos credential 
                                or equivalent for current user */
@@ -540,8 +563,8 @@ GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int extended_security;   /* if on, session setup sent 
                                with more secure ntlmssp2 challenge/resp */
-GLOBAL_EXTERN unsigned int ntlmv2_support;  /* better optional password hash */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
+GLOBAL_EXTERN unsigned int secFlags;
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
 GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
 GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b2233ac05bd2..86239023545b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -16,7 +16,7 @@
 *
 *   You should have received a copy of the GNU Lesser General Public License
 *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 #ifndef _CIFSPDU_H
@@ -24,8 +24,14 @@
 #include <net/sock.h>
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define LANMAN_PROT 0
+#define CIFS_PROT   1
+#else
 #define CIFS_PROT   0
-#define BAD_PROT    CIFS_PROT+1
+#endif
+#define POSIX_PROT  CIFS_PROT+1
+#define BAD_PROT 0xFFFF
 /* SMB command codes */
 /* Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
@@ -110,7 +116,7 @@
 /*
 * Size of the session key (crypto key encrypted with the password
 */
-#define CIFS_SESSION_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (24)
 /*
 * Maximum user name length
@@ -400,6 +406,29 @@ typedef struct negotiate_req {
        unsigned char DialectsArray[1];
 } __attribute__((packed)) NEGOTIATE_REQ;
+/* Dialect index is 13 for LANMAN */
+typedef struct lanman_neg_rsp {
+        struct smb_hdr hdr;     /* wct = 13 */
+        __le16 DialectIndex;
+        __le16 SecurityMode;
+        __le16 MaxBufSize;
+        __le16 MaxMpxCount;
+        __le16 MaxNumberVcs;
+        __le16 RawMode;
+        __le32 SessionKey;
+        __le32 ServerTime;
+        __le16 ServerTimeZone;
+        __le16 EncryptionKeyLength;
+        __le16 Reserved;
+        __u16  ByteCount;
+        unsigned char EncryptionKey[1];
+} __attribute__((packed)) LANMAN_NEG_RSP;
+#define READ_RAW_ENABLE 1
+#define WRITE_RAW_ENABLE 2
+#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
 typedef struct negotiate_rsp {
        struct smb_hdr hdr;     /* wct = 17 */
        __le16 DialectIndex;
@@ -509,7 +538,7 @@ typedef union smb_com_session_setup_andx {
 /*      unsigned char  * NativeOS;      */
 /*      unsigned char  * NativeLanMan;  */
 /*      unsigned char  * PrimaryDomain; */
-        } __attribute__((packed)) resp;                 /* NTLM response format (with or without extended security */
+        } __attribute__((packed)) resp; /* NTLM response with or without extended sec*/
        struct {                /* request format */
                struct smb_hdr hdr;     /* wct = 10 */
@@ -520,8 +549,8 @@ typedef union smb_com_session_setup_andx {
                __le16 MaxMpxCount;
                __le16 VcNumber;
                __u32 SessionKey;
-                __le16 PassswordLength;
+                __le16 PasswordLength;
-                __u32 Reserved;
+                __u32 Reserved; /* encrypt key len and offset */
                __le16 ByteCount;
                unsigned char AccountPassword[1];       /* followed by */
                /* STRING AccountName */
@@ -543,6 +572,26 @@ typedef union smb_com_session_setup_andx {
        } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */
 } __attribute__((packed)) SESSION_SETUP_ANDX;
+/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */
+struct ntlmssp2_name {
+        __le16 type;
+        __le16 length;
+/*      char   name[length]; */
+} __attribute__((packed));
+struct ntlmv2_resp {
+        char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+        __le32 blob_signature;
+        __u32  reserved;
+        __le64  time;
+        __u64  client_chal; /* random */
+        __u32  reserved2;
+        struct ntlmssp2_name names[1];
+        /* array of name entries could follow ending in minimum 4 byte struct */
+} __attribute__((packed));
 #define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux"
 /* Capabilities bits (for NTLM SessSetup request) */
@@ -573,7 +622,9 @@ typedef struct smb_com_tconx_req {
 } __attribute__((packed)) TCONX_REQ;
 typedef struct smb_com_tconx_rsp {
-        struct smb_hdr hdr;     /* wct = 3 *//* note that Win2000 has sent wct=7 in some cases on responses. Four unspecified words followed OptionalSupport */
+        struct smb_hdr hdr;     /* wct = 3 note that Win2000 has sent wct = 7
+                                 in some cases on responses. Four unspecified
+                                 words followed OptionalSupport */
        __u8 AndXCommand;
        __u8 AndXReserved;
        __le16 AndXOffset;
@@ -1323,6 +1374,9 @@ struct smb_t2_rsp {
 #define SMB_FILE_MAXIMUM_INFO           0x40d
 /* Find File infolevels */
+#define SMB_FIND_FILE_INFO_STANDARD       0x001
+#define SMB_FIND_FILE_QUERY_EA_SIZE       0x002
+#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003
 #define SMB_FIND_FILE_DIRECTORY_INFO      0x101
 #define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102
 #define SMB_FIND_FILE_NAMES_INFO          0x103
@@ -1844,13 +1898,13 @@ typedef struct {
 typedef struct {
        __le32 DeviceType;
        __le32 DeviceCharacteristics;
-} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO;      /* device info, level 0x104 */
+} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
 typedef struct {
        __le32 Attributes;
        __le32 MaxPathNameComponentLength;
        __le32 FileSystemNameLen;
-        char FileSystemName[52]; /* do not really need to save this - so potentially get only subset of name */
+        char FileSystemName[52]; /* do not have to save this - get subset? */
 } __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO;
 /******************************************************************************/
@@ -1947,7 +2001,8 @@ typedef struct {
 struct file_allocation_info {
        __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
-} __attribute__((packed));      /* size used on disk, level 0x103 for set, 0x105 for query */
+} __attribute__((packed));      /* size used on disk, for level 0x103 for set,
+                                   0x105 for query */
 struct file_end_of_file_info {
        __le64 FileSize;                /* offset to end of file */
@@ -2054,7 +2109,7 @@ typedef struct {
        __le32 ExtFileAttributes;
        __le32 FileNameLength;
        char FileName[1];
-} __attribute__((packed)) FILE_DIRECTORY_INFO;   /* level 0x101 FF response data area */
+} __attribute__((packed)) FILE_DIRECTORY_INFO;   /* level 0x101 FF resp data */
 typedef struct {
        __le32 NextEntryOffset;
@@ -2069,7 +2124,7 @@ typedef struct {
        __le32 FileNameLength;
        __le32 EaSize; /* length of the xattrs */
        char FileName[1];
-} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO;   /* level 0x102 FF response data area */
+} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */
 typedef struct {
        __le32 NextEntryOffset;
@@ -2086,7 +2141,7 @@ typedef struct {
        __le32 Reserved;
        __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
        char FileName[1];
-} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO;   /* level 0x105 FF response data area */
+} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
 typedef struct {
        __le32 NextEntryOffset;
@@ -2104,7 +2159,22 @@ typedef struct {
        __u8   Reserved;
        __u8   ShortName[12];
        char FileName[1];
-} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO;   /* level 0x104 FF response data area */
+} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */
+typedef struct {
+        __u32  ResumeKey;
+        __le16 CreationDate; /* SMB Date */
+        __le16 CreationTime; /* SMB Time */
+        __le16 LastAccessDate;
+        __le16 LastAccessTime;
+        __le16 LastWriteDate;
+        __le16 LastWriteTime;
+        __le32 DataSize; /* File Size (EOF) */
+        __le32 AllocationSize;
+        __le16 Attributes; /* verify not u32 */
+        __u8   FileNameLength;
+        char FileName[1];
+} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */
 struct win_dev {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 310ea2f0e0bf..a5ddc62d6fe6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -64,14 +64,12 @@ extern int map_smb_to_linux_error(struct smb_hdr *smb);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifsTconInfo *, int /* length of
                            fixed section (word count) in two byte units */);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
                                struct cifsSesInfo *ses,
                                void ** request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                             const int stage, int * pNTLMv2_flg,
+                             const int stage, 
                             const struct nls_table *nls_cp);
-#endif
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16, 
                                                 struct cifsTconInfo *);
@@ -285,8 +283,14 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key,
        __u32 expected_sequence_number);
 extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, struct nls_table *);
+extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, 
-extern void CalcNTLMv2_response(const struct cifsSesInfo *,char * );
+                        const struct nls_table *);
+extern void CalcNTLMv2_response(const struct cifsSesInfo *, char * );
+extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 
+                             const struct nls_table *);
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+extern void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key);
+#endif /* CIFS_WEAK_PW_HASH */
 extern int CIFSSMBCopy(int xid,
                        struct cifsTconInfo *source_tcon,
                        const char *fromName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 925881e00ff2..19678c575dfc 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -44,8 +44,11 @@ static struct {
        int index;
        char *name;
 } protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+        {LANMAN_PROT, "\2LM1.2X002"},
+#endif /* weak password hashing for legacy clients */
        {CIFS_PROT, "\2NT LM 0.12"}, 
-        {CIFS_PROT, "\2POSIX 2"},
+        {POSIX_PROT, "\2POSIX 2"},
        {BAD_PROT, "\2"}
 };
 #else
@@ -53,11 +56,29 @@ static struct {
        int index;
        char *name;
 } protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+        {LANMAN_PROT, "\2LM1.2X002"},
+#endif /* weak password hashing for legacy clients */
        {CIFS_PROT, "\2NT LM 0.12"}, 
        {BAD_PROT, "\2"}
 };
 #endif
+/* define the number of elements in the cifs dialect array */
+#ifdef CONFIG_CIFS_POSIX
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 3
+#else
+#define CIFS_NUM_PROT 2
+#endif /* CIFS_WEAK_PW_HASH */
+#else /* not posix */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 2
+#else
+#define CIFS_NUM_PROT 1
+#endif /* CONFIG_CIFS_WEAK_PW_HASH */
+#endif /* CIFS_POSIX */
 /* Mark as invalid, all open files on tree connections since they
   were closed when session to server was lost */
@@ -188,7 +209,6 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        return rc;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL  
 int
 small_smb_init_no_tc(const int smb_command, const int wct, 
                     struct cifsSesInfo *ses, void **request_buf)
@@ -214,7 +234,6 @@ small_smb_init_no_tc(const int smb_command, const int wct,
        return rc;
 }
-#endif  /* CONFIG_CIFS_EXPERIMENTAL */
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
@@ -322,7 +341,8 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
    /* potential retries of smb operations it turns out we can determine */
    /* from the mid flags when the request buffer can be resent without  */
    /* having to use a second distinct buffer for the response */
-        *response_buf = *request_buf; 
+        if(response_buf)
+                *response_buf = *request_buf; 
        header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon,
                        wct /*wct */ );
@@ -373,8 +393,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        NEGOTIATE_RSP *pSMBr;
        int rc = 0;
        int bytes_returned;
+        int i;
        struct TCP_Server_Info * server;
        u16 count;
+        unsigned int secFlags;
        if(ses->server)
                server = ses->server;
@@ -386,101 +408,200 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                      (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
+        /* if any of auth flags (ie not sign or seal) are overriden use them */
+        if(ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+                secFlags = ses->overrideSecFlg;
+        else /* if override flags set only sign/seal OR them with global auth */
+                secFlags = extended_security | ses->overrideSecFlg;
+        cFYI(1,("secFlags 0x%x",secFlags));
        pSMB->hdr.Mid = GetNextMid(server);
        pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
-        if (extended_security)
+        if((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
+        
-        count = strlen(protocols[0].name) + 1;
+        count = 0;
-        strncpy(pSMB->DialectsArray, protocols[0].name, 30);    
+        for(i=0;i<CIFS_NUM_PROT;i++) {
-    /* null guaranteed to be at end of source and target buffers anyway */
+                strncpy(pSMB->DialectsArray+count, protocols[i].name, 16);
+                count += strlen(protocols[i].name) + 1;
+                /* null at end of source and target buffers anyway */
+        }
        pSMB->hdr.smb_buf_length += count;
        pSMB->ByteCount = cpu_to_le16(count);
        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc == 0) {
+        if (rc != 0) 
-                server->secMode = pSMBr->SecurityMode;
+                goto neg_err_exit;
-                if((server->secMode & SECMODE_USER) == 0)
-                        cFYI(1,("share mode security"));
+        cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
-                server->secType = NTLM; /* BB override default for
+        /* Check wct = 1 error case */
-                                           NTLMv2 or kerberos v5 */
+        if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) {
-                /* one byte - no need to convert this or EncryptionKeyLen
+                /* core returns wct = 1, but we do not ask for core - otherwise
-                   from little endian */
+                small wct just comes when dialect index is -1 indicating we 
-                server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+                could not negotiate a common dialect */
-                /* probably no need to store and check maxvcs */
+                rc = -EOPNOTSUPP;
-                server->maxBuf =
+                goto neg_err_exit;
-                        min(le32_to_cpu(pSMBr->MaxBufferSize),
+#ifdef CONFIG_CIFS_WEAK_PW_HASH 
+        } else if((pSMBr->hdr.WordCount == 13)
+                        && (pSMBr->DialectIndex == LANMAN_PROT)) {
+                struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
+                if((secFlags & CIFSSEC_MAY_LANMAN) || 
+                        (secFlags & CIFSSEC_MAY_PLNTXT))
+                        server->secType = LANMAN;
+                else {
+                        cERROR(1, ("mount failed weak security disabled"
+                                   " in /proc/fs/cifs/SecurityFlags"));
+                        rc = -EOPNOTSUPP;
+                        goto neg_err_exit;
+                }       
+                server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
+                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
+                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
+                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+                GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
+                /* even though we do not use raw we might as well set this
+                accurately, in case we ever find a need for it */
+                if((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+                        server->maxRw = 0xFF00;
+                        server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+                } else {
+                        server->maxRw = 0;/* we do not need to use raw anyway */
+                        server->capabilities = CAP_MPX_MODE;
+                }
+                server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
+                /* BB get server time for time conversions and add
+                code to use it and timezone since this is not UTC */    
+                if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+                        memcpy(server->cryptKey, rsp->EncryptionKey,
+                                CIFS_CRYPTO_KEY_SIZE);
+                } else if (server->secMode & SECMODE_PW_ENCRYPT) {
+                        rc = -EIO; /* need cryptkey unless plain text */
+                        goto neg_err_exit;
+                }
+                cFYI(1,("LANMAN negotiated"));
+                /* we will not end up setting signing flags - as no signing
+                was in LANMAN and server did not return the flags on */
+                goto signing_check;
+#else /* weak security disabled */
+        } else if(pSMBr->hdr.WordCount == 13) {
+                cERROR(1,("mount failed, cifs module not built "
+                          "with CIFS_WEAK_PW_HASH support"));
+                        rc = -EOPNOTSUPP;
+#endif /* WEAK_PW_HASH */
+                goto neg_err_exit;
+        } else if(pSMBr->hdr.WordCount != 17) {
+                /* unknown wct */
+                rc = -EOPNOTSUPP;
+                goto neg_err_exit;
+        }
+        /* else wct == 17 NTLM */
+        server->secMode = pSMBr->SecurityMode;
+        if((server->secMode & SECMODE_USER) == 0)
+                cFYI(1,("share mode security"));
+        if((server->secMode & SECMODE_PW_ENCRYPT) == 0)
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
+#endif /* CIFS_WEAK_PW_HASH */
+                        cERROR(1,("Server requests plain text password"
+                                  " but client support disabled"));
+        if((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+                server->secType = NTLMv2;
+        else if(secFlags & CIFSSEC_MAY_NTLM)
+                server->secType = NTLM;
+        else if(secFlags & CIFSSEC_MAY_NTLMV2)
+                server->secType = NTLMv2;
+        /* else krb5 ... any others ... */
+        /* one byte, so no need to convert this or EncryptionKeyLen from
+           little endian */
+        server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+        /* probably no need to store and check maxvcs */
+        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-                server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+        server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
-                cFYI(0, ("Max buf = %d", ses->server->maxBuf));
+        cFYI(0, ("Max buf = %d", ses->server->maxBuf));
-                GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
+        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
-                server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
-                server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);  
+        server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);  
-        /* BB with UTC do we ever need to be using srvr timezone? */
+        if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-                if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+                memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
-                        memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
+                       CIFS_CRYPTO_KEY_SIZE);
-                               CIFS_CRYPTO_KEY_SIZE);
+        } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
-                } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
+                        && (pSMBr->EncryptionKeyLength == 0)) {
-                           && (pSMBr->EncryptionKeyLength == 0)) {
+                /* decode security blob */
-                        /* decode security blob */
+        } else if (server->secMode & SECMODE_PW_ENCRYPT) {
-                } else
+                rc = -EIO; /* no crypt key only if plain text pwd */
-                        rc = -EIO;
+                goto neg_err_exit;
+        }
-                /* BB might be helpful to save off the domain of server here */
+        /* BB might be helpful to save off the domain of server here */
-                if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && 
+        if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && 
-                        (server->capabilities & CAP_EXTENDED_SECURITY)) {
+                (server->capabilities & CAP_EXTENDED_SECURITY)) {
-                        count = pSMBr->ByteCount;
+                count = pSMBr->ByteCount;
-                        if (count < 16)
+                if (count < 16)
-                                rc = -EIO;
+                        rc = -EIO;
-                        else if (count == 16) {
+                else if (count == 16) {
-                                server->secType = RawNTLMSSP;
+                        server->secType = RawNTLMSSP;
-                                if (server->socketUseCount.counter > 1) {
+                        if (server->socketUseCount.counter > 1) {
-                                        if (memcmp
+                                if (memcmp(server->server_GUID,
-                                                (server->server_GUID,
+                                           pSMBr->u.extended_response.
-                                                pSMBr->u.extended_response.
+                                           GUID, 16) != 0) {
-                                                GUID, 16) != 0) {
+                                        cFYI(1, ("server UID changed"));
-                                                cFYI(1, ("server UID changed"));
-                                                memcpy(server->
-                                                        server_GUID,
-                                                        pSMBr->u.
-                                                        extended_response.
-                                                        GUID, 16);
-                                        }
-                                } else
                                        memcpy(server->server_GUID,
-                                               pSMBr->u.extended_response.
+                                                pSMBr->u.extended_response.GUID,
-                                               GUID, 16);
+                                                16);
-                        } else {
-                                rc = decode_negTokenInit(pSMBr->u.
-                                                         extended_response.
-                                                         SecurityBlob,
-                                                         count - 16,
-                                                         &server->secType);
-                                if(rc == 1) {
-                                /* BB Need to fill struct for sessetup here */
-                                        rc = -EOPNOTSUPP;
-                                } else {
-                                        rc = -EINVAL;
                                }
+                        } else
+                                memcpy(server->server_GUID,
+                                       pSMBr->u.extended_response.GUID, 16);
+                } else {
+                        rc = decode_negTokenInit(pSMBr->u.extended_response.
+                                                 SecurityBlob,
+                                                 count - 16,
+                                                 &server->secType);
+                        if(rc == 1) {
+                        /* BB Need to fill struct for sessetup here */
+                                rc = -EOPNOTSUPP;
+                        } else {
+                                rc = -EINVAL;
                        }
-                } else
-                        server->capabilities &= ~CAP_EXTENDED_SECURITY;
-                if(sign_CIFS_PDUs == FALSE) {        
-                        if(server->secMode & SECMODE_SIGN_REQUIRED)
-                                cERROR(1,
-                                 ("Server requires /proc/fs/cifs/PacketSigningEnabled"));
-                        server->secMode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-                } else if(sign_CIFS_PDUs == 1) {
-                        if((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
-                                server->secMode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
                }
-                                
+        } else
+                server->capabilities &= ~CAP_EXTENDED_SECURITY;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+signing_check:
+#endif
+        if(sign_CIFS_PDUs == FALSE) {        
+                if(server->secMode & SECMODE_SIGN_REQUIRED)
+                        cERROR(1,("Server requires "
+                                 "/proc/fs/cifs/PacketSigningEnabled to be on"));
+                server->secMode &= 
+                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+        } else if(sign_CIFS_PDUs == 1) {
+                if((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
+                        server->secMode &= 
+                                ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+        } else if(sign_CIFS_PDUs == 2) {
+                if((server->secMode & 
+                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
+                        cERROR(1,("signing required but server lacks support"));
+                }
        }
-        
+neg_err_exit:   
        cifs_buf_release(pSMB);
+        cFYI(1,("negprot rc %d",rc));
        return rc;
 }
@@ -2239,7 +2360,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        }
                        symlinkinfo[buflen] = 0; /* just in case so the caller
                                        does not go off the end of the buffer */
-                        cFYI(1,("readlink result - %s ",symlinkinfo));
+                        cFYI(1,("readlink result - %s",symlinkinfo));
                }
        }
 qreparse_out:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index bae1479318d1..876eb9ef85fe 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -49,8 +49,6 @@
 static DECLARE_COMPLETION(cifsd_complete);
-extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
-                       unsigned char *p24);
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
@@ -70,6 +68,7 @@ struct smb_vol {
        gid_t linux_gid;
        mode_t file_mode;
        mode_t dir_mode;
+        unsigned secFlg;
        unsigned rw:1;
        unsigned retry:1;
        unsigned intr:1;
@@ -83,12 +82,7 @@ struct smb_vol {
        unsigned remap:1;   /* set to remap seven reserved chars in filenames */
        unsigned posix_paths:1;   /* unset to not ask for posix pathnames. */
        unsigned sfu_emul:1;
-        unsigned krb5:1;
-        unsigned ntlm:1;
-        unsigned ntlmv2:1;
        unsigned nullauth:1; /* attempt to authenticate with null user */
-        unsigned sign:1;
-        unsigned seal:1;     /* encrypt */
        unsigned nocase;     /* request case insensitive filenames */
        unsigned nobrl;      /* disable sending byte range locks to srv */
        unsigned int rsize;
@@ -369,21 +363,21 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                        continue;
                if (bigbuf == NULL) {
                        bigbuf = cifs_buf_get();
-                        if(bigbuf == NULL) {
+                        if (!bigbuf) {
-                                cERROR(1,("No memory for large SMB response"));
+                                cERROR(1, ("No memory for large SMB response"));
                                msleep(3000);
                                /* retry will check if exiting */
                                continue;
                        }
-                } else if(isLargeBuf) {
+                } else if (isLargeBuf) {
-                        /* we are reusing a dirtry large buf, clear its start */
+                        /* we are reusing a dirty large buf, clear its start */
                        memset(bigbuf, 0, sizeof (struct smb_hdr));
                }
                if (smallbuf == NULL) {
                        smallbuf = cifs_small_buf_get();
-                        if(smallbuf == NULL) {
+                        if (!smallbuf) {
-                                cERROR(1,("No memory for SMB response"));
+                                cERROR(1, ("No memory for SMB response"));
                                msleep(1000);
                                /* retry will check if exiting */
                                continue;
@@ -403,12 +397,12 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                    kernel_recvmsg(csocket, &smb_msg,
                                 &iov, 1, 4, 0 /* BB see socket.h flags */);
-                if(server->tcpStatus == CifsExiting) {
+                if (server->tcpStatus == CifsExiting) {
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
-                        cFYI(1,("Reconnect after server stopped responding"));
+                        cFYI(1, ("Reconnect after server stopped responding"));
                        cifs_reconnect(server);
-                        cFYI(1,("call to reconnect done"));
+                        cFYI(1, ("call to reconnect done"));
                        csocket = server->ssocket;
                        continue;
                } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -417,15 +411,15 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                                tcpStatus CifsNeedReconnect if server hung */
                        continue;
                } else if (length <= 0) {
-                        if(server->tcpStatus == CifsNew) {
+                        if (server->tcpStatus == CifsNew) {
-                                cFYI(1,("tcp session abend after SMBnegprot"));
+                                cFYI(1, ("tcp session abend after SMBnegprot"));
                                /* some servers kill the TCP session rather than
                                   returning an SMB negprot error, in which
                                   case reconnecting here is not going to help,
                                   and so simply return error to mount */
                                break;
                        }
-                        if(length == -EINTR) { 
+                        if (!try_to_freeze() && (length == -EINTR)) {
                                cFYI(1,("cifsd thread killed"));
                                break;
                        }
@@ -585,9 +579,11 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                                                /* merge response - fix up 1st*/
                                                if(coalesce_t2(smb_buffer, 
                                                        mid_entry->resp_buf)) {
+                                                        mid_entry->multiRsp = 1;
                                                        break;
                                                } else {
                                                        /* all parts received */
+                                                        mid_entry->multiEnd = 1;
                                                        goto multi_t2_fnd; 
                                                }
                                        } else {
@@ -632,9 +628,14 @@ multi_t2_fnd:
                        wake_up_process(task_to_wake);
                } else if ((is_valid_oplock_break(smb_buffer, server) == FALSE)
                    && (isMultiRsp == FALSE)) {                          
-                        cERROR(1, ("No task to wake, unknown frame rcvd!"));
+                        cERROR(1, ("No task to wake, unknown frame rcvd! NumMids %d", midCount.counter));
                        cifs_dump_mem("Received Data is: ",(char *)smb_buffer,
                                      sizeof(struct smb_hdr));
+#ifdef CONFIG_CIFS_DEBUG2
+                        cifs_dump_detail(smb_buffer);
+                        cifs_dump_mids(server);
+#endif /* CIFS_DEBUG2 */
+                        
                }
        } /* end while !EXITING */
@@ -784,7 +785,6 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
        vol->rw = TRUE;
-        vol->ntlm = TRUE;
        /* default is always to request posix paths. */
        vol->posix_paths = 1;
@@ -915,30 +915,35 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                cERROR(1,("no security value specified"));
                                continue;
                        } else if (strnicmp(value, "krb5i", 5) == 0) {
-                                vol->sign = 1;
+                                vol->secFlg |= CIFSSEC_MAY_KRB5 | 
-                                vol->krb5 = 1;
+                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "krb5p", 5) == 0) {
-                                /* vol->seal = 1; 
+                                /* vol->secFlg |= CIFSSEC_MUST_SEAL | 
-                                   vol->krb5 = 1; */
+                                        CIFSSEC_MAY_KRB5; */ 
                                cERROR(1,("Krb5 cifs privacy not supported"));
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
-                                vol->krb5 = 1;
+                                vol->secFlg |= CIFSSEC_MAY_KRB5;
                        } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
-                                vol->ntlmv2 = 1;
+                                vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
-                                vol->sign = 1;
+                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlmv2", 6) == 0) {
-                                vol->ntlmv2 = 1;
+                                vol->secFlg |= CIFSSEC_MAY_NTLMV2;
                        } else if (strnicmp(value, "ntlmi", 5) == 0) {
-                                vol->ntlm = 1;
+                                vol->secFlg |= CIFSSEC_MAY_NTLM |
-                                vol->sign = 1;
+                                        CIFSSEC_MUST_SIGN;
                        } else if (strnicmp(value, "ntlm", 4) == 0) {
                                /* ntlm is default so can be turned off too */
-                                vol->ntlm = 1;
+                                vol->secFlg |= CIFSSEC_MAY_NTLM;
                        } else if (strnicmp(value, "nontlm", 6) == 0) {
-                                vol->ntlm = 0;
+                                /* BB is there a better way to do this? */
+                                vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+                        } else if (strnicmp(value, "lanman", 6) == 0) {
+                                vol->secFlg |= CIFSSEC_MAY_LANMAN;
+#endif
                        } else if (strnicmp(value, "none", 4) == 0) {
-                                vol->nullauth = 1; 
+                                vol->nullauth = 1;
                        } else {
                                cERROR(1,("bad security option: %s", value));
                                return 1;
@@ -976,7 +981,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                        }
                        /* BB are there cases in which a comma can be valid in
                        a domain name and need special handling? */
-                        if (strnlen(value, 65) < 65) {
+                        if (strnlen(value, 256) < 256) {
                                vol->domainname = value;
                                cFYI(1, ("Domain name set"));
                        } else {
@@ -1168,6 +1173,10 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                        vol->no_psx_acl = 0;
                } else if (strnicmp(data, "noacl",5) == 0) {
                        vol->no_psx_acl = 1;
+                } else if (strnicmp(data, "sign",4) == 0) {
+                        vol->secFlg |= CIFSSEC_MUST_SIGN;
+/*              } else if (strnicmp(data, "seal",4) == 0) {
+                        vol->secFlg |= CIFSSEC_MUST_SEAL; */
                } else if (strnicmp(data, "direct",6) == 0) {
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio",13) == 0) {
@@ -1762,11 +1771,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        if (volume_info.username)
                                strncpy(pSesInfo->userName,
                                        volume_info.username,MAX_USERNAME_SIZE);
-                        if (volume_info.domainname)
+                        if (volume_info.domainname) {
-                                strncpy(pSesInfo->domainName,
+                                int len = strlen(volume_info.domainname);
-                                        volume_info.domainname,MAX_USERNAME_SIZE);
+                                pSesInfo->domainName = 
+                                        kmalloc(len + 1, GFP_KERNEL);
+                                if(pSesInfo->domainName)
+                                        strcpy(pSesInfo->domainName,
+                                                volume_info.domainname);
+                        }
                        pSesInfo->linux_uid = volume_info.linux_uid;
+                        pSesInfo->overrideSecFlg = volume_info.secFlg;
                        down(&pSesInfo->sesSem);
+                        /* BB FIXME need to pass vol->secFlgs BB */
                        rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
                        up(&pSesInfo->sesSem);
                        if(!rc)
@@ -1980,7 +1996,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 static int
 CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-              char session_key[CIFS_SESSION_KEY_SIZE],
+              char session_key[CIFS_SESS_KEY_SIZE],
              const struct nls_table *nls_codepage)
 {
        struct smb_hdr *smb_buffer;
@@ -2038,15 +2054,15 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
        pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
        pSMB->req_no_secext.CaseInsensitivePasswordLength = 
-                cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+                cpu_to_le16(CIFS_SESS_KEY_SIZE);
        pSMB->req_no_secext.CaseSensitivePasswordLength =
-            cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+            cpu_to_le16(CIFS_SESS_KEY_SIZE);
        bcc_ptr = pByteArea(smb_buffer);
-        memcpy(bcc_ptr, (char *) session_key, CIFS_SESSION_KEY_SIZE);
+        memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
-        bcc_ptr += CIFS_SESSION_KEY_SIZE;
+        bcc_ptr += CIFS_SESS_KEY_SIZE;
-        memcpy(bcc_ptr, (char *) session_key, CIFS_SESSION_KEY_SIZE);
+        memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
-        bcc_ptr += CIFS_SESSION_KEY_SIZE;
+        bcc_ptr += CIFS_SESS_KEY_SIZE;
        if (ses->capabilities & CAP_UNICODE) {
                if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */
@@ -2054,7 +2070,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                        bcc_ptr++;
                }
                if(user == NULL)
-                        bytes_returned = 0; /* skill null user */
+                        bytes_returned = 0; /* skip null user */
                else
                        bytes_returned =
                                cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
@@ -2162,8 +2178,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                if (remaining_words > 0) {
                                        len = UniStrnlen((wchar_t *)bcc_ptr,
                                                         remaining_words-1);
-                                        if(ses->serverNOS)
+                                        kfree(ses->serverNOS);
-                                                kfree(ses->serverNOS);
                                        ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL);
                                        if(ses->serverNOS == NULL)
                                                goto sesssetup_nomem;
@@ -2203,12 +2218,10 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        /* if these kcallocs fail not much we
                                           can do, but better to not fail the
                                           sesssetup itself */
-                                        if(ses->serverDomain)
+                                        kfree(ses->serverDomain);
-                                                kfree(ses->serverDomain);
                                        ses->serverDomain =
                                            kzalloc(2, GFP_KERNEL);
-                                        if(ses->serverNOS)
+                                        kfree(ses->serverNOS);
-                                                kfree(ses->serverNOS);
                                        ses->serverNOS =
                                            kzalloc(2, GFP_KERNEL);
                                }
@@ -2217,8 +2230,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                if (((long) bcc_ptr + len) - (long)
                                    pByteArea(smb_buffer_response)
                                            <= BCC(smb_buffer_response)) {
-                                        if(ses->serverOS)
+                                        kfree(ses->serverOS);
-                                                kfree(ses->serverOS);
                                        ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
                                        if(ses->serverOS == NULL)
                                                goto sesssetup_nomem;
@@ -2229,8 +2241,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        bcc_ptr++;
                                        len = strnlen(bcc_ptr, 1024);
-                                        if(ses->serverNOS)
+                                        kfree(ses->serverNOS);
-                                                kfree(ses->serverNOS);
                                        ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
                                        if(ses->serverNOS == NULL)
                                                goto sesssetup_nomem;
@@ -2274,292 +2285,6 @@ sesssetup_nomem:	/* do not return an error on nomem for the info strings,
 }
 static int
-CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                char *SecurityBlob,int SecurityBlobLength,
-                const struct nls_table *nls_codepage)
-{
-        struct smb_hdr *smb_buffer;
-        struct smb_hdr *smb_buffer_response;
-        SESSION_SETUP_ANDX *pSMB;
-        SESSION_SETUP_ANDX *pSMBr;
-        char *bcc_ptr;
-        char *user;
-        char *domain;
-        int rc = 0;
-        int remaining_words = 0;
-        int bytes_returned = 0;
-        int len;
-        __u32 capabilities;
-        __u16 count;
-        cFYI(1, ("In spnego sesssetup "));
-        if(ses == NULL)
-                return -EINVAL;
-        user = ses->userName;
-        domain = ses->domainName;
-        smb_buffer = cifs_buf_get();
-        if (smb_buffer == NULL) {
-                return -ENOMEM;
-        }
-        smb_buffer_response = smb_buffer;
-        pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
-        /* send SMBsessionSetup here */
-        header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-                        NULL /* no tCon exists yet */ , 12 /* wct */ );
-        smb_buffer->Mid = GetNextMid(ses->server);
-        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        pSMB->req.AndXCommand = 0xFF;
-        if(ses->server->maxBuf > 64*1024)
-                ses->server->maxBuf = (64*1023);
-        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-        if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-            CAP_EXTENDED_SECURITY;
-        if (ses->capabilities & CAP_UNICODE) {
-                smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-                capabilities |= CAP_UNICODE;
-        }
-        if (ses->capabilities & CAP_STATUS32) {
-                smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-                capabilities |= CAP_STATUS32;
-        }
-        if (ses->capabilities & CAP_DFS) {
-                smb_buffer->Flags2 |= SMBFLG2_DFS;
-                capabilities |= CAP_DFS;
-        }
-        pSMB->req.Capabilities = cpu_to_le32(capabilities);
-        pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
-        bcc_ptr = pByteArea(smb_buffer);
-        memcpy(bcc_ptr, SecurityBlob, SecurityBlobLength);
-        bcc_ptr += SecurityBlobLength;
-        if (ses->capabilities & CAP_UNICODE) {
-                if ((long) bcc_ptr % 2) {       /* must be word aligned for Unicode strings */
-                        *bcc_ptr = 0;
-                        bcc_ptr++;
-                }
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, user, 100, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;  /* convert num of 16 bit words to bytes */
-                bcc_ptr += 2;   /* trailing null */
-                if (domain == NULL)
-                        bytes_returned =
-                            cifs_strtoUCS((__le16 *) bcc_ptr,
-                                          "CIFS_LINUX_DOM", 32, nls_codepage);
-                else
-                        bytes_returned =
-                            cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
-                                          nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-                                  32, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
-                                  nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;
-                bytes_returned =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-                                  64, nls_codepage);
-                bcc_ptr += 2 * bytes_returned;
-                bcc_ptr += 2;
-        } else {
-                strncpy(bcc_ptr, user, 200);
-                bcc_ptr += strnlen(user, 200);
-                *bcc_ptr = 0;
-                bcc_ptr++;
-                if (domain == NULL) {
-                        strcpy(bcc_ptr, "CIFS_LINUX_DOM");
-                        bcc_ptr += strlen("CIFS_LINUX_DOM") + 1;
-                } else {
-                        strncpy(bcc_ptr, domain, 64);
-                        bcc_ptr += strnlen(domain, 64);
-                        *bcc_ptr = 0;
-                        bcc_ptr++;
-                }
-                strcpy(bcc_ptr, "Linux version ");
-                bcc_ptr += strlen("Linux version ");
-                strcpy(bcc_ptr, system_utsname.release);
-                bcc_ptr += strlen(system_utsname.release) + 1;
-                strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-                bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-        }
-        count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-        smb_buffer->smb_buf_length += count;
-        pSMB->req.ByteCount = cpu_to_le16(count);
-        rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-                         &bytes_returned, 1);
-        if (rc) {
-/*    rc = map_smb_to_linux_error(smb_buffer_response);  *//* done in SendReceive now */
-        } else if ((smb_buffer_response->WordCount == 3)
-                   || (smb_buffer_response->WordCount == 4)) {
-                __u16 action = le16_to_cpu(pSMBr->resp.Action);
-                __u16 blob_len =
-                    le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-                if (action & GUEST_LOGIN)
-                        cFYI(1, (" Guest login"));      /* BB do we want to set anything in SesInfo struct ? */
-                if (ses) {
-                        ses->Suid = smb_buffer_response->Uid;   /* UID left in wire format (le) */
-                        cFYI(1, ("UID = %d ", ses->Suid));
-                        bcc_ptr = pByteArea(smb_buffer_response);       /* response can have either 3 or 4 word count - Samba sends 3 */
-                        /* BB Fix below to make endian neutral !! */
-                        if ((pSMBr->resp.hdr.WordCount == 3)
-                            || ((pSMBr->resp.hdr.WordCount == 4)
-                                && (blob_len <
-                                    pSMBr->resp.ByteCount))) {
-                                if (pSMBr->resp.hdr.WordCount == 4) {
-                                        bcc_ptr +=
-                                            blob_len;
-                                        cFYI(1,
-                                             ("Security Blob Length %d ",
-                                              blob_len));
-                                }
-                                if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-                                        if ((long) (bcc_ptr) % 2) {
-                                                remaining_words =
-                                                    (BCC(smb_buffer_response)
-                                                     - 1) / 2;
-                                                bcc_ptr++;      /* Unicode strings must be word aligned */
-                                        } else {
-                                                remaining_words =
-                                                    BCC
-                                                    (smb_buffer_response) / 2;
-                                        }
-                                        len =
-                                            UniStrnlen((wchar_t *) bcc_ptr,
-                                                       remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-                                        if(ses->serverOS)
-                                                kfree(ses->serverOS);
-                                        ses->serverOS =
-                                            kzalloc(2 * (len + 1), GFP_KERNEL);
-                                        cifs_strfromUCS_le(ses->serverOS,
-                                                           (__le16 *)
-                                                           bcc_ptr, len,
-                                                           nls_codepage);
-                                        bcc_ptr += 2 * (len + 1);
-                                        remaining_words -= len + 1;
-                                        ses->serverOS[2 * len] = 0;
-                                        ses->serverOS[1 + (2 * len)] = 0;
-                                        if (remaining_words > 0) {
-                                                len = UniStrnlen((wchar_t *)bcc_ptr,
-                                                                 remaining_words
-                                                                 - 1);
-                                                if(ses->serverNOS)
-                                                        kfree(ses->serverNOS);
-                                                ses->serverNOS =
-                                                    kzalloc(2 * (len + 1),
-                                                            GFP_KERNEL);
-                                                cifs_strfromUCS_le(ses->serverNOS,
-                                                                   (__le16 *)bcc_ptr,
-                                                                   len,
-                                                                   nls_codepage);
-                                                bcc_ptr += 2 * (len + 1);
-                                                ses->serverNOS[2 * len] = 0;
-                                                ses->serverNOS[1 + (2 * len)] = 0;
-                                                remaining_words -= len + 1;
-                                                if (remaining_words > 0) {
-                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); 
-                     /* last string not null terminated (e.g.Windows XP/2000) */
-                                                        if(ses->serverDomain)
-                                                                kfree(ses->serverDomain);
-                                                        ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
-                                                        cifs_strfromUCS_le(ses->serverDomain,
-                                                             (__le16 *)bcc_ptr, 
-                                                             len, nls_codepage);
-                                                        bcc_ptr += 2*(len+1);
-                                                        ses->serverDomain[2*len] = 0;
-                                                        ses->serverDomain[1+(2*len)] = 0;
-                                                } /* else no more room so create dummy domain string */
-                                                else {
-                                                        if(ses->serverDomain)
-                                                                kfree(ses->serverDomain);
-                                                        ses->serverDomain =
-                                                            kzalloc(2,GFP_KERNEL);
-                                                }
-                                        } else {/* no room use dummy domain&NOS */
-                                                if(ses->serverDomain)
-                                                        kfree(ses->serverDomain);
-                                                ses->serverDomain = kzalloc(2, GFP_KERNEL);
-                                                if(ses->serverNOS)
-                                                        kfree(ses->serverNOS);
-                                                ses->serverNOS = kzalloc(2, GFP_KERNEL);
-                                        }
-                                } else {        /* ASCII */
-                                        len = strnlen(bcc_ptr, 1024);
-                                        if (((long) bcc_ptr + len) - (long)
-                                            pByteArea(smb_buffer_response)
-                                            <= BCC(smb_buffer_response)) {
-                                                if(ses->serverOS)
-                                                        kfree(ses->serverOS);
-                                                ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-                                                strncpy(ses->serverOS, bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0; /* null terminate the string */
-                                                bcc_ptr++;
-                                                len = strnlen(bcc_ptr, 1024);
-                                                if(ses->serverNOS)
-                                                        kfree(ses->serverNOS);
-                                                ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
-                                                strncpy(ses->serverNOS, bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0;
-                                                bcc_ptr++;
-                                                len = strnlen(bcc_ptr, 1024);
-                                                if(ses->serverDomain)
-                                                        kfree(ses->serverDomain);
-                                                ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
-                                                strncpy(ses->serverDomain, bcc_ptr, len);
-                                                bcc_ptr += len;
-                                                bcc_ptr[0] = 0;
-                                                bcc_ptr++;
-                                        } else
-                                                cFYI(1,
-                                                     ("Variable field of length %d extends beyond end of smb ",
-                                                      len));
-                                }
-                        } else {
-                                cERROR(1,
-                                       (" Security Blob Length extends beyond end of SMB"));
-                        }
-                } else {
-                        cERROR(1, ("No session structure passed in."));
-                }
-        } else {
-                cERROR(1,
-                       (" Invalid Word count %d: ",
-                        smb_buffer_response->WordCount));
-                rc = -EIO;
-        }
-        if (smb_buffer)
-                cifs_buf_release(smb_buffer);
-        return rc;
-}
-static int
 CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                              struct cifsSesInfo *ses, int * pNTLMv2_flag,
                              const struct nls_table *nls_codepage)
@@ -2635,8 +2360,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
            /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128;
        if(sign_CIFS_PDUs)
                negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN;
-        if(ntlmv2_support)
+/*      if(ntlmv2_support)
-                negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;
+                negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/
        /* setup pointers to domain name and workstation name */
        bcc_ptr += SecurityBlobLength;
@@ -2783,8 +2508,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                                 bcc_ptr,
                                                                 remaining_words
                                                                 - 1);
-                                                if(ses->serverNOS)
+                                                kfree(ses->serverNOS);
-                                                        kfree(ses->serverNOS);
                                                ses->serverNOS =
                                                    kzalloc(2 * (len + 1),
                                                            GFP_KERNEL);
@@ -2802,8 +2526,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                if (remaining_words > 0) {
                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); 
           /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
-                                                        if(ses->serverDomain)
+                                                        kfree(ses->serverDomain);
-                                                                kfree(ses->serverDomain);
                                                        ses->serverDomain =
                                                            kzalloc(2 *
                                                                    (len +
@@ -2822,19 +2545,16 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                            = 0;
                                                } /* else no more room so create dummy domain string */
                                                else {
-                                                        if(ses->serverDomain)
+                                                        kfree(ses->serverDomain);
-                                                                kfree(ses->serverDomain);
                                                        ses->serverDomain =
                                                            kzalloc(2,
                                                                    GFP_KERNEL);
                                                }
                                        } else {        /* no room so create dummy domain and NOS string */
-                                                if(ses->serverDomain);
+                                                kfree(ses->serverDomain);
-                                                        kfree(ses->serverDomain);
                                                ses->serverDomain =
                                                    kzalloc(2, GFP_KERNEL);
-                                                if(ses->serverNOS)
+                                                kfree(ses->serverNOS);
-                                                        kfree(ses->serverNOS);
                                                ses->serverNOS =
                                                    kzalloc(2, GFP_KERNEL);
                                        }
@@ -2856,8 +2576,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                if(ses->serverNOS)
+                                                kfree(ses->serverNOS);
-                                                        kfree(ses->serverNOS);
                                                ses->serverNOS =
                                                    kzalloc(len + 1,
                                                            GFP_KERNEL);
@@ -2867,8 +2586,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                if(ses->serverDomain)
+                                                kfree(ses->serverDomain);
-                                                        kfree(ses->serverDomain);
                                                ses->serverDomain =
                                                    kzalloc(len + 1,
                                                            GFP_KERNEL);
@@ -2994,14 +2712,14 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
        SecurityBlob->LmChallengeResponse.Buffer = 0;
        SecurityBlob->NtChallengeResponse.Length =
-            cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+            cpu_to_le16(CIFS_SESS_KEY_SIZE);
        SecurityBlob->NtChallengeResponse.MaximumLength =
-            cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+            cpu_to_le16(CIFS_SESS_KEY_SIZE);
-        memcpy(bcc_ptr, ntlm_session_key, CIFS_SESSION_KEY_SIZE);
+        memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE);
        SecurityBlob->NtChallengeResponse.Buffer =
            cpu_to_le32(SecurityBlobLength);
-        SecurityBlobLength += CIFS_SESSION_KEY_SIZE;
+        SecurityBlobLength += CIFS_SESS_KEY_SIZE;
-        bcc_ptr += CIFS_SESSION_KEY_SIZE;
+        bcc_ptr += CIFS_SESS_KEY_SIZE;
        if (ses->capabilities & CAP_UNICODE) {
                if (domain == NULL) {
@@ -3190,8 +2908,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                                 bcc_ptr,
                                                                 remaining_words
                                                                 - 1);
-                                                if(ses->serverNOS)
+                                                kfree(ses->serverNOS);
-                                                        kfree(ses->serverNOS);
                                                ses->serverNOS =
                                                    kzalloc(2 * (len + 1),
                                                            GFP_KERNEL);
@@ -3244,8 +2961,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                if(ses->serverDomain)
                                                        kfree(ses->serverDomain);
                                                ses->serverDomain = kzalloc(2, GFP_KERNEL);
-                                                if(ses->serverNOS)
+                                                kfree(ses->serverNOS);
-                                                        kfree(ses->serverNOS);
                                                ses->serverNOS = kzalloc(2, GFP_KERNEL);
                                        }
                                } else {        /* ASCII */
@@ -3263,8 +2979,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                if(ses->serverNOS)
+                                                kfree(ses->serverNOS);
-                                                        kfree(ses->serverNOS);
                                                ses->serverNOS = kzalloc(len+1,GFP_KERNEL);
                                                strncpy(ses->serverNOS, bcc_ptr, len);  
                                                bcc_ptr += len;
@@ -3340,22 +3055,33 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        bcc_ptr = &pSMB->Password[0];
        if((ses->server->secMode) & SECMODE_USER) {
                pSMB->PasswordLength = cpu_to_le16(1);  /* minimum */
+                *bcc_ptr = 0; /* password is null byte */
                bcc_ptr++;              /* skip password */
+                /* already aligned so no need to do it below */
        } else {
-                pSMB->PasswordLength = cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+                pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
                /* BB FIXME add code to fail this if NTLMv2 or Kerberos
                   specified as required (when that support is added to
                   the vfs in the future) as only NTLM or the much
-                   weaker LANMAN (which we do not send) is accepted
+                   weaker LANMAN (which we do not send by default) is accepted
                   by Samba (not sure whether other servers allow
                   NTLMv2 password here) */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+                if((extended_security & CIFSSEC_MAY_LANMAN) && 
+                        (ses->server->secType == LANMAN))
+                        calc_lanman_hash(ses, bcc_ptr);
+                else
+#endif /* CIFS_WEAK_PW_HASH */
                SMBNTencrypt(ses->password,
                             ses->server->cryptKey,
                             bcc_ptr);
-                bcc_ptr += CIFS_SESSION_KEY_SIZE;
+                bcc_ptr += CIFS_SESS_KEY_SIZE;
-                *bcc_ptr = 0;
+                if(ses->capabilities & CAP_UNICODE) {
-                bcc_ptr++; /* align */
+                        /* must align unicode strings */
+                        *bcc_ptr = 0; /* null byte password */
+                        bcc_ptr++;
+                }
        }
        if(ses->server->secMode & 
@@ -3429,7 +3155,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                        }
                        /* else do not bother copying these informational fields */
                }
-                tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
+                if(smb_buffer_response->WordCount == 3)
+                        tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
+                else
+                        tcon->Flags = 0;
                cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
        } else if ((rc == 0) && tcon == NULL) {
        /* all we need to save for IPC$ connection */
@@ -3494,7 +3223,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                                           struct nls_table * nls_info)
 {
        int rc = 0;
-        char ntlm_session_key[CIFS_SESSION_KEY_SIZE];
+        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
        int ntlmv2_flag = FALSE;
        int first_time = 0;
@@ -3526,20 +3255,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                        pSesInfo->server->secMode,
                        pSesInfo->server->capabilities,
                        pSesInfo->server->timeZone));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if(experimEnabled < 2)
-                if(experimEnabled > 1)
+                        rc = CIFS_SessSetup(xid, pSesInfo,
-                        rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */,
+                                            first_time, nls_info);
-                                            &ntlmv2_flag, nls_info);    
+                else if (extended_security
-                else
-#endif
-                if (extended_security
                                && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
                                && (pSesInfo->server->secType == NTLMSSP)) {
-                        cFYI(1, ("New style sesssetup"));
+                        rc = -EOPNOTSUPP;
-                        rc = CIFSSpnegoSessSetup(xid, pSesInfo,
-                                NULL /* security blob */, 
-                                0 /* blob length */,
-                                nls_info);
                } else if (extended_security
                           && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
                           && (pSesInfo->server->secType == RawNTLMSSP)) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 82315edc77d7..ba4cbe9b0684 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -113,7 +113,7 @@ cifs_bp_rename_retry:
        full_path[namelen+2] = 0;
 BB remove above eight lines BB */
-/* Inode operations in similar order to how they appear in the Linux file fs.h */
+/* Inode operations in similar order to how they appear in Linux file fs.h */
 int
 cifs_create(struct inode *inode, struct dentry *direntry, int mode,
@@ -178,11 +178,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                FreeXid(xid);
                return -ENOMEM;
        }
+        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 
-        rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+                rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
                         desiredAccess, CREATE_NOT_DIR,
                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = -EIO; /* no NT SMB support fall into legacy open below */
        if(rc == -EIO) {
                /* old server, retry the open legacy style */
                rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
@@ -191,7 +194,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        } 
        if (rc) {
-                cFYI(1, ("cifs_create returned 0x%x ", rc));
+                cFYI(1, ("cifs_create returned 0x%x", rc));
        } else {
                /* If Open reported that we actually created a file
                then we now have to set the mode if possible */
@@ -369,6 +372,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                                         cifs_sb->mnt_cifs_flags & 
                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
+                        /* BB FIXME - add handling for backlevel servers
+                           which need legacy open and check for all
+                           calls to SMBOpen for fallback to 
+                           SMBLeagcyOpen */
                        if(!rc) {
                                /* BB Do not bother to decode buf since no
                                   local inode yet to put timestamps in,
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index 633a93811328..d91a3d44e9e3 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -91,14 +91,14 @@ int cifs_dir_notify(struct file * file, unsigned long arg)
        if(full_path == NULL) {
                rc = -ENOMEM;
        } else {
-                cERROR(1,("cifs dir notify on file %s with arg 0x%lx",full_path,arg)); /* BB removeme BB */
+                cFYI(1,("dir notify on file %s Arg 0x%lx",full_path,arg));
                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, 
                        GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
                        &netfid, &oplock,NULL, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
                /* BB fixme - add this handle to a notify handle list */
                if(rc) {
-                        cERROR(1,("Could not open directory for notify"));  /* BB remove BB */
+                        cFYI(1,("Could not open directory for notify"));
                } else {
                        filter = convert_to_cifs_notify_flags(arg);
                        if(filter != 0) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2b4ce1dad66..944d2b9e092d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -110,7 +110,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
                         &pCifsInode->openFileList);
        }
        write_unlock(&GlobalSMBSeslock);
-        write_unlock(&file->f_owner.lock);
        if (pCifsInode->clientCanCacheRead) {
                /* we have the inode open somewhere else
                   no need to discard cache data */
@@ -201,7 +200,7 @@ int cifs_open(struct inode *inode, struct file *file)
                } else {
                        if (file->f_flags & O_EXCL)
                                cERROR(1, ("could not find file instance for "
-                                           "new file %p ", file));
+                                           "new file %p", file));
                }
        }
@@ -260,10 +259,15 @@ int cifs_open(struct inode *inode, struct file *file)
                rc = -ENOMEM;
                goto out;
        }
-        rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
-                         CREATE_NOT_DIR, &netfid, &oplock, buf,
+        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+                rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 
+                         desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        else
+                rc = -EIO; /* no NT SMB support fall into legacy open below */
        if (rc == -EIO) {
                /* Old server, try legacy style OpenX */
                rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
@@ -272,7 +276,7 @@ int cifs_open(struct inode *inode, struct file *file)
                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_open returned 0x%x ", rc));
+                cFYI(1, ("cifs_open returned 0x%x", rc));
                goto out;
        }
        file->private_data =
@@ -282,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file)
                goto out;
        }
        pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
-        write_lock(&file->f_owner.lock);
        write_lock(&GlobalSMBSeslock);
        list_add(&pCifsFile->tlist, &pTcon->openFileList);
@@ -293,7 +296,6 @@ int cifs_open(struct inode *inode, struct file *file)
                                            &oplock, buf, full_path, xid);
        } else {
                write_unlock(&GlobalSMBSeslock);
-                write_unlock(&file->f_owner.lock);
        }
        if (oplock & CIFS_CREATE_ACTION) {           
@@ -322,7 +324,7 @@ out:
        return rc;
 }
-/* Try to reaquire byte range locks that were released when session */
+/* Try to reacquire byte range locks that were released when session */
 /* to server was lost */
 static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 {
@@ -409,8 +411,8 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
                up(&pCifsFile->fh_sem);
-                cFYI(1, ("cifs_open returned 0x%x ", rc));
+                cFYI(1, ("cifs_open returned 0x%x", rc));
-                cFYI(1, ("oplock: %d ", oplock));
+                cFYI(1, ("oplock: %d", oplock));
        } else {
                pCifsFile->netfid = netfid;
                pCifsFile->invalidHandle = FALSE;
@@ -472,7 +474,6 @@ int cifs_close(struct inode *inode, struct file *file)
        pTcon = cifs_sb->tcon;
        if (pSMBFile) {
                pSMBFile->closePend = TRUE;
-                write_lock(&file->f_owner.lock);
                if (pTcon) {
                        /* no sense reconnecting to close a file that is
                           already closed */
@@ -487,23 +488,18 @@ int cifs_close(struct inode *inode, struct file *file)
                                        the struct would be in each open file,
                                        but this should give enough time to 
                                        clear the socket */
-                                        write_unlock(&file->f_owner.lock);
                                        cERROR(1,("close with pending writes"));
                                        msleep(timeout);
-                                        write_lock(&file->f_owner.lock);
                                        timeout *= 4;
                                } 
-                                write_unlock(&file->f_owner.lock);
                                rc = CIFSSMBClose(xid, pTcon,
                                                  pSMBFile->netfid);
-                                write_lock(&file->f_owner.lock);
                        }
                }
                write_lock(&GlobalSMBSeslock);
                list_del(&pSMBFile->flist);
                list_del(&pSMBFile->tlist);
                write_unlock(&GlobalSMBSeslock);
-                write_unlock(&file->f_owner.lock);
                kfree(pSMBFile->search_resume_name);
                kfree(file->private_data);
                file->private_data = NULL;
@@ -531,7 +527,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
            (struct cifsFileInfo *)file->private_data;
        char *ptmp;
-        cFYI(1, ("Closedir inode = 0x%p with ", inode));
+        cFYI(1, ("Closedir inode = 0x%p", inode));
        xid = GetXid();
@@ -605,7 +601,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        }
        if (pfLock->fl_flags & FL_ACCESS)
                cFYI(1, ("Process suspended by mandatory locking - "
-                         "not implemented yet "));
+                         "not implemented yet"));
        if (pfLock->fl_flags & FL_LEASE)
                cFYI(1, ("Lease on file - not implemented yet"));
        if (pfLock->fl_flags & 
@@ -1079,9 +1075,9 @@ static int cifs_writepages(struct address_space *mapping,
        unsigned int bytes_written;
        struct cifs_sb_info *cifs_sb;
        int done = 0;
-        pgoff_t end = -1;
+        pgoff_t end;
        pgoff_t index;
-        int is_range = 0;
+        int range_whole = 0;
        struct kvec iov[32];
        int len;
        int n_iov = 0;
@@ -1122,16 +1118,14 @@ static int cifs_writepages(struct address_space *mapping,
        xid = GetXid();
        pagevec_init(&pvec, 0);
-        if (wbc->sync_mode == WB_SYNC_NONE)
+        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
-        else {
+                end = -1;
-                index = 0;
+        } else {
-                scanned = 1;
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
-        }
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        if (wbc->start || wbc->end) {
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                index = wbc->start >> PAGE_CACHE_SHIFT;
+                        range_whole = 1;
-                end = wbc->end >> PAGE_CACHE_SHIFT;
-                is_range = 1;
                scanned = 1;
        }
 retry:
@@ -1167,7 +1161,7 @@ retry:
                                break;
                        }
-                        if (unlikely(is_range) && (page->index > end)) {
+                        if (!wbc->range_cyclic && page->index > end) {
                                done = 1;
                                unlock_page(page);
                                break;
@@ -1271,7 +1265,7 @@ retry:
                index = 0;
                goto retry;
        }
-        if (!is_range)
+        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                mapping->writeback_index = index;
        FreeXid(xid);
@@ -1377,7 +1371,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        xid = GetXid();
-        cFYI(1, ("Sync file - name: %s datasync: 0x%x ", 
+        cFYI(1, ("Sync file - name: %s datasync: 0x%x", 
                dentry->d_name.name, datasync));
        
        rc = filemap_fdatawrite(inode->i_mapping);
@@ -1406,7 +1400,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*      fill in rpages then 
        result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-/*      cFYI(1, ("rpages is %d for sync page of Index %ld ", rpages, index));
+/*      cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
 #if 0
        if (rc < 0)
@@ -1419,7 +1413,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 * As file closes, flush all cached write data for this inode checking
 * for write behind errors.
 */
-int cifs_flush(struct file *file)
+int cifs_flush(struct file *file, fl_owner_t id)
 {
        struct inode * inode = file->f_dentry->d_inode;
        int rc = 0;
@@ -1838,7 +1832,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        if (rc < 0)
                goto io_error;
        else
-                cFYI(1, ("Bytes read %d ",rc));
+                cFYI(1, ("Bytes read %d",rc));
                                                                                                                           
        file->f_dentry->d_inode->i_atime =
                current_fs_time(file->f_dentry->d_inode->i_sb);
@@ -1948,7 +1942,7 @@ static int cifs_prepare_write(struct file *file, struct page *page,
        return 0;
 }
-struct address_space_operations cifs_addr_ops = {
+const struct address_space_operations cifs_addr_ops = {
        .readpage = cifs_readpage,
        .readpages = cifs_readpages,
        .writepage = cifs_writepage,
@@ -1959,3 +1953,19 @@ struct address_space_operations cifs_addr_ops = {
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
+/*
+ * cifs_readpages requires the server to support a buffer large enough to
+ * contain the header plus one complete page of data.  Otherwise, we need
+ * to leave cifs_readpages out of the address space operations.
+ */
+const struct address_space_operations cifs_addr_ops_smallbuf = {
+        .readpage = cifs_readpage,
+        .writepage = cifs_writepage,
+        .writepages = cifs_writepages,
+        .prepare_write = cifs_prepare_write,
+        .commit_write = cifs_commit_write,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+        /* .sync_page = cifs_sync_page, */
+        /* .direct_IO = */
+};
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4093764ef461..b88147c1dc27 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -41,7 +41,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        char *tmp_path;
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s ", search_path));
+        cFYI(1, ("Getting info on %s", search_path));
        /* could have done a find first instead but this returns more info */
        rc = CIFSSMBUnixQPathInfo(xid, pTcon, search_path, &findData,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -97,9 +97,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                inode = *pinode;
                cifsInfo = CIFS_I(inode);
-                cFYI(1, ("Old time %ld ", cifsInfo->time));
+                cFYI(1, ("Old time %ld", cifsInfo->time));
                cifsInfo->time = jiffies;
-                cFYI(1, ("New time %ld ", cifsInfo->time));
+                cFYI(1, ("New time %ld", cifsInfo->time));
                /* this is ok to set on every inode revalidate */
                atomic_set(&cifsInfo->inUse,1);
@@ -180,11 +180,12 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                        else /* not direct, send byte range locks */ 
                                inode->i_fop = &cifs_file_ops;
-                        inode->i_data.a_ops = &cifs_addr_ops;
                        /* check if server can support readpages */
                        if(pTcon->ses->server->maxBuf < 
-                            4096 + MAX_CIFS_HDR_SIZE)
+                            PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
-                                inode->i_data.a_ops->readpages = NULL;
+                                inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+                        else
+                                inode->i_data.a_ops = &cifs_addr_ops;
                } else if (S_ISDIR(inode->i_mode)) {
                        cFYI(1, ("Directory inode"));
                        inode->i_op = &cifs_dir_inode_ops;
@@ -421,23 +422,23 @@ int cifs_get_inode_info(struct inode **pinode,
                inode = *pinode;
                cifsInfo = CIFS_I(inode);
                cifsInfo->cifsAttrs = attr;
-                cFYI(1, ("Old time %ld ", cifsInfo->time));
+                cFYI(1, ("Old time %ld", cifsInfo->time));
                cifsInfo->time = jiffies;
-                cFYI(1, ("New time %ld ", cifsInfo->time));
+                cFYI(1, ("New time %ld", cifsInfo->time));
                /* blksize needs to be multiple of two. So safer to default to
                blksize and blkbits set in superblock so 2**blkbits and blksize
                will match rather than setting to:
                (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
-                /* Linux can not store file creation time unfortunately so we ignore it */
+                /* Linux can not store file creation time so ignore it */
                inode->i_atime =
                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
                inode->i_mtime =
                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
                inode->i_ctime =
                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
-                cFYI(0, ("Attributes came in as 0x%x ", attr));
+                cFYI(0, ("Attributes came in as 0x%x", attr));
                /* set default mode. will override for dirs below */
                if (atomic_read(&cifsInfo->inUse) == 0)
@@ -519,10 +520,11 @@ int cifs_get_inode_info(struct inode **pinode,
                        else /* not direct, send byte range locks */
                                inode->i_fop = &cifs_file_ops;
-                        inode->i_data.a_ops = &cifs_addr_ops;
                        if(pTcon->ses->server->maxBuf < 
-                             4096 + MAX_CIFS_HDR_SIZE)
+                             PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
-                                inode->i_data.a_ops->readpages = NULL;
+                                inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+                        else
+                                inode->i_data.a_ops = &cifs_addr_ops;
                } else if (S_ISDIR(inode->i_mode)) {
                        cFYI(1, ("Directory inode"));
                        inode->i_op = &cifs_dir_inode_ops;
@@ -731,7 +733,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cFYI(1, ("cifs_mkdir returned 0x%x ", rc));
+                cFYI(1, ("cifs_mkdir returned 0x%x", rc));
                d_drop(direntry);
        } else {
                inode->i_nlink++;
@@ -798,7 +800,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
-        cFYI(1, ("cifs_rmdir, inode = 0x%p with ", inode));
+        cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
        xid = GetXid();
@@ -1121,7 +1123,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        xid = GetXid();
-        cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x ",
+        cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
                 direntry->d_name.name, attrs->ia_valid));
        cifs_sb = CIFS_SB(direntry->d_inode->i_sb);
@@ -1157,6 +1159,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                   when the local oplock break takes longer to flush
                   writebehind data than the SMB timeout for the SetPathInfo
                   request would allow */
                open_file = find_writable_file(cifsInode);
                if (open_file) {
                        __u16 nfid = open_file->netfid;
@@ -1289,7 +1292,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                it may be useful to Windows - but we do
                not want to set ctime unless some other
                timestamp is changing */
-                cFYI(1, ("CIFS - CTIME changed "));
+                cFYI(1, ("CIFS - CTIME changed"));
                time_buf.ChangeTime =
                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
        } else
@@ -1356,7 +1359,7 @@ cifs_setattr_exit:
 void cifs_delete_inode(struct inode *inode)
 {
-        cFYI(1, ("In cifs_delete_inode, inode = 0x%p ", inode));
+        cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
        /* may have to add back in if and when safe distributed caching of
           directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2ec99f833142..a57f5d6e6213 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -167,7 +167,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                return -ENOMEM;
        }
-        cFYI(1, ("Full path: %s ", full_path));
+        cFYI(1, ("Full path: %s", full_path));
        cFYI(1, ("symname is %s", symname));
        /* BB what if DFS and this volume is on different share? BB */
@@ -186,8 +186,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                                                 inode->i_sb,xid);
                if (rc != 0) {
-                        cFYI(1,
+                        cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
-                             ("Create symlink worked but get_inode_info failed with rc = %d ",
                              rc));
                } else {
                        if (pTcon->nocase)
@@ -289,7 +288,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
                                        else {
                                                cFYI(1,("num referral: %d",num_referrals));
                                                if(referrals) {
-                                                        cFYI(1,("referral string: %s ",referrals));
+                                                        cFYI(1,("referral string: %s",referrals));
                                                        strncpy(tmpbuffer, referrals, len-1);                            
                                                }
                                        }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index fafd056426e4..22c937e5884f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -101,6 +101,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
        kfree(buf_to_free->serverDomain);
        kfree(buf_to_free->serverNOS);
        kfree(buf_to_free->password);
+        kfree(buf_to_free->domainName);
        kfree(buf_to_free);
 }
@@ -499,11 +500,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                if(pSMBr->ByteCount > sizeof(struct file_notify_information)) {
                        data_offset = le32_to_cpu(pSMBr->DataOffset);
-                        pnotify = (struct file_notify_information *)((char *)&pSMBr->hdr.Protocol
+                        pnotify = (struct file_notify_information *)
-                                + data_offset);
+                                ((char *)&pSMBr->hdr.Protocol + data_offset);
-                        cFYI(1,("dnotify on %s with action: 0x%x",pnotify->FileName,
+                        cFYI(1,("dnotify on %s Action: 0x%x",pnotify->FileName,
                                pnotify->Action));  /* BB removeme BB */
-                     /*   cifs_dump_mem("Received notify Data is: ",buf,sizeof(struct smb_hdr)+60); */
+                     /*   cifs_dump_mem("Rcvd notify Data: ",buf,
+                                sizeof(struct smb_hdr)+60); */
                        return TRUE;
                }
                if(pSMBr->hdr.Status.CifsError) {
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 5de74d216fdd..b66eff5dc624 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -84,11 +84,11 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
        {ERRerror, -EIO},
-        {ERRbadpw, -EPERM},
+        {ERRbadpw, -EACCES},  /* was EPERM */
        {ERRbadtype, -EREMOTE},
        {ERRaccess, -EACCES},
        {ERRinvtid, -ENXIO},
-        {ERRinvnetname, -ENODEV},
+        {ERRinvnetname, -ENXIO},
        {ERRinvdevice, -ENXIO},
        {ERRqfull, -ENOSPC},
        {ERRqtoobig, -ENOSPC},
diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c
deleted file mode 100644
index 115359cc7a32..000000000000
--- a/fs/cifs/ntlmssp.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- *   fs/cifs/ntlmssp.h
- *
- *   Copyright (c) International Business Machines  Corp., 2006
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include "cifspdu.h"
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "ntlmssp.h"
-#include "nterr.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
-{
-        __u32 capabilities = 0;
-        /* init fields common to all four types of SessSetup */
-        /* note that header is initialized to zero in header_assemble */
-        pSMB->req.AndXCommand = 0xFF;
-        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-        /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
-        /* BB verify whether signing required on neg or just on auth frame 
-           (and NTLM case) */
-        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-                        CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-        if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-                pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        if (ses->capabilities & CAP_UNICODE) {
-                pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE;
-                capabilities |= CAP_UNICODE;
-        }
-        if (ses->capabilities & CAP_STATUS32) {
-                pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS;
-                capabilities |= CAP_STATUS32;
-        }
-        if (ses->capabilities & CAP_DFS) {
-                pSMB->req.hdr.Flags2 |= SMBFLG2_DFS;
-                capabilities |= CAP_DFS;
-        }
-        /* BB check whether to init vcnum BB */
-        return capabilities;
-}
-int 
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type,
-                  int * pNTLMv2_flg, const struct nls_table *nls_cp)
-{
-        int rc = 0;
-        int wct;
-        struct smb_hdr *smb_buffer;
-        char *bcc_ptr;
-        SESSION_SETUP_ANDX *pSMB;
-        __u32 capabilities;
-        if(ses == NULL)
-                return -EINVAL;
-        cFYI(1,("SStp type: %d",type));
-        if(type < CIFS_NTLM) {
-#ifndef CONFIG_CIFS_WEAK_PW_HASH
-                /* LANMAN and plaintext are less secure and off by default.
-                So we make this explicitly be turned on in kconfig (in the
-                build) and turned on at runtime (changed from the default)
-                in proc/fs/cifs or via mount parm.  Unfortunately this is
-                needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
-                return -EOPNOTSUPP;
-#endif
-                wct = 10; /* lanman 2 style sessionsetup */
-        } else if(type < CIFS_NTLMSSP_NEG)
-                wct = 13; /* old style NTLM sessionsetup */
-        else /* same size for negotiate or auth, NTLMSSP or extended security */
-                wct = 12;
-        rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
-                            (void **)&smb_buffer);
-        if(rc)
-                return rc;
-        pSMB = (SESSION_SETUP_ANDX *)smb_buffer;
-        capabilities = cifs_ssetup_hdr(ses, pSMB);
-        bcc_ptr = pByteArea(smb_buffer);
-        if(type > CIFS_NTLM) {
-                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-                capabilities |= CAP_EXTENDED_SECURITY;
-                pSMB->req.Capabilities = cpu_to_le32(capabilities);
-                /* BB set password lengths */
-        } else if(type < CIFS_NTLM) /* lanman */ {
-                /* no capabilities flags in old lanman negotiation */
-                /* pSMB->old_req.PasswordLength = */ /* BB fixme BB */
-        } else /* type CIFS_NTLM */ {
-                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-                pSMB->req_no_secext.CaseInsensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESSION_KEY_SIZE);
-                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(CIFS_SESSION_KEY_SIZE);
-        }
-        /* copy session key */
-        /* if Unicode, align strings to two byte boundary */
-        /* copy user name */ /* BB Do we need to special case null user name? */
-        /* copy domain name */
-        /* copy Linux version */
-        /* copy network operating system name */
-        /* update bcc and smb buffer length */
-/*      rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */
-        /* SMB request buf freed in SendReceive2 */
-        return rc;
-}
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b689c5035124..03bbcb377913 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -21,6 +21,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include "cifspdu.h"
@@ -31,8 +32,8 @@
 #include "cifs_fs_sb.h"
 #include "cifsfs.h"
-/* BB fixme - add debug wrappers around this function to disable it fixme BB */
+#ifdef CONFIG_CIFS_DEBUG2
-/* static void dump_cifs_file_struct(struct file *file, char *label)
+static void dump_cifs_file_struct(struct file *file, char *label)
 {
        struct cifsFileInfo * cf;
@@ -53,7 +54,8 @@
                }
                
        }
-} */
+}
+#endif /* DEBUG2 */
 /* Returns one if new inode created (which therefore needs to be hashed) */
 /* Might check in the future if inode number changed so we can rehash inode */
@@ -107,32 +109,52 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
        return rc;
 }
-static void fill_in_inode(struct inode *tmp_inode,
+static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
-        FILE_DIRECTORY_INFO *pfindData, int *pobject_type, int isNewInode)
+                char * buf, int *pobject_type, int isNewInode)
 {
        loff_t local_size;
        struct timespec local_mtime;
        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
-        __u32 attr = le32_to_cpu(pfindData->ExtFileAttributes);
+        __u32 attr;
-        __u64 allocation_size = le64_to_cpu(pfindData->AllocationSize);
+        __u64 allocation_size;
-        __u64 end_of_file = le64_to_cpu(pfindData->EndOfFile);
+        __u64 end_of_file;
-        cifsInfo->cifsAttrs = attr;
-        cifsInfo->time = jiffies;
        /* save mtime and size */
        local_mtime = tmp_inode->i_mtime;
        local_size  = tmp_inode->i_size;
+        if(new_buf_type) {
+                FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
+                attr = le32_to_cpu(pfindData->ExtFileAttributes);
+                allocation_size = le64_to_cpu(pfindData->AllocationSize);
+                end_of_file = le64_to_cpu(pfindData->EndOfFile);
+                tmp_inode->i_atime =
+                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+                tmp_inode->i_mtime =
+                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+                tmp_inode->i_ctime =
+                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+        } else { /* legacy, OS2 and DOS style */
+                FIND_FILE_STANDARD_INFO * pfindData = 
+                        (FIND_FILE_STANDARD_INFO *)buf;
+                attr = le16_to_cpu(pfindData->Attributes);
+                allocation_size = le32_to_cpu(pfindData->AllocationSize);
+                end_of_file = le32_to_cpu(pfindData->DataSize);
+                tmp_inode->i_atime = CURRENT_TIME;
+                /* tmp_inode->i_mtime =  BB FIXME - add dos time handling
+                tmp_inode->i_ctime = 0;   BB FIXME */
+        }
        /* Linux can not store file creation time unfortunately so ignore it */
-        tmp_inode->i_atime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+        cifsInfo->cifsAttrs = attr;
-        tmp_inode->i_mtime =
+        cifsInfo->time = jiffies;
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-        tmp_inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
        /* treat dos attribute of read-only as read-only mode bit e.g. 555? */
        /* 2767 perms - indicate mandatory locking */
                /* BB fill in uid and gid here? with help from winbind? 
@@ -215,11 +237,13 @@ static void fill_in_inode(struct inode *tmp_inode,
                else
                        tmp_inode->i_fop = &cifs_file_ops;
-                tmp_inode->i_data.a_ops = &cifs_addr_ops;
                if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
                   (cifs_sb->tcon->ses->server->maxBuf <
-                        4096 + MAX_CIFS_HDR_SIZE))
+                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
-                        tmp_inode->i_data.a_ops->readpages = NULL;
+                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+                else
+                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
                if(isNewInode)
                        return; /* No sense invalidating pages for new inode
                                   since have not started caching readahead file
@@ -338,11 +362,12 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
                else
                        tmp_inode->i_fop = &cifs_file_ops;
-                tmp_inode->i_data.a_ops = &cifs_addr_ops;
                if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
                   (cifs_sb->tcon->ses->server->maxBuf < 
-                        4096 + MAX_CIFS_HDR_SIZE))
+                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
-                        tmp_inode->i_data.a_ops->readpages = NULL;
+                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+                else
+                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
                if(isNewInode)
                        return; /* No sense invalidating pages for new inode since we
@@ -415,7 +440,10 @@ static int initiate_cifs_search(const int xid, struct file *file)
 ffirst_retry:
        /* test for Unix extensions */
        if (pTcon->ses->capabilities & CAP_UNIX) {
-                cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; 
+                cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
+        } else if ((pTcon->ses->capabilities & 
+                        (CAP_NT_SMBS | CAP_NT_FIND)) == 0) {
+                cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
        } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
        } else /* not srvinos - BB fixme add check for backlevel? */ {
@@ -451,12 +479,19 @@ static int cifs_unicode_bytelen(char *str)
        return len << 1;
 }
-static char *nxt_dir_entry(char *old_entry, char *end_of_smb)
+static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 {
        char * new_entry;
        FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
-        new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
+        if(level == SMB_FIND_FILE_INFO_STANDARD) {
+                FIND_FILE_STANDARD_INFO * pfData;
+                pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
+                new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
+                                pfData->FileNameLength;
+        } else
+                new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
        cFYI(1,("new entry %p old entry %p",new_entry,old_entry));
        /* validate that new_entry is not past end of SMB */
        if(new_entry >= end_of_smb) {
@@ -464,7 +499,10 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb)
                      ("search entry %p began after end of SMB %p old entry %p",
                        new_entry, end_of_smb, old_entry)); 
                return NULL;
-        } else if (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb) {
+        } else if(((level == SMB_FIND_FILE_INFO_STANDARD) &&
+                   (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) ||
+                  ((level != SMB_FIND_FILE_INFO_STANDARD) &&
+                   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
                cERROR(1,("search entry %p extends after end of SMB %p",
                        new_entry, end_of_smb));
                return NULL;
@@ -482,7 +520,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
        char * filename = NULL;
        int len = 0; 
-        if(cfile->srch_inf.info_level == 0x202) {
+        if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
                FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                if(cfile->srch_inf.unicode) {
@@ -491,26 +529,34 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
                        /* BB should we make this strnlen of PATH_MAX? */
                        len = strnlen(filename, 5);
                }
-        } else if(cfile->srch_inf.info_level == 0x101) {
+        } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO * pFindData = 
                        (FILE_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level == 0x102) {
+        } else if(cfile->srch_inf.info_level == 
+                        SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
                FILE_FULL_DIRECTORY_INFO * pFindData = 
                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level == 0x105) {
+        } else if(cfile->srch_inf.info_level ==
+                        SMB_FIND_FILE_ID_FULL_DIR_INFO) {
                SEARCH_ID_FULL_DIR_INFO * pFindData = 
                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level == 0x104) {
+        } else if(cfile->srch_inf.info_level == 
+                        SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
                FILE_BOTH_DIRECTORY_INFO * pFindData = 
                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
+        } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
+                FIND_FILE_STANDARD_INFO * pFindData =
+                        (FIND_FILE_STANDARD_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                len = le32_to_cpu(pFindData->FileNameLength);
        } else {
                cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
        }
@@ -597,7 +643,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        . and .. for the root of a drive and for those we need
        to start two entries earlier */
-/*      dump_cifs_file_struct(file, "In fce ");*/
+#ifdef CONFIG_CIFS_DEBUG2
+        dump_cifs_file_struct(file, "In fce ");
+#endif
        if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
             is_dir_changed(file)) || 
           (index_to_find < first_entry_in_buffer)) {
@@ -644,10 +692,12 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
                                        - cifsFile->srch_inf.entries_in_buffer;
                pos_in_buf = index_to_find - first_entry_in_buffer;
-                cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); 
+                cFYI(1,("found entry - pos_in_buf %d",pos_in_buf));
                for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) {
                        /* go entry by entry figuring out which is first */
-                        current_entry = nxt_dir_entry(current_entry,end_of_smb);
+                        current_entry = nxt_dir_entry(current_entry,end_of_smb,
+                                                cifsFile->srch_inf.info_level);
                }
                if((current_entry == NULL) && (i < pos_in_buf)) {
                        /* BB fixme - check if we should flag this error */
@@ -674,7 +724,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
        char *current_entry, __u16 level, unsigned int unicode,
-        struct cifs_sb_info * cifs_sb, ino_t *pinum)
+        struct cifs_sb_info * cifs_sb, int max_len, ino_t *pinum)
 {
        int rc = 0;
        unsigned int len = 0;
@@ -718,10 +768,22 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
+        } else if(level == SMB_FIND_FILE_INFO_STANDARD) {
+                FIND_FILE_STANDARD_INFO * pFindData =
+                        (FIND_FILE_STANDARD_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                /* one byte length, no name conversion */
+                len = (unsigned int)pFindData->FileNameLength;
        } else {
                cFYI(1,("Unknown findfirst level %d",level));
                return -EINVAL;
        }
+        if(len > max_len) {
+                cERROR(1,("bad search response length %d past smb end", len));
+                return -EINVAL;
+        }
        if(unicode) {
                /* BB fixme - test with long names */
                /* Note converted filename can be longer than in unicode */
@@ -741,7 +803,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 }
 static int cifs_filldir(char *pfindEntry, struct file *file,
-        filldir_t filldir, void *direntry, char *scratch_buf)
+        filldir_t filldir, void *direntry, char *scratch_buf, int max_len)
 {
        int rc = 0;
        struct qstr qstring;
@@ -777,6 +839,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        rc = cifs_get_name_from_search_buf(&qstring,pfindEntry,
                        pCifsF->srch_inf.info_level,
                        pCifsF->srch_inf.unicode,cifs_sb,
+                        max_len,
                        &inum /* returned */);
        if(rc)
@@ -798,13 +861,16 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        /* we pass in rc below, indicating whether it is a new inode,
           so we can figure out whether to invalidate the inode cached
           data if the file has changed */
-        if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
+        if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
                unix_fill_in_inode(tmp_inode,
-                                   (FILE_UNIX_INFO *)pfindEntry,&obj_type, rc);
+                                   (FILE_UNIX_INFO *)pfindEntry,
-        } else {
+                                   &obj_type, rc);
-                fill_in_inode(tmp_inode,
+        else if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
-                              (FILE_DIRECTORY_INFO *)pfindEntry,&obj_type, rc);
+                fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */,
-        }
+                                pfindEntry, &obj_type, rc);
+        else
+                fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc);
+        
        
        rc = filldir(direntry,qstring.name,qstring.len,file->f_pos,
                     tmp_inode->i_ino,obj_type);
@@ -864,6 +930,12 @@ static int cifs_save_resume_key(const char *current_entry,
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+        } else if(level == SMB_FIND_FILE_INFO_STANDARD) {
+                FIND_FILE_STANDARD_INFO * pFindData =
+                        (FIND_FILE_STANDARD_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                /* one byte length, no name conversion */
+                len = (unsigned int)pFindData->FileNameLength;
        } else {
                cFYI(1,("Unknown findfirst level %d",level));
                return -EINVAL;
@@ -884,6 +956,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        int num_to_fill = 0;
        char * tmp_buf = NULL;
        char * end_of_smb;
+        int max_len;
        xid = GetXid();
@@ -909,7 +982,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 1:
                if (filldir(direntry, "..", 2, file->f_pos,
                     file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for parent dir failed "));
+                        cERROR(1, ("Filldir for parent dir failed"));
                        rc = -ENOMEM;
                        break;
                }
@@ -959,10 +1032,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        goto rddir2_exit;
                }
                cFYI(1,("loop through %d times filling dir for net buf %p",
-                        num_to_fill,cifsFile->srch_inf.ntwrk_buf_start)); 
+                        num_to_fill,cifsFile->srch_inf.ntwrk_buf_start));
-                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start +
+                max_len = smbCalcSize((struct smb_hdr *)
-                        smbCalcSize((struct smb_hdr *)
+                                cifsFile->srch_inf.ntwrk_buf_start);
-                                    cifsFile->srch_inf.ntwrk_buf_start);
+                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
                /* To be safe - for UCS to UTF-8 with strings loaded
                with the rare long characters alloc more to account for
                such multibyte target UTF-8 characters. cifs_unicode.c,
@@ -977,17 +1051,19 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        }
                        /* if buggy server returns . and .. late do
                        we want to check for that here? */
-                        rc = cifs_filldir(current_entry, file, 
+                        rc = cifs_filldir(current_entry, file,
-                                        filldir, direntry,tmp_buf);
+                                        filldir, direntry, tmp_buf, max_len);
                        file->f_pos++;
-                        if(file->f_pos == cifsFile->srch_inf.index_of_last_entry) {
+                        if(file->f_pos == 
+                                cifsFile->srch_inf.index_of_last_entry) {
                                cFYI(1,("last entry in buf at pos %lld %s",
-                                        file->f_pos,tmp_buf)); /* BB removeme BB */
+                                        file->f_pos,tmp_buf));
                                cifs_save_resume_key(current_entry,cifsFile);
                                break;
                        } else 
-                                current_entry = nxt_dir_entry(current_entry,
+                                current_entry = 
-                                                              end_of_smb);
+                                        nxt_dir_entry(current_entry, end_of_smb,
+                                                cifsFile->srch_inf.info_level);
                }
                kfree(tmp_buf);
                break;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
new file mode 100644
index 000000000000..7202d534ef0b
--- /dev/null
+++ b/fs/cifs/sess.c
@@ -0,0 +1,538 @@
+/*
+ *   fs/cifs/sess.c
+ *
+ *   SMB/CIFS session setup handling routines
+ *
+ *   Copyright (c) International Business Machines  Corp., 2006
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "ntlmssp.h"
+#include "nterr.h"
+#include <linux/utsname.h>
+extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
+                         unsigned char *p24);
+static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
+{
+        __u32 capabilities = 0;
+        /* init fields common to all four types of SessSetup */
+        /* note that header is initialized to zero in header_assemble */
+        pSMB->req.AndXCommand = 0xFF;
+        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+        /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
+        /* BB verify whether signing required on neg or just on auth frame 
+           (and NTLM case) */
+        capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
+                        CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
+        if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+        if (ses->capabilities & CAP_UNICODE) {
+                pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE;
+                capabilities |= CAP_UNICODE;
+        }
+        if (ses->capabilities & CAP_STATUS32) {
+                pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+                capabilities |= CAP_STATUS32;
+        }
+        if (ses->capabilities & CAP_DFS) {
+                pSMB->req.hdr.Flags2 |= SMBFLG2_DFS;
+                capabilities |= CAP_DFS;
+        }
+        if (ses->capabilities & CAP_UNIX) {
+                capabilities |= CAP_UNIX;
+        }
+        /* BB check whether to init vcnum BB */
+        return capabilities;
+}
+static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
+                            const struct nls_table * nls_cp)
+{
+        char * bcc_ptr = *pbcc_area;
+        int bytes_ret = 0;
+        /* BB FIXME add check that strings total less
+        than 335 or will need to send them as arrays */
+        /* unicode strings, must be word aligned before the call */
+/*      if ((long) bcc_ptr % 2) {
+                *bcc_ptr = 0;
+                bcc_ptr++;
+        } */
+        /* copy user */
+        if(ses->userName == NULL) {
+                /* BB what about null user mounts - check that we do this BB */
+        } else { /* 300 should be long enough for any conceivable user name */
+                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
+                                          300, nls_cp);
+        }
+        bcc_ptr += 2 * bytes_ret;
+        bcc_ptr += 2; /* account for null termination */
+        /* copy domain */
+        if(ses->domainName == NULL)
+                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr,
+                                          "CIFS_LINUX_DOM", 32, nls_cp);
+        else
+                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, 
+                                          256, nls_cp);
+        bcc_ptr += 2 * bytes_ret;
+        bcc_ptr += 2;  /* account for null terminator */
+        /* Copy OS version */
+        bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
+                                  nls_cp);
+        bcc_ptr += 2 * bytes_ret;
+        bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release,
+                                  32, nls_cp);
+        bcc_ptr += 2 * bytes_ret;
+        bcc_ptr += 2; /* trailing null */
+        bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+                                  32, nls_cp);
+        bcc_ptr += 2 * bytes_ret;
+        bcc_ptr += 2; /* trailing null */
+        *pbcc_area = bcc_ptr;
+}
+static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
+                          const struct nls_table * nls_cp)
+{
+        char * bcc_ptr = *pbcc_area;
+        /* copy user */
+        /* BB what about null user mounts - check that we do this BB */
+        /* copy user */
+        if(ses->userName == NULL) {
+                /* BB what about null user mounts - check that we do this BB */
+        } else { /* 300 should be long enough for any conceivable user name */
+                strncpy(bcc_ptr, ses->userName, 300);
+        }
+        /* BB improve check for overflow */
+        bcc_ptr += strnlen(ses->userName, 300);
+        *bcc_ptr = 0;
+        bcc_ptr++; /* account for null termination */
+        /* copy domain */
+        
+        if(ses->domainName == NULL) {
+                strcpy(bcc_ptr, "CIFS_LINUX_DOM");
+                bcc_ptr += 14;  /* strlen(CIFS_LINUX_DOM) */
+        } else {
+                strncpy(bcc_ptr, ses->domainName, 256); 
+                bcc_ptr += strnlen(ses->domainName, 256);
+        }
+        *bcc_ptr = 0;
+        bcc_ptr++;
+        /* BB check for overflow here */
+        strcpy(bcc_ptr, "Linux version ");
+        bcc_ptr += strlen("Linux version ");
+        strcpy(bcc_ptr, system_utsname.release);
+        bcc_ptr += strlen(system_utsname.release) + 1;
+        strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+        bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+        *pbcc_area = bcc_ptr;
+}
+static int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
+                            const struct nls_table * nls_cp)
+{
+        int rc = 0;
+        int words_left, len;
+        char * data = *pbcc_area;
+        cFYI(1,("bleft %d",bleft));
+        /* word align, if bytes remaining is not even */
+        if(bleft % 2) {
+                bleft--;
+                data++;
+        }
+        words_left = bleft / 2;
+        /* save off server operating system */
+        len = UniStrnlen((wchar_t *) data, words_left);
+/* We look for obvious messed up bcc or strings in response so we do not go off
+   the end since (at least) WIN2K and Windows XP have a major bug in not null
+   terminating last Unicode string in response  */
+        if(len >= words_left)
+                return rc;
+        if(ses->serverOS)
+                kfree(ses->serverOS);
+        /* UTF-8 string will not grow more than four times as big as UCS-16 */
+        ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+        if(ses->serverOS != NULL) {
+                cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len,
+                                   nls_cp);
+        }
+        data += 2 * (len + 1);
+        words_left -= len + 1;
+        /* save off server network operating system */
+        len = UniStrnlen((wchar_t *) data, words_left);
+        if(len >= words_left)
+                return rc;
+        if(ses->serverNOS)
+                kfree(ses->serverNOS);
+        ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+        if(ses->serverNOS != NULL) {
+                cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
+                                   nls_cp);
+                if(strncmp(ses->serverNOS, "NT LAN Manager 4",16) == 0) {
+                        cFYI(1,("NT4 server"));
+                        ses->flags |= CIFS_SES_NT4;
+                }
+        }
+        data += 2 * (len + 1);
+        words_left -= len + 1;
+        /* save off server domain */
+        len = UniStrnlen((wchar_t *) data, words_left);
+        if(len > words_left)
+                return rc;
+        if(ses->serverDomain)
+                kfree(ses->serverDomain);
+        ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */
+        if(ses->serverDomain != NULL) {
+                cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
+                                   nls_cp);
+                ses->serverDomain[2*len] = 0;
+                ses->serverDomain[(2*len) + 1] = 0;
+        }
+        data += 2 * (len + 1);
+        words_left -= len + 1;
+        
+        cFYI(1,("words left: %d",words_left));
+        return rc;
+}
+static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
+                            const struct nls_table * nls_cp)
+{
+        int rc = 0;
+        int len;
+        char * bcc_ptr = *pbcc_area;
+        cFYI(1,("decode sessetup ascii. bleft %d", bleft));
+        
+        len = strnlen(bcc_ptr, bleft);
+        if(len >= bleft)
+                return rc;
+        
+        if(ses->serverOS)
+                kfree(ses->serverOS);
+        ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
+        if(ses->serverOS)
+                strncpy(ses->serverOS, bcc_ptr, len);
+        bcc_ptr += len + 1;
+        bleft -= len + 1;
+        len = strnlen(bcc_ptr, bleft);
+        if(len >= bleft)
+                return rc;
+        if(ses->serverNOS)
+                kfree(ses->serverNOS);
+        ses->serverNOS = kzalloc(len + 1, GFP_KERNEL);
+        if(ses->serverNOS)
+                strncpy(ses->serverNOS, bcc_ptr, len);
+        bcc_ptr += len + 1;
+        bleft -= len + 1;
+        len = strnlen(bcc_ptr, bleft);
+        if(len > bleft)
+                return rc;
+        if(ses->serverDomain)
+                kfree(ses->serverDomain);
+        ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
+        if(ses->serverOS)
+                strncpy(ses->serverOS, bcc_ptr, len);
+        bcc_ptr += len + 1;
+        bleft -= len + 1;
+        cFYI(1,("ascii: bytes left %d",bleft));
+        return rc;
+}
+int 
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
+                const struct nls_table *nls_cp)
+{
+        int rc = 0;
+        int wct;
+        struct smb_hdr *smb_buf;
+        char *bcc_ptr;
+        char *str_area;
+        SESSION_SETUP_ANDX *pSMB;
+        __u32 capabilities;
+        int count;
+        int resp_buf_type = 0;
+        struct kvec iov[2];
+        enum securityEnum type;
+        __u16 action;
+        int bytes_remaining;
+        if(ses == NULL)
+                return -EINVAL;
+        type = ses->server->secType;
+        cFYI(1,("sess setup type %d",type));
+        if(type == LANMAN) {
+#ifndef CONFIG_CIFS_WEAK_PW_HASH
+                /* LANMAN and plaintext are less secure and off by default.
+                So we make this explicitly be turned on in kconfig (in the
+                build) and turned on at runtime (changed from the default)
+                in proc/fs/cifs or via mount parm.  Unfortunately this is
+                needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+                return -EOPNOTSUPP;
+#endif
+                wct = 10; /* lanman 2 style sessionsetup */
+        } else if((type == NTLM) || (type == NTLMv2)) { 
+                /* For NTLMv2 failures eventually may need to retry NTLM */
+                wct = 13; /* old style NTLM sessionsetup */
+        } else /* same size for negotiate or auth, NTLMSSP or extended security */
+                wct = 12;
+        rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+                            (void **)&smb_buf);
+        if(rc)
+                return rc;
+        pSMB = (SESSION_SETUP_ANDX *)smb_buf;
+        capabilities = cifs_ssetup_hdr(ses, pSMB);
+        /* we will send the SMB in two pieces,
+        a fixed length beginning part, and a
+        second part which will include the strings
+        and rest of bcc area, in order to avoid having
+        to do a large buffer 17K allocation */
+        iov[0].iov_base = (char *)pSMB;
+        iov[0].iov_len = smb_buf->smb_buf_length + 4;
+        /* 2000 big enough to fit max user, domain, NOS name etc. */
+        str_area = kmalloc(2000, GFP_KERNEL);
+        bcc_ptr = str_area;
+        if(type == LANMAN) {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                /* no capabilities flags in old lanman negotiation */
+                pSMB->old_req.PasswordLength = CIFS_SESS_KEY_SIZE; 
+                /* BB calculate hash with password */
+                /* and copy into bcc */
+                calc_lanman_hash(ses, lnm_session_key);
+/* #ifdef CONFIG_CIFS_DEBUG2
+                cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
+                        CIFS_SESS_KEY_SIZE);
+#endif */
+                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
+                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                /* can not sign if LANMAN negotiated so no need
+                to calculate signing key? but what if server
+                changed to do higher than lanman dialect and
+                we reconnected would we ever calc signing_key? */
+                cFYI(1,("Negotiating LANMAN setting up strings"));
+                /* Unicode not allowed for LANMAN dialects */
+                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+#endif    
+        } else if (type == NTLM) {
+                char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+                pSMB->req_no_secext.CaseInsensitivePasswordLength =
+                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->req_no_secext.CaseSensitivePasswordLength =
+                        cpu_to_le16(CIFS_SESS_KEY_SIZE);
+        
+                /* calculate session key */
+                SMBNTencrypt(ses->password, ses->server->cryptKey,
+                             ntlm_session_key);
+                if(first_time) /* should this be moved into common code 
+                                  with similar ntlmv2 path? */
+                        cifs_calculate_mac_key(ses->server->mac_signing_key,
+                                ntlm_session_key, ses->password);
+                /* copy session key */
+                memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE);
+                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE);
+                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                if(ses->capabilities & CAP_UNICODE) {
+                        /* unicode strings must be word aligned */
+                        if (iov[0].iov_len % 2) {
+                                *bcc_ptr = 0;
+                                bcc_ptr++;              
+                        }       
+                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
+                } else
+                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+        } else if (type == NTLMv2) {
+                char * v2_sess_key = 
+                        kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
+                /* BB FIXME change all users of v2_sess_key to
+                   struct ntlmv2_resp */
+                if(v2_sess_key == NULL) {
+                        cifs_small_buf_release(smb_buf);
+                        return -ENOMEM;
+                }
+                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+                /* LM2 password would be here if we supported it */
+                pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+                /*      cpu_to_le16(LM2_SESS_KEY_SIZE); */
+                pSMB->req_no_secext.CaseSensitivePasswordLength =
+                        cpu_to_le16(sizeof(struct ntlmv2_resp));
+                /* calculate session key */
+                setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+                if(first_time) /* should this be moved into common code
+                                  with similar ntlmv2 path? */
+                /*   cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
+                                response BB FIXME, v2_sess_key); */
+                /* copy session key */
+        /*      memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
+                bcc_ptr += LM2_SESS_KEY_SIZE; */
+                memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp));
+                bcc_ptr += sizeof(struct ntlmv2_resp);
+                kfree(v2_sess_key);
+                if(ses->capabilities & CAP_UNICODE) {
+                        if(iov[0].iov_len % 2) {
+                                *bcc_ptr = 0;
+                        }       bcc_ptr++;
+                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
+                } else
+                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+        } else /* NTLMSSP or SPNEGO */ {
+                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                capabilities |= CAP_EXTENDED_SECURITY;
+                pSMB->req.Capabilities = cpu_to_le32(capabilities);
+                /* BB set password lengths */
+        }
+        count = (long) bcc_ptr - (long) str_area;
+        smb_buf->smb_buf_length += count;
+        BCC_LE(smb_buf) = cpu_to_le16(count);
+        iov[1].iov_base = str_area;
+        iov[1].iov_len = count; 
+        rc = SendReceive2(xid, ses, iov, 2 /* num_iovecs */, &resp_buf_type, 0);
+        /* SMB request buf freed in SendReceive2 */
+        cFYI(1,("ssetup rc from sendrecv2 is %d",rc));
+        if(rc)
+                goto ssetup_exit;
+        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
+        smb_buf = (struct smb_hdr *)iov[0].iov_base;
+        if((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
+                rc = -EIO;
+                cERROR(1,("bad word count %d", smb_buf->WordCount));
+                goto ssetup_exit;
+        }
+        action = le16_to_cpu(pSMB->resp.Action);
+        if (action & GUEST_LOGIN)
+                cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
+        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+        cFYI(1, ("UID = %d ", ses->Suid));
+        /* response can have either 3 or 4 word count - Samba sends 3 */
+        /* and lanman response is 3 */
+        bytes_remaining = BCC(smb_buf);
+        bcc_ptr = pByteArea(smb_buf);
+        if(smb_buf->WordCount == 4) {
+                __u16 blob_len;
+                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+                bcc_ptr += blob_len;
+                if(blob_len > bytes_remaining) {
+                        cERROR(1,("bad security blob length %d", blob_len));
+                        rc = -EINVAL;
+                        goto ssetup_exit;
+                }
+                bytes_remaining -= blob_len;
+        }       
+        /* BB check if Unicode and decode strings */
+        if(smb_buf->Flags2 & SMBFLG2_UNICODE)
+                rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining,
+                                                   ses, nls_cp);
+        else
+                rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,nls_cp);
+        
+ssetup_exit:
+        kfree(str_area);
+        if(resp_buf_type == CIFS_SMALL_BUFFER) {
+                cFYI(1,("ssetup freeing small buf %p", iov[0].iov_base));
+                cifs_small_buf_release(iov[0].iov_base);
+        } else if(resp_buf_type == CIFS_LARGE_BUFFER)
+                cifs_buf_release(iov[0].iov_base);
+        return rc;
+}
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 6103bcdfb16d..f518c5e45035 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -30,6 +30,7 @@
 #include <linux/random.h>
 #include "cifs_unicode.h"
 #include "cifspdu.h"
+#include "cifsglob.h"
 #include "md5.h"
 #include "cifs_debug.h"
 #include "cifsencrypt.h"
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 3da80409466c..17ba329e2b3d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -654,8 +654,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
                up(&ses->server->tcpSem);
-                cERROR(1,
+                cERROR(1, ("Illegal length, greater than maximum frame, %d",
-                       ("Illegal length, greater than maximum frame, %d ",
                        in_buf->smb_buf_length));
                DeleteMidQEntry(midQ);
                /* If not lock req, update # of requests on wire to server */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7c2642431fa5..cc66c681bd11 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -164,7 +164,7 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
        return 0;
 }
-int coda_flush(struct file *coda_file)
+int coda_flush(struct file *coda_file, fl_owner_t id)
 {
        unsigned short flags = coda_file->f_flags & ~O_EXCL;
        unsigned short coda_flags = coda_flags_to_cflags(flags);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index ada1a81df6bd..87f1dc8aa24b 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -36,7 +36,7 @@
 /* VFS super_block ops */
 static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 static kmem_cache_t * coda_inode_cachep;
@@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = {
        .setattr        = coda_setattr,
 };
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        int error;
        
        lock_kernel();
-        error = venus_statfs(sb, buf);
+        error = venus_statfs(dentry, buf);
        unlock_kernel();
@@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
 /* init_coda: used by filesystems.c to register coda */
-static struct super_block *coda_get_sb(struct file_system_type *fs_type,
+static int coda_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, coda_fill_super);
+        return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
 }
 struct file_system_type coda_fs_type = {
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 6c6771db36da..803aacf0d49c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -28,7 +28,6 @@
 #include <linux/delay.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -259,7 +258,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
        /* If request was not a signal, enqueue and don't free */
        if (!(req->uc_flags & REQ_ASYNC)) {
                req->uc_flags |= REQ_READ;
-                list_add(&(req->uc_chain), vcp->vc_processing.prev);
+                list_add_tail(&(req->uc_chain), &vcp->vc_processing);
                goto out;
        }
@@ -365,22 +364,12 @@ static int init_coda_psdev(void)
                err = PTR_ERR(coda_psdev_class);
                goto out_chrdev;
        }               
-        devfs_mk_dir ("coda");
+        for (i = 0; i < MAX_CODADEVS; i++)
-        for (i = 0; i < MAX_CODADEVS; i++) {
                class_device_create(coda_psdev_class, NULL,
                                MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
-                err = devfs_mk_cdev(MKDEV(CODA_PSDEV_MAJOR, i),
-                                S_IFCHR|S_IRUSR|S_IWUSR, "coda/%d", i);
-                if (err)
-                        goto out_class;
-        }
        coda_sysctl_init();
        goto out;
-out_class:
-        for (i = 0; i < MAX_CODADEVS; i++) 
-                class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
-        class_destroy(coda_psdev_class);
 out_chrdev:
        unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
 out:
@@ -419,12 +408,9 @@ static int __init init_coda(void)
        }
        return 0;
 out:
-        for (i = 0; i < MAX_CODADEVS; i++) {
+        for (i = 0; i < MAX_CODADEVS; i++)
                class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
-                devfs_remove("coda/%d", i);
-        }
        class_destroy(coda_psdev_class);
-        devfs_remove("coda");
        unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
        coda_sysctl_clean();
 out1:
@@ -441,12 +427,9 @@ static void __exit exit_coda(void)
        if ( err != 0 ) {
                printk("coda: failed to unregister filesystem\n");
        }
-        for (i = 0; i < MAX_CODADEVS; i++) {
+        for (i = 0; i < MAX_CODADEVS; i++)
                class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
-                devfs_remove("coda/%d", i);
-        }
        class_destroy(coda_psdev_class);
-        devfs_remove("coda");
        unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
        coda_sysctl_clean();
        coda_destroy_inodecache();
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index b35e5bbd9c99..76e00a65a75b 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -50,6 +50,6 @@ fail:
        return error;
 }
-struct address_space_operations coda_symlink_aops = {
+const struct address_space_operations coda_symlink_aops = {
        .readpage       = coda_symlink_filler,
 };
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index f0b10757288f..1c82e9a7d7c8 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/sysctl.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1bae99650a91..a5b5e631ba61 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
        return error;
 }
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs) 
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 { 
        union inputArgs *inp;
        union outputArgs *outp;
@@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs)
        insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
        UPARG(CODA_STATFS);
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+        error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
        
        if (!error) {
                sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
@@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
        ((union inputArgs *)buffer)->ih.unique = req->uc_unique;
        /* Append msg to pending queue and poke Venus. */
-        list_add(&(req->uc_chain), vcommp->vc_pending.prev);
+        list_add_tail(&(req->uc_chain), &vcommp->vc_pending);
        
        wake_up_interruptible(&vcommp->vc_waitq);
        /* We can be interrupted while we wait for Venus to process
diff --git a/fs/compat.c b/fs/compat.c
index b1f64786a613..e31e9cf96647 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+int compat_log = 1;
+int compat_printk(const char *fmt, ...)
+{
+        va_list ap;
+        int ret;
+        if (!compat_log)
+                return 0;
+        va_start(ap, fmt);
+        ret = vprintk(fmt, ap);
+        va_end(ap);
+        return ret;
+}
 /*
 * Not all architectures have sys_utime, so implement this in terms
 * of sys_utimes.
@@ -197,7 +211,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
        error = user_path_walk(path, &nd);
        if (!error) {
                struct kstatfs tmp;
-                error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+                error = vfs_statfs(nd.dentry, &tmp);
                if (!error)
                        error = put_compat_statfs(buf, &tmp);
                path_release(&nd);
@@ -215,7 +229,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
        file = fget(fd);
        if (!file)
                goto out;
-        error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+        error = vfs_statfs(file->f_dentry, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
        fput(file);
@@ -265,7 +279,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
        error = user_path_walk(path, &nd);
        if (!error) {
                struct kstatfs tmp;
-                error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+                error = vfs_statfs(nd.dentry, &tmp);
                if (!error)
                        error = put_compat_statfs64(buf, &tmp);
                path_release(&nd);
@@ -286,7 +300,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
        file = fget(fd);
        if (!file)
                goto out;
-        error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+        error = vfs_statfs(file->f_dentry, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
        fput(file);
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
        sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
        if (!isprint(buf[1]))
                sprintf(buf, "%02x", buf[1]);
-        printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+        compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
                        "cmd(%08x){%s} arg(%08x) on %s\n",
                        current->comm, current->pid,
                        (int)fd, (unsigned int)cmd, buf,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d2c38875ab29..4063a9396977 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,7 +10,6 @@
 * ioctls.
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
@@ -44,7 +43,6 @@
 #include <linux/loop.h>
 #include <linux/auto_fs.h>
 #include <linux/auto_fs4.h>
-#include <linux/devfs_fs.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
@@ -80,6 +78,7 @@
 #include <net/bluetooth/rfcomm.h>
 #include <linux/capi.h>
+#include <linux/gigaset_dev.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
@@ -205,38 +204,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
        return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
 }
-struct compat_dmx_event {
-        dmx_event_t     event;
-        compat_time_t   timeStamp;
-        union
-        {
-                dmx_scrambling_status_t scrambling;
-        } u;
-};
-static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-        struct dmx_event kevent;
-        mm_segment_t old_fs = get_fs();
-        int err;
-        set_fs(KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
-        set_fs(old_fs);
-        if (!err) {
-                struct compat_dmx_event __user *up = compat_ptr(arg);
-                err  = put_user(kevent.event, &up->event);
-                err |= put_user(kevent.timeStamp, &up->timeStamp);
-                err |= put_user(kevent.u.scrambling, &up->u.scrambling);
-                if (err)
-                        err = -EFAULT;
-        }
-        return err;
-}
 struct compat_video_event {
        int32_t         type;
        compat_time_t   timestamp;
@@ -2964,7 +2931,6 @@ HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
 #endif
 /* dvb */
-HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f952187fc53..df025453dd97 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -211,7 +211,7 @@ static void remove_dir(struct dentry * d)
        struct configfs_dirent * sd;
        sd = d->d_fsdata;
-        list_del_init(&sd->s_sibling);
+        list_del_init(&sd->s_sibling);
        configfs_put(sd);
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
@@ -330,7 +330,7 @@ static int configfs_detach_prep(struct dentry *dentry)
                        ret = configfs_detach_prep(sd->s_dentry);
                        if (!ret)
-                                continue;
+                                continue;
                } else
                        ret = -ENOTEMPTY;
@@ -931,7 +931,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
        new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
        if (!IS_ERR(new_dentry)) {
-                if (!new_dentry->d_inode) {
+                if (!new_dentry->d_inode) {
                        error = config_item_set_name(item, "%s", new_name);
                        if (!error) {
                                d_add(new_dentry, NULL);
@@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
                        /* fallthrough */
                default:
                        if (filp->f_pos == 2) {
-                                list_del(q);
+                                list_move(q, &parent_sd->s_children);
-                                list_add(q, &parent_sd->s_children);
                        }
                        for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
                                struct configfs_dirent *next;
@@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
                                                 dt_type(next)) < 0)
                                        return 0;
-                                list_del(q);
+                                list_move(q, p);
-                                list_add(q, p);
                                p = q;
                                filp->f_pos++;
                        }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c153bd9534cb..e14488ca6411 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -38,7 +38,7 @@
 extern struct super_block * configfs_sb;
-static struct address_space_operations configfs_aops = {
+static const struct address_space_operations configfs_aops = {
        .readpage       = simple_readpage,
        .prepare_write  = simple_prepare_write,
        .commit_write   = simple_commit_write
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e5..3e5fe843e1df 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+static int configfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, configfs_fill_super);
+        return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
 }
 static struct file_system_type configfs_fs_type = {
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
 int configfs_pin_fs(void)
 {
-        return simple_pin_fs("configfs", &configfs_mount,
+        return simple_pin_fs(&configfs_fs_type, &configfs_mount,
                             &configfs_mnt_count);
 }
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index e5512e295cf2..fb65e0800a86 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -66,7 +66,7 @@ static void fill_item_path(struct config_item * item, char * buffer, int length)
 }
 static int create_link(struct config_item *parent_item,
-                       struct config_item *item,
+                       struct config_item *item,
                       struct dentry *dentry)
 {
        struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9efcc3a164e8..223c0431042d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -30,7 +30,7 @@
 static struct super_operations cramfs_ops;
 static struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
-static struct address_space_operations cramfs_aops;
+static const struct address_space_operations cramfs_aops;
 static DEFINE_MUTEX(read_mutex);
@@ -181,9 +181,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
                struct page *page = NULL;
                if (blocknr + i < devsize) {
-                        page = read_cache_page(mapping, blocknr + i,
+                        page = read_mapping_page(mapping, blocknr + i, NULL);
-                                (filler_t *)mapping->a_ops->readpage,
-                                NULL);
                        /* synchronous error? */
                        if (IS_ERR(page))
                                page = NULL;
@@ -322,8 +320,10 @@ out:
        return -EINVAL;
 }
-static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        buf->f_type = CRAMFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
        buf->f_blocks = CRAMFS_SB(sb)->blocks;
@@ -501,7 +501,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
        return 0;
 }
-static struct address_space_operations cramfs_aops = {
+static const struct address_space_operations cramfs_aops = {
        .readpage = cramfs_readpage
 };
@@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = {
        .statfs         = cramfs_statfs,
 };
-static struct super_block *cramfs_get_sb(struct file_system_type *fs_type,
+static int cramfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+                           mnt);
 }
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dcache.c b/fs/dcache.c
index 940d188e5d14..c6e3535be192 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -14,7 +14,6 @@
 * the dcache entry is deleted or garbage collected.
 */
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
@@ -359,12 +358,13 @@ restart:
 }
 /*
- * Throw away a dentry - free the inode, dput the parent.
+ * Throw away a dentry - free the inode, dput the parent.  This requires that
- * This requires that the LRU list has already been
+ * the LRU list has already been removed.
- * removed.
+ *
 * Called with dcache_lock, drops it and then regains.
+ * Called with dentry->d_lock held, drops it.
 */
-static inline void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry * dentry)
 {
        struct dentry * parent;
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(struct dentry * dentry)
 /**
 * prune_dcache - shrink the dcache
 * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
 *
 * Shrink the dcache. This is done when we need
 * more memory, or simply when we need to unmount
@@ -392,16 +394,29 @@ static inline void prune_one_dentry(struct dentry * dentry)
 * all the dentries are in use.
 */
 
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
        spin_lock(&dcache_lock);
        for (; count ; count--) {
                struct dentry *dentry;
                struct list_head *tmp;
+                struct rw_semaphore *s_umount;
                cond_resched_lock(&dcache_lock);
                tmp = dentry_unused.prev;
+                if (sb) {
+                        /* Try to find a dentry for this sb, but don't try
+                         * too hard, if they aren't near the tail they will
+                         * be moved down again soon
+                         */
+                        int skip = count;
+                        while (skip && tmp != &dentry_unused &&
+                            list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+                                skip--;
+                                tmp = tmp->prev;
+                        }
+                }
                if (tmp == &dentry_unused)
                        break;
                list_del_init(tmp);
@@ -427,7 +442,45 @@ static void prune_dcache(int count)
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-                prune_one_dentry(dentry);
+                /*
+                 * If the dentry is not DCACHED_REFERENCED, it is time
+                 * to remove it from the dcache, provided the super block is
+                 * NULL (which means we are trying to reclaim memory)
+                 * or this dentry belongs to the same super block that
+                 * we want to shrink.
+                 */
+                /*
+                 * If this dentry is for "my" filesystem, then I can prune it
+                 * without taking the s_umount lock (I already hold it).
+                 */
+                if (sb && dentry->d_sb == sb) {
+                        prune_one_dentry(dentry);
+                        continue;
+                }
+                /*
+                 * ...otherwise we need to be sure this filesystem isn't being
+                 * unmounted, otherwise we could race with
+                 * generic_shutdown_super(), and end up holding a reference to
+                 * an inode while the filesystem is unmounted.
+                 * So we try to get s_umount, and make sure s_root isn't NULL.
+                 * (Take a local copy of s_umount to avoid a use-after-free of
+                 * `dentry').
+                 */
+                s_umount = &dentry->d_sb->s_umount;
+                if (down_read_trylock(s_umount)) {
+                        if (dentry->d_sb->s_root != NULL) {
+                                prune_one_dentry(dentry);
+                                up_read(s_umount);
+                                continue;
+                        }
+                        up_read(s_umount);
+                }
+                spin_unlock(&dentry->d_lock);
+                /* Cannot remove the first dentry, and it isn't appropriate
+                 * to move it to the head of the list, so give up, and try
+                 * later
+                 */
+                break;
        }
        spin_unlock(&dcache_lock);
 }
@@ -468,8 +521,7 @@ void shrink_dcache_sb(struct super_block * sb)
                dentry = list_entry(tmp, struct dentry, d_lru);
                if (dentry->d_sb != sb)
                        continue;
-                list_del(tmp);
+                list_move(tmp, &dentry_unused);
-                list_add(tmp, &dentry_unused);
        }
        /*
@@ -584,7 +636,7 @@ resume:
                 * of the unused list for prune_dcache
                 */
                if (!atomic_read(&dentry->d_count)) {
-                        list_add(&dentry->d_lru, dentry_unused.prev);
+                        list_add_tail(&dentry->d_lru, &dentry_unused);
                        dentry_stat.nr_unused++;
                        found++;
                }
@@ -630,46 +682,7 @@ void shrink_dcache_parent(struct dentry * parent)
        int found;
        while ((found = select_parent(parent)) != 0)
-                prune_dcache(found);
+                prune_dcache(found, parent->d_sb);
-}
-/**
- * shrink_dcache_anon - further prune the cache
- * @head: head of d_hash list of dentries to prune
- *
- * Prune the dentries that are anonymous
- *
- * parsing d_hash list does not hlist_for_each_entry_rcu() as it
- * done under dcache_lock.
- *
- */
-void shrink_dcache_anon(struct hlist_head *head)
-{
-        struct hlist_node *lp;
-        int found;
-        do {
-                found = 0;
-                spin_lock(&dcache_lock);
-                hlist_for_each(lp, head) {
-                        struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-                        if (!list_empty(&this->d_lru)) {
-                                dentry_stat.nr_unused--;
-                                list_del_init(&this->d_lru);
-                        }
-                        /* 
-                         * move only zero ref count dentries to the end 
-                         * of the unused list for prune_dcache
-                         */
-                        if (!atomic_read(&this->d_count)) {
-                                list_add_tail(&this->d_lru, &dentry_unused);
-                                dentry_stat.nr_unused++;
-                                found++;
-                        }
-                }
-                spin_unlock(&dcache_lock);
-                prune_dcache(found);
-        } while(found);
 }
 /*
@@ -689,7 +702,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                prune_dcache(nr);
+                prune_dcache(nr, NULL);
        }
        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 8749339bf4f6..0c4b0674854b 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -12,7 +12,6 @@
 * to the pair and can be looked up from userspace.
 */
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 66a505422e5c..39640fd03458 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -13,7 +13,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a676..e8ae3042b806 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -16,7 +16,6 @@
 /* uncomment to get debug messages from the debug filesystem, ah the irony. */
 /* #define DEBUG */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -111,11 +110,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
        return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
-static struct super_block *debug_get_sb(struct file_system_type *fs_type,
+static int debug_get_sb(struct file_system_type *fs_type,
-                                        int flags, const char *dev_name,
+                        int flags, const char *dev_name,
-                                        void *data)
+                        void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, debug_fill_super);
+        return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
 }
 static struct file_system_type debug_fs_type = {
@@ -199,7 +198,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
        pr_debug("debugfs: creating file '%s'\n",name);
-        error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count);
+        error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
        if (error)
                goto exit;
diff --git a/fs/devfs/Makefile b/fs/devfs/Makefile
deleted file mode 100644
index 6dd8d1245e2c..000000000000
--- a/fs/devfs/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-#
-# Makefile for the linux devfs-filesystem routines.
-#
-obj-$(CONFIG_DEVFS_FS) += devfs.o
-devfs-objs := base.o util.o
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
deleted file mode 100644
index 52f5059c4f31..000000000000
--- a/fs/devfs/base.c
+++ /dev/null
@@ -1,2836 +0,0 @@
-/*  devfs (Device FileSystem) driver.
-    Copyright (C) 1998-2002  Richard Gooch
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-    Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
-    The postal address is:
-      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-    ChangeLog
-    19980110   Richard Gooch <rgooch@atnf.csiro.au>
-               Original version.
-  v0.1
-    19980111   Richard Gooch <rgooch@atnf.csiro.au>
-               Created per-fs inode table rather than using inode->u.generic_ip
-  v0.2
-    19980111   Richard Gooch <rgooch@atnf.csiro.au>
-               Created .epoch inode which has a ctime of 0.
-               Fixed loss of named pipes when dentries lost.
-               Fixed loss of inode data when devfs_register() follows mknod().
-  v0.3
-    19980111   Richard Gooch <rgooch@atnf.csiro.au>
-               Fix for when compiling with CONFIG_KERNELD.
-    19980112   Richard Gooch <rgooch@atnf.csiro.au>
-               Fix for readdir() which sometimes didn't show entries.
-               Added <<tolerant>> option to <devfs_register>.
-  v0.4
-    19980113   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_fill_file> function.
-  v0.5
-    19980115   Richard Gooch <rgooch@atnf.csiro.au>
-               Added subdirectory support. Major restructuring.
-    19980116   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed <find_by_dev> to not search major=0,minor=0.
-               Added symlink support.
-  v0.6
-    19980120   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_mk_dir> function and support directory unregister
-    19980120   Richard Gooch <rgooch@atnf.csiro.au>
-               Auto-ownership uses real uid/gid rather than effective uid/gid.
-  v0.7
-    19980121   Richard Gooch <rgooch@atnf.csiro.au>
-               Supported creation of sockets.
-  v0.8
-    19980122   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFS_FL_HIDE_UNREG flag.
-               Interface change to <devfs_mk_symlink>.
-               Created <devfs_symlink> to support symlink(2).
-  v0.9
-    19980123   Richard Gooch <rgooch@atnf.csiro.au>
-               Added check to <devfs_fill_file> to check inode is in devfs.
-               Added optional traversal of symlinks.
-  v0.10
-    19980124   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_flags> and <devfs_set_flags>.
-  v0.11
-    19980125   C. Scott Ananian <cananian@alumni.princeton.edu>
-               Created <devfs_find_handle>.
-    19980125   Richard Gooch <rgooch@atnf.csiro.au>
-               Allow removal of symlinks.
-  v0.12
-    19980125   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_set_symlink_destination>.
-    19980126   Richard Gooch <rgooch@atnf.csiro.au>
-               Moved DEVFS_SUPER_MAGIC into header file.
-               Added DEVFS_FL_HIDE flag.
-               Created <devfs_get_maj_min>.
-               Created <devfs_get_handle_from_inode>.
-               Fixed minor bug in <find_by_dev>.
-    19980127   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed interface to <find_by_dev>, <find_entry>,
-               <devfs_unregister>, <devfs_fill_file> and <devfs_find_handle>.
-               Fixed inode times when symlink created with symlink(2).
-  v0.13
-    19980129   C. Scott Ananian <cananian@alumni.princeton.edu>
-               Exported <devfs_set_symlink_destination>, <devfs_get_maj_min>
-               and <devfs_get_handle_from_inode>.
-    19980129   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_unlink> to support unlink(2).
-  v0.14
-    19980129   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed kerneld support for entries in devfs subdirectories.
-    19980130   Richard Gooch <rgooch@atnf.csiro.au>
-               Bugfixes in <call_kerneld>.
-  v0.15
-    19980207   Richard Gooch <rgooch@atnf.csiro.au>
-               Call kerneld when looking up unregistered entries.
-  v0.16
-    19980326   Richard Gooch <rgooch@atnf.csiro.au>
-               Modified interface to <devfs_find_handle> for symlink traversal.
-  v0.17
-    19980331   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed persistence bug with device numbers for manually created
-               device files.
-               Fixed problem with recreating symlinks with different content.
-  v0.18
-    19980401   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed to CONFIG_KMOD.
-               Hide entries which are manually unlinked.
-               Always invalidate devfs dentry cache when registering entries.
-               Created <devfs_rmdir> to support rmdir(2).
-               Ensure directories created by <devfs_mk_dir> are visible.
-  v0.19
-    19980402   Richard Gooch <rgooch@atnf.csiro.au>
-               Invalidate devfs dentry cache when making directories.
-               Invalidate devfs dentry cache when removing entries.
-               Fixed persistence bug with fifos.
-  v0.20
-    19980421   Richard Gooch <rgooch@atnf.csiro.au>
-               Print process command when debugging kerneld/kmod.
-               Added debugging for register/unregister/change operations.
-    19980422   Richard Gooch <rgooch@atnf.csiro.au>
-               Added "devfs=" boot options.
-  v0.21
-    19980426   Richard Gooch <rgooch@atnf.csiro.au>
-               No longer lock/unlock superblock in <devfs_put_super>.
-               Drop negative dentries when they are released.
-               Manage dcache more efficiently.
-  v0.22
-    19980427   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFS_FL_AUTO_DEVNUM flag.
-  v0.23
-    19980430   Richard Gooch <rgooch@atnf.csiro.au>
-               No longer set unnecessary methods.
-  v0.24
-    19980504   Richard Gooch <rgooch@atnf.csiro.au>
-               Added PID display to <call_kerneld> debugging message.
-               Added "after" debugging message to <call_kerneld>.
-    19980519   Richard Gooch <rgooch@atnf.csiro.au>
-               Added "diread" and "diwrite" boot options.
-    19980520   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed persistence problem with permissions.
-  v0.25
-    19980602   Richard Gooch <rgooch@atnf.csiro.au>
-               Support legacy device nodes.
-               Fixed bug where recreated inodes were hidden.
-  v0.26
-    19980602   Richard Gooch <rgooch@atnf.csiro.au>
-               Improved debugging in <get_vfs_inode>.
-    19980607   Richard Gooch <rgooch@atnf.csiro.au>
-               No longer free old dentries in <devfs_mk_dir>.
-               Free all dentries for a given entry when deleting inodes.
-  v0.27
-    19980627   Richard Gooch <rgooch@atnf.csiro.au>
-               Limit auto-device numbering to majors 128 to 239.
-  v0.28
-    19980629   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed inode times persistence problem.
-  v0.29
-    19980704   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed spelling in <devfs_readlink> debug.
-               Fixed bug in <devfs_setup> parsing "dilookup".
-  v0.30
-    19980705   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed devfs inode leak when manually recreating inodes.
-               Fixed permission persistence problem when recreating inodes.
-  v0.31
-    19980727   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed harmless "unused variable" compiler warning.
-               Fixed modes for manually recreated device nodes.
-  v0.32
-    19980728   Richard Gooch <rgooch@atnf.csiro.au>
-               Added NULL devfs inode warning in <devfs_read_inode>.
-               Force all inode nlink values to 1.
-  v0.33
-    19980730   Richard Gooch <rgooch@atnf.csiro.au>
-               Added "dimknod" boot option.
-               Set inode nlink to 0 when freeing dentries.
-               Fixed modes for manually recreated symlinks.
-  v0.34
-    19980802   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bugs in recreated directories and symlinks.
-  v0.35
-    19980806   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bugs in recreated device nodes.
-    19980807   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in currently unused <devfs_get_handle_from_inode>.
-               Defined new <devfs_handle_t> type.
-               Improved debugging when getting entries.
-               Fixed bug where directories could be emptied.
-  v0.36
-    19980809   Richard Gooch <rgooch@atnf.csiro.au>
-               Replaced dummy .epoch inode with .devfsd character device.
-    19980810   Richard Gooch <rgooch@atnf.csiro.au>
-               Implemented devfsd protocol revision 0.
-  v0.37
-    19980819   Richard Gooch <rgooch@atnf.csiro.au>
-               Added soothing message to warning in <devfs_d_iput>.
-  v0.38
-    19980829   Richard Gooch <rgooch@atnf.csiro.au>
-               Use GCC extensions for structure initialisations.
-               Implemented async open notification.
-               Incremented devfsd protocol revision to 1.
-  v0.39
-    19980908   Richard Gooch <rgooch@atnf.csiro.au>
-               Moved async open notification to end of <devfs_open>.
-  v0.40
-    19980910   Richard Gooch <rgooch@atnf.csiro.au>
-               Prepended "/dev/" to module load request.
-               Renamed <call_kerneld> to <call_kmod>.
-  v0.41
-    19980910   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed typo "AYSNC" -> "ASYNC".
-  v0.42
-    19980910   Richard Gooch <rgooch@atnf.csiro.au>
-               Added open flag for files.
-  v0.43
-    19980927   Richard Gooch <rgooch@atnf.csiro.au>
-               Set i_blocks=0 and i_blksize=1024 in <devfs_read_inode>.
-  v0.44
-    19981005   Richard Gooch <rgooch@atnf.csiro.au>
-               Added test for empty <<name>> in <devfs_find_handle>.
-               Renamed <generate_path> to <devfs_generate_path> and published.
-  v0.45
-    19981006   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_fops>.
-  v0.46
-    19981007   Richard Gooch <rgooch@atnf.csiro.au>
-               Limit auto-device numbering to majors 144 to 239.
-  v0.47
-    19981010   Richard Gooch <rgooch@atnf.csiro.au>
-               Updated <devfs_follow_link> for VFS change in 2.1.125.
-  v0.48
-    19981022   Richard Gooch <rgooch@atnf.csiro.au>
-               Created DEVFS_ FL_COMPAT flag.
-  v0.49
-    19981023   Richard Gooch <rgooch@atnf.csiro.au>
-               Created "nocompat" boot option.
-  v0.50
-    19981025   Richard Gooch <rgooch@atnf.csiro.au>
-               Replaced "mount" boot option with "nomount".
-  v0.51
-    19981110   Richard Gooch <rgooch@atnf.csiro.au>
-               Created "only" boot option.
-  v0.52
-    19981112   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFS_FL_REMOVABLE flag.
-  v0.53
-    19981114   Richard Gooch <rgooch@atnf.csiro.au>
-               Only call <scan_dir_for_removable> on first call to
-               <devfs_readdir>.
-  v0.54
-    19981205   Richard Gooch <rgooch@atnf.csiro.au>
-               Updated <devfs_rmdir> for VFS change in 2.1.131.
-  v0.55
-    19981218   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_mk_compat>.
-    19981220   Richard Gooch <rgooch@atnf.csiro.au>
-               Check for partitions on removable media in <devfs_lookup>.
-  v0.56
-    19990118   Richard Gooch <rgooch@atnf.csiro.au>
-               Added support for registering regular files.
-               Created <devfs_set_file_size>.
-               Update devfs inodes from entries if not changed through FS.
-  v0.57
-    19990124   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed <devfs_fill_file> to only initialise temporary inodes.
-               Trap for NULL fops in <devfs_register>.
-               Return -ENODEV in <devfs_fill_file> for non-driver inodes.
-  v0.58
-    19990126   Richard Gooch <rgooch@atnf.csiro.au>
-               Switched from PATH_MAX to DEVFS_PATHLEN.
-  v0.59
-    19990127   Richard Gooch <rgooch@atnf.csiro.au>
-               Created "nottycompat" boot option.
-  v0.60
-    19990318   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed <devfsd_read> to not overrun event buffer.
-  v0.61
-    19990329   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_auto_unregister>.
-  v0.62
-    19990330   Richard Gooch <rgooch@atnf.csiro.au>
-               Don't return unregistred entries in <devfs_find_handle>.
-               Panic in <devfs_unregister> if entry unregistered.
-    19990401   Richard Gooch <rgooch@atnf.csiro.au>
-               Don't panic in <devfs_auto_unregister> for duplicates.
-  v0.63
-    19990402   Richard Gooch <rgooch@atnf.csiro.au>
-               Don't unregister already unregistered entries in <unregister>.
-  v0.64
-    19990510   Richard Gooch <rgooch@atnf.csiro.au>
-               Disable warning messages when unable to read partition table for
-               removable media.
-  v0.65
-    19990512   Richard Gooch <rgooch@atnf.csiro.au>
-               Updated <devfs_lookup> for VFS change in 2.3.1-pre1.
-               Created "oops-on-panic" boot option.
-               Improved debugging in <devfs_register> and <devfs_unregister>.
-  v0.66
-    19990519   Richard Gooch <rgooch@atnf.csiro.au>
-               Added documentation for some functions.
-    19990525   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed "oops-on-panic" boot option: now always Oops.
-  v0.67
-    19990531   Richard Gooch <rgooch@atnf.csiro.au>
-               Improved debugging in <devfs_register>.
-  v0.68
-    19990604   Richard Gooch <rgooch@atnf.csiro.au>
-               Added "diunlink" and "nokmod" boot options.
-               Removed superfluous warning message in <devfs_d_iput>.
-  v0.69
-    19990611   Richard Gooch <rgooch@atnf.csiro.au>
-               Took account of change to <d_alloc_root>.
-  v0.70
-    19990614   Richard Gooch <rgooch@atnf.csiro.au>
-               Created separate event queue for each mounted devfs.
-               Removed <devfs_invalidate_dcache>.
-               Created new ioctl()s.
-               Incremented devfsd protocol revision to 3.
-               Fixed bug when re-creating directories: contents were lost.
-               Block access to inodes until devfsd updates permissions.
-    19990615   Richard Gooch <rgooch@atnf.csiro.au>
-               Support 2.2.x kernels.
-  v0.71
-    19990623   Richard Gooch <rgooch@atnf.csiro.au>
-               Switched to sending process uid/gid to devfsd.
-               Renamed <call_kmod> to <try_modload>.
-               Added DEVFSD_NOTIFY_LOOKUP event.
-    19990624   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFSD_NOTIFY_CHANGE event.
-               Incremented devfsd protocol revision to 4.
-  v0.72
-    19990713   Richard Gooch <rgooch@atnf.csiro.au>
-               Return EISDIR rather than EINVAL for read(2) on directories.
-  v0.73
-    19990809   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed <devfs_setup> to new __init scheme.
-  v0.74
-    19990901   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed remaining function declarations to new __init scheme.
-  v0.75
-    19991013   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_info>, <devfs_set_info>,
-               <devfs_get_first_child> and <devfs_get_next_sibling>.
-               Added <<dir>> parameter to <devfs_register>, <devfs_mk_compat>,
-               <devfs_mk_dir> and <devfs_find_handle>.
-               Work sponsored by SGI.
-  v0.76
-    19991017   Richard Gooch <rgooch@atnf.csiro.au>
-               Allow multiple unregistrations.
-               Work sponsored by SGI.
-  v0.77
-    19991026   Richard Gooch <rgooch@atnf.csiro.au>
-               Added major and minor number to devfsd protocol.
-               Incremented devfsd protocol revision to 5.
-               Work sponsored by SGI.
-  v0.78
-    19991030   Richard Gooch <rgooch@atnf.csiro.au>
-               Support info pointer for all devfs entry types.
-               Added <<info>> parameter to <devfs_mk_dir> and
-               <devfs_mk_symlink>.
-               Work sponsored by SGI.
-  v0.79
-    19991031   Richard Gooch <rgooch@atnf.csiro.au>
-               Support "../" when searching devfs namespace.
-               Work sponsored by SGI.
-  v0.80
-    19991101   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_unregister_slave>.
-               Work sponsored by SGI.
-  v0.81
-    19991103   Richard Gooch <rgooch@atnf.csiro.au>
-               Exported <devfs_get_parent>.
-               Work sponsored by SGI.
-  v0.82
-    19991104   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed unused <devfs_set_symlink_destination>.
-    19991105   Richard Gooch <rgooch@atnf.csiro.au>
-               Do not hide entries from devfsd or children.
-               Removed DEVFS_ FL_TTY_COMPAT flag.
-               Removed "nottycompat" boot option.
-               Removed <devfs_mk_compat>.
-               Work sponsored by SGI.
-  v0.83
-    19991107   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFS_FL_WAIT flag.
-               Work sponsored by SGI.
-  v0.84
-    19991107   Richard Gooch <rgooch@atnf.csiro.au>
-               Support new "disc" naming scheme in <get_removable_partition>.
-               Allow NULL fops in <devfs_register>.
-               Work sponsored by SGI.
-  v0.85
-    19991110   Richard Gooch <rgooch@atnf.csiro.au>
-               Fall back to major table if NULL fops given to <devfs_register>.
-               Work sponsored by SGI.
-  v0.86
-    19991204   Richard Gooch <rgooch@atnf.csiro.au>
-               Support fifos when unregistering.
-               Work sponsored by SGI.
-  v0.87
-    19991209   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed obsolete DEVFS_ FL_COMPAT and DEVFS_ FL_TOLERANT flags.
-               Work sponsored by SGI.
-  v0.88
-    19991214   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed kmod support.
-               Work sponsored by SGI.
-  v0.89
-    19991216   Richard Gooch <rgooch@atnf.csiro.au>
-               Improved debugging in <get_vfs_inode>.
-               Ensure dentries created by devfsd will be cleaned up.
-               Work sponsored by SGI.
-  v0.90
-    19991223   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_name>.
-               Work sponsored by SGI.
-  v0.91
-    20000203   Richard Gooch <rgooch@atnf.csiro.au>
-               Ported to kernel 2.3.42.
-               Removed <devfs_fill_file>.
-               Work sponsored by SGI.
-  v0.92
-    20000306   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFS_ FL_NO_PERSISTENCE flag.
-               Removed unnecessary call to <update_devfs_inode_from_entry> in
-               <devfs_readdir>.
-               Work sponsored by SGI.
-  v0.93
-    20000413   Richard Gooch <rgooch@atnf.csiro.au>
-               Set inode->i_size to correct size for symlinks.
-    20000414   Richard Gooch <rgooch@atnf.csiro.au>
-               Only give lookup() method to directories to comply with new VFS
-               assumptions.
-               Work sponsored by SGI.
-    20000415   Richard Gooch <rgooch@atnf.csiro.au>
-               Remove unnecessary tests in symlink methods.
-               Don't kill existing block ops in <devfs_read_inode>.
-               Work sponsored by SGI.
-  v0.94
-    20000424   Richard Gooch <rgooch@atnf.csiro.au>
-               Don't create missing directories in <devfs_find_handle>.
-               Work sponsored by SGI.
-  v0.95
-    20000430   Richard Gooch <rgooch@atnf.csiro.au>
-               Added CONFIG_DEVFS_MOUNT.
-               Work sponsored by SGI.
-  v0.96
-    20000608   Richard Gooch <rgooch@atnf.csiro.au>
-               Disabled multi-mount capability (use VFS bindings instead).
-               Work sponsored by SGI.
-  v0.97
-    20000610   Richard Gooch <rgooch@atnf.csiro.au>
-               Switched to FS_SINGLE to disable multi-mounts.
-    20000612   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed module support.
-               Removed multi-mount code.
-               Removed compatibility macros: VFS has changed too much.
-               Work sponsored by SGI.
-  v0.98
-    20000614   Richard Gooch <rgooch@atnf.csiro.au>
-               Merged devfs inode into devfs entry.
-               Work sponsored by SGI.
-  v0.99
-    20000619   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed dead code in <devfs_register> which used to call
-               <free_dentries>.
-               Work sponsored by SGI.
-  v0.100
-    20000621   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed interface to <devfs_register>.
-               Work sponsored by SGI.
-  v0.101
-    20000622   Richard Gooch <rgooch@atnf.csiro.au>
-               Simplified interface to <devfs_mk_symlink> and <devfs_mk_dir>.
-               Simplified interface to <devfs_find_handle>.
-               Work sponsored by SGI.
-  v0.102
-    20010519   Richard Gooch <rgooch@atnf.csiro.au>
-               Ensure <devfs_generate_path> terminates string for root entry.
-               Exported <devfs_get_name> to modules.
-    20010520   Richard Gooch <rgooch@atnf.csiro.au>
-               Make <devfs_mk_symlink> send events to devfsd.
-               Cleaned up option processing in <devfs_setup>.
-    20010521   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bugs in handling symlinks: could leak or cause Oops.
-    20010522   Richard Gooch <rgooch@atnf.csiro.au>
-               Cleaned up directory handling by separating fops.
-  v0.103
-    20010601   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed handling of inverted options in <devfs_setup>.
-  v0.104
-    20010604   Richard Gooch <rgooch@atnf.csiro.au>
-               Adjusted <try_modload> to account for <devfs_generate_path> fix.
-  v0.105
-    20010617   Richard Gooch <rgooch@atnf.csiro.au>
-               Answered question posed by Al Viro and removed his comments.
-               Moved setting of registered flag after other fields are changed.
-               Fixed race between <devfsd_close> and <devfsd_notify_one>.
-               Global VFS changes added bogus BKL to <devfsd_close>: removed.
-               Widened locking in <devfs_readlink> and <devfs_follow_link>.
-               Replaced <devfsd_read> stack usage with <devfsd_ioctl> kmalloc.
-               Simplified locking in <devfsd_ioctl> and fixed memory leak.
-  v0.106
-    20010709   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed broken devnum allocation and use <devfs_alloc_devnum>.
-               Fixed old devnum leak by calling new <devfs_dealloc_devnum>.
-  v0.107
-    20010712   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfs_setup> which could hang boot process.
-  v0.108
-    20010730   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFSD_NOTIFY_DELETE event.
-    20010801   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed #include <asm/segment.h>.
-  v0.109
-    20010807   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed inode table races by removing it and using
-               inode->u.generic_ip instead.
-               Moved <devfs_read_inode> into <get_vfs_inode>.
-               Moved <devfs_write_inode> into <devfs_notify_change>.
-  v0.110
-    20010808   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed race in <devfs_do_symlink> for uni-processor.
-  v0.111
-    20010818   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed remnant of multi-mount support in <devfs_mknod>.
-               Removed unused DEVFS_FL_SHOW_UNREG flag.
-  v0.112
-    20010820   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed nlink field from struct devfs_inode.
-  v0.113
-    20010823   Richard Gooch <rgooch@atnf.csiro.au>
-               Replaced BKL with global rwsem to protect symlink data (quick
-               and dirty hack).
-  v0.114
-    20010827   Richard Gooch <rgooch@atnf.csiro.au>
-               Replaced global rwsem for symlink with per-link refcount.
-  v0.115
-    20010919   Richard Gooch <rgooch@atnf.csiro.au>
-               Set inode->i_mapping->a_ops for block nodes in <get_vfs_inode>.
-  v0.116
-    20011008   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed overrun in <devfs_link> by removing function (not needed).
-    20011009   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed buffer underrun in <try_modload>.
-    20011029   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed race in <devfsd_ioctl> when setting event mask.
-    20011114   Richard Gooch <rgooch@atnf.csiro.au>
-               First release of new locking code.
-  v1.0
-    20011117   Richard Gooch <rgooch@atnf.csiro.au>
-               Discard temporary buffer, now use "%s" for dentry names.
-    20011118   Richard Gooch <rgooch@atnf.csiro.au>
-               Don't generate path in <try_modload>: use fake entry instead.
-               Use "existing" directory in <_devfs_make_parent_for_leaf>.
-    20011122   Richard Gooch <rgooch@atnf.csiro.au>
-               Use slab cache rather than fixed buffer for devfsd events.
-  v1.1
-    20011125   Richard Gooch <rgooch@atnf.csiro.au>
-               Send DEVFSD_NOTIFY_REGISTERED events in <devfs_mk_dir>.
-    20011127   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed locking bug in <devfs_d_revalidate_wait> due to typo.
-               Do not send CREATE, CHANGE, ASYNC_OPEN or DELETE events from
-               devfsd or children.
-  v1.2
-    20011202   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfsd_read>: was dereferencing freed pointer.
-  v1.3
-    20011203   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfsd_close>: was dereferencing freed pointer.
-               Added process group check for devfsd privileges.
-  v1.4
-    20011204   Richard Gooch <rgooch@atnf.csiro.au>
-               Use SLAB_ATOMIC in <devfsd_notify_de> from <devfs_d_delete>.
-  v1.5
-    20011211   Richard Gooch <rgooch@atnf.csiro.au>
-               Return old entry in <devfs_mk_dir> for 2.4.x kernels.
-    20011212   Richard Gooch <rgooch@atnf.csiro.au>
-               Increment refcount on module in <check_disc_changed>.
-    20011215   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_handle> and exported <devfs_put>.
-               Increment refcount on module in <devfs_get_ops>.
-               Created <devfs_put_ops>.
-  v1.6
-    20011216   Richard Gooch <rgooch@atnf.csiro.au>
-               Added poisoning to <devfs_put>.
-               Improved debugging messages.
-  v1.7
-    20011221   Richard Gooch <rgooch@atnf.csiro.au>
-               Corrected (made useful) debugging message in <unregister>.
-               Moved <kmem_cache_create> in <mount_devfs_fs> to <init_devfs_fs>
-    20011224   Richard Gooch <rgooch@atnf.csiro.au>
-               Added magic number to guard against scribbling drivers.
-    20011226   Richard Gooch <rgooch@atnf.csiro.au>
-               Only return old entry in <devfs_mk_dir> if a directory.
-               Defined macros for error and debug messages.
-  v1.8
-    20020113   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed (rare, old) race in <devfs_lookup>.
-  v1.9
-    20020120   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed deadlock bug in <devfs_d_revalidate_wait>.
-               Tag VFS deletable in <devfs_mk_symlink> if handle ignored.
-  v1.10
-    20020129   Richard Gooch <rgooch@atnf.csiro.au>
-               Added KERN_* to remaining messages.
-               Cleaned up declaration of <stat_read>.
-  v1.11
-    20020219   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed <devfs_rmdir> to allow later additions if not yet empty.
-  v1.12
-    20020406   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed silently introduced calls to lock_kernel() and
-               unlock_kernel() due to recent VFS locking changes. BKL isn't
-               required in devfs.
-  v1.13
-    20020428   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed 2.4.x compatibility code.
-  v1.14
-    20020510   Richard Gooch <rgooch@atnf.csiro.au>
-               Added BKL to <devfs_open> because drivers still need it.
-  v1.15
-    20020512   Richard Gooch <rgooch@atnf.csiro.au>
-               Protected <scan_dir_for_removable> and <get_removable_partition>
-               from changing directory contents.
-  v1.16
-    20020514   Richard Gooch <rgooch@atnf.csiro.au>
-               Minor cleanup of <scan_dir_for_removable>.
-  v1.17
-    20020721   Richard Gooch <rgooch@atnf.csiro.au>
-               Switched to ISO C structure field initialisers.
-               Switch to set_current_state() and move before add_wait_queue().
-    20020722   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed devfs entry leak in <devfs_readdir> when *readdir fails.
-  v1.18
-    20020725   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_find_and_unregister>.
-  v1.19
-    20020728   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed deprecated <devfs_find_handle>.
-  v1.20
-    20020820   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed module unload race in <devfs_open>.
-  v1.21
-    20021013   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed DEVFS_ FL_AUTO_OWNER.
-               Switched lingering structure field initialiser to ISO C.
-               Added locking when updating FCB flags.
-  v1.22
-*/
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/tty.h>
-#include <linux/timer.h>
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/wait.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/ctype.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/devfs_fs.h>
-#include <linux/devfs_fs_kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/smp.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/namei.h>
-#include <linux/bitops.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/pgtable.h>
-#include <asm/atomic.h>
-#define DEVFS_VERSION            "2004-01-31"
-#define DEVFS_NAME "devfs"
-#define FIRST_INODE 1
-#define STRING_LENGTH 256
-#define FAKE_BLOCK_SIZE 1024
-#define POISON_PTR ( *(void **) poison_array )
-#define MAGIC_VALUE 0x327db823
-#ifndef TRUE
-#  define TRUE 1
-#  define FALSE 0
-#endif
-#define MODE_DIR (S_IFDIR | S_IWUSR | S_IRUGO | S_IXUGO)
-#define DEBUG_NONE         0x0000000
-#define DEBUG_MODULE_LOAD  0x0000001
-#define DEBUG_REGISTER     0x0000002
-#define DEBUG_UNREGISTER   0x0000004
-#define DEBUG_FREE         0x0000008
-#define DEBUG_SET_FLAGS    0x0000010
-#define DEBUG_S_READ       0x0000100    /*  Break  */
-#define DEBUG_I_LOOKUP     0x0001000    /*  Break  */
-#define DEBUG_I_CREATE     0x0002000
-#define DEBUG_I_GET        0x0004000
-#define DEBUG_I_CHANGE     0x0008000
-#define DEBUG_I_UNLINK     0x0010000
-#define DEBUG_I_RLINK      0x0020000
-#define DEBUG_I_FLINK      0x0040000
-#define DEBUG_I_MKNOD      0x0080000
-#define DEBUG_F_READDIR    0x0100000    /*  Break  */
-#define DEBUG_D_DELETE     0x1000000    /*  Break  */
-#define DEBUG_D_RELEASE    0x2000000
-#define DEBUG_D_IPUT       0x4000000
-#define DEBUG_ALL          0xfffffff
-#define DEBUG_DISABLED     DEBUG_NONE
-#define OPTION_NONE             0x00
-#define OPTION_MOUNT            0x01
-#define PRINTK(format, args...) \
-   {printk (KERN_ERR "%s" format, __FUNCTION__ , ## args);}
-#define OOPS(format, args...) \
-   {printk (KERN_CRIT "%s" format, __FUNCTION__ , ## args); \
-    printk ("Forcing Oops\n"); \
-    BUG();}
-#ifdef CONFIG_DEVFS_DEBUG
-#  define VERIFY_ENTRY(de) \
-   {if ((de) && (de)->magic_number != MAGIC_VALUE) \
-        OOPS ("(%p): bad magic value: %x\n", (de), (de)->magic_number);}
-#  define WRITE_ENTRY_MAGIC(de,magic) (de)->magic_number = (magic)
-#  define DPRINTK(flag, format, args...) \
-   {if (devfs_debug & flag) \
-        printk (KERN_INFO "%s" format, __FUNCTION__ , ## args);}
-#else
-#  define VERIFY_ENTRY(de)
-#  define WRITE_ENTRY_MAGIC(de,magic)
-#  define DPRINTK(flag, format, args...)
-#endif
-typedef struct devfs_entry *devfs_handle_t;
-struct directory_type {
-        rwlock_t lock;          /*  Lock for searching(R)/updating(W)   */
-        struct devfs_entry *first;
-        struct devfs_entry *last;
-        unsigned char no_more_additions:1;
-};
-struct symlink_type {
-        unsigned int length;    /*  Not including the NULL-termimator       */
-        char *linkname;         /*  This is NULL-terminated                 */
-};
-struct devfs_inode {            /*  This structure is for "persistent" inode storage  */
-        struct dentry *dentry;
-        struct timespec atime;
-        struct timespec mtime;
-        struct timespec ctime;
-        unsigned int ino;       /*  Inode number as seen in the VFS         */
-        uid_t uid;
-        gid_t gid;
-};
-struct devfs_entry {
-#ifdef CONFIG_DEVFS_DEBUG
-        unsigned int magic_number;
-#endif
-        void *info;
-        atomic_t refcount;      /*  When this drops to zero, it's unused    */
-        union {
-                struct directory_type dir;
-                dev_t dev;
-                struct symlink_type symlink;
-                const char *name;       /*  Only used for (mode == 0)               */
-        } u;
-        struct devfs_entry *prev;       /*  Previous entry in the parent directory  */
-        struct devfs_entry *next;       /*  Next entry in the parent directory      */
-        struct devfs_entry *parent;     /*  The parent directory                    */
-        struct devfs_inode inode;
-        umode_t mode;
-        unsigned short namelen; /*  I think 64k+ filenames are a way off... */
-        unsigned char vfs:1;    /*  Whether the VFS may delete the entry   */
-        char name[1];           /*  This is just a dummy: the allocated array
-                                   is bigger. This is NULL-terminated      */
-};
-/*  The root of the device tree  */
-static struct devfs_entry *root_entry;
-struct devfsd_buf_entry {
-        struct devfs_entry *de; /*  The name is generated with this         */
-        unsigned short type;    /*  The type of event                       */
-        umode_t mode;
-        uid_t uid;
-        gid_t gid;
-        struct devfsd_buf_entry *next;
-};
-struct fs_info {                /*  This structure is for the mounted devfs  */
-        struct super_block *sb;
-        spinlock_t devfsd_buffer_lock;  /*  Lock when inserting/deleting events  */
-        struct devfsd_buf_entry *devfsd_first_event;
-        struct devfsd_buf_entry *devfsd_last_event;
-        volatile int devfsd_sleeping;
-        volatile struct task_struct *devfsd_task;
-        volatile pid_t devfsd_pgrp;
-        volatile struct file *devfsd_file;
-        struct devfsd_notify_struct *devfsd_info;
-        volatile unsigned long devfsd_event_mask;
-        atomic_t devfsd_overrun_count;
-        wait_queue_head_t devfsd_wait_queue;    /*  Wake devfsd on input       */
-        wait_queue_head_t revalidate_wait_queue;        /*  Wake when devfsd sleeps    */
-};
-static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED };
-static kmem_cache_t *devfsd_buf_cache;
-#ifdef CONFIG_DEVFS_DEBUG
-static unsigned int devfs_debug_init __initdata = DEBUG_NONE;
-static unsigned int devfs_debug = DEBUG_NONE;
-static DEFINE_SPINLOCK(stat_lock);
-static unsigned int stat_num_entries;
-static unsigned int stat_num_bytes;
-#endif
-static unsigned char poison_array[8] =
-    { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a };
-#ifdef CONFIG_DEVFS_MOUNT
-static unsigned int boot_options = OPTION_MOUNT;
-#else
-static unsigned int boot_options = OPTION_NONE;
-#endif
-/*  Forward function declarations  */
-static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir,
-                                       const char *name, int namelen,
-                                       int traverse_symlink);
-static ssize_t devfsd_read(struct file *file, char __user *buf, size_t len,
-                           loff_t * ppos);
-static int devfsd_ioctl(struct inode *inode, struct file *file,
-                        unsigned int cmd, unsigned long arg);
-static int devfsd_close(struct inode *inode, struct file *file);
-#ifdef CONFIG_DEVFS_DEBUG
-static ssize_t stat_read(struct file *file, char __user *buf, size_t len,
-                         loff_t * ppos);
-static const struct file_operations stat_fops = {
-        .open = nonseekable_open,
-        .read = stat_read,
-};
-#endif
-/*  Devfs daemon file operations  */
-static const struct file_operations devfsd_fops = {
-        .open = nonseekable_open,
-        .read = devfsd_read,
-        .ioctl = devfsd_ioctl,
-        .release = devfsd_close,
-};
-/*  Support functions follow  */
-/**
- *      devfs_get - Get a reference to a devfs entry.
- *      @de:  The devfs entry.
- */
-static struct devfs_entry *devfs_get(struct devfs_entry *de)
-{
-        VERIFY_ENTRY(de);
-        if (de)
-                atomic_inc(&de->refcount);
-        return de;
-}                               /*  End Function devfs_get  */
-/**
- *      devfs_put - Put (release) a reference to a devfs entry.
- *      @de:  The handle to the devfs entry.
- */
-static void devfs_put(devfs_handle_t de)
-{
-        if (!de)
-                return;
-        VERIFY_ENTRY(de);
-        if (de->info == POISON_PTR)
-                OOPS("(%p): poisoned pointer\n", de);
-        if (!atomic_dec_and_test(&de->refcount))
-                return;
-        if (de == root_entry)
-                OOPS("(%p): root entry being freed\n", de);
-        DPRINTK(DEBUG_FREE, "(%s): de: %p, parent: %p \"%s\"\n",
-                de->name, de, de->parent,
-                de->parent ? de->parent->name : "no parent");
-        if (S_ISLNK(de->mode))
-                kfree(de->u.symlink.linkname);
-        WRITE_ENTRY_MAGIC(de, 0);
-#ifdef CONFIG_DEVFS_DEBUG
-        spin_lock(&stat_lock);
-        --stat_num_entries;
-        stat_num_bytes -= sizeof *de + de->namelen;
-        if (S_ISLNK(de->mode))
-                stat_num_bytes -= de->u.symlink.length + 1;
-        spin_unlock(&stat_lock);
-#endif
-        de->info = POISON_PTR;
-        kfree(de);
-}                               /*  End Function devfs_put  */
-/**
- *      _devfs_search_dir - Search for a devfs entry in a directory.
- *      @dir:  The directory to search.
- *      @name:  The name of the entry to search for.
- *      @namelen:  The number of characters in @name.
- *
- *  Search for a devfs entry in a directory and returns a pointer to the entry
- *   on success, else %NULL. The directory must be locked already.
- *   An implicit devfs_get() is performed on the returned entry.
- */
-static struct devfs_entry *_devfs_search_dir(struct devfs_entry *dir,
-                                             const char *name,
-                                             unsigned int namelen)
-{
-        struct devfs_entry *curr;
-        if (!S_ISDIR(dir->mode)) {
-                PRINTK("(%s): not a directory\n", dir->name);
-                return NULL;
-        }
-        for (curr = dir->u.dir.first; curr != NULL; curr = curr->next) {
-                if (curr->namelen != namelen)
-                        continue;
-                if (memcmp(curr->name, name, namelen) == 0)
-                        break;
-                /*  Not found: try the next one  */
-        }
-        return devfs_get(curr);
-}                               /*  End Function _devfs_search_dir  */
-/**
- *      _devfs_alloc_entry - Allocate a devfs entry.
- *      @name:     the name of the entry
- *      @namelen:  the number of characters in @name
- *      @mode:     the mode for the entry
- *
- *  Allocate a devfs entry and returns a pointer to the entry on success, else
- *   %NULL.
- */
-static struct devfs_entry *_devfs_alloc_entry(const char *name,
-                                              unsigned int namelen,
-                                              umode_t mode)
-{
-        struct devfs_entry *new;
-        static unsigned long inode_counter = FIRST_INODE;
-        static DEFINE_SPINLOCK(counter_lock);
-        if (name && (namelen < 1))
-                namelen = strlen(name);
-        if ((new = kmalloc(sizeof *new + namelen, GFP_KERNEL)) == NULL)
-                return NULL;
-        memset(new, 0, sizeof *new + namelen);  /*  Will set '\0' on name  */
-        new->mode = mode;
-        if (S_ISDIR(mode))
-                rwlock_init(&new->u.dir.lock);
-        atomic_set(&new->refcount, 1);
-        spin_lock(&counter_lock);
-        new->inode.ino = inode_counter++;
-        spin_unlock(&counter_lock);
-        if (name)
-                memcpy(new->name, name, namelen);
-        new->namelen = namelen;
-        WRITE_ENTRY_MAGIC(new, MAGIC_VALUE);
-#ifdef CONFIG_DEVFS_DEBUG
-        spin_lock(&stat_lock);
-        ++stat_num_entries;
-        stat_num_bytes += sizeof *new + namelen;
-        spin_unlock(&stat_lock);
-#endif
-        return new;
-}                               /*  End Function _devfs_alloc_entry  */
-/**
- *      _devfs_append_entry - Append a devfs entry to a directory's child list.
- *      @dir:  The directory to add to.
- *      @de:  The devfs entry to append.
- *      @old_de: If an existing entry exists, it will be written here. This may
- *               be %NULL. An implicit devfs_get() is performed on this entry.
- *
- *  Append a devfs entry to a directory's list of children, checking first to
- *   see if an entry of the same name exists. The directory will be locked.
- *   The value 0 is returned on success, else a negative error code.
- *   On failure, an implicit devfs_put() is performed on %de.
- */
-static int _devfs_append_entry(devfs_handle_t dir, devfs_handle_t de,
-                               devfs_handle_t * old_de)
-{
-        int retval;
-        if (old_de)
-                *old_de = NULL;
-        if (!S_ISDIR(dir->mode)) {
-                PRINTK("(%s): dir: \"%s\" is not a directory\n", de->name,
-                       dir->name);
-                devfs_put(de);
-                return -ENOTDIR;
-        }
-        write_lock(&dir->u.dir.lock);
-        if (dir->u.dir.no_more_additions)
-                retval = -ENOENT;
-        else {
-                struct devfs_entry *old;
-                old = _devfs_search_dir(dir, de->name, de->namelen);
-                if (old_de)
-                        *old_de = old;
-                else
-                        devfs_put(old);
-                if (old == NULL) {
-                        de->parent = dir;
-                        de->prev = dir->u.dir.last;
-                        /*  Append to the directory's list of children  */
-                        if (dir->u.dir.first == NULL)
-                                dir->u.dir.first = de;
-                        else
-                                dir->u.dir.last->next = de;
-                        dir->u.dir.last = de;
-                        retval = 0;
-                } else
-                        retval = -EEXIST;
-        }
-        write_unlock(&dir->u.dir.lock);
-        if (retval)
-                devfs_put(de);
-        return retval;
-}                               /*  End Function _devfs_append_entry  */
-/**
- *      _devfs_get_root_entry - Get the root devfs entry.
- *
- *      Returns the root devfs entry on success, else %NULL.
- *
- *      TODO it must be called asynchronously due to the fact
- *      that devfs is initialized relatively late. Proper way
- *      is to remove module_init from init_devfs_fs and manually
- *      call it early enough during system init
- */
-static struct devfs_entry *_devfs_get_root_entry(void)
-{
-        struct devfs_entry *new;
-        static DEFINE_SPINLOCK(root_lock);
-        if (root_entry)
-                return root_entry;
-        new = _devfs_alloc_entry(NULL, 0, MODE_DIR);
-        if (new == NULL)
-                return NULL;
-        spin_lock(&root_lock);
-        if (root_entry) {
-                spin_unlock(&root_lock);
-                devfs_put(new);
-                return root_entry;
-        }
-        root_entry = new;
-        spin_unlock(&root_lock);
-        return root_entry;
-}                               /*  End Function _devfs_get_root_entry  */
-/**
- *      _devfs_descend - Descend down a tree using the next component name.
- *      @dir:  The directory to search.
- *      @name:  The component name to search for.
- *      @namelen:  The length of %name.
- *      @next_pos:  The position of the next '/' or '\0' is written here.
- *
- *  Descend into a directory, searching for a component. This function forms
- *   the core of a tree-walking algorithm. The directory will be locked.
- *   The devfs entry corresponding to the component is returned. If there is
- *   no matching entry, %NULL is returned.
- *   An implicit devfs_get() is performed on the returned entry.
- */
-static struct devfs_entry *_devfs_descend(struct devfs_entry *dir,
-                                          const char *name, int namelen,
-                                          int *next_pos)
-{
-        const char *stop, *ptr;
-        struct devfs_entry *entry;
-        if ((namelen >= 3) && (strncmp(name, "../", 3) == 0)) { /*  Special-case going to parent directory  */
-                *next_pos = 3;
-                return devfs_get(dir->parent);
-        }
-        stop = name + namelen;
-        /*  Search for a possible '/'  */
-        for (ptr = name; (ptr < stop) && (*ptr != '/'); ++ptr) ;
-        *next_pos = ptr - name;
-        read_lock(&dir->u.dir.lock);
-        entry = _devfs_search_dir(dir, name, *next_pos);
-        read_unlock(&dir->u.dir.lock);
-        return entry;
-}                               /*  End Function _devfs_descend  */
-static devfs_handle_t _devfs_make_parent_for_leaf(struct devfs_entry *dir,
-                                                  const char *name,
-                                                  int namelen, int *leaf_pos)
-{
-        int next_pos = 0;
-        if (dir == NULL)
-                dir = _devfs_get_root_entry();
-        if (dir == NULL)
-                return NULL;
-        devfs_get(dir);
-        /*  Search for possible trailing component and ignore it  */
-        for (--namelen; (namelen > 0) && (name[namelen] != '/'); --namelen) ;
-        *leaf_pos = (name[namelen] == '/') ? (namelen + 1) : 0;
-        for (; namelen > 0; name += next_pos, namelen -= next_pos) {
-                struct devfs_entry *de, *old = NULL;
-                if ((de =
-                     _devfs_descend(dir, name, namelen, &next_pos)) == NULL) {
-                        de = _devfs_alloc_entry(name, next_pos, MODE_DIR);
-                        devfs_get(de);
-                        if (!de || _devfs_append_entry(dir, de, &old)) {
-                                devfs_put(de);
-                                if (!old || !S_ISDIR(old->mode)) {
-                                        devfs_put(old);
-                                        devfs_put(dir);
-                                        return NULL;
-                                }
-                                de = old;       /*  Use the existing directory  */
-                        }
-                }
-                if (de == dir->parent) {
-                        devfs_put(dir);
-                        devfs_put(de);
-                        return NULL;
-                }
-                devfs_put(dir);
-                dir = de;
-                if (name[next_pos] == '/')
-                        ++next_pos;
-        }
-        return dir;
-}                               /*  End Function _devfs_make_parent_for_leaf  */
-static devfs_handle_t _devfs_prepare_leaf(devfs_handle_t * dir,
-                                          const char *name, umode_t mode)
-{
-        int namelen, leaf_pos;
-        struct devfs_entry *de;
-        namelen = strlen(name);
-        if ((*dir = _devfs_make_parent_for_leaf(*dir, name, namelen,
-                                                &leaf_pos)) == NULL) {
-                PRINTK("(%s): could not create parent path\n", name);
-                return NULL;
-        }
-        if ((de = _devfs_alloc_entry(name + leaf_pos, namelen - leaf_pos, mode))
-            == NULL) {
-                PRINTK("(%s): could not allocate entry\n", name);
-                devfs_put(*dir);
-                return NULL;
-        }
-        return de;
-}                               /*  End Function _devfs_prepare_leaf  */
-static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir,
-                                       const char *name, int namelen,
-                                       int traverse_symlink)
-{
-        int next_pos = 0;
-        if (dir == NULL)
-                dir = _devfs_get_root_entry();
-        if (dir == NULL)
-                return NULL;
-        devfs_get(dir);
-        for (; namelen > 0; name += next_pos, namelen -= next_pos) {
-                struct devfs_entry *de, *link;
-                if (!S_ISDIR(dir->mode)) {
-                        devfs_put(dir);
-                        return NULL;
-                }
-                if ((de =
-                     _devfs_descend(dir, name, namelen, &next_pos)) == NULL) {
-                        devfs_put(dir);
-                        return NULL;
-                }
-                if (S_ISLNK(de->mode) && traverse_symlink) {    /*  Need to follow the link: this is a stack chomper  */
-                        /* FIXME what if it puts outside of mounted tree? */
-                        link = _devfs_walk_path(dir, de->u.symlink.linkname,
-                                                de->u.symlink.length, TRUE);
-                        devfs_put(de);
-                        if (!link) {
-                                devfs_put(dir);
-                                return NULL;
-                        }
-                        de = link;
-                }
-                devfs_put(dir);
-                dir = de;
-                if (name[next_pos] == '/')
-                        ++next_pos;
-        }
-        return dir;
-}                               /*  End Function _devfs_walk_path  */
-/**
- *      _devfs_find_entry - Find a devfs entry.
- *      @dir: The handle to the parent devfs directory entry. If this is %NULL the
- *              name is relative to the root of the devfs.
- *      @name: The name of the entry. This may be %NULL.
- *      @traverse_symlink: If %TRUE then symbolic links are traversed.
- *
- *      Returns the devfs_entry pointer on success, else %NULL. An implicit
- *      devfs_get() is performed.
- */
-static struct devfs_entry *_devfs_find_entry(devfs_handle_t dir,
-                                             const char *name,
-                                             int traverse_symlink)
-{
-        unsigned int namelen = strlen(name);
-        if (name[0] == '/') {
-                /*  Skip leading pathname component  */
-                if (namelen < 2) {
-                        PRINTK("(%s): too short\n", name);
-                        return NULL;
-                }
-                for (++name, --namelen; (*name != '/') && (namelen > 0);
-                     ++name, --namelen) ;
-                if (namelen < 2) {
-                        PRINTK("(%s): too short\n", name);
-                        return NULL;
-                }
-                ++name;
-                --namelen;
-        }
-        return _devfs_walk_path(dir, name, namelen, traverse_symlink);
-}                               /*  End Function _devfs_find_entry  */
-static struct devfs_entry *get_devfs_entry_from_vfs_inode(struct inode *inode)
-{
-        if (inode == NULL)
-                return NULL;
-        VERIFY_ENTRY((struct devfs_entry *)inode->u.generic_ip);
-        return inode->u.generic_ip;
-}                               /*  End Function get_devfs_entry_from_vfs_inode  */
-/**
- *      free_dentry - Free the dentry for a device entry and invalidate inode.
- *      @de: The entry.
- *
- *      This must only be called after the entry has been unhooked from its
- *       parent directory.
- */
-static void free_dentry(struct devfs_entry *de)
-{
-        struct dentry *dentry = de->inode.dentry;
-        if (!dentry)
-                return;
-        spin_lock(&dcache_lock);
-        dget_locked(dentry);
-        spin_unlock(&dcache_lock);
-        /*  Forcefully remove the inode  */
-        if (dentry->d_inode != NULL)
-                dentry->d_inode->i_nlink = 0;
-        d_drop(dentry);
-        dput(dentry);
-}                               /*  End Function free_dentry  */
-/**
- *      is_devfsd_or_child - Test if the current process is devfsd or one of its children.
- *      @fs_info: The filesystem information.
- *
- *      Returns %TRUE if devfsd or child, else %FALSE.
- */
-static int is_devfsd_or_child(struct fs_info *fs_info)
-{
-        struct task_struct *p = current;
-        if (p == fs_info->devfsd_task)
-                return (TRUE);
-        if (process_group(p) == fs_info->devfsd_pgrp)
-                return (TRUE);
-        read_lock(&tasklist_lock);
-        for (; p != &init_task; p = p->real_parent) {
-                if (p == fs_info->devfsd_task) {
-                        read_unlock(&tasklist_lock);
-                        return (TRUE);
-                }
-        }
-        read_unlock(&tasklist_lock);
-        return (FALSE);
-}                               /*  End Function is_devfsd_or_child  */
-/**
- *      devfsd_queue_empty - Test if devfsd has work pending in its event queue.
- *      @fs_info: The filesystem information.
- *
- *      Returns %TRUE if the queue is empty, else %FALSE.
- */
-static inline int devfsd_queue_empty(struct fs_info *fs_info)
-{
-        return (fs_info->devfsd_last_event) ? FALSE : TRUE;
-}                               /*  End Function devfsd_queue_empty  */
-/**
- *      wait_for_devfsd_finished - Wait for devfsd to finish processing its event queue.
- *      @fs_info: The filesystem information.
- *
- *      Returns %TRUE if no more waiting will be required, else %FALSE.
- */
-static int wait_for_devfsd_finished(struct fs_info *fs_info)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        if (fs_info->devfsd_task == NULL)
-                return (TRUE);
-        if (devfsd_queue_empty(fs_info) && fs_info->devfsd_sleeping)
-                return TRUE;
-        if (is_devfsd_or_child(fs_info))
-                return (FALSE);
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        add_wait_queue(&fs_info->revalidate_wait_queue, &wait);
-        if (!devfsd_queue_empty(fs_info) || !fs_info->devfsd_sleeping)
-                if (fs_info->devfsd_task)
-                        schedule();
-        remove_wait_queue(&fs_info->revalidate_wait_queue, &wait);
-        __set_current_state(TASK_RUNNING);
-        return (TRUE);
-}                               /*  End Function wait_for_devfsd_finished  */
-/**
- *      devfsd_notify_de - Notify the devfsd daemon of a change.
- *      @de: The devfs entry that has changed. This and all parent entries will
- *            have their reference counts incremented if the event was queued.
- *      @type: The type of change.
- *      @mode: The mode of the entry.
- *      @uid: The user ID.
- *      @gid: The group ID.
- *      @fs_info: The filesystem info.
- *
- *      Returns %TRUE if an event was queued and devfsd woken up, else %FALSE.
- */
-static int devfsd_notify_de(struct devfs_entry *de,
-                            unsigned short type, umode_t mode,
-                            uid_t uid, gid_t gid, struct fs_info *fs_info)
-{
-        struct devfsd_buf_entry *entry;
-        struct devfs_entry *curr;
-        if (!(fs_info->devfsd_event_mask & (1 << type)))
-                return (FALSE);
-        if ((entry = kmem_cache_alloc(devfsd_buf_cache, SLAB_KERNEL)) == NULL) {
-                atomic_inc(&fs_info->devfsd_overrun_count);
-                return (FALSE);
-        }
-        for (curr = de; curr != NULL; curr = curr->parent)
-                devfs_get(curr);
-        entry->de = de;
-        entry->type = type;
-        entry->mode = mode;
-        entry->uid = uid;
-        entry->gid = gid;
-        entry->next = NULL;
-        spin_lock(&fs_info->devfsd_buffer_lock);
-        if (!fs_info->devfsd_first_event)
-                fs_info->devfsd_first_event = entry;
-        if (fs_info->devfsd_last_event)
-                fs_info->devfsd_last_event->next = entry;
-        fs_info->devfsd_last_event = entry;
-        spin_unlock(&fs_info->devfsd_buffer_lock);
-        wake_up_interruptible(&fs_info->devfsd_wait_queue);
-        return (TRUE);
-}                               /*  End Function devfsd_notify_de  */
-/**
- *      devfsd_notify - Notify the devfsd daemon of a change.
- *      @de: The devfs entry that has changed.
- *      @type: The type of change event.
- *      @wait: If TRUE, the function waits for the daemon to finish processing
- *              the event.
- */
-static void devfsd_notify(struct devfs_entry *de, unsigned short type)
-{
-        devfsd_notify_de(de, type, de->mode, current->euid,
-                         current->egid, &fs_info);
-}
-static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args)
-{
-        struct devfs_entry *dir = NULL, *de;
-        char buf[64];
-        int error, n;
-        n = vsnprintf(buf, sizeof(buf), fmt, args);
-        if (n >= sizeof(buf) || !buf[0]) {
-                printk(KERN_WARNING "%s: invalid format string %s\n",
-                       __FUNCTION__, fmt);
-                return -EINVAL;
-        }
-        de = _devfs_prepare_leaf(&dir, buf, mode);
-        if (!de) {
-                printk(KERN_WARNING "%s: could not prepare leaf for %s\n",
-                       __FUNCTION__, buf);
-                return -ENOMEM; /* could be more accurate... */
-        }
-        de->u.dev = dev;
-        error = _devfs_append_entry(dir, de, NULL);
-        if (error) {
-                printk(KERN_WARNING "%s: could not append to parent for %s\n",
-                       __FUNCTION__, buf);
-                goto out;
-        }
-        devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
-      out:
-        devfs_put(dir);
-        return error;
-}
-int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
-{
-        va_list args;
-        if (!S_ISBLK(mode)) {
-                printk(KERN_WARNING "%s: invalide mode (%u) for %s\n",
-                       __FUNCTION__, mode, fmt);
-                return -EINVAL;
-        }
-        va_start(args, fmt);
-        return devfs_mk_dev(dev, mode, fmt, args);
-}
-EXPORT_SYMBOL(devfs_mk_bdev);
-int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
-{
-        va_list args;
-        if (!S_ISCHR(mode)) {
-                printk(KERN_WARNING "%s: invalide mode (%u) for %s\n",
-                       __FUNCTION__, mode, fmt);
-                return -EINVAL;
-        }
-        va_start(args, fmt);
-        return devfs_mk_dev(dev, mode, fmt, args);
-}
-EXPORT_SYMBOL(devfs_mk_cdev);
-/**
- *      _devfs_unhook - Unhook a device entry from its parents list
- *      @de: The entry to unhook.
- *
- *      Returns %TRUE if the entry was unhooked, else %FALSE if it was
- *              previously unhooked.
- *      The caller must have a write lock on the parent directory.
- */
-static int _devfs_unhook(struct devfs_entry *de)
-{
-        struct devfs_entry *parent;
-        if (!de || (de->prev == de))
-                return FALSE;
-        parent = de->parent;
-        if (de->prev == NULL)
-                parent->u.dir.first = de->next;
-        else
-                de->prev->next = de->next;
-        if (de->next == NULL)
-                parent->u.dir.last = de->prev;
-        else
-                de->next->prev = de->prev;
-        de->prev = de;          /*  Indicate we're unhooked                      */
-        de->next = NULL;        /*  Force early termination for <devfs_readdir>  */
-        return TRUE;
-}                               /*  End Function _devfs_unhook  */
-/**
- *      _devfs_unregister - Unregister a device entry from its parent.
- *      @dir: The parent directory.
- *      @de: The entry to unregister.
- *
- *      The caller must have a write lock on the parent directory, which is
- *      unlocked by this function.
- */
-static void _devfs_unregister(struct devfs_entry *dir, struct devfs_entry *de)
-{
-        int unhooked = _devfs_unhook(de);
-        write_unlock(&dir->u.dir.lock);
-        if (!unhooked)
-                return;
-        devfs_get(dir);
-        devfsd_notify(de, DEVFSD_NOTIFY_UNREGISTERED);
-        free_dentry(de);
-        devfs_put(dir);
-        if (!S_ISDIR(de->mode))
-                return;
-        while (TRUE) {          /*  Recursively unregister: this is a stack chomper  */
-                struct devfs_entry *child;
-                write_lock(&de->u.dir.lock);
-                de->u.dir.no_more_additions = TRUE;
-                child = de->u.dir.first;
-                VERIFY_ENTRY(child);
-                _devfs_unregister(de, child);
-                if (!child)
-                        break;
-                DPRINTK(DEBUG_UNREGISTER, "(%s): child: %p  refcount: %d\n",
-                        child->name, child, atomic_read(&child->refcount));
-                devfs_put(child);
-        }
-}                               /*  End Function _devfs_unregister  */
-static int devfs_do_symlink(devfs_handle_t dir, const char *name,
-                            const char *link, devfs_handle_t * handle)
-{
-        int err;
-        unsigned int linklength;
-        char *newlink;
-        struct devfs_entry *de;
-        if (handle != NULL)
-                *handle = NULL;
-        if (name == NULL) {
-                PRINTK("(): NULL name pointer\n");
-                return -EINVAL;
-        }
-        if (link == NULL) {
-                PRINTK("(%s): NULL link pointer\n", name);
-                return -EINVAL;
-        }
-        linklength = strlen(link);
-        if ((newlink = kmalloc(linklength + 1, GFP_KERNEL)) == NULL)
-                return -ENOMEM;
-        memcpy(newlink, link, linklength);
-        newlink[linklength] = '\0';
-        if ((de = _devfs_prepare_leaf(&dir, name, S_IFLNK | S_IRUGO | S_IXUGO))
-            == NULL) {
-                PRINTK("(%s): could not prepare leaf\n", name);
-                kfree(newlink);
-                return -ENOTDIR;
-        }
-        de->info = NULL;
-        de->u.symlink.linkname = newlink;
-        de->u.symlink.length = linklength;
-        if ((err = _devfs_append_entry(dir, de, NULL)) != 0) {
-                PRINTK("(%s): could not append to parent, err: %d\n", name,
-                       err);
-                devfs_put(dir);
-                return err;
-        }
-        devfs_put(dir);
-#ifdef CONFIG_DEVFS_DEBUG
-        spin_lock(&stat_lock);
-        stat_num_bytes += linklength + 1;
-        spin_unlock(&stat_lock);
-#endif
-        if (handle != NULL)
-                *handle = de;
-        return 0;
-}                               /*  End Function devfs_do_symlink  */
-/**
- *      devfs_mk_symlink Create a symbolic link in the devfs namespace.
- *      @from: The name of the entry.
- *      @to: Name of the destination
- *
- *      Returns 0 on success, else a negative error code is returned.
- */
-int devfs_mk_symlink(const char *from, const char *to)
-{
-        devfs_handle_t de;
-        int err;
-        err = devfs_do_symlink(NULL, from, to, &de);
-        if (!err) {
-                de->vfs = TRUE;
-                devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
-        }
-        return err;
-}
-/**
- *      devfs_mk_dir - Create a directory in the devfs namespace.
- *              new name is relative to the root of the devfs.
- *      @fmt: The name of the entry.
- *
- *      Use of this function is optional. The devfs_register() function
- *      will automatically create intermediate directories as needed. This function
- *      is provided for efficiency reasons, as it provides a handle to a directory.
- *      On failure %NULL is returned.
- */
-int devfs_mk_dir(const char *fmt, ...)
-{
-        struct devfs_entry *dir = NULL, *de = NULL, *old;
-        char buf[64];
-        va_list args;
-        int error, n;
-        va_start(args, fmt);
-        n = vsnprintf(buf, 64, fmt, args);
-        if (n >= 64 || !buf[0]) {
-                printk(KERN_WARNING "%s: invalid argument.", __FUNCTION__);
-                return -EINVAL;
-        }
-        de = _devfs_prepare_leaf(&dir, buf, MODE_DIR);
-        if (!de) {
-                PRINTK("(%s): could not prepare leaf\n", buf);
-                return -EINVAL;
-        }
-        error = _devfs_append_entry(dir, de, &old);
-        if (error == -EEXIST && S_ISDIR(old->mode)) {
-                /*
-                 * devfs_mk_dir() of an already-existing directory will
-                 * return success.
-                 */
-                error = 0;
-                goto out_put;
-        } else if (error) {
-                PRINTK("(%s): could not append to dir: %p \"%s\"\n",
-                       buf, dir, dir->name);
-                devfs_put(old);
-                goto out_put;
-        }
-        devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
-      out_put:
-        devfs_put(dir);
-        return error;
-}
-void devfs_remove(const char *fmt, ...)
-{
-        char buf[64];
-        va_list args;
-        int n;
-        va_start(args, fmt);
-        n = vsnprintf(buf, sizeof(buf), fmt, args);
-        if (n < sizeof(buf) && buf[0]) {
-                devfs_handle_t de = _devfs_find_entry(NULL, buf, 0);
-                if (!de) {
-                        printk(KERN_ERR "%s: %s not found, cannot remove\n",
-                               __FUNCTION__, buf);
-                        dump_stack();
-                        return;
-                }
-                write_lock(&de->parent->u.dir.lock);
-                _devfs_unregister(de->parent, de);
-                devfs_put(de);
-                devfs_put(de);
-        }
-}
-/**
- *      devfs_generate_path - Generate a pathname for an entry, relative to the devfs root.
- *      @de: The devfs entry.
- *      @path: The buffer to write the pathname to. The pathname and '\0'
- *              terminator will be written at the end of the buffer.
- *      @buflen: The length of the buffer.
- *
- *      Returns the offset in the buffer where the pathname starts on success,
- *      else a negative error code.
- */
-static int devfs_generate_path(devfs_handle_t de, char *path, int buflen)
-{
-        int pos;
-#define NAMEOF(de) ( (de)->mode ? (de)->name : (de)->u.name )
-        if (de == NULL)
-                return -EINVAL;
-        VERIFY_ENTRY(de);
-        if (de->namelen >= buflen)
-                return -ENAMETOOLONG;   /*  Must be first       */
-        path[buflen - 1] = '\0';
-        if (de->parent == NULL)
-                return buflen - 1;      /*  Don't prepend root  */
-        pos = buflen - de->namelen - 1;
-        memcpy(path + pos, NAMEOF(de), de->namelen);
-        for (de = de->parent; de->parent != NULL; de = de->parent) {
-                if (pos - de->namelen - 1 < 0)
-                        return -ENAMETOOLONG;
-                path[--pos] = '/';
-                pos -= de->namelen;
-                memcpy(path + pos, NAMEOF(de), de->namelen);
-        }
-        return pos;
-}                               /*  End Function devfs_generate_path  */
-/**
- *      devfs_setup - Process kernel boot options.
- *      @str: The boot options after the "devfs=".
- */
-static int __init devfs_setup(char *str)
-{
-        static struct {
-                char *name;
-                unsigned int mask;
-                unsigned int *opt;
-        } devfs_options_tab[] __initdata = {
-#ifdef CONFIG_DEVFS_DEBUG
-                {
-                "dall", DEBUG_ALL, &devfs_debug_init}, {
-                "dmod", DEBUG_MODULE_LOAD, &devfs_debug_init}, {
-                "dreg", DEBUG_REGISTER, &devfs_debug_init}, {
-                "dunreg", DEBUG_UNREGISTER, &devfs_debug_init}, {
-                "dfree", DEBUG_FREE, &devfs_debug_init}, {
-                "diget", DEBUG_I_GET, &devfs_debug_init}, {
-                "dchange", DEBUG_SET_FLAGS, &devfs_debug_init}, {
-                "dsread", DEBUG_S_READ, &devfs_debug_init}, {
-                "dichange", DEBUG_I_CHANGE, &devfs_debug_init}, {
-                "dimknod", DEBUG_I_MKNOD, &devfs_debug_init}, {
-                "dilookup", DEBUG_I_LOOKUP, &devfs_debug_init}, {
-                "diunlink", DEBUG_I_UNLINK, &devfs_debug_init},
-#endif                          /*  CONFIG_DEVFS_DEBUG  */
-                {
-                "mount", OPTION_MOUNT, &boot_options}, {
-                NULL, 0, NULL}
-        };
-        while ((*str != '\0') && !isspace(*str)) {
-                int i, found = 0, invert = 0;
-                if (strncmp(str, "no", 2) == 0) {
-                        invert = 1;
-                        str += 2;
-                }
-                for (i = 0; devfs_options_tab[i].name != NULL; i++) {
-                        int len = strlen(devfs_options_tab[i].name);
-                        if (strncmp(str, devfs_options_tab[i].name, len) == 0) {
-                                if (invert)
-                                        *devfs_options_tab[i].opt &=
-                                            ~devfs_options_tab[i].mask;
-                                else
-                                        *devfs_options_tab[i].opt |=
-                                            devfs_options_tab[i].mask;
-                                str += len;
-                                found = 1;
-                                break;
-                        }
-                }
-                if (!found)
-                        return 0;       /*  No match         */
-                if (*str != ',')
-                        return 0;       /*  No more options  */
-                ++str;
-        }
-        return 1;
-}                               /*  End Function devfs_setup  */
-__setup("devfs=", devfs_setup);
-EXPORT_SYMBOL(devfs_mk_dir);
-EXPORT_SYMBOL(devfs_remove);
-/**
- *      try_modload - Notify devfsd of an inode lookup by a non-devfsd process.
- *      @parent: The parent devfs entry.
- *      @fs_info: The filesystem info.
- *      @name: The device name.
- *      @namelen: The number of characters in @name.
- *      @buf: A working area that will be used. This must not go out of scope
- *            until devfsd is idle again.
- *
- *      Returns 0 on success (event was queued), else a negative error code.
- */
-static int try_modload(struct devfs_entry *parent, struct fs_info *fs_info,
-                       const char *name, unsigned namelen,
-                       struct devfs_entry *buf)
-{
-        if (!(fs_info->devfsd_event_mask & (1 << DEVFSD_NOTIFY_LOOKUP)))
-                return -ENOENT;
-        if (is_devfsd_or_child(fs_info))
-                return -ENOENT;
-        memset(buf, 0, sizeof *buf);
-        atomic_set(&buf->refcount, 1);
-        buf->parent = parent;
-        buf->namelen = namelen;
-        buf->u.name = name;
-        WRITE_ENTRY_MAGIC(buf, MAGIC_VALUE);
-        if (!devfsd_notify_de(buf, DEVFSD_NOTIFY_LOOKUP, 0,
-                              current->euid, current->egid, fs_info))
-                return -ENOENT;
-        /*  Possible success: event has been queued  */
-        return 0;
-}                               /*  End Function try_modload  */
-/*  Superblock operations follow  */
-static struct inode_operations devfs_iops;
-static struct inode_operations devfs_dir_iops;
-static const struct file_operations devfs_fops;
-static const struct file_operations devfs_dir_fops;
-static struct inode_operations devfs_symlink_iops;
-static int devfs_notify_change(struct dentry *dentry, struct iattr *iattr)
-{
-        int retval;
-        struct devfs_entry *de;
-        struct inode *inode = dentry->d_inode;
-        struct fs_info *fs_info = inode->i_sb->s_fs_info;
-        de = get_devfs_entry_from_vfs_inode(inode);
-        if (de == NULL)
-                return -ENODEV;
-        retval = inode_change_ok(inode, iattr);
-        if (retval != 0)
-                return retval;
-        retval = inode_setattr(inode, iattr);
-        if (retval != 0)
-                return retval;
-        DPRINTK(DEBUG_I_CHANGE, "(%d): VFS inode: %p  devfs_entry: %p\n",
-                (int)inode->i_ino, inode, de);
-        DPRINTK(DEBUG_I_CHANGE, "():   mode: 0%o  uid: %d  gid: %d\n",
-                (int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid);
-        /*  Inode is not on hash chains, thus must save permissions here rather
-           than in a write_inode() method  */
-        de->mode = inode->i_mode;
-        de->inode.uid = inode->i_uid;
-        de->inode.gid = inode->i_gid;
-        de->inode.atime = inode->i_atime;
-        de->inode.mtime = inode->i_mtime;
-        de->inode.ctime = inode->i_ctime;
-        if ((iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) &&
-            !is_devfsd_or_child(fs_info))
-                devfsd_notify_de(de, DEVFSD_NOTIFY_CHANGE, inode->i_mode,
-                                 inode->i_uid, inode->i_gid, fs_info);
-        return 0;
-}                               /*  End Function devfs_notify_change  */
-static struct super_operations devfs_sops = {
-        .drop_inode = generic_delete_inode,
-        .statfs = simple_statfs,
-};
-/**
- *      _devfs_get_vfs_inode - Get a VFS inode.
- *      @sb: The super block.
- *      @de: The devfs inode.
- *      @dentry: The dentry to register with the devfs inode.
- *
- *      Returns the inode on success, else %NULL. An implicit devfs_get() is
- *       performed if the inode is created.
- */
-static struct inode *_devfs_get_vfs_inode(struct super_block *sb,
-                                          struct devfs_entry *de,
-                                          struct dentry *dentry)
-{
-        struct inode *inode;
-        if (de->prev == de)
-                return NULL;    /*  Quick check to see if unhooked  */
-        if ((inode = new_inode(sb)) == NULL) {
-                PRINTK("(%s): new_inode() failed, de: %p\n", de->name, de);
-                return NULL;
-        }
-        if (de->parent) {
-                read_lock(&de->parent->u.dir.lock);
-                if (de->prev != de)
-                        de->inode.dentry = dentry;      /*      Not unhooked  */
-                read_unlock(&de->parent->u.dir.lock);
-        } else
-                de->inode.dentry = dentry;      /*  Root: no locking needed  */
-        if (de->inode.dentry != dentry) {       /*  Must have been unhooked  */
-                iput(inode);
-                return NULL;
-        }
-        /* FIXME where is devfs_put? */
-        inode->u.generic_ip = devfs_get(de);
-        inode->i_ino = de->inode.ino;
-        DPRINTK(DEBUG_I_GET, "(%d): VFS inode: %p  devfs_entry: %p\n",
-                (int)inode->i_ino, inode, de);
-        inode->i_blocks = 0;
-        inode->i_blksize = FAKE_BLOCK_SIZE;
-        inode->i_op = &devfs_iops;
-        inode->i_mode = de->mode;
-        if (S_ISDIR(de->mode)) {
-                inode->i_op = &devfs_dir_iops;
-                inode->i_fop = &devfs_dir_fops;
-        } else if (S_ISLNK(de->mode)) {
-                inode->i_op = &devfs_symlink_iops;
-                inode->i_size = de->u.symlink.length;
-        } else if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) {
-                init_special_inode(inode, de->mode, de->u.dev);
-        } else if (S_ISFIFO(de->mode) || S_ISSOCK(de->mode)) {
-                init_special_inode(inode, de->mode, 0);
-        } else {
-                PRINTK("(%s): unknown mode %o de: %p\n",
-                       de->name, de->mode, de);
-                iput(inode);
-                devfs_put(de);
-                return NULL;
-        }
-        inode->i_uid = de->inode.uid;
-        inode->i_gid = de->inode.gid;
-        inode->i_atime = de->inode.atime;
-        inode->i_mtime = de->inode.mtime;
-        inode->i_ctime = de->inode.ctime;
-        DPRINTK(DEBUG_I_GET, "():   mode: 0%o  uid: %d  gid: %d\n",
-                (int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid);
-        return inode;
-}                               /*  End Function _devfs_get_vfs_inode  */
-/*  File operations for device entries follow  */
-static int devfs_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-        int err, count;
-        int stored = 0;
-        struct fs_info *fs_info;
-        struct devfs_entry *parent, *de, *next = NULL;
-        struct inode *inode = file->f_dentry->d_inode;
-        fs_info = inode->i_sb->s_fs_info;
-        parent = get_devfs_entry_from_vfs_inode(file->f_dentry->d_inode);
-        if ((long)file->f_pos < 0)
-                return -EINVAL;
-        DPRINTK(DEBUG_F_READDIR, "(%s): fs_info: %p  pos: %ld\n",
-                parent->name, fs_info, (long)file->f_pos);
-        switch ((long)file->f_pos) {
-        case 0:
-                err = (*filldir) (dirent, "..", 2, file->f_pos,
-                                  parent_ino(file->f_dentry), DT_DIR);
-                if (err == -EINVAL)
-                        break;
-                if (err < 0)
-                        return err;
-                file->f_pos++;
-                ++stored;
-                /*  Fall through  */
-        case 1:
-                err =
-                    (*filldir) (dirent, ".", 1, file->f_pos, inode->i_ino,
-                                DT_DIR);
-                if (err == -EINVAL)
-                        break;
-                if (err < 0)
-                        return err;
-                file->f_pos++;
-                ++stored;
-                /*  Fall through  */
-        default:
-                /*  Skip entries  */
-                count = file->f_pos - 2;
-                read_lock(&parent->u.dir.lock);
-                for (de = parent->u.dir.first; de && (count > 0); de = de->next)
-                        --count;
-                devfs_get(de);
-                read_unlock(&parent->u.dir.lock);
-                /*  Now add all remaining entries  */
-                while (de) {
-                        err = (*filldir) (dirent, de->name, de->namelen,
-                                          file->f_pos, de->inode.ino,
-                                          de->mode >> 12);
-                        if (err < 0)
-                                devfs_put(de);
-                        else {
-                                file->f_pos++;
-                                ++stored;
-                        }
-                        if (err == -EINVAL)
-                                break;
-                        if (err < 0)
-                                return err;
-                        read_lock(&parent->u.dir.lock);
-                        next = devfs_get(de->next);
-                        read_unlock(&parent->u.dir.lock);
-                        devfs_put(de);
-                        de = next;
-                }
-                break;
-        }
-        return stored;
-}                               /*  End Function devfs_readdir  */
-/* Open devfs specific special files */
-static int devfs_open(struct inode *inode, struct file *file)
-{
-        int err;
-        int minor = MINOR(inode->i_rdev);
-        struct file_operations *old_fops, *new_fops;
-        switch (minor) {
-        case 0:         /* /dev/.devfsd */
-                new_fops = fops_get(&devfsd_fops);
-                break;
-#ifdef CONFIG_DEVFS_DEBUG
-        case 1:         /* /dev/.stat */
-                new_fops = fops_get(&stat_fops);
-                break;
-#endif
-        default:
-                return -ENODEV;
-        }
-        if (new_fops == NULL)
-                return -ENODEV;
-        old_fops = file->f_op;
-        file->f_op = new_fops;
-        err = new_fops->open ? new_fops->open(inode, file) : 0;
-        if (err) {
-                file->f_op = old_fops;
-                fops_put(new_fops);
-        } else
-                fops_put(old_fops);
-        return err;
-}                               /*  End Function devfs_open  */
-static const struct file_operations devfs_fops = {
-        .open = devfs_open,
-};
-static const struct file_operations devfs_dir_fops = {
-        .read = generic_read_dir,
-        .readdir = devfs_readdir,
-};
-/*  Dentry operations for device entries follow  */
-/**
- *      devfs_d_release - Callback for when a dentry is freed.
- *      @dentry: The dentry.
- */
-static void devfs_d_release(struct dentry *dentry)
-{
-        DPRINTK(DEBUG_D_RELEASE, "(%p): inode: %p\n", dentry, dentry->d_inode);
-}                               /*  End Function devfs_d_release  */
-/**
- *      devfs_d_iput - Callback for when a dentry loses its inode.
- *      @dentry: The dentry.
- *      @inode: The inode.
- */
-static void devfs_d_iput(struct dentry *dentry, struct inode *inode)
-{
-        struct devfs_entry *de;
-        de = get_devfs_entry_from_vfs_inode(inode);
-        DPRINTK(DEBUG_D_IPUT,
-                "(%s): dentry: %p inode: %p de: %p de->dentry: %p\n", de->name,
-                dentry, inode, de, de->inode.dentry);
-        if (de->inode.dentry && (de->inode.dentry != dentry))
-                OOPS("(%s): de: %p dentry: %p de->dentry: %p\n",
-                     de->name, de, dentry, de->inode.dentry);
-        de->inode.dentry = NULL;
-        iput(inode);
-        devfs_put(de);
-}                               /*  End Function devfs_d_iput  */
-static int devfs_d_delete(struct dentry *dentry);
-static struct dentry_operations devfs_dops = {
-        .d_delete = devfs_d_delete,
-        .d_release = devfs_d_release,
-        .d_iput = devfs_d_iput,
-};
-static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *);
-static struct dentry_operations devfs_wait_dops = {
-        .d_delete = devfs_d_delete,
-        .d_release = devfs_d_release,
-        .d_iput = devfs_d_iput,
-        .d_revalidate = devfs_d_revalidate_wait,
-};
-/**
- *      devfs_d_delete - Callback for when all files for a dentry are closed.
- *      @dentry: The dentry.
- */
-static int devfs_d_delete(struct dentry *dentry)
-{
-        struct inode *inode = dentry->d_inode;
-        if (dentry->d_op == &devfs_wait_dops)
-                dentry->d_op = &devfs_dops;
-        /*  Unhash dentry if negative (has no inode)  */
-        if (inode == NULL) {
-                DPRINTK(DEBUG_D_DELETE, "(%p): dropping negative dentry\n",
-                        dentry);
-                return 1;
-        }
-        return 0;
-}                               /*  End Function devfs_d_delete  */
-struct devfs_lookup_struct {
-        devfs_handle_t de;
-        wait_queue_head_t wait_queue;
-};
-/* XXX: this doesn't handle the case where we got a negative dentry
-        but a devfs entry has been registered in the meanwhile */
-static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
-{
-        struct inode *dir = dentry->d_parent->d_inode;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        devfs_handle_t parent = get_devfs_entry_from_vfs_inode(dir);
-        struct devfs_lookup_struct *lookup_info = dentry->d_fsdata;
-        DECLARE_WAITQUEUE(wait, current);
-        int need_lock;
-        /*
-         * FIXME HACK
-         *
-         * make sure that
-         *   d_instantiate always runs under lock
-         *   we release i_mutex lock before going to sleep
-         *
-         * unfortunately sometimes d_revalidate is called with
-         * and sometimes without i_mutex lock held. The following checks
-         * attempt to deduce when we need to add (and drop resp.) lock
-         * here. This relies on current (2.6.2) calling coventions:
-         *
-         *   lookup_hash is always run under i_mutex and is passing NULL
-         *   as nd
-         *
-         *   open(...,O_CREATE,...) calls _lookup_hash under i_mutex
-         *   and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
-         *
-         *   all other invocations of ->d_revalidate seem to happen
-         *   outside of i_mutex
-         */
-        need_lock = nd &&
-            (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
-        if (need_lock)
-                mutex_lock(&dir->i_mutex);
-        if (is_devfsd_or_child(fs_info)) {
-                devfs_handle_t de = lookup_info->de;
-                struct inode *inode;
-                DPRINTK(DEBUG_I_LOOKUP,
-                        "(%s): dentry: %p inode: %p de: %p by: \"%s\"\n",
-                        dentry->d_name.name, dentry, dentry->d_inode, de,
-                        current->comm);
-                if (dentry->d_inode)
-                        goto out;
-                if (de == NULL) {
-                        read_lock(&parent->u.dir.lock);
-                        de = _devfs_search_dir(parent, dentry->d_name.name,
-                                               dentry->d_name.len);
-                        read_unlock(&parent->u.dir.lock);
-                        if (de == NULL)
-                                goto out;
-                        lookup_info->de = de;
-                }
-                /*  Create an inode, now that the driver information is available  */
-                inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry);
-                if (!inode)
-                        goto out;
-                DPRINTK(DEBUG_I_LOOKUP,
-                        "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n",
-                        de->name, de->inode.ino, inode, de, current->comm);
-                d_instantiate(dentry, inode);
-                goto out;
-        }
-        if (lookup_info == NULL)
-                goto out;       /*  Early termination  */
-        read_lock(&parent->u.dir.lock);
-        if (dentry->d_fsdata) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                add_wait_queue(&lookup_info->wait_queue, &wait);
-                read_unlock(&parent->u.dir.lock);
-                /* at this point it is always (hopefully) locked */
-                mutex_unlock(&dir->i_mutex);
-                schedule();
-                mutex_lock(&dir->i_mutex);
-                /*
-                 * This does not need nor should remove wait from wait_queue.
-                 * Wait queue head is never reused - nothing is ever added to it
-                 * after all waiters have been waked up and head itself disappears
-                 * very soon after it. Moreover it is local variable on stack that
-                 * is likely to have already disappeared so any reference to it
-                 * at this point is buggy.
-                 */
-        } else
-                read_unlock(&parent->u.dir.lock);
-      out:
-        if (need_lock)
-                mutex_unlock(&dir->i_mutex);
-        return 1;
-}                               /*  End Function devfs_d_revalidate_wait  */
-/*  Inode operations for device entries follow  */
-static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry,
-                                   struct nameidata *nd)
-{
-        struct devfs_entry tmp; /*  Must stay in scope until devfsd idle again  */
-        struct devfs_lookup_struct lookup_info;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        struct devfs_entry *parent, *de;
-        struct inode *inode;
-        struct dentry *retval = NULL;
-        /*  Set up the dentry operations before anything else, to ensure cleaning
-           up on any error  */
-        dentry->d_op = &devfs_dops;
-        /*  First try to get the devfs entry for this directory  */
-        parent = get_devfs_entry_from_vfs_inode(dir);
-        DPRINTK(DEBUG_I_LOOKUP, "(%s): dentry: %p parent: %p by: \"%s\"\n",
-                dentry->d_name.name, dentry, parent, current->comm);
-        if (parent == NULL)
-                return ERR_PTR(-ENOENT);
-        read_lock(&parent->u.dir.lock);
-        de = _devfs_search_dir(parent, dentry->d_name.name, dentry->d_name.len);
-        read_unlock(&parent->u.dir.lock);
-        lookup_info.de = de;
-        init_waitqueue_head(&lookup_info.wait_queue);
-        dentry->d_fsdata = &lookup_info;
-        if (de == NULL) {       /*  Try with devfsd. For any kind of failure, leave a negative dentry
-                                   so someone else can deal with it (in the case where the sysadmin
-                                   does a mknod()). It's important to do this before hashing the
-                                   dentry, so that the devfsd queue is filled before revalidates
-                                   can start  */
-                if (try_modload(parent, fs_info, dentry->d_name.name, dentry->d_name.len, &tmp) < 0) {  /*  Lookup event was not queued to devfsd  */
-                        d_add(dentry, NULL);
-                        return NULL;
-                }
-        }
-        dentry->d_op = &devfs_wait_dops;
-        d_add(dentry, NULL);    /*  Open the floodgates  */
-        /*  Unlock directory semaphore, which will release any waiters. They
-           will get the hashed dentry, and may be forced to wait for
-           revalidation  */
-        mutex_unlock(&dir->i_mutex);
-        wait_for_devfsd_finished(fs_info);      /*  If I'm not devfsd, must wait  */
-        mutex_lock(&dir->i_mutex);      /*  Grab it again because them's the rules  */
-        de = lookup_info.de;
-        /*  If someone else has been so kind as to make the inode, we go home
-           early  */
-        if (dentry->d_inode)
-                goto out;
-        if (de == NULL) {
-                read_lock(&parent->u.dir.lock);
-                de = _devfs_search_dir(parent, dentry->d_name.name,
-                                       dentry->d_name.len);
-                read_unlock(&parent->u.dir.lock);
-                if (de == NULL)
-                        goto out;
-                /*  OK, there's an entry now, but no VFS inode yet  */
-        }
-        /*  Create an inode, now that the driver information is available  */
-        inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry);
-        if (!inode) {
-                retval = ERR_PTR(-ENOMEM);
-                goto out;
-        }
-        DPRINTK(DEBUG_I_LOOKUP,
-                "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", de->name,
-                de->inode.ino, inode, de, current->comm);
-        d_instantiate(dentry, inode);
-      out:
-        write_lock(&parent->u.dir.lock);
-        dentry->d_op = &devfs_dops;
-        dentry->d_fsdata = NULL;
-        wake_up(&lookup_info.wait_queue);
-        write_unlock(&parent->u.dir.lock);
-        devfs_put(de);
-        return retval;
-}                               /*  End Function devfs_lookup  */
-static int devfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-        int unhooked;
-        struct devfs_entry *de;
-        struct inode *inode = dentry->d_inode;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        de = get_devfs_entry_from_vfs_inode(inode);
-        DPRINTK(DEBUG_I_UNLINK, "(%s): de: %p\n", dentry->d_name.name, de);
-        if (de == NULL)
-                return -ENOENT;
-        if (!de->vfs)
-                return -EPERM;
-        write_lock(&de->parent->u.dir.lock);
-        unhooked = _devfs_unhook(de);
-        write_unlock(&de->parent->u.dir.lock);
-        if (!unhooked)
-                return -ENOENT;
-        if (!is_devfsd_or_child(fs_info))
-                devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
-                                 inode->i_uid, inode->i_gid, fs_info);
-        free_dentry(de);
-        devfs_put(de);
-        return 0;
-}                               /*  End Function devfs_unlink  */
-static int devfs_symlink(struct inode *dir, struct dentry *dentry,
-                         const char *symname)
-{
-        int err;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        struct devfs_entry *parent, *de;
-        struct inode *inode;
-        /*  First try to get the devfs entry for this directory  */
-        parent = get_devfs_entry_from_vfs_inode(dir);
-        if (parent == NULL)
-                return -ENOENT;
-        err = devfs_do_symlink(parent, dentry->d_name.name, symname, &de);
-        DPRINTK(DEBUG_DISABLED, "(%s): errcode from <devfs_do_symlink>: %d\n",
-                dentry->d_name.name, err);
-        if (err < 0)
-                return err;
-        de->vfs = TRUE;
-        de->inode.uid = current->euid;
-        de->inode.gid = current->egid;
-        de->inode.atime = CURRENT_TIME;
-        de->inode.mtime = CURRENT_TIME;
-        de->inode.ctime = CURRENT_TIME;
-        if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
-                return -ENOMEM;
-        DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
-                dentry->d_name.name, de->inode.ino, inode, dentry);
-        d_instantiate(dentry, inode);
-        if (!is_devfsd_or_child(fs_info))
-                devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-                                 inode->i_uid, inode->i_gid, fs_info);
-        return 0;
-}                               /*  End Function devfs_symlink  */
-static int devfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        int err;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        struct devfs_entry *parent, *de;
-        struct inode *inode;
-        mode = (mode & ~S_IFMT) | S_IFDIR;      /*  VFS doesn't pass S_IFMT part  */
-        parent = get_devfs_entry_from_vfs_inode(dir);
-        if (parent == NULL)
-                return -ENOENT;
-        de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode);
-        if (!de)
-                return -ENOMEM;
-        de->vfs = TRUE;
-        if ((err = _devfs_append_entry(parent, de, NULL)) != 0)
-                return err;
-        de->inode.uid = current->euid;
-        de->inode.gid = current->egid;
-        de->inode.atime = CURRENT_TIME;
-        de->inode.mtime = CURRENT_TIME;
-        de->inode.ctime = CURRENT_TIME;
-        if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
-                return -ENOMEM;
-        DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
-                dentry->d_name.name, de->inode.ino, inode, dentry);
-        d_instantiate(dentry, inode);
-        if (!is_devfsd_or_child(fs_info))
-                devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-                                 inode->i_uid, inode->i_gid, fs_info);
-        return 0;
-}                               /*  End Function devfs_mkdir  */
-static int devfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        int err = 0;
-        struct devfs_entry *de;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        struct inode *inode = dentry->d_inode;
-        if (dir->i_sb->s_fs_info != inode->i_sb->s_fs_info)
-                return -EINVAL;
-        de = get_devfs_entry_from_vfs_inode(inode);
-        if (de == NULL)
-                return -ENOENT;
-        if (!S_ISDIR(de->mode))
-                return -ENOTDIR;
-        if (!de->vfs)
-                return -EPERM;
-        /*  First ensure the directory is empty and will stay that way  */
-        write_lock(&de->u.dir.lock);
-        if (de->u.dir.first)
-                err = -ENOTEMPTY;
-        else
-                de->u.dir.no_more_additions = TRUE;
-        write_unlock(&de->u.dir.lock);
-        if (err)
-                return err;
-        /*  Now unhook the directory from its parent  */
-        write_lock(&de->parent->u.dir.lock);
-        if (!_devfs_unhook(de))
-                err = -ENOENT;
-        write_unlock(&de->parent->u.dir.lock);
-        if (err)
-                return err;
-        if (!is_devfsd_or_child(fs_info))
-                devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
-                                 inode->i_uid, inode->i_gid, fs_info);
-        free_dentry(de);
-        devfs_put(de);
-        return 0;
-}                               /*  End Function devfs_rmdir  */
-static int devfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
-                       dev_t rdev)
-{
-        int err;
-        struct fs_info *fs_info = dir->i_sb->s_fs_info;
-        struct devfs_entry *parent, *de;
-        struct inode *inode;
-        DPRINTK(DEBUG_I_MKNOD, "(%s): mode: 0%o  dev: %u:%u\n",
-                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
-        parent = get_devfs_entry_from_vfs_inode(dir);
-        if (parent == NULL)
-                return -ENOENT;
-        de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode);
-        if (!de)
-                return -ENOMEM;
-        de->vfs = TRUE;
-        if (S_ISCHR(mode) || S_ISBLK(mode))
-                de->u.dev = rdev;
-        if ((err = _devfs_append_entry(parent, de, NULL)) != 0)
-                return err;
-        de->inode.uid = current->euid;
-        de->inode.gid = current->egid;
-        de->inode.atime = CURRENT_TIME;
-        de->inode.mtime = CURRENT_TIME;
-        de->inode.ctime = CURRENT_TIME;
-        if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
-                return -ENOMEM;
-        DPRINTK(DEBUG_I_MKNOD, ":   new VFS inode(%u): %p  dentry: %p\n",
-                de->inode.ino, inode, dentry);
-        d_instantiate(dentry, inode);
-        if (!is_devfsd_or_child(fs_info))
-                devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-                                 inode->i_uid, inode->i_gid, fs_info);
-        return 0;
-}                               /*  End Function devfs_mknod  */
-static void *devfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        struct devfs_entry *p = get_devfs_entry_from_vfs_inode(dentry->d_inode);
-        nd_set_link(nd, p ? p->u.symlink.linkname : ERR_PTR(-ENODEV));
-        return NULL;
-}                               /*  End Function devfs_follow_link  */
-static struct inode_operations devfs_iops = {
-        .setattr = devfs_notify_change,
-};
-static struct inode_operations devfs_dir_iops = {
-        .lookup = devfs_lookup,
-        .unlink = devfs_unlink,
-        .symlink = devfs_symlink,
-        .mkdir = devfs_mkdir,
-        .rmdir = devfs_rmdir,
-        .mknod = devfs_mknod,
-        .setattr = devfs_notify_change,
-};
-static struct inode_operations devfs_symlink_iops = {
-        .readlink = generic_readlink,
-        .follow_link = devfs_follow_link,
-        .setattr = devfs_notify_change,
-};
-static int devfs_fill_super(struct super_block *sb, void *data, int silent)
-{
-        struct inode *root_inode = NULL;
-        if (_devfs_get_root_entry() == NULL)
-                goto out_no_root;
-        atomic_set(&fs_info.devfsd_overrun_count, 0);
-        init_waitqueue_head(&fs_info.devfsd_wait_queue);
-        init_waitqueue_head(&fs_info.revalidate_wait_queue);
-        fs_info.sb = sb;
-        sb->s_fs_info = &fs_info;
-        sb->s_blocksize = 1024;
-        sb->s_blocksize_bits = 10;
-        sb->s_magic = DEVFS_SUPER_MAGIC;
-        sb->s_op = &devfs_sops;
-        sb->s_time_gran = 1;
-        if ((root_inode = _devfs_get_vfs_inode(sb, root_entry, NULL)) == NULL)
-                goto out_no_root;
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root)
-                goto out_no_root;
-        DPRINTK(DEBUG_S_READ, "(): made devfs ptr: %p\n", sb->s_fs_info);
-        return 0;
-      out_no_root:
-        PRINTK("(): get root inode failed\n");
-        if (root_inode)
-                iput(root_inode);
-        return -EINVAL;
-}                               /*  End Function devfs_fill_super  */
-static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
-                                        int flags, const char *dev_name,
-                                        void *data)
-{
-        return get_sb_single(fs_type, flags, data, devfs_fill_super);
-}
-static struct file_system_type devfs_fs_type = {
-        .name = DEVFS_NAME,
-        .get_sb = devfs_get_sb,
-        .kill_sb = kill_anon_super,
-};
-/*  File operations for devfsd follow  */
-static ssize_t devfsd_read(struct file *file, char __user *buf, size_t len,
-                           loff_t * ppos)
-{
-        int done = FALSE;
-        int ival;
-        loff_t pos, devname_offset, tlen, rpos;
-        devfs_handle_t de;
-        struct devfsd_buf_entry *entry;
-        struct fs_info *fs_info = file->f_dentry->d_inode->i_sb->s_fs_info;
-        struct devfsd_notify_struct *info = fs_info->devfsd_info;
-        DECLARE_WAITQUEUE(wait, current);
-        /*  Verify the task has grabbed the queue  */
-        if (fs_info->devfsd_task != current)
-                return -EPERM;
-        info->major = 0;
-        info->minor = 0;
-        /*  Block for a new entry  */
-        set_current_state(TASK_INTERRUPTIBLE);
-        add_wait_queue(&fs_info->devfsd_wait_queue, &wait);
-        while (devfsd_queue_empty(fs_info)) {
-                fs_info->devfsd_sleeping = TRUE;
-                wake_up(&fs_info->revalidate_wait_queue);
-                schedule();
-                fs_info->devfsd_sleeping = FALSE;
-                if (signal_pending(current)) {
-                        remove_wait_queue(&fs_info->devfsd_wait_queue, &wait);
-                        __set_current_state(TASK_RUNNING);
-                        return -EINTR;
-                }
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        remove_wait_queue(&fs_info->devfsd_wait_queue, &wait);
-        __set_current_state(TASK_RUNNING);
-        /*  Now play with the data  */
-        ival = atomic_read(&fs_info->devfsd_overrun_count);
-        info->overrun_count = ival;
-        entry = fs_info->devfsd_first_event;
-        info->type = entry->type;
-        info->mode = entry->mode;
-        info->uid = entry->uid;
-        info->gid = entry->gid;
-        de = entry->de;
-        if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) {
-                info->major = MAJOR(de->u.dev);
-                info->minor = MINOR(de->u.dev);
-        }
-        pos = devfs_generate_path(de, info->devname, DEVFS_PATHLEN);
-        if (pos < 0)
-                return pos;
-        info->namelen = DEVFS_PATHLEN - pos - 1;
-        if (info->mode == 0)
-                info->mode = de->mode;
-        devname_offset = info->devname - (char *)info;
-        rpos = *ppos;
-        if (rpos < devname_offset) {
-                /*  Copy parts of the header  */
-                tlen = devname_offset - rpos;
-                if (tlen > len)
-                        tlen = len;
-                if (copy_to_user(buf, (char *)info + rpos, tlen)) {
-                        return -EFAULT;
-                }
-                rpos += tlen;
-                buf += tlen;
-                len -= tlen;
-        }
-        if ((rpos >= devname_offset) && (len > 0)) {
-                /*  Copy the name  */
-                tlen = info->namelen + 1;
-                if (tlen > len)
-                        tlen = len;
-                else
-                        done = TRUE;
-                if (copy_to_user
-                    (buf, info->devname + pos + rpos - devname_offset, tlen)) {
-                        return -EFAULT;
-                }
-                rpos += tlen;
-        }
-        tlen = rpos - *ppos;
-        if (done) {
-                devfs_handle_t parent;
-                spin_lock(&fs_info->devfsd_buffer_lock);
-                fs_info->devfsd_first_event = entry->next;
-                if (entry->next == NULL)
-                        fs_info->devfsd_last_event = NULL;
-                spin_unlock(&fs_info->devfsd_buffer_lock);
-                for (; de != NULL; de = parent) {
-                        parent = de->parent;
-                        devfs_put(de);
-                }
-                kmem_cache_free(devfsd_buf_cache, entry);
-                if (ival > 0)
-                        atomic_sub(ival, &fs_info->devfsd_overrun_count);
-                *ppos = 0;
-        } else
-                *ppos = rpos;
-        return tlen;
-}                               /*  End Function devfsd_read  */
-static int devfsd_ioctl(struct inode *inode, struct file *file,
-                        unsigned int cmd, unsigned long arg)
-{
-        int ival;
-        struct fs_info *fs_info = inode->i_sb->s_fs_info;
-        switch (cmd) {
-        case DEVFSDIOC_GET_PROTO_REV:
-                ival = DEVFSD_PROTOCOL_REVISION_KERNEL;
-                if (copy_to_user((void __user *)arg, &ival, sizeof ival))
-                        return -EFAULT;
-                break;
-        case DEVFSDIOC_SET_EVENT_MASK:
-                /*  Ensure only one reader has access to the queue. This scheme will
-                   work even if the global kernel lock were to be removed, because it
-                   doesn't matter who gets in first, as long as only one gets it  */
-                if (fs_info->devfsd_task == NULL) {
-                        static DEFINE_SPINLOCK(lock);
-                        if (!spin_trylock(&lock))
-                                return -EBUSY;
-                        if (fs_info->devfsd_task != NULL) {     /*  We lost the race...  */
-                                spin_unlock(&lock);
-                                return -EBUSY;
-                        }
-                        fs_info->devfsd_task = current;
-                        spin_unlock(&lock);
-                        fs_info->devfsd_pgrp =
-                            (process_group(current) ==
-                             current->pid) ? process_group(current) : 0;
-                        fs_info->devfsd_file = file;
-                        fs_info->devfsd_info =
-                            kmalloc(sizeof *fs_info->devfsd_info, GFP_KERNEL);
-                        if (!fs_info->devfsd_info) {
-                                devfsd_close(inode, file);
-                                return -ENOMEM;
-                        }
-                } else if (fs_info->devfsd_task != current)
-                        return -EBUSY;
-                fs_info->devfsd_event_mask = arg;       /*  Let the masses come forth  */
-                break;
-        case DEVFSDIOC_RELEASE_EVENT_QUEUE:
-                if (fs_info->devfsd_file != file)
-                        return -EPERM;
-                return devfsd_close(inode, file);
-                /*break; */
-#ifdef CONFIG_DEVFS_DEBUG
-        case DEVFSDIOC_SET_DEBUG_MASK:
-                if (copy_from_user(&ival, (void __user *)arg, sizeof ival))
-                        return -EFAULT;
-                devfs_debug = ival;
-                break;
-#endif
-        default:
-                return -ENOIOCTLCMD;
-        }
-        return 0;
-}                               /*  End Function devfsd_ioctl  */
-static int devfsd_close(struct inode *inode, struct file *file)
-{
-        struct devfsd_buf_entry *entry, *next;
-        struct fs_info *fs_info = inode->i_sb->s_fs_info;
-        if (fs_info->devfsd_file != file)
-                return 0;
-        fs_info->devfsd_event_mask = 0;
-        fs_info->devfsd_file = NULL;
-        spin_lock(&fs_info->devfsd_buffer_lock);
-        entry = fs_info->devfsd_first_event;
-        fs_info->devfsd_first_event = NULL;
-        fs_info->devfsd_last_event = NULL;
-        kfree(fs_info->devfsd_info);
-        fs_info->devfsd_info = NULL;
-        spin_unlock(&fs_info->devfsd_buffer_lock);
-        fs_info->devfsd_pgrp = 0;
-        fs_info->devfsd_task = NULL;
-        wake_up(&fs_info->revalidate_wait_queue);
-        for (; entry; entry = next) {
-                next = entry->next;
-                kmem_cache_free(devfsd_buf_cache, entry);
-        }
-        return 0;
-}                               /*  End Function devfsd_close  */
-#ifdef CONFIG_DEVFS_DEBUG
-static ssize_t stat_read(struct file *file, char __user *buf, size_t len,
-                         loff_t * ppos)
-{
-        ssize_t num;
-        char txt[80];
-        num = sprintf(txt, "Number of entries: %u  number of bytes: %u\n",
-                      stat_num_entries, stat_num_bytes) + 1;
-        if (*ppos >= num)
-                return 0;
-        if (*ppos + len > num)
-                len = num - *ppos;
-        if (copy_to_user(buf, txt + *ppos, len))
-                return -EFAULT;
-        *ppos += len;
-        return len;
-}                               /*  End Function stat_read  */
-#endif
-static int __init init_devfs_fs(void)
-{
-        int err;
-        int major;
-        struct devfs_entry *devfsd;
-#ifdef CONFIG_DEVFS_DEBUG
-        struct devfs_entry *stat;
-#endif
-        if (_devfs_get_root_entry() == NULL)
-                return -ENOMEM;
-        printk(KERN_INFO "%s: %s Richard Gooch (rgooch@atnf.csiro.au)\n",
-               DEVFS_NAME, DEVFS_VERSION);
-        devfsd_buf_cache = kmem_cache_create("devfsd_event",
-                                             sizeof(struct devfsd_buf_entry),
-                                             0, 0, NULL, NULL);
-        if (!devfsd_buf_cache)
-                OOPS("(): unable to allocate event slab\n");
-#ifdef CONFIG_DEVFS_DEBUG
-        devfs_debug = devfs_debug_init;
-        printk(KERN_INFO "%s: devfs_debug: 0x%0x\n", DEVFS_NAME, devfs_debug);
-#endif
-        printk(KERN_INFO "%s: boot_options: 0x%0x\n", DEVFS_NAME, boot_options);
-        /* register special device for devfsd communication */
-        major = register_chrdev(0, "devfs", &devfs_fops);
-        if (major < 0)
-                return major;
-        /*  And create the entry for ".devfsd"  */
-        devfsd = _devfs_alloc_entry(".devfsd", 0, S_IFCHR | S_IRUSR | S_IWUSR);
-        if (devfsd == NULL)
-                return -ENOMEM;
-        devfsd->u.dev = MKDEV(major, 0);
-        _devfs_append_entry(root_entry, devfsd, NULL);
-#ifdef CONFIG_DEVFS_DEBUG
-        stat = _devfs_alloc_entry(".stat", 0, S_IFCHR | S_IRUGO);
-        if (stat == NULL)
-                return -ENOMEM;
-        stat->u.dev = MKDEV(major, 1);
-        _devfs_append_entry(root_entry, stat, NULL);
-#endif
-        err = register_filesystem(&devfs_fs_type);
-        return err;
-}                               /*  End Function init_devfs_fs  */
-void __init mount_devfs_fs(void)
-{
-        int err;
-        if (!(boot_options & OPTION_MOUNT))
-                return;
-        err = do_mount("none", "/dev", "devfs", 0, NULL);
-        if (err == 0)
-                printk(KERN_INFO "Mounted devfs on /dev\n");
-        else
-                PRINTK("(): unable to mount devfs, err: %d\n", err);
-}                               /*  End Function mount_devfs_fs  */
-module_init(init_devfs_fs)
diff --git a/fs/devfs/util.c b/fs/devfs/util.c
deleted file mode 100644
index db06d388c9ac..000000000000
--- a/fs/devfs/util.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*  devfs (Device FileSystem) utilities.
-    Copyright (C) 1999-2002  Richard Gooch
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-    Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
-    The postal address is:
-      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-    ChangeLog
-    19991031   Richard Gooch <rgooch@atnf.csiro.au>
-               Created.
-    19991103   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <_devfs_convert_name> and supported SCSI and IDE CD-ROMs
-    20000203   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed operations pointer type to void *.
-    20000621   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed interface to <devfs_register_series>.
-    20000622   Richard Gooch <rgooch@atnf.csiro.au>
-               Took account of interface change to <devfs_mk_symlink>.
-               Took account of interface change to <devfs_mk_dir>.
-    20010519   Richard Gooch <rgooch@atnf.csiro.au>
-               Documentation cleanup.
-    20010709   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_*alloc_major> and <devfs_*alloc_devnum>.
-    20010710   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_*alloc_unique_number>.
-    20010730   Richard Gooch <rgooch@atnf.csiro.au>
-               Documentation typo fix.
-    20010806   Richard Gooch <rgooch@atnf.csiro.au>
-               Made <block_semaphore> and <char_semaphore> private.
-    20010813   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfs_alloc_unique_number>: limited to 128 numbers
-    20010818   Richard Gooch <rgooch@atnf.csiro.au>
-               Updated major masks up to Linus' "no new majors" proclamation.
-               Block: were 126 now 122 free, char: were 26 now 19 free.
-    20020324   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfs_alloc_unique_number>: was clearing beyond
-               bitfield.
-    20020326   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bitfield data type for <devfs_*alloc_devnum>.
-               Made major bitfield type and initialiser 64 bit safe.
-    20020413   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed shift warning on 64 bit machines.
-    20020428   Richard Gooch <rgooch@atnf.csiro.au>
-               Copied and used macro for error messages from fs/devfs/base.c 
-    20021013   Richard Gooch <rgooch@atnf.csiro.au>
-               Documentation fix.
-    20030101   Adam J. Richter <adam@yggdrasil.com>
-               Eliminate DEVFS_SPECIAL_{CHR,BLK}.  Use mode_t instead.
-    20030106   Christoph Hellwig <hch@infradead.org>
-               Rewrite devfs_{,de}alloc_devnum to look like C code.
-*/
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/devfs_fs_kernel.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/genhd.h>
-#include <linux/bitops.h>
-int devfs_register_tape(const char *name)
-{
-        char tname[32], dest[64];
-        static unsigned int tape_counter;
-        unsigned int n = tape_counter++;
-        sprintf(dest, "../%s", name);
-        sprintf(tname, "tapes/tape%u", n);
-        devfs_mk_symlink(tname, dest);
-        return n;
-}
-EXPORT_SYMBOL(devfs_register_tape);
-void devfs_unregister_tape(int num)
-{
-        if (num >= 0)
-                devfs_remove("tapes/tape%u", num);
-}
-EXPORT_SYMBOL(devfs_unregister_tape);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14c5620b5cab..f7aef5bb584a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,10 +130,10 @@ fail:
        return -ENOMEM;
 }
-static struct super_block *devpts_get_sb(struct file_system_type *fs_type,
+static int devpts_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, devpts_fill_super);
+        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 static struct file_system_type devpts_fs_type = {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b218776..538fb0418fba 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
                NULL);                          /* vmas */
        up_read(&current->mm->mmap_sem);
-        if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+        if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
                struct page *page = ZERO_PAGE(dio->curr_user_address);
                /*
                 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
                map_bh->b_state = 0;
                map_bh->b_size = fs_count << dio->inode->i_blkbits;
-                create = dio->rw == WRITE;
+                create = dio->rw & WRITE;
                if (dio->lock_type == DIO_LOCKING) {
                        if (dio->block_in_file < (i_size_read(dio->inode) >>
                                                        dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
                                loff_t i_size_aligned;
                                /* AKPM: eargh, -ENOTBLK is a hack */
-                                if (dio->rw == WRITE) {
+                                if (dio->rw & WRITE) {
                                        page_cache_release(page);
                                        return -ENOTBLK;
                                }
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                }
        } /* end iovec loop */
-        if (ret == -ENOTBLK && rw == WRITE) {
+        if (ret == -ENOTBLK && (rw & WRITE)) {
                /*
                 * The remaining part of the request will be
                 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        if (dio->is_async) {
                int should_wait = 0;
-                if (dio->result < dio->size && rw == WRITE) {
+                if (dio->result < dio->size && (rw & WRITE)) {
                        dio->waiter = current;
                        should_wait = 1;
                }
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                        ret = transferred;
                /* We could have also come here on an AIO file extend */
-                if (!is_sync_kiocb(iocb) && rw == WRITE &&
+                if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
                    ret >= 0 && dio->result == dio->size)
                        /*
                         * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int acquire_i_mutex = 0;
        if (rw & WRITE)
-                current->flags |= PF_SYNCWRITE;
+                rw = WRITE_SYNC;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
         * even for AIO, we need to wait for i/o to complete before
         * returning in this case.
         */
-        dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+        dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
                (end > i_size_read(inode)));
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
                mutex_unlock(&inode->i_mutex);
        else if (acquire_i_mutex)
                mutex_lock(&inode->i_mutex);
-        if (rw & WRITE)
-                current->flags &= ~PF_SYNCWRITE;
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dquot.c b/fs/dquot.c
index 81d87a413c68..0122a279106a 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
 /* Add a dquot to the tail of the free list */
 static inline void put_dquot_last(struct dquot *dquot)
 {
-        list_add(&dquot->dq_free, free_dquots.prev);
+        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats.free_dquots++;
 }
@@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot)
 {
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
-        list_add(&dquot->dq_inuse, inuse_list.prev);
+        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats.allocated_dquots++;
 }
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 180607f9314d..174696f9bf14 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -21,7 +21,7 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,efs_get_block);
 }
-static struct address_space_operations efs_aops = {
+static const struct address_space_operations efs_aops = {
        .readpage = efs_readpage,
        .sync_page = block_sync_page,
        .bmap = _efs_bmap
diff --git a/fs/efs/super.c b/fs/efs/super.c
index dff623e3ddbf..8ac2462ae5dd 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,13 +15,13 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
-static int efs_statfs(struct super_block *s, struct kstatfs *buf);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
-static struct super_block *efs_get_sb(struct file_system_type *fs_type,
+static int efs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
 }
 static struct file_system_type efs_fs_type = {
@@ -322,8 +322,8 @@ out_no_fs:
        return -EINVAL;
 }
-static int efs_statfs(struct super_block *s, struct kstatfs *buf) {
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
-        struct efs_sb_info *sb = SUPER_INFO(s);
+        struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
        buf->f_type    = EFS_SUPER_MAGIC;       /* efs magic number */
        buf->f_bsize   = EFS_BLOCKSIZE;         /* blocksize */
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 3d9a350e3e7f..e249cf733a6b 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -53,6 +53,6 @@ fail:
        return err;
 }
-struct address_space_operations efs_symlink_aops = {
+const struct address_space_operations efs_symlink_aops = {
        .readpage       = efs_symlink_readpage
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1b4491cdd115..9c677bbd0b08 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
 *  fs/eventpoll.c ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2003  Davide Libenzi
+ *  Copyright (C) 2001,...,2006  Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
@@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
-                                              int flags, const char *dev_name,
+                              int flags, const char *dev_name,
-                                              void *data);
+                              void *data, struct vfsmount *mnt);
 /*
 * This semaphore is used to serialize ep_free() and eventpoll_release_file().
@@ -337,20 +337,20 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 /* Special initialization for the rb-tree node to detect linkage */
 static inline void ep_rb_initnode(struct rb_node *n)
 {
-        n->rb_parent = n;
+        rb_set_parent(n, n);
 }
 /* Removes a node from the rb-tree and marks it for a fast is-linked check */
 static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
 {
        rb_erase(n, r);
-        n->rb_parent = n;
+        rb_set_parent(n, n);
 }
 /* Fast check to verify that the item is linked to the main rb-tree */
 static inline int ep_rb_linked(struct rb_node *n)
 {
-        return n->rb_parent != n;
+        return rb_parent(n) != n;
 }
 /*
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
-                        wake_up(&ep->wq);
+                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }
@@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
                                /* Notify waiting tasks that events are available */
                                if (waitqueue_active(&ep->wq))
-                                        wake_up(&ep->wq);
+                                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                                         TASK_INTERRUPTIBLE);
                                if (waitqueue_active(&ep->poll_wait))
                                        pwake++;
                        }
@@ -1260,7 +1261,8 @@ is_linked:
         * wait list.
         */
        if (waitqueue_active(&ep->wq))
-                wake_up(&ep->wq);
+                __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                 TASK_INTERRUPTIBLE);
        if (waitqueue_active(&ep->poll_wait))
                pwake++;
@@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
                 * wait list.
                 */
                if (waitqueue_active(&ep->wq))
-                        wake_up(&ep->wq);
+                        __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+                                         TASK_INTERRUPTIBLE);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }
@@ -1516,7 +1519,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-                add_wait_queue(&ep->wq, &wait);
+                __add_wait_queue(&ep->wq, &wait);
                for (;;) {
                        /*
@@ -1536,7 +1539,7 @@ retry:
                        jtimeout = schedule_timeout(jtimeout);
                        write_lock_irqsave(&ep->lock, flags);
                }
-                remove_wait_queue(&ep->wq, &wait);
+                __remove_wait_queue(&ep->wq, &wait);
                set_current_state(TASK_RUNNING);
        }
@@ -1595,11 +1598,12 @@ eexit_1:
 }
-static struct super_block *
+static int
 eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-                   const char *dev_name, void *data)
+                   const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+        return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+                             mnt);
 }
diff --git a/fs/exec.c b/fs/exec.c
index 3a79d97ac234..8344ba73a2a6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -22,7 +22,6 @@
 * formats. 
 */
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/mman.h>
@@ -49,6 +48,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -665,8 +665,6 @@ static int de_thread(struct task_struct *tsk)
         * and to assume its PID:
         */
        if (!thread_group_leader(current)) {
-                struct dentry *proc_dentry1, *proc_dentry2;
                /*
                 * Wait for the thread group leader to be a zombie.
                 * It should already be zombie at this point, most
@@ -688,10 +686,6 @@ static int de_thread(struct task_struct *tsk)
                 */
                current->start_time = leader->start_time;
-                spin_lock(&leader->proc_lock);
-                spin_lock(&current->proc_lock);
-                proc_dentry1 = proc_pid_unhash(current);
-                proc_dentry2 = proc_pid_unhash(leader);
                write_lock_irq(&tasklist_lock);
                BUG_ON(leader->tgid != current->tgid);
@@ -712,7 +706,7 @@ static int de_thread(struct task_struct *tsk)
                attach_pid(current, PIDTYPE_PID,  current->pid);
                attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
                attach_pid(current, PIDTYPE_SID,  current->signal->session);
-                list_add_tail_rcu(&current->tasks, &init_task.tasks);
+                list_replace_rcu(&leader->tasks, &current->tasks);
                current->group_leader = current;
                leader->group_leader = current;
@@ -720,7 +714,6 @@ static int de_thread(struct task_struct *tsk)
                /* Reduce leader to a thread */
                detach_pid(leader, PIDTYPE_PGID);
                detach_pid(leader, PIDTYPE_SID);
-                list_del_init(&leader->tasks);
                current->exit_signal = SIGCHLD;
@@ -728,10 +721,6 @@ static int de_thread(struct task_struct *tsk)
                leader->exit_state = EXIT_DEAD;
                write_unlock_irq(&tasklist_lock);
-                spin_unlock(&leader->proc_lock);
-                spin_unlock(&current->proc_lock);
-                proc_pid_flush(proc_dentry1);
-                proc_pid_flush(proc_dentry2);
        }
        /*
@@ -865,7 +854,6 @@ int flush_old_exec(struct linux_binprm * bprm)
        bprm->mm = NULL;                /* We're using it now */
        /* This is the point of no return */
-        steal_locks(files);
        put_files_struct(files);
        current->sas_ss_sp = current->sas_ss_size = 0;
@@ -1085,6 +1073,11 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        /* kernel module loader fixup */
        /* so we don't try to load run modprobe in kernel space. */
        set_fs(USER_DS);
+        retval = audit_bprm(bprm);
+        if (retval)
+                return retval;
        retval = -ENOENT;
        for (try=0; try<2; try++) {
                read_lock(&binfmt_lock);
@@ -1374,67 +1367,102 @@ static void format_corename(char *corename, const char *pattern, long signr)
        *out_ptr = 0;
 }
-static void zap_threads (struct mm_struct *mm)
+static void zap_process(struct task_struct *start)
 {
-        struct task_struct *g, *p;
+        struct task_struct *t;
-        struct task_struct *tsk = current;
-        struct completion *vfork_done = tsk->vfork_done;
-        int traced = 0;
-        /*
+        start->signal->flags = SIGNAL_GROUP_EXIT;
-         * Make sure nobody is waiting for us to release the VM,
+        start->signal->group_stop_count = 0;
-         * otherwise we can deadlock when we wait on each other
-         */
-        if (vfork_done) {
-                tsk->vfork_done = NULL;
-                complete(vfork_done);
-        }
-        read_lock(&tasklist_lock);
+        t = start;
-        do_each_thread(g,p)
+        do {
-                if (mm == p->mm && p != tsk) {
+                if (t != current && t->mm) {
-                        force_sig_specific(SIGKILL, p);
+                        t->mm->core_waiters++;
-                        mm->core_waiters++;
+                        sigaddset(&t->pending.signal, SIGKILL);
-                        if (unlikely(p->ptrace) &&
+                        signal_wake_up(t, 1);
-                            unlikely(p->parent->mm == mm))
-                                traced = 1;
                }
-        while_each_thread(g,p);
+        } while ((t = next_thread(t)) != start);
+}
-        read_unlock(&tasklist_lock);
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+                                int exit_code)
+{
+        struct task_struct *g, *p;
+        unsigned long flags;
+        int err = -EAGAIN;
+        spin_lock_irq(&tsk->sighand->siglock);
+        if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+                tsk->signal->group_exit_code = exit_code;
+                zap_process(tsk);
+                err = 0;
+        }
+        spin_unlock_irq(&tsk->sighand->siglock);
+        if (err)
+                return err;
-        if (unlikely(traced)) {
+        if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
-                /*
+                goto done;
-                 * We are zapping a thread and the thread it ptraces.
-                 * If the tracee went into a ptrace stop for exit tracing,
+        rcu_read_lock();
-                 * we could deadlock since the tracer is waiting for this
+        for_each_process(g) {
-                 * coredump to finish.  Detach them so they can both die.
+                if (g == tsk->group_leader)
-                 */
+                        continue;
-                write_lock_irq(&tasklist_lock);
-                do_each_thread(g,p) {
+                p = g;
-                        if (mm == p->mm && p != tsk &&
+                do {
-                            p->ptrace && p->parent->mm == mm) {
+                        if (p->mm) {
-                                __ptrace_detach(p, 0);
+                                if (p->mm == mm) {
+                                        /*
+                                         * p->sighand can't disappear, but
+                                         * may be changed by de_thread()
+                                         */
+                                        lock_task_sighand(p, &flags);
+                                        zap_process(p);
+                                        unlock_task_sighand(p, &flags);
+                                }
+                                break;
                        }
-                } while_each_thread(g,p);
+                } while ((p = next_thread(p)) != g);
-                write_unlock_irq(&tasklist_lock);
        }
+        rcu_read_unlock();
+done:
+        return mm->core_waiters;
 }
-static void coredump_wait(struct mm_struct *mm)
+static int coredump_wait(int exit_code)
 {
-        DECLARE_COMPLETION(startup_done);
+        struct task_struct *tsk = current;
+        struct mm_struct *mm = tsk->mm;
+        struct completion startup_done;
+        struct completion *vfork_done;
        int core_waiters;
+        init_completion(&mm->core_done);
+        init_completion(&startup_done);
        mm->core_startup_done = &startup_done;
-        zap_threads(mm);
+        core_waiters = zap_threads(tsk, mm, exit_code);
-        core_waiters = mm->core_waiters;
        up_write(&mm->mmap_sem);
+        if (unlikely(core_waiters < 0))
+                goto fail;
+        /*
+         * Make sure nobody is waiting for us to release the VM,
+         * otherwise we can deadlock when we wait on each other
+         */
+        vfork_done = tsk->vfork_done;
+        if (vfork_done) {
+                tsk->vfork_done = NULL;
+                complete(vfork_done);
+        }
        if (core_waiters)
                wait_for_completion(&startup_done);
+fail:
        BUG_ON(mm->core_waiters);
+        return core_waiters;
 }
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1468,22 +1496,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        }
        mm->dumpable = 0;
-        retval = -EAGAIN;
+        retval = coredump_wait(exit_code);
-        spin_lock_irq(&current->sighand->siglock);
+        if (retval < 0)
-        if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
-                current->signal->flags = SIGNAL_GROUP_EXIT;
-                current->signal->group_exit_code = exit_code;
-                current->signal->group_stop_count = 0;
-                retval = 0;
-        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (retval) {
-                up_write(&mm->mmap_sem);
                goto fail;
-        }
-        init_completion(&mm->core_done);
-        coredump_wait(mm);
        /*
         * Clear any false indication of pending signals that might
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index c5d02da73bc3..e0b2b43c1fdb 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT2_FS) += ext2.o
-ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
          ioctl.o namei.o super.o symlink.o
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2c00953d4b0b..d4870432ecfc 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -11,7 +11,6 @@
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */
-#include <linux/config.h>
 #include "ext2.h"
 #include <linux/quotaops.h>
 #include <linux/sched.h>
@@ -521,6 +520,26 @@ io_error:
        goto out_release;
 }
+#ifdef EXT2FS_DEBUG
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
+{
+        unsigned int i;
+        unsigned long sum = 0;
+        if (!map)
+                return (0);
+        for (i = 0; i < numchars; i++)
+                sum += nibblemap[map->b_data[i] & 0xf] +
+                        nibblemap[(map->b_data[i] >> 4) & 0xf];
+        return (sum);
+}
+#endif  /*  EXT2FS_DEBUG  */
+/* Superblock must be locked */
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
        struct ext2_group_desc * desc;
@@ -530,7 +549,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
        unsigned long bitmap_count, x;
        struct ext2_super_block *es;
-        lock_super (sb);
        es = EXT2_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
@@ -554,7 +572,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
        printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
                (long)le32_to_cpu(es->s_free_blocks_count),
                desc_count, bitmap_count);
-        unlock_super (sb);
        return bitmap_count;
 #else
        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
deleted file mode 100644
index e9983a0dd396..000000000000
--- a/fs/ext2/bitmap.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  linux/fs/ext2/bitmap.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-#ifdef EXT2FS_DEBUG
-#include <linux/buffer_head.h>
-#include "ext2.h"
-static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
-{
-        unsigned int i;
-        unsigned long sum = 0;
-        
-        if (!map) 
-                return (0);
-        for (i = 0; i < numchars; i++)
-                sum += nibblemap[map->b_data[i] & 0xf] +
-                        nibblemap[(map->b_data[i] >> 4) & 0xf];
-        return (sum);
-}
-#endif  /*  EXT2FS_DEBUG  */
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d672aa9f4061..92ea8265d7d5 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -159,8 +159,7 @@ fail:
 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
-        struct page *page = read_cache_page(mapping, n,
+        struct page *page = read_mapping_page(mapping, n, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                kmap(page);
@@ -400,8 +399,7 @@ ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
        de = ext2_find_entry (dir, dentry, &page);
        if (de) {
                res = le32_to_cpu(de->inode);
-                kunmap(page);
+                ext2_put_page(page);
-                page_cache_release(page);
        }
        return res;
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9f74a62be555..e65a019fc7a5 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -162,9 +162,9 @@ extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
 /* inode.c */
-extern struct address_space_operations ext2_aops;
+extern const struct address_space_operations ext2_aops;
-extern struct address_space_operations ext2_aops_xip;
+extern const struct address_space_operations ext2_aops_xip;
-extern struct address_space_operations ext2_nobh_aops;
+extern const struct address_space_operations ext2_nobh_aops;
 /* namei.c */
 extern struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index c9c2e5ffa48e..7806b9e8155b 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -24,7 +24,7 @@
 #include "ext2.h"
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>          /* for fsync_inode_buffers() */
+#include <linux/buffer_head.h>          /* for sync_mapping_buffers() */
 /*
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index e52765219e16..de85c61c58c5 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -12,7 +12,6 @@
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */
-#include <linux/config.h>
 #include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/backing-dev.h>
@@ -638,6 +637,7 @@ fail:
        return ERR_PTR(err);
 }
+/* Superblock must be locked */
 unsigned long ext2_count_free_inodes (struct super_block * sb)
 {
        struct ext2_group_desc *desc;
@@ -649,7 +649,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
        unsigned long bitmap_count = 0;
        struct buffer_head *bitmap_bh = NULL;
-        lock_super (sb);
        es = EXT2_SB(sb)->s_es;
        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
                unsigned x;
@@ -672,7 +671,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
        printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
                percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
                desc_count, bitmap_count);
-        unlock_super(sb);
        return desc_count;
 #else
        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 04af9c45dce2..fb4d3220eb8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -684,7 +684,7 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
        return mpage_writepages(mapping, wbc, ext2_get_block);
 }
-struct address_space_operations ext2_aops = {
+const struct address_space_operations ext2_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
        .writepage              = ext2_writepage,
@@ -697,12 +697,12 @@ struct address_space_operations ext2_aops = {
        .migratepage            = buffer_migrate_page,
 };
-struct address_space_operations ext2_aops_xip = {
+const struct address_space_operations ext2_aops_xip = {
        .bmap                   = ext2_bmap,
        .get_xip_page           = ext2_get_xip_page,
 };
-struct address_space_operations ext2_nobh_aops = {
+const struct address_space_operations ext2_nobh_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
        .writepage              = ext2_nobh_writepage,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e30bae174ed..9f43879d6d68 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -16,7 +16,6 @@
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
@@ -39,7 +38,7 @@
 static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 void ext2_error (struct super_block * sb, const char * function,
                 const char * fmt, ...)
@@ -834,9 +833,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                printk ("EXT2-fs: not enough memory\n");
                goto failed_mount;
        }
-        percpu_counter_init(&sbi->s_freeblocks_counter);
-        percpu_counter_init(&sbi->s_freeinodes_counter);
-        percpu_counter_init(&sbi->s_dirs_counter);
        bgl_lock_init(&sbi->s_blockgroup_lock);
        sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
                               GFP_KERNEL);
@@ -857,12 +853,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!ext2_check_descriptors (sb)) {
                printk ("EXT2-fs: group descriptors corrupted!\n");
-                db_count = i;
                goto failed_mount2;
        }
        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        percpu_counter_init(&sbi->s_freeblocks_counter,
+                                ext2_count_free_blocks(sb));
+        percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext2_count_free_inodes(sb));
+        percpu_counter_init(&sbi->s_dirs_counter,
+                                ext2_count_dirs(sb));
        /*
         * set up enough so that it can read an inode
         */
@@ -874,24 +876,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root) {
                iput(root);
                printk(KERN_ERR "EXT2-fs: get root inode failed\n");
-                goto failed_mount2;
+                goto failed_mount3;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                dput(sb->s_root);
                sb->s_root = NULL;
                printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
-                goto failed_mount2;
+                goto failed_mount3;
        }
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
                ext2_warning(sb, __FUNCTION__,
                        "mounting ext3 filesystem as ext2");
        ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-        percpu_counter_mod(&sbi->s_freeblocks_counter,
-                                ext2_count_free_blocks(sb));
-        percpu_counter_mod(&sbi->s_freeinodes_counter,
-                                ext2_count_free_inodes(sb));
-        percpu_counter_mod(&sbi->s_dirs_counter,
-                                ext2_count_dirs(sb));
        return 0;
 cantfind_ext2:
@@ -899,7 +895,10 @@ cantfind_ext2:
                printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
                       sb->s_id);
        goto failed_mount;
+failed_mount3:
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -1038,12 +1037,14 @@ restore_opts:
        return err;
 }
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+        struct super_block *sb = dentry->d_sb;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        unsigned long overhead;
        int i;
+        lock_super(sb);
        if (test_opt (sb, MINIX_DF))
                overhead = 0;
        else {
@@ -1084,13 +1085,14 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
        buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
        buf->f_ffree = ext2_count_free_inodes (sb);
        buf->f_namelen = EXT2_NAME_LEN;
+        unlock_super(sb);
        return 0;
 }
-static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
+static int ext2_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
 }
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 67cfeb66e897..bf8175b2ced9 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -6,7 +6,6 @@
  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/xattr.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 77927d6938f6..a504a40d6d29 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -11,7 +11,6 @@
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
@@ -163,20 +162,19 @@ restart:
 #endif
 static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
                        unsigned int group, struct super_block * sb)
 {
-        unsigned long group_first_block, group_last_block;
+        ext3_fsblk_t group_first_block, group_last_block;
-        group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
+        group_first_block = ext3_group_first_block_no(sb, group);
-                                group * EXT3_BLOCKS_PER_GROUP(sb);
        group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
        if ((rsv->_rsv_start > group_last_block) ||
            (rsv->_rsv_end < group_first_block))
                return 0;
-        if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
+        if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-                || (goal + group_first_block > rsv->_rsv_end)))
+                || (grp_goal + group_first_block > rsv->_rsv_end)))
                return 0;
        return 1;
 }
@@ -187,7 +185,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
 * Returns NULL if there are no windows or if all windows start after the goal.
 */
 static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, unsigned long goal)
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 {
        struct rb_node *n = root->rb_node;
        struct ext3_reserve_window_node *rsv;
@@ -223,7 +221,7 @@ void ext3_rsv_window_add(struct super_block *sb,
 {
        struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
        struct rb_node *node = &rsv->rsv_node;
-        unsigned int start = rsv->rsv_start;
+        ext3_fsblk_t start = rsv->rsv_start;
        struct rb_node ** p = &root->rb_node;
        struct rb_node * parent = NULL;
@@ -310,20 +308,20 @@ void ext3_discard_reservation(struct inode *inode)
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
-                         unsigned long block, unsigned long count,
+                         ext3_fsblk_t block, unsigned long count,
-                         int *pdquot_freed_blocks)
+                         unsigned long *pdquot_freed_blocks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        unsigned long block_group;
-        unsigned long bit;
+        ext3_grpblk_t bit;
        unsigned long i;
        unsigned long overflow;
        struct ext3_group_desc * desc;
        struct ext3_super_block * es;
        struct ext3_sb_info *sbi;
        int err = 0, ret;
-        unsigned group_freed;
+        ext3_grpblk_t group_freed;
        *pdquot_freed_blocks = 0;
        sbi = EXT3_SB(sb);
@@ -333,7 +331,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
            block + count > le32_to_cpu(es->s_blocks_count)) {
                ext3_error (sb, "ext3_free_blocks",
                            "Freeing blocks not in datazone - "
-                            "block = %lu, count = %lu", block, count);
+                            "block = "E3FSBLK", count = %lu", block, count);
                goto error_return;
        }
@@ -369,7 +367,7 @@ do_more:
                      sbi->s_itb_per_group))
                ext3_error (sb, "ext3_free_blocks",
                            "Freeing blocks in system zones - "
-                            "Block = %lu, count = %lu",
+                            "Block = "E3FSBLK", count = %lu",
                            block, count);
        /*
@@ -453,7 +451,8 @@ do_more:
                                                bit + i, bitmap_bh->b_data)) {
                        jbd_unlock_bh_state(bitmap_bh);
                        ext3_error(sb, __FUNCTION__,
-                                "bit already cleared for block %lu", block + i);
+                                "bit already cleared for block "E3FSBLK,
+                                 block + i);
                        jbd_lock_bh_state(bitmap_bh);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
@@ -493,10 +492,10 @@ error_return:
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long block, unsigned long count)
+                        ext3_fsblk_t block, unsigned long count)
 {
        struct super_block * sb;
-        int dquot_freed_blocks;
+        unsigned long dquot_freed_blocks;
        sb = inode->i_sb;
        if (!sb) {
@@ -525,7 +524,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 * data-writes at some point, and disable it for metadata allocations or
 * sync-data inodes.
 */
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 {
        int ret;
        struct journal_head *jh = bh2jh(bh);
@@ -542,11 +541,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
        return ret;
 }
-static int
+static ext3_grpblk_t
-bitmap_search_next_usable_block(int start, struct buffer_head *bh,
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
-                                        int maxblocks)
+                                        ext3_grpblk_t maxblocks)
 {
-        int next;
+        ext3_grpblk_t next;
        struct journal_head *jh = bh2jh(bh);
        /*
@@ -576,10 +575,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
 * the initial goal; then for a free byte somewhere in the bitmap; then
 * for any free bit in the bitmap.
 */
-static int
+static ext3_grpblk_t
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+                        ext3_grpblk_t maxblocks)
 {
-        int here, next;
+        ext3_grpblk_t here, next;
        char *p, *r;
        if (start > 0) {
@@ -591,7 +591,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
                 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
                 * next 64-bit boundary is simple..
                 */
-                int end_goal = (start + 63) & ~63;
+                ext3_grpblk_t end_goal = (start + 63) & ~63;
                if (end_goal > maxblocks)
                        end_goal = maxblocks;
                here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -628,7 +628,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 * zero (failure).
 */
 static inline int
-claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 {
        struct journal_head *jh = bh2jh(bh);
        int ret;
@@ -651,19 +651,18 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
 * new bitmap.  In that case we must release write access to the old one via
 * ext3_journal_release_buffer(), else we'll run out of credits.
 */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-                        struct buffer_head *bitmap_bh, int goal,
+                        struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
                        unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
-        int group_first_block, start, end;
+        ext3_fsblk_t group_first_block;
+        ext3_grpblk_t start, end;
        unsigned long num = 0;
        /* we do allocation within the reservation window if we have a window */
        if (my_rsv) {
-                group_first_block =
+                group_first_block = ext3_group_first_block_no(sb, group);
-                        le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-                        group * EXT3_BLOCKS_PER_GROUP(sb);
                if (my_rsv->_rsv_start >= group_first_block)
                        start = my_rsv->_rsv_start - group_first_block;
                else
@@ -673,13 +672,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
                if (end > EXT3_BLOCKS_PER_GROUP(sb))
                        /* reservation window crosses group boundary */
                        end = EXT3_BLOCKS_PER_GROUP(sb);
-                if ((start <= goal) && (goal < end))
+                if ((start <= grp_goal) && (grp_goal < end))
-                        start = goal;
+                        start = grp_goal;
                else
-                        goal = -1;
+                        grp_goal = -1;
        } else {
-                if (goal > 0)
+                if (grp_goal > 0)
-                        start = goal;
+                        start = grp_goal;
                else
                        start = 0;
                end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -688,43 +687,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
        BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
 repeat:
-        if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
+        if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
-                goal = find_next_usable_block(start, bitmap_bh, end);
+                grp_goal = find_next_usable_block(start, bitmap_bh, end);
-                if (goal < 0)
+                if (grp_goal < 0)
                        goto fail_access;
                if (!my_rsv) {
                        int i;
-                        for (i = 0; i < 7 && goal > start &&
+                        for (i = 0; i < 7 && grp_goal > start &&
-                                        ext3_test_allocatable(goal - 1,
+                                        ext3_test_allocatable(grp_goal - 1,
                                                                bitmap_bh);
-                                        i++, goal--)
+                                        i++, grp_goal--)
                                ;
                }
        }
-        start = goal;
+        start = grp_goal;
-        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
                /*
                 * The block was allocated by another thread, or it was
                 * allocated and then freed by another thread
                 */
                start++;
-                goal++;
+                grp_goal++;
                if (start >= end)
                        goto fail_access;
                goto repeat;
        }
        num++;
-        goal++;
+        grp_goal++;
-        while (num < *count && goal < end
+        while (num < *count && grp_goal < end
-                && ext3_test_allocatable(goal, bitmap_bh)
+                && ext3_test_allocatable(grp_goal, bitmap_bh)
-                && claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+                && claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
                num++;
-                goal++;
+                grp_goal++;
        }
        *count = num;
-        return goal - num;
+        return grp_goal - num;
 fail_access:
        *count = num;
        return -1;
@@ -766,12 +765,13 @@ fail_access:
 static int find_next_reservable_window(
                                struct ext3_reserve_window_node *search_head,
                                struct ext3_reserve_window_node *my_rsv,
-                                struct super_block * sb, int start_block,
+                                struct super_block * sb,
-                                int last_block)
+                                ext3_fsblk_t start_block,
+                                ext3_fsblk_t last_block)
 {
        struct rb_node *next;
        struct ext3_reserve_window_node *rsv, *prev;
-        int cur;
+        ext3_fsblk_t cur;
        int size = my_rsv->rsv_goal_size;
        /* TODO: make the start of the reservation window byte-aligned */
@@ -873,10 +873,10 @@ static int find_next_reservable_window(
 *
 *      @rsv: the reservation
 *
- *      @goal: The goal (group-relative).  It is where the search for a
+ *      @grp_goal: The goal (group-relative).  It is where the search for a
 *              free reservable space should start from.
- *              if we have a goal(goal >0 ), then start from there,
+ *              if we have a grp_goal(grp_goal >0 ), then start from there,
- *              no goal(goal = -1), we start from the first block
+ *              no grp_goal(grp_goal = -1), we start from the first block
 *              of the group.
 *
 *      @sb: the super block
@@ -885,25 +885,24 @@ static int find_next_reservable_window(
 *
 */
 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
-                int goal, struct super_block *sb,
+                ext3_grpblk_t grp_goal, struct super_block *sb,
                unsigned int group, struct buffer_head *bitmap_bh)
 {
        struct ext3_reserve_window_node *search_head;
-        int group_first_block, group_end_block, start_block;
+        ext3_fsblk_t group_first_block, group_end_block, start_block;
-        int first_free_block;
+        ext3_grpblk_t first_free_block;
        struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
        unsigned long size;
        int ret;
        spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
-        group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
+        group_first_block = ext3_group_first_block_no(sb, group);
-                                group * EXT3_BLOCKS_PER_GROUP(sb);
        group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
-        if (goal < 0)
+        if (grp_goal < 0)
                start_block = group_first_block;
        else
-                start_block = goal + group_first_block;
+                start_block = grp_goal + group_first_block;
        size = my_rsv->rsv_goal_size;
@@ -1057,14 +1056,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 * sorted double linked list should be fast.
 *
 */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
                        unsigned int group, struct buffer_head *bitmap_bh,
-                        int goal, struct ext3_reserve_window_node * my_rsv,
+                        ext3_grpblk_t grp_goal,
+                        struct ext3_reserve_window_node * my_rsv,
                        unsigned long *count, int *errp)
 {
-        unsigned long group_first_block;
+        ext3_fsblk_t group_first_block;
-        int ret = 0;
+        ext3_grpblk_t ret = 0;
        int fatal;
        unsigned long num = *count;
@@ -1090,17 +1090,16 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
         */
        if (my_rsv == NULL ) {
                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-                                                goal, count, NULL);
+                                                grp_goal, count, NULL);
                goto out;
        }
        /*
-         * goal is a group relative block number (if there is a goal)
+         * grp_goal is a group relative block number (if there is a goal)
-         * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
+         * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
         * first block is a filesystem wide block number
         * first block is the block number of the first block in this group
         */
-        group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
+        group_first_block = ext3_group_first_block_no(sb, group);
-                        group * EXT3_BLOCKS_PER_GROUP(sb);
        /*
         * Basically we will allocate a new block from inode's reservation
@@ -1119,24 +1118,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
         */
        while (1) {
                if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-                        !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
+                        !goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
                        if (my_rsv->rsv_goal_size < *count)
                                my_rsv->rsv_goal_size = *count;
-                        ret = alloc_new_reservation(my_rsv, goal, sb,
+                        ret = alloc_new_reservation(my_rsv, grp_goal, sb,
                                                        group, bitmap_bh);
                        if (ret < 0)
                                break;                  /* failed */
-                        if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
+                        if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
-                                goal = -1;
+                                grp_goal = -1;
-                } else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count)
+                } else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
                        try_to_extend_reservation(my_rsv, sb,
-                                        *count-my_rsv->rsv_end + goal - 1);
+                                        *count-my_rsv->rsv_end + grp_goal - 1);
                if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
                    || (my_rsv->rsv_end < group_first_block))
                        BUG();
-                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
+                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
                                           &num, &my_rsv->rsv_window);
                if (ret >= 0) {
                        my_rsv->rsv_alloc_hit += num;
@@ -1164,7 +1163,7 @@ out:
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-        int free_blocks, root_blocks;
+        ext3_fsblk_t free_blocks, root_blocks;
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
        root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
@@ -1200,19 +1199,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 * bitmap, and then for any free bit if that fails.
 * This function also updates quota and i_blocks field.
 */
-int ext3_new_blocks(handle_t *handle, struct inode *inode,
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long goal, unsigned long *count, int *errp)
+                        ext3_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gdp_bh;
        int group_no;
        int goal_group;
-        int ret_block;
+        ext3_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
+        ext3_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
+        ext3_fsblk_t ret_block;         /* filesyetem-wide allocated block */
        int bgi;                        /* blockgroup iteration index */
-        int target_block;
        int fatal = 0, err;
        int performed_allocation = 0;
-        int free_blocks;
+        ext3_grpblk_t free_blocks;      /* number of free blocks in a group */
        struct super_block *sb;
        struct ext3_group_desc *gdp;
        struct ext3_super_block *es;
@@ -1285,16 +1285,17 @@ retry:
                my_rsv = NULL;
        if (free_blocks > 0) {
-                ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+                grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
                                EXT3_BLOCKS_PER_GROUP(sb));
                bitmap_bh = read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
-                ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+                grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
-                                        bitmap_bh, ret_block, my_rsv, &num, &fatal);
+                                        group_no, bitmap_bh, grp_target_blk,
+                                        my_rsv, &num, &fatal);
                if (fatal)
                        goto out;
-                if (ret_block >= 0)
+                if (grp_alloc_blk >= 0)
                        goto allocated;
        }
@@ -1327,11 +1328,15 @@ retry:
                bitmap_bh = read_block_bitmap(sb, group_no);
                if (!bitmap_bh)
                        goto io_error;
-                ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+                /*
-                                        bitmap_bh, -1, my_rsv, &num, &fatal);
+                 * try to allocate block(s) from this group, without a goal(-1).
+                 */
+                grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+                                        group_no, bitmap_bh, -1, my_rsv,
+                                        &num, &fatal);
                if (fatal)
                        goto out;
-                if (ret_block >= 0) 
+                if (grp_alloc_blk >= 0)
                        goto allocated;
        }
        /*
@@ -1360,18 +1365,18 @@ allocated:
        if (fatal)
                goto out;
-        target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
+        ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
-                                + le32_to_cpu(es->s_first_data_block);
-        if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) ||
+        if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
-            in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) ||
+            in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
-            in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
+            in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
                      EXT3_SB(sb)->s_itb_per_group) ||
-            in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+            in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
                      EXT3_SB(sb)->s_itb_per_group))
                ext3_error(sb, "ext3_new_block",
                            "Allocating block in system zone - "
-                            "blocks from %u, length %lu", target_block, num);
+                            "blocks from "E3FSBLK", length %lu",
+                             ret_block, num);
        performed_allocation = 1;
@@ -1380,7 +1385,7 @@ allocated:
                struct buffer_head *debug_bh;
                /* Record bitmap buffer state in the newly allocated block */
-                debug_bh = sb_find_get_block(sb, target_block);
+                debug_bh = sb_find_get_block(sb, ret_block);
                if (debug_bh) {
                        BUFFER_TRACE(debug_bh, "state when allocated");
                        BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1398,21 @@ allocated:
                int i;
                for (i = 0; i < num; i++) {
-                        if (ext3_test_bit(ret_block,
+                        if (ext3_test_bit(grp_alloc_blk+i,
                                        bh2jh(bitmap_bh)->b_committed_data)) {
                                printk("%s: block was unexpectedly set in "
                                        "b_committed_data\n", __FUNCTION__);
                        }
                }
        }
-        ext3_debug("found bit %d\n", ret_block);
+        ext3_debug("found bit %d\n", grp_alloc_blk);
        spin_unlock(sb_bgl_lock(sbi, group_no));
        jbd_unlock_bh_state(bitmap_bh);
 #endif
-        /* ret_block was blockgroup-relative.  Now it becomes fs-relative */
-        ret_block = target_block;
        if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
                ext3_error(sb, "ext3_new_block",
-                            "block(%d) >= blocks count(%d) - "
+                            "block("E3FSBLK") >= blocks count(%d) - "
                            "block_group = %d, es == %p ", ret_block,
                        le32_to_cpu(es->s_blocks_count), group_no, es);
                goto out;
@@ -1421,7 +1423,7 @@ allocated:
         * list of some description.  We don't know in advance whether
         * the caller wants to use it as metadata or data.
         */
-        ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+        ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
                        ret_block, goal_hits, goal_attempts);
        spin_lock(sb_bgl_lock(sbi, group_no));
@@ -1461,23 +1463,24 @@ out:
        return 0;
 }
-int ext3_new_block(handle_t *handle, struct inode *inode,
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
-                        unsigned long goal, int *errp)
+                        ext3_fsblk_t goal, int *errp)
 {
        unsigned long count = 1;
        return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
-unsigned long ext3_count_free_blocks(struct super_block *sb)
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
-        unsigned long desc_count;
+        ext3_fsblk_t desc_count;
        struct ext3_group_desc *gdp;
        int i;
        unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
 #ifdef EXT3FS_DEBUG
        struct ext3_super_block *es;
-        unsigned long bitmap_count, x;
+        ext3_fsblk_t bitmap_count;
+        unsigned long x;
        struct buffer_head *bitmap_bh = NULL;
        es = EXT3_SB(sb)->s_es;
@@ -1502,8 +1505,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
                bitmap_count += x;
        }
        brelse(bitmap_bh);
-        printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
+        printk("ext3_count_free_blocks: stored = "E3FSBLK
-               le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+                ", computed = "E3FSBLK", "E3FSBLK"\n",
+               le32_to_cpu(es->s_free_blocks_count),
+                desc_count, bitmap_count);
        return bitmap_count;
 #else
        desc_count = 0;
@@ -1520,7 +1525,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 }
 static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
+block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 {
        return ext3_test_bit ((block -
                le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index f37528ed222e..fbb0d4ed07d4 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -284,7 +284,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                 * beginning of the loop and try to free the parent
                 * node.
                 */
-                parent = n->rb_parent;
+                parent = rb_parent(n);
                fname = rb_entry(n, struct fname, rb_hash);
                while (fname) {
                        struct fname * old = fname;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f313..36546ed36a14 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        int ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
        int freei, avefreei;
-        int freeb, avefreeb;
+        ext3_fsblk_t freeb, avefreeb;
-        int blocks_per_dir, ndirs;
+        ext3_fsblk_t blocks_per_dir;
-        int max_debt, max_dirs, min_blocks, min_inodes;
+        int ndirs;
+        int max_debt, max_dirs, min_inodes;
+        ext3_grpblk_t min_blocks;
        int group = -1, i;
        struct ext3_group_desc *desc;
        struct buffer_head *bh;
@@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        min_inodes = avefreei - inodes_per_group / 4;
        min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
-        max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+        max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
        if (max_debt * INODE_COST > inodes_per_group)
                max_debt = inodes_per_group / INODE_COST;
        if (max_debt > 255)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2edd7eec88fd..f804d5e9d60c 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
 * still needs to be revoked.
 */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                        struct buffer_head *bh, int blocknr)
+                        struct buffer_head *bh, ext3_fsblk_t blocknr)
 {
        int err;
@@ -407,13 +407,13 @@ no_block:
 *
 *      Caller must make sure that @ind is valid and will stay that way.
 */
-static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 {
        struct ext3_inode_info *ei = EXT3_I(inode);
        __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
        __le32 *p;
-        unsigned long bg_start;
+        ext3_fsblk_t bg_start;
-        unsigned long colour;
+        ext3_grpblk_t colour;
        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
@@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
-        bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+        bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
-                le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
        colour = (current->pid % 16) *
                        (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
        return bg_start + colour;
@@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 *      stores it in *@goal and returns zero.
 */
-static unsigned long ext3_find_goal(struct inode *inode, long block,
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
                Indirect chain[4], Indirect *partial)
 {
        struct ext3_block_alloc_info *block_i;
@@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 *              direct blocks
 */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-                        unsigned long goal, int indirect_blks, int blks,
+                        ext3_fsblk_t goal, int indirect_blks, int blks,
-                        unsigned long long new_blocks[4], int *err)
+                        ext3_fsblk_t new_blocks[4], int *err)
 {
        int target, i;
        unsigned long count = 0;
        int index = 0;
-        unsigned long current_block = 0;
+        ext3_fsblk_t current_block = 0;
        int ret = 0;
        /*
@@ -592,7 +591,7 @@ failed_out:
 *      as described above and return 0.
 */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-                        int indirect_blks, int *blks, unsigned long goal,
+                        int indirect_blks, int *blks, ext3_fsblk_t goal,
                        int *offsets, Indirect *branch)
 {
        int blocksize = inode->i_sb->s_blocksize;
@@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
        int err = 0;
        struct buffer_head *bh;
        int num;
-        unsigned long long new_blocks[4];
+        ext3_fsblk_t new_blocks[4];
-        unsigned long long current_block;
+        ext3_fsblk_t current_block;
        num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
                                *blks, new_blocks, &err);
@@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
        int i;
        int err = 0;
        struct ext3_block_alloc_info *block_i;
-        unsigned long current_block;
+        ext3_fsblk_t current_block;
        block_i = EXT3_I(inode)->i_block_alloc_info;
        /*
@@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        int offsets[4];
        Indirect chain[4];
        Indirect *partial;
-        unsigned long goal;
+        ext3_fsblk_t goal;
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
        struct ext3_inode_info *ei = EXT3_I(inode);
        int count = 0;
-        unsigned long first_block = 0;
+        ext3_fsblk_t first_block = 0;
        J_ASSERT(handle != NULL || create == 0);
@@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                count++;
                /*map more blocks*/
                while (count < maxblocks && count <= blocks_to_boundary) {
-                        unsigned long blk;
+                        ext3_fsblk_t blk;
                        if (!verify_chain(chain, partial)) {
                                /*
@@ -1699,7 +1698,7 @@ static int ext3_journalled_set_page_dirty(struct page *page)
        return __set_page_dirty_nobuffers(page);
 }
-static struct address_space_operations ext3_ordered_aops = {
+static const struct address_space_operations ext3_ordered_aops = {
        .readpage       = ext3_readpage,
        .readpages      = ext3_readpages,
        .writepage      = ext3_ordered_writepage,
@@ -1713,7 +1712,7 @@ static struct address_space_operations ext3_ordered_aops = {
        .migratepage    = buffer_migrate_page,
 };
-static struct address_space_operations ext3_writeback_aops = {
+static const struct address_space_operations ext3_writeback_aops = {
        .readpage       = ext3_readpage,
        .readpages      = ext3_readpages,
        .writepage      = ext3_writeback_writepage,
@@ -1727,7 +1726,7 @@ static struct address_space_operations ext3_writeback_aops = {
        .migratepage    = buffer_migrate_page,
 };
-static struct address_space_operations ext3_journalled_aops = {
+static const struct address_space_operations ext3_journalled_aops = {
        .readpage       = ext3_readpage,
        .readpages      = ext3_readpages,
        .writepage      = ext3_journalled_writepage,
@@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode)
 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
                struct address_space *mapping, loff_t from)
 {
-        unsigned long index = from >> PAGE_CACHE_SHIFT;
+        ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        unsigned blocksize, iblock, length, pos;
        struct inode *inode = mapping->host;
@@ -1960,7 +1959,7 @@ no_top:
 * than `count' because there can be holes in there.
 */
 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
-                struct buffer_head *bh, unsigned long block_to_free,
+                struct buffer_head *bh, ext3_fsblk_t block_to_free,
                unsigned long count, __le32 *first, __le32 *last)
 {
        __le32 *p;
@@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
                           struct buffer_head *this_bh,
                           __le32 *first, __le32 *last)
 {
-        unsigned long block_to_free = 0;    /* Starting block # of a run */
+        ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
        unsigned long count = 0;            /* Number of blocks in the run */ 
        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
                                               corresponding to
                                               block_to_free */
-        unsigned long nr;                   /* Current block # */
+        ext3_fsblk_t nr;                    /* Current block # */
        __le32 *p;                          /* Pointer into inode/ind
                                               for current block */
        int err;
@@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                               struct buffer_head *parent_bh,
                               __le32 *first, __le32 *last, int depth)
 {
-        unsigned long nr;
+        ext3_fsblk_t nr;
        __le32 *p;
        if (is_handle_aborted(handle))
@@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                         */
                        if (!bh) {
                                ext3_error(inode->i_sb, "ext3_free_branches",
-                                           "Read failure, inode=%ld, block=%ld",
+                                           "Read failure, inode=%ld, block="E3FSBLK,
                                           inode->i_ino, nr);
                                continue;
                        }
@@ -2394,11 +2393,12 @@ out_stop:
        ext3_journal_stop(handle);
 }
-static unsigned long ext3_get_inode_block(struct super_block *sb,
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
                unsigned long ino, struct ext3_iloc *iloc)
 {
        unsigned long desc, group_desc, block_group;
-        unsigned long offset, block;
+        unsigned long offset;
+        ext3_fsblk_t block;
        struct buffer_head *bh;
        struct ext3_group_desc * gdp;
@@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
 static int __ext3_get_inode_loc(struct inode *inode,
                                struct ext3_iloc *iloc, int in_mem)
 {
-        unsigned long block;
+        ext3_fsblk_t block;
        struct buffer_head *bh;
        block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode,
        if (!bh) {
                ext3_error (inode->i_sb, "ext3_get_inode_loc",
                                "unable to read inode block - "
-                                "inode=%lu, block=%lu", inode->i_ino, block);
+                                "inode=%lu, block="E3FSBLK,
+                                 inode->i_ino, block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -2540,7 +2541,7 @@ make_io:
                if (!buffer_uptodate(bh)) {
                        ext3_error(inode->i_sb, "ext3_get_inode_loc",
                                        "unable to read inode block - "
-                                        "inode=%lu, block=%lu",
+                                        "inode=%lu, block="E3FSBLK,
                                        inode->i_ino, block);
                        brelse(bh);
                        return -EIO;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8c22aa9a7fbb..3a6b012d120c 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -204,7 +204,7 @@ flags_err:
                return 0;
        }
        case EXT3_IOC_GROUP_EXTEND: {
-                unsigned long n_blocks_count;
+                ext3_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
                int err;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b8f5cd1e540d..d9176dba3698 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1379,7 +1379,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
        int     dx_fallback=0;
 #endif
        unsigned blocksize;
-        unsigned nlen, rlen;
        u32 block, blocks;
        sb = dir->i_sb;
@@ -1417,8 +1416,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext3_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(rlen = blocksize);
+        de->rec_len = cpu_to_le16(blocksize);
-        nlen = 0;
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 34b39e9a1e5a..5e1337fd878a 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -8,7 +8,6 @@
 * This could probably be made into a module, because it is not often in use.
 */
-#include <linux/config.h>
 #define EXT3FS_DEBUG
@@ -28,16 +27,16 @@ static int verify_group_input(struct super_block *sb,
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-        unsigned start = le32_to_cpu(es->s_blocks_count);
+        ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
-        unsigned end = start + input->blocks_count;
+        ext3_fsblk_t end = start + input->blocks_count;
        unsigned group = input->group;
-        unsigned itend = input->inode_table + sbi->s_itb_per_group;
+        ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
        unsigned overhead = ext3_bg_has_super(sb, group) ?
                (1 + ext3_bg_num_gdb(sb, group) +
                 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-        unsigned metaend = start + overhead;
+        ext3_fsblk_t metaend = start + overhead;
        struct buffer_head *bh = NULL;
-        int free_blocks_count;
+        ext3_grpblk_t free_blocks_count;
        int err = -EINVAL;
        input->free_blocks_count = free_blocks_count =
@@ -64,7 +63,8 @@ static int verify_group_input(struct super_block *sb,
                ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
                             input->blocks_count);
        else if (!(bh = sb_bread(sb, end - 1)))
-                ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
+                ext3_warning(sb, __FUNCTION__,
+                             "Cannot read last block ("E3FSBLK")",
                             end - 1);
        else if (outside(input->block_bitmap, start, end))
                ext3_warning(sb, __FUNCTION__,
@@ -77,7 +77,7 @@ static int verify_group_input(struct super_block *sb,
        else if (outside(input->inode_table, start, end) ||
                 outside(itend - 1, start, end))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode table not in group (blocks %u-%u)",
+                             "Inode table not in group (blocks %u-"E3FSBLK")",
                             input->inode_table, itend - 1);
        else if (input->inode_bitmap == input->block_bitmap)
                ext3_warning(sb, __FUNCTION__,
@@ -85,24 +85,27 @@ static int verify_group_input(struct super_block *sb,
                             input->block_bitmap);
        else if (inside(input->block_bitmap, input->inode_table, itend))
                ext3_warning(sb, __FUNCTION__,
-                             "Block bitmap (%u) in inode table (%u-%u)",
+                             "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
                             input->block_bitmap, input->inode_table, itend-1);
        else if (inside(input->inode_bitmap, input->inode_table, itend))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode bitmap (%u) in inode table (%u-%u)",
+                             "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
                             input->inode_bitmap, input->inode_table, itend-1);
        else if (inside(input->block_bitmap, start, metaend))
                ext3_warning(sb, __FUNCTION__,
-                             "Block bitmap (%u) in GDT table (%u-%u)",
+                             "Block bitmap (%u) in GDT table"
+                             " ("E3FSBLK"-"E3FSBLK")",
                             input->block_bitmap, start, metaend - 1);
        else if (inside(input->inode_bitmap, start, metaend))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode bitmap (%u) in GDT table (%u-%u)",
+                             "Inode bitmap (%u) in GDT table"
+                             " ("E3FSBLK"-"E3FSBLK")",
                             input->inode_bitmap, start, metaend - 1);
        else if (inside(input->inode_table, start, metaend) ||
                 inside(itend - 1, start, metaend))
                ext3_warning(sb, __FUNCTION__,
-                             "Inode table (%u-%u) overlaps GDT table (%u-%u)",
+                             "Inode table (%u-"E3FSBLK") overlaps"
+                             "GDT table ("E3FSBLK"-"E3FSBLK")",
                             input->inode_table, itend - 1, start, metaend - 1);
        else
                err = 0;
@@ -112,7 +115,7 @@ static int verify_group_input(struct super_block *sb,
 }
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
-                                  unsigned long blk)
+                                  ext3_fsblk_t blk)
 {
        struct buffer_head *bh;
        int err;
@@ -163,15 +166,14 @@ static int setup_new_group_blocks(struct super_block *sb,
                                  struct ext3_new_group_data *input)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long start = input->group * sbi->s_blocks_per_group +
+        ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
-                le32_to_cpu(sbi->s_es->s_first_data_block);
        int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
        unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
        struct buffer_head *bh;
        handle_t *handle;
-        unsigned long block;
+        ext3_fsblk_t block;
-        int bit;
+        ext3_grpblk_t bit;
        int i;
        int err = 0, err2;
@@ -328,7 +330,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
 static int verify_reserved_gdb(struct super_block *sb,
                               struct buffer_head *primary)
 {
-        const unsigned long blk = primary->b_blocknr;
+        const ext3_fsblk_t blk = primary->b_blocknr;
        const unsigned long end = EXT3_SB(sb)->s_groups_count;
        unsigned three = 1;
        unsigned five = 5;
@@ -340,7 +342,8 @@ static int verify_reserved_gdb(struct super_block *sb,
        while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
                if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
                        ext3_warning(sb, __FUNCTION__,
-                                     "reserved GDT %ld missing grp %d (%ld)",
+                                     "reserved GDT "E3FSBLK
+                                     " missing grp %d ("E3FSBLK")",
                                     blk, grp,
                                     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
                        return -EINVAL;
@@ -372,7 +375,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        struct super_block *sb = inode->i_sb;
        struct ext3_super_block *es = EXT3_SB(sb)->s_es;
        unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-        unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+        ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
        struct buffer_head **o_group_desc, **n_group_desc;
        struct buffer_head *dind;
        int gdbackups;
@@ -417,7 +420,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        data = (__u32 *)dind->b_data;
        if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
                ext3_warning(sb, __FUNCTION__,
-                             "new group %u GDT block %lu not reserved",
+                             "new group %u GDT block "E3FSBLK" not reserved",
                             input->group, gdblock);
                err = -EINVAL;
                goto exit_dind;
@@ -515,7 +518,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        struct buffer_head **primary;
        struct buffer_head *dind;
        struct ext3_iloc iloc;
-        unsigned long blk;
+        ext3_fsblk_t blk;
        __u32 *data, *end;
        int gdbackups = 0;
        int res, i;
@@ -540,7 +543,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        for (res = 0; res < reserved_gdb; res++, blk++) {
                if (le32_to_cpu(*data) != blk) {
                        ext3_warning(sb, __FUNCTION__,
-                                     "reserved block %lu not at offset %ld",
+                                     "reserved block "E3FSBLK
+                                     " not at offset %ld",
                                     blk, (long)(data - (__u32 *)dind->b_data));
                        err = -EINVAL;
                        goto exit_bh;
@@ -902,15 +906,16 @@ exit_put:
 * GDT blocks are reserved to grow to the desired size.
 */
 int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
-                      unsigned long n_blocks_count)
+                      ext3_fsblk_t n_blocks_count)
 {
-        unsigned long o_blocks_count;
+        ext3_fsblk_t o_blocks_count;
        unsigned long o_groups_count;
-        unsigned long last;
+        ext3_grpblk_t last;
-        int add;
+        ext3_grpblk_t add;
        struct buffer_head * bh;
        handle_t *handle;
-        int err, freed_blocks;
+        int err;
+        unsigned long freed_blocks;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -919,12 +924,22 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        o_groups_count = EXT3_SB(sb)->s_groups_count;
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
+                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
                return 0;
+        if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+                        " too large to resize to %lu blocks safely\n",
+                        sb->s_id, n_blocks_count);
+                if (sizeof(sector_t) < 8)
+                        ext3_warning(sb, __FUNCTION__,
+                        "CONFIG_LBD not enabled\n");
+                return -EINVAL;
+        }
        if (n_blocks_count < o_blocks_count) {
                ext3_warning(sb, __FUNCTION__,
                             "can't shrink FS - resize aborted");
@@ -948,7 +963,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        if (o_blocks_count + add < n_blocks_count)
                ext3_warning(sb, __FUNCTION__,
-                             "will only finish group (%lu blocks, %u new)",
+                             "will only finish group ("E3FSBLK
+                             " blocks, %u new)",
                             o_blocks_count + add, add);
        /* See if the device is actually as big as what was requested */
@@ -991,10 +1007,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        sb->s_dirt = 1;
        unlock_super(sb);
-        ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-        ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+        ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext3_journal_stop(handle)))
                goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f8a5266ea1ff..f2dd71336612 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -16,7 +16,6 @@
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
@@ -58,7 +57,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
 static const char *ext3_decode_error(struct super_block * sb, int errno,
                                     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
@@ -499,20 +498,21 @@ static void ext3_clear_inode(struct inode *inode)
 {
        struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-       if (EXT3_I(inode)->i_acl &&
+        if (EXT3_I(inode)->i_acl &&
-           EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
+                        EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-               posix_acl_release(EXT3_I(inode)->i_acl);
+                posix_acl_release(EXT3_I(inode)->i_acl);
-               EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
+                EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-       }
+        }
-       if (EXT3_I(inode)->i_default_acl &&
+        if (EXT3_I(inode)->i_default_acl &&
-           EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
+                        EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-               posix_acl_release(EXT3_I(inode)->i_default_acl);
+                posix_acl_release(EXT3_I(inode)->i_default_acl);
-               EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
+                EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-       }
+        }
 #endif
        ext3_discard_reservation(inode);
        EXT3_I(inode)->i_block_alloc_info = NULL;
-        kfree(rsv);
+        if (unlikely(rsv))
+                kfree(rsv);
 }
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -629,7 +629,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -665,6 +665,7 @@ static match_table_t tokens = {
        {Opt_noreservation, "noreservation"},
        {Opt_noload, "noload"},
        {Opt_nobh, "nobh"},
+        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
        {Opt_journal_update, "journal=update"},
        {Opt_journal_inum, "journal=%u"},
@@ -688,14 +689,15 @@ static match_table_t tokens = {
        {Opt_resize, "resize"},
 };
-static unsigned long get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data)
 {
-        unsigned long   sb_block;
+        ext3_fsblk_t    sb_block;
        char            *options = (char *) *data;
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
        options += 3;
+        /*todo: use simple_strtoll with >32bit ext3 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk("EXT3-fs: Invalid sb specification: %s\n",
@@ -710,7 +712,7 @@ static unsigned long get_sb_block(void **data)
 static int parse_options (char *options, struct super_block *sb,
                          unsigned long *inum, unsigned long *journal_devnum,
-                          unsigned long *n_blocks_count, int is_remount)
+                          ext3_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        char * p;
@@ -1012,6 +1014,9 @@ clear_qf_name:
                case Opt_nobh:
                        set_opt(sbi->s_mount_opt, NOBH);
                        break;
+                case Opt_bh:
+                        clear_opt(sbi->s_mount_opt, NOBH);
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1127,7 +1132,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
        struct ext3_group_desc * gdp = NULL;
        int desc_block = 0;
        int i;
@@ -1314,15 +1319,14 @@ static loff_t ext3_max_size(int bits)
        return res;
 }
-static unsigned long descriptor_loc(struct super_block *sb,
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
-                                    unsigned long logic_sb_block,
+                                    ext3_fsblk_t logic_sb_block,
                                    int nr)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long bg, first_data_block, first_meta_bg;
+        unsigned long bg, first_meta_bg;
        int has_super = 0;
-        first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
        if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1331,7 +1335,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
        bg = sbi->s_desc_per_block * nr;
        if (ext3_bg_has_super(sb, bg))
                has_super = 1;
-        return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
+        return (has_super + ext3_group_first_block_no(sb, bg));
 }
@@ -1340,9 +1344,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        struct buffer_head * bh;
        struct ext3_super_block *es = NULL;
        struct ext3_sb_info *sbi;
-        unsigned long block;
+        ext3_fsblk_t block;
-        unsigned long sb_block = get_sb_block(&data);
+        ext3_fsblk_t sb_block = get_sb_block(&data);
-        unsigned long logic_sb_block;
+        ext3_fsblk_t logic_sb_block;
        unsigned long offset = 0;
        unsigned long journal_inum = 0;
        unsigned long journal_devnum = 0;
@@ -1564,6 +1568,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        if (le32_to_cpu(es->s_blocks_count) >
+                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+                printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+                        " too large to mount safely\n", sb->s_id);
+                if (sizeof(sector_t) < 8)
+                        printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+                                        "enabled\n");
+                goto failed_mount;
+        }
        if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext3;
        sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
@@ -1579,9 +1593,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        percpu_counter_init(&sbi->s_freeblocks_counter);
-        percpu_counter_init(&sbi->s_freeinodes_counter);
-        percpu_counter_init(&sbi->s_dirs_counter);
        bgl_lock_init(&sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
@@ -1595,12 +1606,20 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                }
        }
        if (!ext3_check_descriptors (sb)) {
-                printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+                printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
                goto failed_mount2;
        }
        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        percpu_counter_init(&sbi->s_freeblocks_counter,
+                ext3_count_free_blocks(sb));
+        percpu_counter_init(&sbi->s_freeinodes_counter,
+                ext3_count_free_inodes(sb));
+        percpu_counter_init(&sbi->s_dirs_counter,
+                ext3_count_dirs(sb));
        /* per fileystem reservation list head & lock */
        spin_lock_init(&sbi->s_rsv_window_lock);
        sbi->s_rsv_window_root = RB_ROOT;
@@ -1639,16 +1658,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (!test_opt(sb, NOLOAD) &&
            EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext3_load_journal(sb, es, journal_devnum))
-                        goto failed_mount2;
+                        goto failed_mount3;
        } else if (journal_inum) {
                if (ext3_create_journal(sb, es, journal_inum))
-                        goto failed_mount2;
+                        goto failed_mount3;
        } else {
                if (!silent)
                        printk (KERN_ERR
                                "ext3: No journal on filesystem on %s\n",
                                sb->s_id);
-                goto failed_mount2;
+                goto failed_mount3;
        }
        /* We have now updated the journal if required, so we can
@@ -1671,7 +1690,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
                        printk(KERN_ERR "EXT3-fs: Journal does not support "
                               "requested data journaling mode\n");
-                        goto failed_mount3;
+                        goto failed_mount4;
                }
        default:
                break;
@@ -1694,13 +1713,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (!sb->s_root) {
                printk(KERN_ERR "EXT3-fs: get root inode failed\n");
                iput(root);
-                goto failed_mount3;
+                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                dput(sb->s_root);
                sb->s_root = NULL;
                printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
-                goto failed_mount3;
+                goto failed_mount4;
        }
        ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -1723,13 +1742,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
-        percpu_counter_mod(&sbi->s_freeblocks_counter,
-                ext3_count_free_blocks(sb));
-        percpu_counter_mod(&sbi->s_freeinodes_counter,
-                ext3_count_free_inodes(sb));
-        percpu_counter_mod(&sbi->s_dirs_counter,
-                ext3_count_dirs(sb));
        lock_kernel();
        return 0;
@@ -1739,8 +1751,12 @@ cantfind_ext3:
                       sb->s_id);
        goto failed_mount;
-failed_mount3:
+failed_mount4:
        journal_destroy(sbi->s_journal);
+failed_mount3:
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -1827,10 +1843,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 {
        struct buffer_head * bh;
        journal_t *journal;
-        int start;
+        ext3_fsblk_t start;
-        int len;
+        ext3_fsblk_t len;
        int hblock, blocksize;
-        unsigned long sb_block;
+        ext3_fsblk_t sb_block;
        unsigned long offset;
        struct ext3_super_block * es;
        struct block_device *bdev;
@@ -2203,7 +2219,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 {
        struct ext3_super_block * es;
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        unsigned long n_blocks_count = 0;
+        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext3_mount_options old_opts;
        int err;
@@ -2318,11 +2334,12 @@ restore_opts:
        return err;
 }
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+        struct super_block *sb = dentry->d_sb;
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-        unsigned long overhead;
+        ext3_fsblk_t overhead;
        int i;
        if (test_opt (sb, MINIX_DF))
@@ -2646,10 +2663,10 @@ out:
 #endif
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
+static int ext3_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
 }
 static struct file_system_type ext3_fs_type = {
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7df..a44a0562203a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
        error = -ENODATA;
        if (!EXT3_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        if (!bh)
                goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext3_xattr_check_block(bh)) {
 bad_block:      ext3_error(inode->i_sb, __FUNCTION__,
-                           "inode %ld: bad block %d", inode->i_ino,
+                           "inode %ld: bad block "E3FSBLK, inode->i_ino,
                           EXT3_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
        error = 0;
        if (!EXT3_I(inode)->i_file_acl)
                goto cleanup;
-        ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+        ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        error = -EIO;
        if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext3_xattr_check_block(bh)) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                           "inode %ld: bad block %d", inode->i_ino,
+                           "inode %ld: bad block "E3FSBLK, inode->i_ino,
                           EXT3_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext3_xattr_check_block(bs->bh)) {
                        ext3_error(sb, __FUNCTION__,
-                                "inode %ld: bad block %d", inode->i_ino,
+                                "inode %ld: bad block "E3FSBLK, inode->i_ino,
                                EXT3_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
@@ -792,11 +792,12 @@ inserted:
                        get_bh(new_bh);
                } else {
                        /* We need to allocate a new block */
-                        int goal = le32_to_cpu(
+                        ext3_fsblk_t goal = le32_to_cpu(
                                        EXT3_SB(sb)->s_es->s_first_data_block) +
-                                EXT3_I(inode)->i_block_group *
+                                (ext3_fsblk_t)EXT3_I(inode)->i_block_group *
                                EXT3_BLOCKS_PER_GROUP(sb);
-                        int block = ext3_new_block(handle, inode, goal, &error);
+                        ext3_fsblk_t block = ext3_new_block(handle, inode,
+                                                        goal, &error);
                        if (error)
                                goto cleanup;
                        ea_idebug(inode, "creating block %d", block);
@@ -847,7 +848,7 @@ cleanup_dquot:
 bad_block:
        ext3_error(inode->i_sb, __FUNCTION__,
-                   "inode %ld: bad block %d", inode->i_ino,
+                   "inode %ld: bad block "E3FSBLK, inode->i_ino,
                   EXT3_I(inode)->i_file_acl);
        goto cleanup;
@@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        if (!bh) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                        "inode %ld: block %d read error", inode->i_ino,
+                        "inode %ld: block "E3FSBLK" read error", inode->i_ino,
                        EXT3_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                        "inode %ld: bad block %d", inode->i_ino,
+                        "inode %ld: bad block "E3FSBLK, inode->i_ino,
                        EXT3_I(inode)->i_file_acl);
                goto cleanup;
        }
@@ -1210,11 +1211,11 @@ again:
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
                        ext3_error(inode->i_sb, __FUNCTION__,
-                                "inode %ld: block %ld read error",
+                                "inode %ld: block %lu read error",
                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT3_XATTR_REFCOUNT_MAX) {
-                        ea_idebug(inode, "block %ld refcount %d>=%d",
+                        ea_idebug(inode, "block %lu refcount %d>=%d",
                                  (unsigned long) ce->e_block,
                                  le32_to_cpu(BHDR(bh)->h_refcount),
                                          EXT3_XATTR_REFCOUNT_MAX);
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2ceae38f3d49..6b1ae1c6182c 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -6,7 +6,6 @@
  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/config.h>
 #include <linux/xattr.h>
 /* Magic value in attribute blocks */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c1ce284f8a94..31b7174176ba 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -196,7 +196,7 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, fat_get_block);
 }
-static struct address_space_operations fat_aops = {
+static const struct address_space_operations fat_aops = {
        .readpage       = fat_readpage,
        .readpages      = fat_readpages,
        .writepage      = fat_writepage,
@@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
-static int fat_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
        /* If the count of free cluster is still unknown, counts it here. */
        if (sbi->free_clusters == -1) {
-                int err = fat_count_free_clusters(sb);
+                int err = fat_count_free_clusters(dentry->d_sb);
                if (err)
                        return err;
        }
-        buf->f_type = sb->s_magic;
+        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = sbi->cluster_size;
        buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
        buf->f_bfree = sbi->free_clusters;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 944652e9dde1..308f2b6b5026 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -210,4 +210,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
        return err;
 }
-EXPORT_SYMBOL_GPL(fat_sync_bhs);
diff --git a/fs/file_table.c b/fs/file_table.c
index bcea1998b4de..0131ba06e1ee 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -5,7 +5,6 @@
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -300,5 +299,5 @@ void __init files_init(unsigned long mempages)
        if (files_stat.max_files < NR_FILE)
                files_stat.max_files = NR_FILE;
        files_defer_init();
-        percpu_counter_init(&nr_files);
+        percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index 583bd78086d8..d35979a58743 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -159,11 +159,11 @@ struct vxfs_sb {
 * In core superblock filesystem private data for VxFS.
 */
 struct vxfs_sb_info {
-        struct vxfs_sb          *vsi_raw;       /* raw (on disk) supeblock */
+        struct vxfs_sb          *vsi_raw;       /* raw (on disk) superblock */
        struct buffer_head      *vsi_bp;        /* buffer for raw superblock*/
        struct inode            *vsi_fship;     /* fileset header inode */
        struct inode            *vsi_ilist;     /* inode list inode */
-        struct inode            *vsi_stilist;   /* structual inode list inode */
+        struct inode            *vsi_stilist;   /* structural inode list inode */
        u_long                  vsi_iext;       /* initial inode list */
        ino_t                   vsi_fshino;     /* fileset header inode */
        daddr_t                 vsi_oltext;     /* OLT extent */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 6dee109aeea4..78948b4b1894 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -112,7 +112,7 @@ vxfs_read_fshead(struct super_block *sbp)
        vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
        if (!vip) {
-                printk(KERN_ERR "vxfs: unabled to read fsh inode\n");
+                printk(KERN_ERR "vxfs: unable to read fsh inode\n");
                return -EINVAL;
        }
        if (!VXFS_ISFSH(vip)) {
@@ -129,13 +129,13 @@ vxfs_read_fshead(struct super_block *sbp)
        infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
        if (!infp->vsi_fship) {
-                printk(KERN_ERR "vxfs: unabled to get fsh inode\n");
+                printk(KERN_ERR "vxfs: unable to get fsh inode\n");
                goto out_free_fship;
        }
        sfp = vxfs_getfsh(infp->vsi_fship, 0);
        if (!sfp) {
-                printk(KERN_ERR "vxfs: unabled to get structural fsh\n");
+                printk(KERN_ERR "vxfs: unable to get structural fsh\n");
                goto out_iput_fship;
        } 
@@ -145,7 +145,7 @@ vxfs_read_fshead(struct super_block *sbp)
        pfp = vxfs_getfsh(infp->vsi_fship, 1);
        if (!pfp) {
-                printk(KERN_ERR "vxfs: unabled to get primary fsh\n");
+                printk(KERN_ERR "vxfs: unable to get primary fsh\n");
                goto out_free_sfp;
        }
@@ -159,7 +159,7 @@ vxfs_read_fshead(struct super_block *sbp)
        infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
        if (!infp->vsi_stilist) {
-                printk(KERN_ERR "vxfs: unabled to get structual list inode\n");
+                printk(KERN_ERR "vxfs: unable to get structural list inode\n");
                kfree(tip);
                goto out_free_pfp;
        }
@@ -174,7 +174,7 @@ vxfs_read_fshead(struct super_block *sbp)
                goto out_iput_stilist;
        infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
        if (!infp->vsi_ilist) {
-                printk(KERN_ERR "vxfs: unabled to get inode list inode\n");
+                printk(KERN_ERR "vxfs: unable to get inode list inode\n");
                kfree(tip);
                goto out_iput_stilist;
        }
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 6f5df1700e95..4e25f3fbed86 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -56,7 +56,7 @@ struct inode_operations vxfs_immed_symlink_iops = {
 /*
 * Adress space operations for immed files and directories.
 */
-struct address_space_operations vxfs_immed_aops = {
+const struct address_space_operations vxfs_immed_aops = {
        .readpage =             vxfs_immed_readpage,
 };
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f544aae9169f..ca6a39714771 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -41,8 +41,8 @@
 #include "vxfs_extern.h"
-extern struct address_space_operations vxfs_aops;
+extern const struct address_space_operations vxfs_aops;
-extern struct address_space_operations vxfs_immed_aops;
+extern const struct address_space_operations vxfs_immed_aops;
 extern struct inode_operations vxfs_immed_symlink_iops;
@@ -295,7 +295,7 @@ vxfs_read_inode(struct inode *ip)
 {
        struct super_block              *sbp = ip->i_sb;
        struct vxfs_inode_info          *vip;
-        struct address_space_operations *aops;
+        const struct address_space_operations   *aops;
        ino_t                           ino = ip->i_ino;
        if (!(vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist)))
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 50aae77651b2..decac62efe57 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -42,7 +42,7 @@
 static int              vxfs_readpage(struct file *, struct page *);
 static sector_t         vxfs_bmap(struct address_space *, sector_t);
-struct address_space_operations vxfs_aops = {
+const struct address_space_operations vxfs_aops = {
        .readpage =             vxfs_readpage,
        .bmap =                 vxfs_bmap,
        .sync_page =            block_sync_page,
@@ -71,8 +71,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 {
        struct page *                   pp;
-        pp = read_cache_page(mapping, n,
+        pp = read_mapping_page(mapping, n, NULL);
-                        (filler_t*)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(pp)) {
                wait_on_page_locked(pp);
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b44c916d24a1..b74b791fc23b 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 #include "vxfs.h"
 #include "vxfs_extern.h"
@@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 static void             vxfs_put_super(struct super_block *);
-static int              vxfs_statfs(struct super_block *, struct kstatfs *);
+static int              vxfs_statfs(struct dentry *, struct kstatfs *);
 static int              vxfs_remount(struct super_block *, int *, char *);
 static struct super_operations vxfs_super_ops = {
@@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp)
 /**
 * vxfs_statfs - get filesystem information
- * @sbp:        VFS superblock
+ * @dentry:     VFS dentry to locate superblock
 * @bufp:       output buffer
 *
 * Description:
 *   vxfs_statfs fills the statfs buffer @bufp with information
- *   about the filesystem described by @sbp.
+ *   about the filesystem described by @dentry.
 *
 * Returns:
 *   Zero.
@@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp)
 *   This is everything but complete...
 */
 static int
-vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp)
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
-        struct vxfs_sb_info             *infp = VXFS_SBI(sbp);
+        struct vxfs_sb_info             *infp = VXFS_SBI(dentry->d_sb);
        bufp->f_type = VXFS_SUPER_MAGIC;
-        bufp->f_bsize = sbp->s_blocksize;
+        bufp->f_bsize = dentry->d_sb->s_blocksize;
        bufp->f_blocks = infp->vsi_raw->vs_dsize;
        bufp->f_bfree = infp->vsi_raw->vs_free;
        bufp->f_bavail = 0;
@@ -241,10 +242,11 @@ out:
 /*
 * The usual module blurb.
 */
-static struct super_block *vxfs_get_sb(struct file_system_type *fs_type,
+static int vxfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+                           mnt);
 }
 static struct file_system_type vxfs_fs_type = {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3fbe2d030f4..892643dc9af1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,9 +461,11 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
        struct writeback_control wbc = {
                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+                .range_start    = 0,
+                .range_end      = LLONG_MAX,
        };
-        unsigned long nr_dirty = read_page_state(nr_dirty);
+        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = read_page_state(nr_unstable);
+        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
        wbc.nr_to_write = nr_dirty + nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
@@ -559,6 +561,8 @@ int write_inode_now(struct inode *inode, int sync)
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = WB_SYNC_ALL,
+                .range_start = 0,
+                .range_end = LLONG_MAX,
        };
        if (!mapping_cap_writeback_dirty(inode->i_mapping))
@@ -619,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
        int need_write_inode_now = 0;
        int err2;
-        current->flags |= PF_SYNCWRITE;
        if (what & OSYNC_DATA)
                err = filemap_fdatawrite(mapping);
        if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -632,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
                if (!err)
                        err = err2;
        }
-        current->flags &= ~PF_SYNCWRITE;
        spin_lock(&inode_lock);
        if ((inode->i_state & I_DIRTY) &&
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index c3e1f760cac9..72437065f6ad 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_FUSE_FS) += fuse.o
-fuse-objs := dev.o dir.o file.o inode.o
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 000000000000..a3bce3a77253
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,218 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+#include "fuse_i.h"
+#include <linux/init.h>
+#include <linux/module.h>
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists.  Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+        struct fuse_conn *fc;
+        mutex_lock(&fuse_mutex);
+        fc = file->f_dentry->d_inode->u.generic_ip;
+        if (fc)
+                fc = fuse_conn_get(fc);
+        mutex_unlock(&fuse_mutex);
+        return fc;
+}
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+        struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+        if (fc) {
+                fuse_abort_conn(fc);
+                fuse_conn_put(fc);
+        }
+        return count;
+}
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+                                      size_t len, loff_t *ppos)
+{
+        char tmp[32];
+        size_t size;
+        if (!*ppos) {
+                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+                if (!fc)
+                        return 0;
+                file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+                fuse_conn_put(fc);
+        }
+        size = sprintf(tmp, "%ld\n", (long)file->private_data);
+        return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+static const struct file_operations fuse_ctl_abort_ops = {
+        .open = nonseekable_open,
+        .write = fuse_conn_abort_write,
+};
+static const struct file_operations fuse_ctl_waiting_ops = {
+        .open = nonseekable_open,
+        .read = fuse_conn_waiting_read,
+};
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+                                          struct fuse_conn *fc,
+                                          const char *name,
+                                          int mode, int nlink,
+                                          struct inode_operations *iop,
+                                          const struct file_operations *fop)
+{
+        struct dentry *dentry;
+        struct inode *inode;
+        BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+        dentry = d_alloc_name(parent, name);
+        if (!dentry)
+                return NULL;
+        fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+        inode = new_inode(fuse_control_sb);
+        if (!inode)
+                return NULL;
+        inode->i_mode = mode;
+        inode->i_uid = fc->user_id;
+        inode->i_gid = fc->group_id;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        /* setting ->i_op to NULL is not allowed */
+        if (iop)
+                inode->i_op = iop;
+        inode->i_fop = fop;
+        inode->i_nlink = nlink;
+        inode->u.generic_ip = fc;
+        d_add(dentry, inode);
+        return dentry;
+}
+/*
+ * Add a connection to the control filesystem (if it exists).  Caller
+ * must host fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+        struct dentry *parent;
+        char name[32];
+        if (!fuse_control_sb)
+                return 0;
+        parent = fuse_control_sb->s_root;
+        parent->d_inode->i_nlink++;
+        sprintf(name, "%llu", (unsigned long long) fc->id);
+        parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+                                     &simple_dir_inode_operations,
+                                     &simple_dir_operations);
+        if (!parent)
+                goto err;
+        if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+                                NULL, &fuse_ctl_waiting_ops) ||
+            !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+                                 NULL, &fuse_ctl_abort_ops))
+                goto err;
+        return 0;
+ err:
+        fuse_ctl_remove_conn(fc);
+        return -ENOMEM;
+}
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must host fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+        int i;
+        if (!fuse_control_sb)
+                return;
+        for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+                struct dentry *dentry = fc->ctl_dentry[i];
+                dentry->d_inode->u.generic_ip = NULL;
+                d_drop(dentry);
+                dput(dentry);
+        }
+        fuse_control_sb->s_root->d_inode->i_nlink--;
+}
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct tree_descr empty_descr = {""};
+        struct fuse_conn *fc;
+        int err;
+        err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+        if (err)
+                return err;
+        mutex_lock(&fuse_mutex);
+        BUG_ON(fuse_control_sb);
+        fuse_control_sb = sb;
+        list_for_each_entry(fc, &fuse_conn_list, entry) {
+                err = fuse_ctl_add_conn(fc);
+                if (err) {
+                        fuse_control_sb = NULL;
+                        mutex_unlock(&fuse_mutex);
+                        return err;
+                }
+        }
+        mutex_unlock(&fuse_mutex);
+        return 0;
+}
+static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+                        const char *dev_name, void *raw_data,
+                        struct vfsmount *mnt)
+{
+        return get_sb_single(fs_type, flags, raw_data,
+                                fuse_ctl_fill_super, mnt);
+}
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+        mutex_lock(&fuse_mutex);
+        fuse_control_sb = NULL;
+        mutex_unlock(&fuse_mutex);
+        kill_litter_super(sb);
+}
+static struct file_system_type fuse_ctl_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "fusectl",
+        .get_sb         = fuse_ctl_get_sb,
+        .kill_sb        = fuse_ctl_kill_sb,
+};
+int __init fuse_ctl_init(void)
+{
+        return register_filesystem(&fuse_ctl_fs_type);
+}
+void fuse_ctl_cleanup(void)
+{
+        unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 104a62dadb94..1e2006caf158 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req)
 {
        memset(req, 0, sizeof(*req));
        INIT_LIST_HEAD(&req->list);
+        INIT_LIST_HEAD(&req->intr_entry);
        init_waitqueue_head(&req->waitq);
        atomic_set(&req->count, 1);
 }
@@ -64,18 +65,6 @@ static void restore_sigs(sigset_t *oldset)
        sigprocmask(SIG_SETMASK, oldset, NULL);
 }
-/*
- * Reset request, so that it can be reused
- *
- * The caller must be _very_ careful to make sure, that it is holding
- * the only reference to req
- */
-void fuse_reset_request(struct fuse_req *req)
-{
-        BUG_ON(atomic_read(&req->count) != 1);
-        fuse_request_init(req);
-}
 static void __fuse_get_request(struct fuse_req *req)
 {
        atomic_inc(&req->count);
@@ -88,6 +77,13 @@ static void __fuse_put_request(struct fuse_req *req)
        atomic_dec(&req->count);
 }
+static void fuse_req_init_context(struct fuse_req *req)
+{
+        req->in.h.uid = current->fsuid;
+        req->in.h.gid = current->fsgid;
+        req->in.h.pid = current->pid;
+}
 struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
        struct fuse_req *req;
@@ -103,14 +99,16 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        if (intr)
                goto out;
+        err = -ENOTCONN;
+        if (!fc->connected)
+                goto out;
        req = fuse_request_alloc();
        err = -ENOMEM;
        if (!req)
                goto out;
-        req->in.h.uid = current->fsuid;
+        fuse_req_init_context(req);
-        req->in.h.gid = current->fsgid;
-        req->in.h.pid = current->pid;
        req->waiting = 1;
        return req;
@@ -119,142 +117,183 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        return ERR_PTR(err);
 }
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+/*
+ * Return request in fuse_file->reserved_req.  However that may
+ * currently be in use.  If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+                                         struct file *file)
 {
-        if (atomic_dec_and_test(&req->count)) {
+        struct fuse_req *req = NULL;
-                if (req->waiting)
+        struct fuse_file *ff = file->private_data;
-                        atomic_dec(&fc->num_waiting);
-                fuse_request_free(req);
+        do {
-        }
+                wait_event(fc->blocked_waitq, ff->reserved_req);
+                spin_lock(&fc->lock);
+                if (ff->reserved_req) {
+                        req = ff->reserved_req;
+                        ff->reserved_req = NULL;
+                        get_file(file);
+                        req->stolen_file = file;
+                }
+                spin_unlock(&fc->lock);
+        } while (!req);
+        return req;
 }
 /*
- * Called with sbput_sem held for read (request_end) or write
+ * Put stolen request back into fuse_file->reserved_req
- * (fuse_put_super).  By the time fuse_put_super() is finished, all
- * inodes belonging to background requests must be released, so the
- * iputs have to be done within the locked region.
 */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
 {
-        iput(req->inode);
+        struct file *file = req->stolen_file;
-        iput(req->inode2);
+        struct fuse_file *ff = file->private_data;
        spin_lock(&fc->lock);
-        list_del(&req->bg_entry);
+        fuse_request_init(req);
-        if (fc->num_background == FUSE_MAX_BACKGROUND) {
+        BUG_ON(ff->reserved_req);
-                fc->blocked = 0;
+        ff->reserved_req = req;
-                wake_up_all(&fc->blocked_waitq);
+        wake_up(&fc->blocked_waitq);
-        }
-        fc->num_background--;
        spin_unlock(&fc->lock);
+        fput(file);
 }
 /*
- * This function is called when a request is finished.  Either a reply
+ * Gets a requests for a file operation, always succeeds
- * has arrived or it was interrupted (and not yet sent) or some error
- * occurred during communication with userspace, or the device file
- * was closed.  In case of a background request the reference to the
- * stored objects are released.  The requester thread is woken up (if
- * still waiting), the 'end' callback is called if given, else the
- * reference to the request is released
 *
- * Releasing extra reference for foreground requests must be done
+ * This is used for sending the FLUSH request, which must get to
- * within the same locked region as setting state to finished.  This
+ * userspace, due to POSIX locks which may need to be unlocked.
- * is because fuse_reset_request() may be called after request is
- * finished and it must be the sole possessor.  If request is
- * interrupted and put in the background, it will return with an error
- * and hence never be reset and reused.
 *
- * Called with fc->lock, unlocks it
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open.  If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
 */
-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
 {
-        list_del(&req->list);
+        struct fuse_req *req;
-        req->state = FUSE_REQ_FINISHED;
-        if (!req->background) {
-                spin_unlock(&fc->lock);
-                wake_up(&req->waitq);
-                fuse_put_request(fc, req);
-        } else {
-                void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-                req->end = NULL;
-                spin_unlock(&fc->lock);
-                down_read(&fc->sbput_sem);
-                if (fc->mounted)
-                        fuse_release_background(fc, req);
-                up_read(&fc->sbput_sem);
-                /* fput must go outside sbput_sem, otherwise it can deadlock */
+        atomic_inc(&fc->num_waiting);
-                if (req->file)
+        wait_event(fc->blocked_waitq, !fc->blocked);
-                        fput(req->file);
+        req = fuse_request_alloc();
+        if (!req)
+                req = get_reserved_req(fc, file);
-                if (end)
+        fuse_req_init_context(req);
-                        end(fc, req);
+        req->waiting = 1;
+        return req;
+}
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+        if (atomic_dec_and_test(&req->count)) {
+                if (req->waiting)
+                        atomic_dec(&fc->num_waiting);
+                if (req->stolen_file)
+                        put_reserved_req(fc, req);
                else
-                        fuse_put_request(fc, req);
+                        fuse_request_free(req);
        }
 }
 /*
- * Unfortunately request interruption not just solves the deadlock
+ * This function is called when a request is finished.  Either a reply
- * problem, it causes problems too.  These stem from the fact, that an
+ * has arrived or it was aborted (and not yet sent) or some error
- * interrupted request is continued to be processed in userspace,
+ * occurred during communication with userspace, or the device file
- * while all the locks and object references (inode and file) held
+ * was closed.  The requester thread is woken up (if still waiting),
- * during the operation are released.
+ * the 'end' callback is called if given, else the reference to the
- *
+ * request is released
- * To release the locks is exactly why there's a need to interrupt the
- * request, so there's not a lot that can be done about this, except
- * introduce additional locking in userspace.
- *
- * More important is to keep inode and file references until userspace
- * has replied, otherwise FORGET and RELEASE could be sent while the
- * inode/file is still used by the filesystem.
- *
- * For this reason the concept of "background" request is introduced.
- * An interrupted request is backgrounded if it has been already sent
- * to userspace.  Backgrounding involves getting an extra reference to
- * inode(s) or file used in the request, and adding the request to
- * fc->background list.  When a reply is received for a background
- * request, the object references are released, and the request is
- * removed from the list.  If the filesystem is unmounted while there
- * are still background requests, the list is walked and references
- * are released as if a reply was received.
 *
- * There's one more use for a background request.  The RELEASE message is
+ * Called with fc->lock, unlocks it
- * always sent as background, since it doesn't return an error or
- * data.
 */
-static void background_request(struct fuse_conn *fc, struct fuse_req *req)
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-{
+{
-        req->background = 1;
+        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-        list_add(&req->bg_entry, &fc->background);
+        req->end = NULL;
-        fc->num_background++;
+        list_del(&req->list);
-        if (fc->num_background == FUSE_MAX_BACKGROUND)
+        list_del(&req->intr_entry);
-                fc->blocked = 1;
+        req->state = FUSE_REQ_FINISHED;
-        if (req->inode)
+        if (req->background) {
-                req->inode = igrab(req->inode);
+                if (fc->num_background == FUSE_MAX_BACKGROUND) {
-        if (req->inode2)
+                        fc->blocked = 0;
-                req->inode2 = igrab(req->inode2);
+                        wake_up_all(&fc->blocked_waitq);
+                }
+                fc->num_background--;
+        }
+        spin_unlock(&fc->lock);
+        dput(req->dentry);
+        mntput(req->vfsmount);
        if (req->file)
-                get_file(req->file);
+                fput(req->file);
+        wake_up(&req->waitq);
+        if (end)
+                end(fc, req);
+        else
+                fuse_put_request(fc, req);
 }
-/* Called with fc->lock held.  Releases, and then reacquires it. */
+static void wait_answer_interruptible(struct fuse_conn *fc,
-static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+                                      struct fuse_req *req)
 {
-        sigset_t oldset;
+        if (signal_pending(current))
+                return;
        spin_unlock(&fc->lock);
-        block_sigs(&oldset);
        wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-        restore_sigs(&oldset);
        spin_lock(&fc->lock);
-        if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
+}
-                return;
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+        list_add_tail(&req->intr_entry, &fc->interrupts);
+        wake_up(&fc->waitq);
+        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+/* Called with fc->lock held.  Releases, and then reacquires it. */
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+{
+        if (!fc->no_interrupt) {
+                /* Any signal may interrupt this */
+                wait_answer_interruptible(fc, req);
+                if (req->aborted)
+                        goto aborted;
+                if (req->state == FUSE_REQ_FINISHED)
+                        return;
-        if (!req->interrupted) {
-                req->out.h.error = -EINTR;
                req->interrupted = 1;
+                if (req->state == FUSE_REQ_SENT)
+                        queue_interrupt(fc, req);
+        }
+        if (req->force) {
+                spin_unlock(&fc->lock);
+                wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+                spin_lock(&fc->lock);
+        } else {
+                sigset_t oldset;
+                /* Only fatal signals may interrupt this */
+                block_sigs(&oldset);
+                wait_answer_interruptible(fc, req);
+                restore_sigs(&oldset);
        }
+        if (req->aborted)
+                goto aborted;
+        if (req->state == FUSE_REQ_FINISHED)
+                return;
+        req->out.h.error = -EINTR;
+        req->aborted = 1;
+ aborted:
        if (req->locked) {
                /* This is uninterruptible sleep, because data is
                   being copied to/from the buffers of req.  During
@@ -268,8 +307,11 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
        if (req->state == FUSE_REQ_PENDING) {
                list_del(&req->list);
                __fuse_put_request(req);
-        } else if (req->state == FUSE_REQ_SENT)
+        } else if (req->state == FUSE_REQ_SENT) {
-                background_request(fc, req);
+                spin_unlock(&fc->lock);
+                wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+                spin_lock(&fc->lock);
+        }
 }
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
@@ -283,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
        return nbytes;
 }
+static u64 fuse_get_unique(struct fuse_conn *fc)
+ {
+        fc->reqctr++;
+        /* zero is special */
+        if (fc->reqctr == 0)
+                fc->reqctr = 1;
+        return fc->reqctr;
+}
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-        fc->reqctr++;
+        req->in.h.unique = fuse_get_unique(fc);
-        /* zero is special */
-        if (fc->reqctr == 0)
-                fc->reqctr = 1;
-        req->in.h.unique = fc->reqctr;
        req->in.h.len = sizeof(struct fuse_in_header) +
                len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
        list_add_tail(&req->list, &fc->pending);
@@ -302,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
-/*
- * This can only be interrupted by a SIGKILL
- */
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
@@ -327,8 +372,12 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
        spin_lock(&fc->lock);
-        background_request(fc, req);
        if (fc->connected) {
+                req->background = 1;
+                fc->num_background++;
+                if (fc->num_background == FUSE_MAX_BACKGROUND)
+                        fc->blocked = 1;
                queue_request(fc, req);
                spin_unlock(&fc->lock);
        } else {
@@ -352,14 +401,14 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 /*
 * Lock the request.  Up to the next unlock_request() there mustn't be
 * anything that could cause a page-fault.  If the request was already
- * interrupted bail out.
+ * aborted bail out.
 */
 static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
        int err = 0;
        if (req) {
                spin_lock(&fc->lock);
-                if (req->interrupted)
+                if (req->aborted)
                        err = -ENOENT;
                else
                        req->locked = 1;
@@ -369,7 +418,7 @@ static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 }
 /*
- * Unlock request.  If it was interrupted during being locked, the
+ * Unlock request.  If it was aborted during being locked, the
 * requester thread is currently waiting for it to be unlocked, so
 * wake it up.
 */
@@ -378,7 +427,7 @@ static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
        if (req) {
                spin_lock(&fc->lock);
                req->locked = 0;
-                if (req->interrupted)
+                if (req->aborted)
                        wake_up(&req->waitq);
                spin_unlock(&fc->lock);
        }
@@ -557,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
+static int request_pending(struct fuse_conn *fc)
+{
+        return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+}
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
 {
        DECLARE_WAITQUEUE(wait, current);
        add_wait_queue_exclusive(&fc->waitq, &wait);
-        while (fc->connected && list_empty(&fc->pending)) {
+        while (fc->connected && !request_pending(fc)) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (signal_pending(current))
                        break;
@@ -577,11 +631,50 @@ static void request_wait(struct fuse_conn *fc)
 }
 /*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+                               const struct iovec *iov, unsigned long nr_segs)
+{
+        struct fuse_copy_state cs;
+        struct fuse_in_header ih;
+        struct fuse_interrupt_in arg;
+        unsigned reqsize = sizeof(ih) + sizeof(arg);
+        int err;
+        list_del_init(&req->intr_entry);
+        req->intr_unique = fuse_get_unique(fc);
+        memset(&ih, 0, sizeof(ih));
+        memset(&arg, 0, sizeof(arg));
+        ih.len = reqsize;
+        ih.opcode = FUSE_INTERRUPT;
+        ih.unique = req->intr_unique;
+        arg.unique = req->in.h.unique;
+        spin_unlock(&fc->lock);
+        if (iov_length(iov, nr_segs) < reqsize)
+                return -EINVAL;
+        fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+        err = fuse_copy_one(&cs, &ih, sizeof(ih));
+        if (!err)
+                err = fuse_copy_one(&cs, &arg, sizeof(arg));
+        fuse_copy_finish(&cs);
+        return err ? err : reqsize;
+}
+/*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
 * the pending list and copies request data to userspace buffer.  If
- * no reply is needed (FORGET) or request has been interrupted or
+ * no reply is needed (FORGET) or request has been aborted or there
- * there was an error during the copying then it's finished by calling
+ * was an error during the copying then it's finished by calling
 * request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
@@ -601,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        spin_lock(&fc->lock);
        err = -EAGAIN;
        if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-            list_empty(&fc->pending))
+            !request_pending(fc))
                goto err_unlock;
        request_wait(fc);
@@ -609,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        if (!fc->connected)
                goto err_unlock;
        err = -ERESTARTSYS;
-        if (list_empty(&fc->pending))
+        if (!request_pending(fc))
                goto err_unlock;
+        if (!list_empty(&fc->interrupts)) {
+                req = list_entry(fc->interrupts.next, struct fuse_req,
+                                 intr_entry);
+                return fuse_read_interrupt(fc, req, iov, nr_segs);
+        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
        req->state = FUSE_REQ_READING;
        list_move(&req->list, &fc->io);
@@ -636,10 +735,10 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        fuse_copy_finish(&cs);
        spin_lock(&fc->lock);
        req->locked = 0;
-        if (!err && req->interrupted)
+        if (!err && req->aborted)
                err = -ENOENT;
        if (err) {
-                if (!req->interrupted)
+                if (!req->aborted)
                        req->out.h.error = -EIO;
                request_end(fc, req);
                return err;
@@ -649,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
        else {
                req->state = FUSE_REQ_SENT;
                list_move_tail(&req->list, &fc->processing);
+                if (req->interrupted)
+                        queue_interrupt(fc, req);
                spin_unlock(&fc->lock);
        }
        return reqsize;
@@ -675,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
        list_for_each(entry, &fc->processing) {
                struct fuse_req *req;
                req = list_entry(entry, struct fuse_req, list);
-                if (req->in.h.unique == unique)
+                if (req->in.h.unique == unique || req->intr_unique == unique)
                        return req;
        }
        return NULL;
@@ -741,17 +842,33 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
                goto err_unlock;
        req = request_find(fc, oh.unique);
-        err = -EINVAL;
        if (!req)
                goto err_unlock;
-        if (req->interrupted) {
+        if (req->aborted) {
                spin_unlock(&fc->lock);
                fuse_copy_finish(&cs);
                spin_lock(&fc->lock);
                request_end(fc, req);
                return -ENOENT;
        }
+        /* Is it an interrupt reply? */
+        if (req->intr_unique == oh.unique) {
+                err = -EINVAL;
+                if (nbytes != sizeof(struct fuse_out_header))
+                        goto err_unlock;
+                if (oh.error == -ENOSYS)
+                        fc->no_interrupt = 1;
+                else if (oh.error == -EAGAIN)
+                        queue_interrupt(fc, req);
+                spin_unlock(&fc->lock);
+                fuse_copy_finish(&cs);
+                return nbytes;
+        }
+        req->state = FUSE_REQ_WRITING;
        list_move(&req->list, &fc->io);
        req->out.h = oh;
        req->locked = 1;
@@ -764,9 +881,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
        spin_lock(&fc->lock);
        req->locked = 0;
        if (!err) {
-                if (req->interrupted)
+                if (req->aborted)
                        err = -ENOENT;
-        } else if (!req->interrupted)
+        } else if (!req->aborted)
                req->out.h.error = -EIO;
        request_end(fc, req);
@@ -800,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
        spin_lock(&fc->lock);
        if (!fc->connected)
                mask = POLLERR;
-        else if (!list_empty(&fc->pending))
+        else if (request_pending(fc))
                mask |= POLLIN | POLLRDNORM;
        spin_unlock(&fc->lock);
@@ -826,7 +943,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 /*
 * Abort requests under I/O
 *
- * The requests are set to interrupted and finished, and the request
+ * The requests are set to aborted and finished, and the request
 * waiter is woken up.  This will make request_wait_answer() wait
 * until the request is unlocked and then return.
 *
@@ -841,7 +958,7 @@ static void end_io_requests(struct fuse_conn *fc)
                        list_entry(fc->io.next, struct fuse_req, list);
                void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-                req->interrupted = 1;
+                req->aborted = 1;
                req->out.h.error = -ECONNABORTED;
                req->state = FUSE_REQ_FINISHED;
                list_del_init(&req->list);
@@ -874,19 +991,20 @@ static void end_io_requests(struct fuse_conn *fc)
 * onto the pending list is prevented by req->connected being false.
 *
 * Progression of requests under I/O to the processing list is
- * prevented by the req->interrupted flag being true for these
+ * prevented by the req->aborted flag being true for these requests.
- * requests.  For this reason requests on the io list must be aborted
+ * For this reason requests on the io list must be aborted first.
- * first.
 */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
        spin_lock(&fc->lock);
        if (fc->connected) {
                fc->connected = 0;
+                fc->blocked = 0;
                end_io_requests(fc);
                end_requests(fc, &fc->pending);
                end_requests(fc, &fc->processing);
                wake_up_all(&fc->waitq);
+                wake_up_all(&fc->blocked_waitq);
                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
        }
        spin_unlock(&fc->lock);
@@ -902,7 +1020,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
                end_requests(fc, &fc->processing);
                spin_unlock(&fc->lock);
                fasync_helper(-1, file, 0, &fc->fasync);
-                kobject_put(&fc->kobj);
+                fuse_conn_put(fc);
        }
        return 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8d7546e832e8..72a74cde6de8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -79,7 +79,6 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 {
        req->in.h.opcode = FUSE_LOOKUP;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
@@ -225,6 +224,20 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 }
 /*
+ * Synchronous release for the case when something goes wrong in CREATE_OPEN
+ */
+static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
+                              u64 nodeid, int flags)
+{
+        struct fuse_req *req;
+        req = fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
+        req->force = 1;
+        request_send(fc, req);
+        fuse_put_request(fc, req);
+}
+/*
 * Atomic create+open operation
 *
 * If the filesystem doesn't support this, then fall back to separate
@@ -237,6 +250,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        struct inode *inode;
        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
+        struct fuse_req *forget_req;
        struct fuse_open_in inarg;
        struct fuse_open_out outopen;
        struct fuse_entry_out outentry;
@@ -247,9 +261,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (fc->no_create)
                return -ENOSYS;
+        forget_req = fuse_get_req(fc);
+        if (IS_ERR(forget_req))
+                return PTR_ERR(forget_req);
        req = fuse_get_req(fc);
+        err = PTR_ERR(req);
        if (IS_ERR(req))
-                return PTR_ERR(req);
+                goto out_put_forget_req;
        err = -ENOMEM;
        ff = fuse_file_alloc();
@@ -262,7 +281,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        inarg.mode = mode;
        req->in.h.opcode = FUSE_CREATE;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -285,25 +303,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
                goto out_free_ff;
+        fuse_put_request(fc, req);
        inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
                          &outentry.attr);
-        err = -ENOMEM;
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
                ff->fh = outopen.fh;
-                /* Special release, with inode = NULL, this will
+                fuse_sync_release(fc, ff, outentry.nodeid, flags);
-                   trigger a 'forget' request when the release is
+                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
-                   complete */
+                return -ENOMEM;
-                fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
-                goto out_put_request;
        }
-        fuse_put_request(fc, req);
+        fuse_put_request(fc, forget_req);
        d_instantiate(entry, inode);
        fuse_change_timeout(entry, &outentry);
        file = lookup_instantiate_filp(nd, entry, generic_file_open);
        if (IS_ERR(file)) {
                ff->fh = outopen.fh;
-                fuse_send_release(fc, ff, outentry.nodeid, inode, flags, 0);
+                fuse_sync_release(fc, ff, outentry.nodeid, flags);
                return PTR_ERR(file);
        }
        fuse_finish_open(inode, file, ff, &outopen);
@@ -313,6 +329,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        fuse_file_free(ff);
 out_put_request:
        fuse_put_request(fc, req);
+ out_put_forget_req:
+        fuse_put_request(fc, forget_req);
        return err;
 }
@@ -328,7 +346,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        int err;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
@@ -448,7 +465,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
        req->in.h.opcode = FUSE_UNLINK;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
@@ -480,7 +496,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        req->in.h.opcode = FUSE_RMDIR;
        req->in.h.nodeid = get_node_id(dir);
-        req->inode = dir;
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
@@ -510,8 +525,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        inarg.newdir = get_node_id(newdir);
        req->in.h.opcode = FUSE_RENAME;
        req->in.h.nodeid = get_node_id(olddir);
-        req->inode = olddir;
-        req->inode2 = newdir;
        req->in.numargs = 3;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -558,7 +571,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
        memset(&inarg, 0, sizeof(inarg));
        inarg.oldnodeid = get_node_id(inode);
        req->in.h.opcode = FUSE_LINK;
-        req->inode2 = inode;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -587,7 +599,6 @@ int fuse_do_getattr(struct inode *inode)
        req->in.h.opcode = FUSE_GETATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(arg);
        req->out.args[0].value = &arg;
@@ -679,7 +690,6 @@ static int fuse_access(struct inode *inode, int mask)
        inarg.mask = mask;
        req->in.h.opcode = FUSE_ACCESS;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -820,7 +830,6 @@ static char *read_link(struct dentry *dentry)
        }
        req->in.h.opcode = FUSE_READLINK;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->out.argvar = 1;
        req->out.numargs = 1;
        req->out.args[0].size = PAGE_SIZE - 1;
@@ -939,7 +948,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
        iattr_to_fattr(attr, &inarg);
        req->in.h.opcode = FUSE_SETATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1002,7 +1010,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
        inarg.flags = flags;
        req->in.h.opcode = FUSE_SETXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 3;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1041,7 +1048,6 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
        inarg.size = size;
        req->in.h.opcode = FUSE_GETXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1091,7 +1097,6 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
        inarg.size = size;
        req->in.h.opcode = FUSE_LISTXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1135,7 +1140,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
        req->in.h.opcode = FUSE_REMOVEXATTR;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = strlen(name) + 1;
        req->in.args[0].value = name;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fc342cf7c2cc..63614ed16336 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -30,7 +30,6 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
        req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -49,8 +48,8 @@ struct fuse_file *fuse_file_alloc(void)
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
        if (ff) {
-                ff->release_req = fuse_request_alloc();
+                ff->reserved_req = fuse_request_alloc();
-                if (!ff->release_req) {
+                if (!ff->reserved_req) {
                        kfree(ff);
                        ff = NULL;
                }
@@ -60,7 +59,7 @@ struct fuse_file *fuse_file_alloc(void)
 void fuse_file_free(struct fuse_file *ff)
 {
-        fuse_request_free(ff->release_req);
+        fuse_request_free(ff->reserved_req);
        kfree(ff);
 }
@@ -113,37 +112,22 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        return err;
 }
-/* Special case for failed iget in CREATE */
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+                                   int opcode)
 {
-        /* If called from end_io_requests(), req has more than one
+        struct fuse_req *req = ff->reserved_req;
-           reference and fuse_reset_request() cannot work */
-        if (fc->connected) {
-                u64 nodeid = req->in.h.nodeid;
-                fuse_reset_request(req);
-                fuse_send_forget(fc, req, nodeid, 1);
-        } else
-                fuse_put_request(fc, req);
-}
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-                       u64 nodeid, struct inode *inode, int flags, int isdir)
-{
-        struct fuse_req * req = ff->release_req;
        struct fuse_release_in *inarg = &req->misc.release_in;
        inarg->fh = ff->fh;
        inarg->flags = flags;
-        req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+        req->in.h.opcode = opcode;
        req->in.h.nodeid = nodeid;
-        req->inode = inode;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_release_in);
        req->in.args[0].value = inarg;
-        request_send_background(fc, req);
-        if (!inode)
-                req->end = fuse_release_end;
        kfree(ff);
+        return req;
 }
 int fuse_release_common(struct inode *inode, struct file *file, int isdir)
@@ -151,8 +135,15 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
        struct fuse_file *ff = file->private_data;
        if (ff) {
                struct fuse_conn *fc = get_fuse_conn(inode);
-                u64 nodeid = get_node_id(inode);
+                struct fuse_req *req;
-                fuse_send_release(fc, ff, nodeid, inode, file->f_flags, isdir);
+                req = fuse_release_fill(ff, get_node_id(inode), file->f_flags,
+                                        isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
+                /* Hold vfsmount and dentry until release is finished */
+                req->vfsmount = mntget(file->f_vfsmnt);
+                req->dentry = dget(file->f_dentry);
+                request_send_background(fc, req);
        }
        /* Return value is ignored by VFS */
@@ -169,7 +160,29 @@ static int fuse_release(struct inode *inode, struct file *file)
        return fuse_release_common(inode, file, 0);
 }
-static int fuse_flush(struct file *file)
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+        u32 *k = fc->scramble_key;
+        u64 v = (unsigned long) id;
+        u32 v0 = v;
+        u32 v1 = v >> 32;
+        u32 sum = 0;
+        int i;
+        for (i = 0; i < 32; i++) {
+                v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+                sum += 0x9E3779B9;
+                v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+        }
+        return (u64) v0 + ((u64) v1 << 32);
+}
+static int fuse_flush(struct file *file, fl_owner_t id)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
@@ -184,19 +197,16 @@ static int fuse_flush(struct file *file)
        if (fc->no_flush)
                return 0;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nofail(fc, file);
-        if (IS_ERR(req))
-                return PTR_ERR(req);
        memset(&inarg, 0, sizeof(inarg));
        inarg.fh = ff->fh;
+        inarg.lock_owner = fuse_lock_owner_id(fc, id);
        req->in.h.opcode = FUSE_FLUSH;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
+        req->force = 1;
        request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -232,8 +242,6 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        inarg.fsync_flags = datasync ? 1 : 0;
        req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -266,8 +274,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
        inarg->size = count;
        req->in.h.opcode = opcode;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_read_in);
        req->in.args[0].value = inarg;
@@ -342,6 +348,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
        req->out.page_zeroing = 1;
        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
        if (fc->async_read) {
+                get_file(file);
+                req->file = file;
                req->end = fuse_readpages_end;
                request_send_background(fc, req);
        } else {
@@ -420,8 +428,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
        inarg.size = count;
        req->in.h.opcode = FUSE_WRITE;
        req->in.h.nodeid = get_node_id(inode);
-        req->inode = inode;
-        req->file = file;
        req->in.argpages = 1;
        req->in.numargs = 2;
        req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -619,6 +625,126 @@ static int fuse_set_page_dirty(struct page *page)
        return 0;
 }
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+                                  struct file_lock *fl)
+{
+        switch (ffl->type) {
+        case F_UNLCK:
+                break;
+        case F_RDLCK:
+        case F_WRLCK:
+                if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+                    ffl->end < ffl->start)
+                        return -EIO;
+                fl->fl_start = ffl->start;
+                fl->fl_end = ffl->end;
+                fl->fl_pid = ffl->pid;
+                break;
+        default:
+                return -EIO;
+        }
+        fl->fl_type = ffl->type;
+        return 0;
+}
+static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+                         const struct file_lock *fl, int opcode, pid_t pid)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_lk_in *arg = &req->misc.lk_in;
+        arg->fh = ff->fh;
+        arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+        arg->lk.start = fl->fl_start;
+        arg->lk.end = fl->fl_end;
+        arg->lk.type = fl->fl_type;
+        arg->lk.pid = pid;
+        req->in.h.opcode = opcode;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(*arg);
+        req->in.args[0].value = arg;
+}
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_lk_out outarg;
+        int err;
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        fuse_lk_fill(req, file, fl, FUSE_GETLK, 0);
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                err = convert_fuse_file_lock(&outarg.lk, fl);
+        return err;
+}
+static int fuse_setlk(struct file *file, struct file_lock *fl)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+        pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+        int err;
+        /* Unlock on close is handled by the flush method */
+        if (fl->fl_flags & FL_CLOSE)
+                return 0;
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        fuse_lk_fill(req, file, fl, opcode, pid);
+        request_send(fc, req);
+        err = req->out.h.error;
+        /* locking is restartable */
+        if (err == -EINTR)
+                err = -ERESTARTSYS;
+        fuse_put_request(fc, req);
+        return err;
+}
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        int err;
+        if (cmd == F_GETLK) {
+                if (fc->no_lock) {
+                        if (!posix_test_lock(file, fl, fl))
+                                fl->fl_type = F_UNLCK;
+                        err = 0;
+                } else
+                        err = fuse_getlk(file, fl);
+        } else {
+                if (fc->no_lock)
+                        err = posix_lock_file_wait(file, fl);
+                else
+                        err = fuse_setlk(file, fl);
+        }
+        return err;
+}
 static const struct file_operations fuse_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_file_read,
@@ -628,6 +754,7 @@ static const struct file_operations fuse_file_operations = {
        .flush          = fuse_flush,
        .release        = fuse_release,
        .fsync          = fuse_fsync,
+        .lock           = fuse_file_lock,
        .sendfile       = generic_file_sendfile,
 };
@@ -639,10 +766,11 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .flush          = fuse_flush,
        .release        = fuse_release,
        .fsync          = fuse_fsync,
+        .lock           = fuse_file_lock,
        /* no mmap and sendfile */
 };
-static struct address_space_operations fuse_file_aops  = {
+static const struct address_space_operations fuse_file_aops  = {
        .readpage       = fuse_readpage,
        .prepare_write  = fuse_prepare_write,
        .commit_write   = fuse_commit_write,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0474202cb5dc..0dbf96621841 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -8,12 +8,13 @@
 #include <linux/fuse.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -24,6 +25,9 @@
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 3
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
    module will check permissions based on the file mode.  Otherwise no
    permission checking is done in the kernel */
@@ -33,6 +37,11 @@
    doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
 /** FUSE inode */
 struct fuse_inode {
@@ -56,7 +65,7 @@ struct fuse_inode {
 /** FUSE specific file data */
 struct fuse_file {
        /** Request reserved for flush and release */
-        struct fuse_req *release_req;
+        struct fuse_req *reserved_req;
        /** File handle used by userspace */
        u64 fh;
@@ -122,6 +131,7 @@ enum fuse_req_state {
        FUSE_REQ_PENDING,
        FUSE_REQ_READING,
        FUSE_REQ_SENT,
+        FUSE_REQ_WRITING,
        FUSE_REQ_FINISHED
 };
@@ -135,12 +145,15 @@ struct fuse_req {
            fuse_conn */
        struct list_head list;
-        /** Entry on the background list */
+        /** Entry on the interrupts list  */
-        struct list_head bg_entry;
+        struct list_head intr_entry;
        /** refcount */
        atomic_t count;
+        /** Unique ID for the interrupt request */
+        u64 intr_unique;
        /*
         * The following bitfields are either set once before the
         * request is queued or setting/clearing them is protected by
@@ -150,12 +163,18 @@ struct fuse_req {
        /** True if the request has reply */
        unsigned isreply:1;
-        /** The request was interrupted */
+        /** Force sending of the request even if interrupted */
-        unsigned interrupted:1;
+        unsigned force:1;
+        /** The request was aborted */
+        unsigned aborted:1;
        /** Request is sent in the background */
        unsigned background:1;
+        /** The request has been interrupted */
+        unsigned interrupted:1;
        /** Data is being copied to/from the request */
        unsigned locked:1;
@@ -181,6 +200,7 @@ struct fuse_req {
                struct fuse_init_in init_in;
                struct fuse_init_out init_out;
                struct fuse_read_in read_in;
+                struct fuse_lk_in lk_in;
        } misc;
        /** page vector */
@@ -192,17 +212,20 @@ struct fuse_req {
        /** offset of data on first page */
        unsigned page_offset;
-        /** Inode used in the request */
-        struct inode *inode;
-        /** Second inode used in the request (or NULL) */
-        struct inode *inode2;
        /** File used in the request (or NULL) */
        struct file *file;
+        /** vfsmount used in release */
+        struct vfsmount *vfsmount;
+        /** dentry used in release */
+        struct dentry *dentry;
        /** Request completion callback */
        void (*end)(struct fuse_conn *, struct fuse_req *);
+        /** Request is stolen from fuse_file->reserved_req */
+        struct file *stolen_file;
 };
 /**
@@ -216,6 +239,9 @@ struct fuse_conn {
        /** Lock protecting accessess to  members of this structure */
        spinlock_t lock;
+        /** Refcount */
+        atomic_t count;
        /** The user id for this mount */
        uid_t user_id;
@@ -243,13 +269,12 @@ struct fuse_conn {
        /** The list of requests under I/O */
        struct list_head io;
-        /** Requests put in the background (RELEASE or any other
-            interrupted request) */
-        struct list_head background;
        /** Number of requests currently in the background */
        unsigned num_background;
+        /** Pending interrupts */
+        struct list_head interrupts;
        /** Flag indicating if connection is blocked.  This will be
            the case before the INIT reply is received, and if there
            are too many outstading backgrounds requests */
@@ -258,15 +283,9 @@ struct fuse_conn {
        /** waitq for blocked connection */
        wait_queue_head_t blocked_waitq;
-        /** RW semaphore for exclusion with fuse_put_super() */
-        struct rw_semaphore sbput_sem;
        /** The next unique request id */
        u64 reqctr;
-        /** Mount is active */
-        unsigned mounted;
        /** Connection established, cleared on umount, connection
            abort and device release */
        unsigned connected;
@@ -305,12 +324,18 @@ struct fuse_conn {
        /** Is removexattr not implemented by fs? */
        unsigned no_removexattr : 1;
+        /** Are file locking primitives not implemented by fs? */
+        unsigned no_lock : 1;
        /** Is access not implemented by fs? */
        unsigned no_access : 1;
        /** Is create not implemented by fs? */
        unsigned no_create : 1;
+        /** Is interrupt not implemented by fs? */
+        unsigned no_interrupt : 1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -320,11 +345,23 @@ struct fuse_conn {
        /** Backing dev info */
        struct backing_dev_info bdi;
-        /** kobject */
+        /** Entry on the fuse_conn_list */
-        struct kobject kobj;
+        struct list_head entry;
+        /** Unique ID */
+        u64 id;
+        /** Dentries in the control filesystem */
+        struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+        /** number of dentries used in the above array */
+        int ctl_ndents;
        /** O_ASYNC requests */
        struct fasync_struct *fasync;
+        /** Key for lock owner ID scrambling */
+        u32 scramble_key[4];
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -337,11 +374,6 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
        return get_fuse_conn_super(inode->i_sb);
 }
-static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj)
-{
-        return container_of(obj, struct fuse_conn, kobj);
-}
 static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
 {
        return container_of(inode, struct fuse_inode, inode);
@@ -383,12 +415,9 @@ void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
                      struct fuse_file *ff, struct fuse_open_out *outarg);
-/**
+/** */
- * Send a RELEASE request
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
- */
+                                   int opcode);
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-                       u64 nodeid, struct inode *inode, int flags, int isdir);
 /**
 * Send RELEASE or RELEASEDIR request
 */
@@ -435,6 +464,9 @@ int fuse_dev_init(void);
 */
 void fuse_dev_cleanup(void);
+int fuse_ctl_init(void);
+void fuse_ctl_cleanup(void);
 /**
 * Allocate a request
 */
@@ -446,14 +478,14 @@ struct fuse_req *fuse_request_alloc(void);
 void fuse_request_free(struct fuse_req *req);
 /**
- * Reinitialize a request, the preallocated flag is left unmodified
+ * Get a request, may fail with -ENOMEM
 */
-void fuse_reset_request(struct fuse_req *req);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 /**
- * Reserve a preallocated request
+ * Gets a requests for a file operation, always succeeds
 */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
 /**
 * Decrement reference count of a request.  If count goes to zero free
@@ -476,11 +508,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-/**
- * Release inodes and file associated with background request
- */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -493,3 +520,23 @@ int fuse_do_getattr(struct inode *inode);
 * Invalidate inode attributes
 */
 void fuse_invalidate_attr(struct inode *inode);
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b2..dcaaabd3b9c4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -11,25 +11,20 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
+#include <linux/random.h>
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 static kmem_cache_t *fuse_inode_cachep;
-static struct subsystem connections_subsys;
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
-struct fuse_conn_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct fuse_conn *, char *);
-        ssize_t (*store)(struct fuse_conn *, const char *, size_t);
-};
 #define FUSE_SUPER_MAGIC 0x65735546
@@ -104,6 +99,14 @@ static void fuse_clear_inode(struct inode *inode)
        }
 }
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        if (*flags & MS_MANDLOCK)
+                return -EINVAL;
+        return 0;
+}
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
        if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
@@ -195,31 +198,29 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
        return inode;
 }
-static void fuse_umount_begin(struct super_block *sb)
+static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-        fuse_abort_conn(get_fuse_conn_super(sb));
+        if (flags & MNT_FORCE)
+                fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 static void fuse_put_super(struct super_block *sb)
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
-        down_write(&fc->sbput_sem);
-        while (!list_empty(&fc->background))
-                fuse_release_background(fc,
-                                        list_entry(fc->background.next,
-                                                   struct fuse_req, bg_entry));
        spin_lock(&fc->lock);
-        fc->mounted = 0;
        fc->connected = 0;
+        fc->blocked = 0;
        spin_unlock(&fc->lock);
-        up_write(&fc->sbput_sem);
        /* Flush all readers on this fs */
        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
        wake_up_all(&fc->waitq);
-        kobject_del(&fc->kobj);
+        wake_up_all(&fc->blocked_waitq);
-        kobject_put(&fc->kobj);
+        mutex_lock(&fuse_mutex);
+        list_del(&fc->entry);
+        fuse_ctl_remove_conn(fc);
+        mutex_unlock(&fuse_mutex);
+        fuse_conn_put(fc);
 }
 static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -236,8 +237,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
        /* fsid is left zero */
 }
-static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        struct fuse_req *req;
        struct fuse_statfs_out outarg;
@@ -368,11 +370,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-static void fuse_conn_release(struct kobject *kobj)
-{
-        kfree(get_fuse_conn_kobj(kobj));
-}
 static struct fuse_conn *new_conn(void)
 {
        struct fuse_conn *fc;
@@ -380,24 +377,35 @@ static struct fuse_conn *new_conn(void)
        fc = kzalloc(sizeof(*fc), GFP_KERNEL);
        if (fc) {
                spin_lock_init(&fc->lock);
+                atomic_set(&fc->count, 1);
                init_waitqueue_head(&fc->waitq);
                init_waitqueue_head(&fc->blocked_waitq);
                INIT_LIST_HEAD(&fc->pending);
                INIT_LIST_HEAD(&fc->processing);
                INIT_LIST_HEAD(&fc->io);
-                INIT_LIST_HEAD(&fc->background);
+                INIT_LIST_HEAD(&fc->interrupts);
-                init_rwsem(&fc->sbput_sem);
-                kobj_set_kset_s(fc, connections_subsys);
-                kobject_init(&fc->kobj);
                atomic_set(&fc->num_waiting, 0);
                fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
                fc->bdi.unplug_io_fn = default_unplug_io_fn;
                fc->reqctr = 0;
                fc->blocked = 1;
+                get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
        }
        return fc;
 }
+void fuse_conn_put(struct fuse_conn *fc)
+{
+        if (atomic_dec_and_test(&fc->count))
+                kfree(fc);
+}
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+        atomic_inc(&fc->count);
+        return fc;
+}
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
        struct fuse_attr attr;
@@ -413,6 +421,7 @@ static struct super_operations fuse_super_operations = {
        .destroy_inode  = fuse_destroy_inode,
        .read_inode     = fuse_read_inode,
        .clear_inode    = fuse_clear_inode,
+        .remount_fs     = fuse_remount_fs,
        .put_super      = fuse_put_super,
        .umount_begin   = fuse_umount_begin,
        .statfs         = fuse_statfs,
@@ -432,8 +441,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
                        if (arg->flags & FUSE_ASYNC_READ)
                                fc->async_read = 1;
-                } else
+                        if (!(arg->flags & FUSE_POSIX_LOCKS))
+                                fc->no_lock = 1;
+                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+                        fc->no_lock = 1;
+                }
                fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
                fc->minor = arg->minor;
@@ -451,7 +464,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->major = FUSE_KERNEL_VERSION;
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-        arg->flags |= FUSE_ASYNC_READ;
+        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -467,10 +480,9 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        request_send_background(fc, req);
 }
-static unsigned long long conn_id(void)
+static u64 conn_id(void)
 {
-        /* BKL is held for ->get_sb() */
+        static u64 ctr = 1;
-        static unsigned long long ctr = 1;
        return ctr++;
 }
@@ -484,6 +496,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        struct fuse_req *init_req;
        int err;
+        if (sb->s_flags & MS_MANDLOCK)
+                return -EINVAL;
        if (!parse_fuse_opt((char *) data, &d))
                return -EINVAL;
@@ -527,25 +542,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (!init_req)
                goto err_put_root;
-        err = kobject_set_name(&fc->kobj, "%llu", conn_id());
+        mutex_lock(&fuse_mutex);
-        if (err)
-                goto err_free_req;
-        err = kobject_add(&fc->kobj);
-        if (err)
-                goto err_free_req;
-        /* Setting file->private_data can't race with other mount()
-           instances, since BKL is held for ->get_sb() */
        err = -EINVAL;
        if (file->private_data)
-                goto err_kobject_del;
+                goto err_unlock;
+        fc->id = conn_id();
+        err = fuse_ctl_add_conn(fc);
+        if (err)
+                goto err_unlock;
+        list_add_tail(&fc->entry, &fuse_conn_list);
        sb->s_root = root_dentry;
-        fc->mounted = 1;
        fc->connected = 1;
-        kobject_get(&fc->kobj);
+        file->private_data = fuse_conn_get(fc);
-        file->private_data = fc;
+        mutex_unlock(&fuse_mutex);
        /*
         * atomic_dec_and_test() in fput() provides the necessary
         * memory barrier for file->private_data to be visible on all
@@ -557,23 +568,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
- err_kobject_del:
+ err_unlock:
-        kobject_del(&fc->kobj);
+        mutex_unlock(&fuse_mutex);
- err_free_req:
        fuse_request_free(init_req);
 err_put_root:
        dput(root_dentry);
 err:
        fput(file);
-        kobject_put(&fc->kobj);
+        fuse_conn_put(fc);
        return err;
 }
-static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
+static int fuse_get_sb(struct file_system_type *fs_type,
-                                       int flags, const char *dev_name,
+                       int flags, const char *dev_name,
-                                       void *raw_data)
+                       void *raw_data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+        return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 static struct file_system_type fuse_fs_type = {
@@ -583,68 +593,8 @@ static struct file_system_type fuse_fs_type = {
        .kill_sb        = kill_anon_super,
 };
-static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page)
-{
-        return sprintf(page, "%i\n", atomic_read(&fc->num_waiting));
-}
-static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page,
-                                     size_t count)
-{
-        fuse_abort_conn(fc);
-        return count;
-}
-static struct fuse_conn_attr fuse_conn_waiting =
-        __ATTR(waiting, 0400, fuse_conn_waiting_show, NULL);
-static struct fuse_conn_attr fuse_conn_abort =
-        __ATTR(abort, 0600, NULL, fuse_conn_abort_store);
-static struct attribute *fuse_conn_attrs[] = {
-        &fuse_conn_waiting.attr,
-        &fuse_conn_abort.attr,
-        NULL,
-};
-static ssize_t fuse_conn_attr_show(struct kobject *kobj,
-                                   struct attribute *attr,
-                                   char *page)
-{
-        struct fuse_conn_attr *fca =
-                container_of(attr, struct fuse_conn_attr, attr);
-        if (fca->show)
-                return fca->show(get_fuse_conn_kobj(kobj), page);
-        else
-                return -EACCES;
-}
-static ssize_t fuse_conn_attr_store(struct kobject *kobj,
-                                    struct attribute *attr,
-                                    const char *page, size_t count)
-{
-        struct fuse_conn_attr *fca =
-                container_of(attr, struct fuse_conn_attr, attr);
-        if (fca->store)
-                return fca->store(get_fuse_conn_kobj(kobj), page, count);
-        else
-                return -EACCES;
-}
-static struct sysfs_ops fuse_conn_sysfs_ops = {
-        .show   = &fuse_conn_attr_show,
-        .store  = &fuse_conn_attr_store,
-};
-static struct kobj_type ktype_fuse_conn = {
-        .release        = fuse_conn_release,
-        .sysfs_ops      = &fuse_conn_sysfs_ops,
-        .default_attrs  = fuse_conn_attrs,
-};
 static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, &ktype_fuse_conn, NULL);
+static decl_subsys(connections, NULL, NULL);
 static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
                                 unsigned long flags)
@@ -718,6 +668,7 @@ static int __init fuse_init(void)
        printk("fuse init (API version %i.%i)\n",
               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+        INIT_LIST_HEAD(&fuse_conn_list);
        res = fuse_fs_init();
        if (res)
                goto err;
@@ -730,8 +681,14 @@ static int __init fuse_init(void)
        if (res)
                goto err_dev_cleanup;
+        res = fuse_ctl_init();
+        if (res)
+                goto err_sysfs_cleanup;
        return 0;
+ err_sysfs_cleanup:
+        fuse_sysfs_cleanup();
 err_dev_cleanup:
        fuse_dev_cleanup();
 err_fs_cleanup:
@@ -744,6 +701,7 @@ static void __exit fuse_exit(void)
 {
        printk(KERN_DEBUG "fuse exit\n");
+        fuse_ctl_cleanup();
        fuse_sysfs_cleanup();
        fuse_fs_cleanup();
        fuse_dev_cleanup();
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 1e44dcfe49c4..13231dd5ce66 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -280,7 +280,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        block = off >> PAGE_CACHE_SHIFT;
        node->page_offset = off & ~PAGE_CACHE_MASK;
        for (i = 0; i < tree->pages_per_bnode; i++) {
-                page = read_cache_page(mapping, block++, (filler_t *)mapping->a_ops->readpage, NULL);
+                page = read_mapping_page(mapping, block++, NULL);
                if (IS_ERR(page))
                        goto fail;
                if (PageError(page)) {
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index d20131ce4b95..400357994319 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -59,7 +59,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        unlock_new_inode(tree->inode);
        mapping = tree->inode->i_mapping;
-        page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                goto free_tree;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 3ed8663a8db1..735332dfd1b8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -182,8 +182,8 @@ extern void hfs_file_truncate(struct inode *);
 extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 /* inode.c */
-extern struct address_space_operations hfs_aops;
+extern const struct address_space_operations hfs_aops;
-extern struct address_space_operations hfs_btree_aops;
+extern const struct address_space_operations hfs_btree_aops;
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2d4ced22201b..315cf44a90b2 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -114,7 +114,7 @@ static int hfs_writepages(struct address_space *mapping,
        return mpage_writepages(mapping, wbc, hfs_get_block);
 }
-struct address_space_operations hfs_btree_aops = {
+const struct address_space_operations hfs_btree_aops = {
        .readpage       = hfs_readpage,
        .writepage      = hfs_writepage,
        .sync_page      = block_sync_page,
@@ -124,7 +124,7 @@ struct address_space_operations hfs_btree_aops = {
        .releasepage    = hfs_releasepage,
 };
-struct address_space_operations hfs_aops = {
+const struct address_space_operations hfs_aops = {
        .readpage       = hfs_readpage,
        .writepage      = hfs_writepage,
        .sync_page      = block_sync_page,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1181d116117d..34937ee83ab1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -12,7 +12,6 @@
 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
@@ -80,8 +79,10 @@ static void hfs_put_super(struct super_block *sb)
 *
 * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
 */
-static int hfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        buf->f_type = HFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
@@ -413,10 +414,11 @@ bail:
        return res;
 }
-static struct super_block *hfs_get_sb(struct file_system_type *fs_type,
+static int hfs_get_sb(struct file_system_type *fs_type,
-                                      int flags, const char *dev_name, void *data)
+                      int flags, const char *dev_name, void *data,
+                      struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
 }
 static struct file_system_type hfs_fs_type = {
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 9fb51632303c..d128a25b74d2 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,8 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
        dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
        mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
        mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
-        page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
+        page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
-                               (filler_t *)mapping->a_ops->readpage, NULL);
        pptr = kmap(page);
        curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
        i = offset % 32;
@@ -72,8 +71,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
                offset += PAGE_CACHE_BITS;
                if (offset >= size)
                        break;
-                page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
+                page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
-                                       (filler_t *)mapping->a_ops->readpage, NULL);
+                                         NULL);
                curr = pptr = kmap(page);
                if ((size ^ offset) / PAGE_CACHE_BITS)
                        end = pptr + PAGE_CACHE_BITS / 32;
@@ -119,8 +118,8 @@ found:
                set_page_dirty(page);
                kunmap(page);
                offset += PAGE_CACHE_BITS;
-                page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
+                page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
-                                       (filler_t *)mapping->a_ops->readpage, NULL);
+                                         NULL);
                pptr = kmap(page);
                curr = pptr;
                end = pptr + PAGE_CACHE_BITS / 32;
@@ -167,7 +166,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
        mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
        mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
        pnr = offset / PAGE_CACHE_BITS;
-        page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+        page = read_mapping_page(mapping, pnr, NULL);
        pptr = kmap(page);
        curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
        end = pptr + PAGE_CACHE_BITS / 32;
@@ -199,7 +198,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
                        break;
                set_page_dirty(page);
                kunmap(page);
-                page = read_cache_page(mapping, ++pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+                page = read_mapping_page(mapping, ++pnr, NULL);
                pptr = kmap(page);
                curr = pptr;
                end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 746abc9ecf70..77bf434da679 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -440,7 +440,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        block = off >> PAGE_CACHE_SHIFT;
        node->page_offset = off & ~PAGE_CACHE_MASK;
        for (i = 0; i < tree->pages_per_bnode; block++, i++) {
-                page = read_cache_page(mapping, block, (filler_t *)mapping->a_ops->readpage, NULL);
+                page = read_mapping_page(mapping, block, NULL);
                if (IS_ERR(page))
                        goto fail;
                if (PageError(page)) {
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index effa8991999c..cfc852fdd1b5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -38,7 +38,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                goto free_tree;
        mapping = tree->inode->i_mapping;
-        page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                goto free_tree;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ae393637a0c..8a1ca5ef7ada 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -323,8 +323,8 @@ int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
 /* inode.c */
-extern struct address_space_operations hfsplus_aops;
+extern const struct address_space_operations hfsplus_aops;
-extern struct address_space_operations hfsplus_btree_aops;
+extern const struct address_space_operations hfsplus_btree_aops;
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index acf66dba3e01..924ecdef8091 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -109,7 +109,7 @@ static int hfsplus_writepages(struct address_space *mapping,
        return mpage_writepages(mapping, wbc, hfsplus_get_block);
 }
-struct address_space_operations hfsplus_btree_aops = {
+const struct address_space_operations hfsplus_btree_aops = {
        .readpage       = hfsplus_readpage,
        .writepage      = hfsplus_writepage,
        .sync_page      = block_sync_page,
@@ -119,7 +119,7 @@ struct address_space_operations hfsplus_btree_aops = {
        .releasepage    = hfsplus_releasepage,
 };
-struct address_space_operations hfsplus_aops = {
+const struct address_space_operations hfsplus_aops = {
        .readpage       = hfsplus_readpage,
        .writepage      = hfsplus_writepage,
        .sync_page      = block_sync_page,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7843f792a4b7..d279d5924f28 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -7,7 +7,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
@@ -212,8 +211,10 @@ static void hfsplus_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
 }
-static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        buf->f_type = HFSPLUS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
@@ -450,10 +451,12 @@ static void hfsplus_destroy_inode(struct inode *inode)
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
-static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type,
+static int hfsplus_get_sb(struct file_system_type *fs_type,
-                                          int flags, const char *dev_name, void *data)
+                          int flags, const char *dev_name, void *data,
+                          struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+                           mnt);
 }
 static struct file_system_type hfsplus_fs_type = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bf0f8e16e433..b82e3d9c8790 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -54,7 +54,7 @@ static int append = 0;
 static struct inode_operations hostfs_iops;
 static struct inode_operations hostfs_dir_iops;
-static struct address_space_operations hostfs_link_aops;
+static const struct address_space_operations hostfs_link_aops;
 #ifndef MODULE
 static int __init hostfs_args(char *options, int *add)
@@ -239,7 +239,7 @@ static int read_inode(struct inode *ino)
        return(err);
 }
-int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
        /* do_statfs uses struct statfs64 internally, but the linux kernel
         * struct statfs still has 32-bit versions for most of these fields,
@@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
        long long f_files;
        long long f_ffree;
-        err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+        err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
                        &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
                        &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
                        &sf->f_namelen, sf->f_spare);
@@ -518,7 +518,7 @@ int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
        return(err);
 }
-static struct address_space_operations hostfs_aops = {
+static const struct address_space_operations hostfs_aops = {
        .writepage      = hostfs_writepage,
        .readpage       = hostfs_readpage,
        .set_page_dirty = __set_page_dirty_nobuffers,
@@ -935,7 +935,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
        return(err);
 }
-static struct address_space_operations hostfs_link_aops = {
+static const struct address_space_operations hostfs_link_aops = {
        .readpage       = hostfs_link_readpage,
 };
@@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        return(err);
 }
-static struct super_block *hostfs_read_sb(struct file_system_type *type,
+static int hostfs_read_sb(struct file_system_type *type,
-                                             int flags, const char *dev_name,
+                          int flags, const char *dev_name,
-                                             void *data)
+                          void *data, struct vfsmount *mnt)
 {
-        return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+        return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
 static struct file_system_type hostfs_type = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d3b9fffe45a1..d9eb19b7b8ae 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -99,7 +99,7 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,hpfs_get_block);
 }
-struct address_space_operations hpfs_aops = {
+const struct address_space_operations hpfs_aops = {
        .readpage = hpfs_readpage,
        .writepage = hpfs_writepage,
        .sync_page = block_sync_page,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 29b7a3e55173..f687d54ed442 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
 int hpfs_file_fsync(struct file *, struct dentry *, int);
 extern const struct file_operations hpfs_file_ops;
 extern struct inode_operations hpfs_file_iops;
-extern struct address_space_operations hpfs_aops;
+extern const struct address_space_operations hpfs_aops;
 /* inode.c */
@@ -304,7 +304,7 @@ void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
 /* namei.c */
 extern struct inode_operations hpfs_dir_iops;
-extern struct address_space_operations hpfs_symlink_aops;
+extern const struct address_space_operations hpfs_symlink_aops;
 static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
 {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a03abb12c610..59e7dc182a0c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -538,7 +538,7 @@ fail:
        return err;
 }
-struct address_space_operations hpfs_symlink_aops = {
+const struct address_space_operations hpfs_symlink_aops = {
        .readpage       = hpfs_symlink_readpage
 };
        
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index d72d8c87c996..f798480a363f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s)
        return count;
 }
-static int hpfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        lock_kernel();
@@ -662,10 +663,11 @@ bail0:
        return -EINVAL;
 }
-static struct super_block *hpfs_get_sb(struct file_system_type *fs_type,
+static int hpfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+                           mnt);
 }
 static struct file_system_type hpfs_fs_type = {
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 5e6363be246f..3a9bdf58166f 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = {
        .fsync          = hppfs_fsync,
 };
-static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
        sf->f_blocks = 0;
        sf->f_bfree = 0;
@@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
        return(err);
 }
-static struct super_block *hppfs_read_super(struct file_system_type *type,
+static int hppfs_read_super(struct file_system_type *type,
-                                             int flags, const char *dev_name,
+                            int flags, const char *dev_name,
-                                             void *data)
+                            void *data, struct vfsmount *mnt)
 {
-        return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+        return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
 }
 static struct file_system_type hppfs_type = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a5b4e923455..6449cb697967 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,7 +34,7 @@
 #define HUGETLBFS_MAGIC 0x958458f6
 static struct super_operations hugetlbfs_ops;
-static struct address_space_operations hugetlbfs_aops;
+static const struct address_space_operations hugetlbfs_aops;
 const struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
 static struct inode_operations hugetlbfs_inode_operations;
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *inode = file->f_dentry->d_inode;
-        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
        loff_t len, vma_len;
        int ret;
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
                goto out;
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (vma->vm_flags & VM_MAYSHARE &&
-                if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
+            hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-                        goto out;
+                                  len >> HPAGE_SHIFT))
+                goto out;
        ret = 0;
        hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
        const pgoff_t start = lstart >> HPAGE_SHIFT;
        struct pagevec pvec;
        pgoff_t next;
-        int i;
+        int i, freed = 0;
-        hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-                                     lstart >> HPAGE_SHIFT);
-        if (!mapping->nrpages)
-                return;
        pagevec_init(&pvec, 0);
        next = start;
        while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
                        truncate_huge_page(page);
                        unlock_page(page);
                        hugetlb_put_quota(mapping);
+                        freed++;
                }
                huge_pagevec_release(&pvec);
        }
        BUG_ON(!lstart && mapping->nrpages);
+        hugetlb_unreserve_pages(inode, start, freed);
 }
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                INIT_LIST_HEAD(&inode->i_mapping->private_list);
                info = HUGETLBFS_I(inode);
                mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
                switch (mode & S_IFMT) {
@@ -467,9 +466,9 @@ static int hugetlbfs_set_page_dirty(struct page *page)
        return 0;
 }
-static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
        buf->f_type = HUGETLBFS_MAGIC;
        buf->f_bsize = HPAGE_SIZE;
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
                hugetlbfs_inc_free_inodes(sbinfo);
                return NULL;
        }
-        p->prereserved_hpages = 0;
        return &p->vfs_inode;
 }
@@ -549,7 +547,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
-static struct address_space_operations hugetlbfs_aops = {
+static const struct address_space_operations hugetlbfs_aops = {
        .readpage       = hugetlbfs_readpage,
        .prepare_write  = hugetlbfs_prepare_write,
        .commit_write   = hugetlbfs_commit_write,
@@ -723,10 +721,10 @@ void hugetlb_put_quota(struct address_space *mapping)
        }
 }
-static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
+static int hugetlbfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+        return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
 }
 static struct file_system_type hugetlbfs_fs_type = {
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
                goto out_file;
        error = -ENOMEM;
-        if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
+        if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
-                                       size >> HPAGE_SHIFT) != 0)
                goto out_inode;
        d_instantiate(dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index 3a2446a27d2c..0bf9f0444a96 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -4,7 +4,6 @@
 * (C) 1997 Linus Torvalds
 */
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/dcache.h>
@@ -102,7 +101,7 @@ static kmem_cache_t * inode_cachep __read_mostly;
 static struct inode *alloc_inode(struct super_block *sb)
 {
-        static struct address_space_operations empty_aops;
+        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct inode *inode;
@@ -452,15 +451,14 @@ static void prune_icache(int nr_to_scan)
                nr_pruned++;
        }
        inodes_stat.nr_unused -= nr_pruned;
+        if (current_is_kswapd())
+                __count_vm_events(KSWAPD_INODESTEAL, reap);
+        else
+                __count_vm_events(PGINODESTEAL, reap);
        spin_unlock(&inode_lock);
        dispose_list(&freeable);
        mutex_unlock(&iprune_mutex);
-        if (current_is_kswapd())
-                mod_page_state(kswapd_inodesteal, reap);
-        else
-                mod_page_state(pginodesteal, reap);
 }
 /*
diff --git a/fs/inotify.c b/fs/inotify.c
index 732ec4bd5774..723836a1f718 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -5,7 +5,10 @@
 *      John McCutchan  <ttb@tentacle.dhs.org>
 *      Robert Love     <rml@novell.com>
 *
+ * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
+ *
 * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
@@ -20,35 +23,17 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
-#include <linux/syscalls.h>
-#include <asm/ioctls.h>
 static atomic_t inotify_cookie;
-static kmem_cache_t *watch_cachep __read_mostly;
-static kmem_cache_t *event_cachep __read_mostly;
-static struct vfsmount *inotify_mnt __read_mostly;
-/* these are configurable via /proc/sys/fs/inotify/ */
-int inotify_max_user_instances __read_mostly;
-int inotify_max_user_watches __read_mostly;
-int inotify_max_queued_events __read_mostly;
 /*
 * Lock ordering:
 *
@@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly;
 * iprune_mutex (synchronize shrink_icache_memory())
 *      inode_lock (protects the super_block->s_inodes list)
 *      inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- *              inotify_dev->mutex (protects inotify_device and watches->d_list)
+ *              inotify_handle->mutex (protects inotify_handle and watches->h_list)
+ *
+ * The inode->inotify_mutex and inotify_handle->mutex and held during execution
+ * of a caller's event handler.  Thus, the caller must not hold any locks
+ * taken in their event handler while calling any of the published inotify
+ * interfaces.
 */
 /*
- * Lifetimes of the three main data structures--inotify_device, inode, and
+ * Lifetimes of the three main data structures--inotify_handle, inode, and
 * inotify_watch--are managed by reference count.
 *
- * inotify_device: Lifetime is from inotify_init() until release.  Additional
+ * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
- * references can bump the count via get_inotify_dev() and drop the count via
+ * Additional references can bump the count via get_inotify_handle() and drop
- * put_inotify_dev().
+ * the count via put_inotify_handle().
 *
- * inotify_watch: Lifetime is from create_watch() to destory_watch().
+ * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
- * Additional references can bump the count via get_inotify_watch() and drop
+ * to remove_watch_no_event().  Additional references can bump the count via
- * the count via put_inotify_watch().
+ * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
+ * is reponsible for the final put after receiving IN_IGNORED, or when using
+ * IN_ONESHOT after receiving the first event.  Inotify does the final put if
+ * inotify_destroy() is called.
 *
 * inode: Pinned so long as the inode is associated with a watch, from
- * create_watch() to put_inotify_watch().
+ * inotify_add_watch() to the final put_inotify_watch().
 */
 /*
- * struct inotify_device - represents an inotify instance
+ * struct inotify_handle - represents an inotify instance
 *
 * This structure is protected by the mutex 'mutex'.
 */
-struct inotify_device {
+struct inotify_handle {
-        wait_queue_head_t       wq;             /* wait queue for i/o */
        struct idr              idr;            /* idr mapping wd -> watch */
        struct mutex            mutex;          /* protects this bad boy */
-        struct list_head        events;         /* list of queued events */
        struct list_head        watches;        /* list of watches */
        atomic_t                count;          /* reference count */
-        struct user_struct      *user;          /* user who opened this dev */
-        unsigned int            queue_size;     /* size of the queue (bytes) */
-        unsigned int            event_count;    /* number of pending events */
-        unsigned int            max_events;     /* maximum number of events */
        u32                     last_wd;        /* the last wd allocated */
+        const struct inotify_operations *in_ops; /* inotify caller operations */
 };
-/*
+static inline void get_inotify_handle(struct inotify_handle *ih)
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-        struct inotify_event    event;  /* the user-space event */
-        struct list_head        list;   /* entry in inotify_device's list */
-        char                    *name;  /* filename, if any */
-};
-/*
- * struct inotify_watch - represents a watch request on a specific inode
- *
- * d_list is protected by dev->mutex of the associated watch->dev.
- * i_list and mask are protected by inode->inotify_mutex of the associated inode.
- * dev, inode, and wd are never written to once the watch is created.
- */
-struct inotify_watch {
-        struct list_head        d_list; /* entry in inotify_device's list */
-        struct list_head        i_list; /* entry in inode's list */
-        atomic_t                count;  /* reference count */
-        struct inotify_device   *dev;   /* associated device */
-        struct inode            *inode; /* associated inode */
-        s32                     wd;     /* watch descriptor */
-        u32                     mask;   /* event mask for this watch */
-};
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-static int zero;
-ctl_table inotify_table[] = {
-        {
-                .ctl_name       = INOTIFY_MAX_USER_INSTANCES,
-                .procname       = "max_user_instances",
-                .data           = &inotify_max_user_instances,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
-                .extra1         = &zero,
-        },
-        {
-                .ctl_name       = INOTIFY_MAX_USER_WATCHES,
-                .procname       = "max_user_watches",
-                .data           = &inotify_max_user_watches,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
-                .extra1         = &zero, 
-        },
-        {
-                .ctl_name       = INOTIFY_MAX_QUEUED_EVENTS,
-                .procname       = "max_queued_events",
-                .data           = &inotify_max_queued_events,
-                .maxlen         = sizeof(int),
-                .mode           = 0644, 
-                .proc_handler   = &proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec, 
-                .extra1         = &zero
-        },
-        { .ctl_name = 0 }
-};
-#endif /* CONFIG_SYSCTL */
-static inline void get_inotify_dev(struct inotify_device *dev)
 {
-        atomic_inc(&dev->count);
+        atomic_inc(&ih->count);
 }
-static inline void put_inotify_dev(struct inotify_device *dev)
+static inline void put_inotify_handle(struct inotify_handle *ih)
 {
-        if (atomic_dec_and_test(&dev->count)) {
+        if (atomic_dec_and_test(&ih->count)) {
-                atomic_dec(&dev->user->inotify_devs);
+                idr_destroy(&ih->idr);
-                free_uid(dev->user);
+                kfree(ih);
-                idr_destroy(&dev->idr);
-                kfree(dev);
        }
 }
-static inline void get_inotify_watch(struct inotify_watch *watch)
+/**
+ * get_inotify_watch - grab a reference to an inotify_watch
+ * @watch: watch to grab
+ */
+void get_inotify_watch(struct inotify_watch *watch)
 {
        atomic_inc(&watch->count);
 }
+EXPORT_SYMBOL_GPL(get_inotify_watch);
-/*
+/**
 * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * the watch and its references if the count reaches zero.
+ * watch references if the count reaches zero.  inotify_watch is freed by
+ * inotify callers via the destroy_watch() op.
+ * @watch: watch to release
 */
-static inline void put_inotify_watch(struct inotify_watch *watch)
+void put_inotify_watch(struct inotify_watch *watch)
 {
        if (atomic_dec_and_test(&watch->count)) {
-                put_inotify_dev(watch->dev);
+                struct inotify_handle *ih = watch->ih;
-                iput(watch->inode);
-                kmem_cache_free(watch_cachep, watch);
-        }
-}
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-                                                  const char *name)
-{
-        struct inotify_kernel_event *kevent;
-        kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
-        if (unlikely(!kevent))
-                return NULL;
-        /* we hand this out to user-space, so zero it just in case */
-        memset(&kevent->event, 0, sizeof(struct inotify_event));
-        kevent->event.wd = wd;
-        kevent->event.mask = mask;
-        kevent->event.cookie = cookie;
-        INIT_LIST_HEAD(&kevent->list);
-        if (name) {
-                size_t len, rem, event_size = sizeof(struct inotify_event);
-                /*
-                 * We need to pad the filename so as to properly align an
-                 * array of inotify_event structures.  Because the structure is
-                 * small and the common case is a small filename, we just round
-                 * up to the next multiple of the structure's sizeof.  This is
-                 * simple and safe for all architectures.
-                 */
-                len = strlen(name) + 1;
-                rem = event_size - len;
-                if (len > event_size) {
-                        rem = event_size - (len % event_size);
-                        if (len % event_size == 0)
-                                rem = 0;
-                }
-                kevent->name = kmalloc(len + rem, GFP_KERNEL);
-                if (unlikely(!kevent->name)) {
-                        kmem_cache_free(event_cachep, kevent);
-                        return NULL;
-                }
-                memcpy(kevent->name, name, len);
-                if (rem)
-                        memset(kevent->name + len, 0, rem);             
-                kevent->event.len = len + rem;
-        } else {
-                kevent->event.len = 0;
-                kevent->name = NULL;
-        }
-        return kevent;
-}
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-        return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-/*
- * inotify_dev_queue_event - add a new event to the given device
- *
- * Caller must hold dev->mutex.  Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_device *dev,
-                                    struct inotify_watch *watch, u32 mask,
-                                    u32 cookie, const char *name)
-{
-        struct inotify_kernel_event *kevent, *last;
-        /* coalescing: drop this event if it is a dupe of the previous */
-        last = inotify_dev_get_event(dev);
-        if (last && last->event.mask == mask && last->event.wd == watch->wd &&
-                        last->event.cookie == cookie) {
-                const char *lastname = last->name;
-                if (!name && !lastname)
-                        return;
-                if (name && lastname && !strcmp(lastname, name))
-                        return;
-        }
-        /* the queue overflowed and we already sent the Q_OVERFLOW event */
-        if (unlikely(dev->event_count > dev->max_events))
-                return;
-        /* if the queue overflows, we need to notify user space */
-        if (unlikely(dev->event_count == dev->max_events))
-                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-        else
-                kevent = kernel_event(watch->wd, mask, cookie, name);
-        if (unlikely(!kevent))
-                return;
-        /* queue the event and wake up anyone waiting */
-        dev->event_count++;
-        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-        list_add_tail(&kevent->list, &dev->events);
-        wake_up_interruptible(&dev->wq);
-}
-/*
- * remove_kevent - cleans up and ultimately frees the given kevent
- *
- * Caller must hold dev->mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-                          struct inotify_kernel_event *kevent)
-{
-        list_del(&kevent->list);
-        dev->event_count--;
-        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-        kfree(kevent->name);
-        kmem_cache_free(event_cachep, kevent);
-}
-/*
+                iput(watch->inode);
- * inotify_dev_event_dequeue - destroy an event on the given device
+                ih->in_ops->destroy_watch(watch);
- *
+                put_inotify_handle(ih);
- * Caller must hold dev->mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-        if (!list_empty(&dev->events)) {
-                struct inotify_kernel_event *kevent;
-                kevent = inotify_dev_get_event(dev);
-                remove_kevent(dev, kevent);
        }
 }
+EXPORT_SYMBOL_GPL(put_inotify_watch);
 /*
- * inotify_dev_get_wd - returns the next WD for use by the given dev
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
 *
- * Callers must hold dev->mutex.  This function can sleep.
+ * Callers must hold ih->mutex.  This function can sleep.
 */
-static int inotify_dev_get_wd(struct inotify_device *dev,
+static int inotify_handle_get_wd(struct inotify_handle *ih,
-                              struct inotify_watch *watch)
+                                 struct inotify_watch *watch)
 {
        int ret;
        do {
-                if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL)))
+                if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
                        return -ENOSPC;
-                ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd);
+                ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
        } while (ret == -EAGAIN);
-        return ret;
+        if (likely(!ret))
-}
+                ih->last_wd = watch->wd;
-/*
+        return ret;
- * find_inode - resolve a user-given path to a specific inode and return a nd
- */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
-                      unsigned flags)
-{
-        int error;
-        error = __user_walk(dirname, flags, nd);
-        if (error)
-                return error;
-        /* you can only watch an inode if you have read permissions on it */
-        error = vfs_permission(nd, MAY_READ);
-        if (error) 
-                path_release(nd);
-        return error;
 }
 /*
@@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched)
 }
 /*
- * create_watch - creates a watch on the given device.
+ * inotify_find_handle - find the watch associated with the given inode and
- *
+ * handle
- * Callers must hold dev->mutex.  Calls inotify_dev_get_wd() so may sleep.
- * Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
- */
-static struct inotify_watch *create_watch(struct inotify_device *dev,
-                                          u32 mask, struct inode *inode)
-{
-        struct inotify_watch *watch;
-        int ret;
-        if (atomic_read(&dev->user->inotify_watches) >=
-                        inotify_max_user_watches)
-                return ERR_PTR(-ENOSPC);
-        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-        if (unlikely(!watch))
-                return ERR_PTR(-ENOMEM);
-        ret = inotify_dev_get_wd(dev, watch);
-        if (unlikely(ret)) {
-                kmem_cache_free(watch_cachep, watch);
-                return ERR_PTR(ret);
-        }
-        dev->last_wd = watch->wd;
-        watch->mask = mask;
-        atomic_set(&watch->count, 0);
-        INIT_LIST_HEAD(&watch->d_list);
-        INIT_LIST_HEAD(&watch->i_list);
-        /* save a reference to device and bump the count to make it official */
-        get_inotify_dev(dev);
-        watch->dev = dev;
-        /*
-         * Save a reference to the inode and bump the ref count to make it
-         * official.  We hold a reference to nameidata, which makes this safe.
-         */
-        watch->inode = igrab(inode);
-        /* bump our own count, corresponding to our entry in dev->watches */
-        get_inotify_watch(watch);
-        atomic_inc(&dev->user->inotify_watches);
-        return watch;
-}
-/*
- * inotify_find_dev - find the watch associated with the given inode and dev
 *
 * Callers must hold inode->inotify_mutex.
 */
-static struct inotify_watch *inode_find_dev(struct inode *inode,
+static struct inotify_watch *inode_find_handle(struct inode *inode,
-                                            struct inotify_device *dev)
+                                               struct inotify_handle *ih)
 {
        struct inotify_watch *watch;
        list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-                if (watch->dev == dev)
+                if (watch->ih == ih)
                        return watch;
        }
@@ -490,40 +207,40 @@ static struct inotify_watch *inode_find_dev(struct inode *inode,
 }
 /*
- * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ * remove_watch_no_event - remove watch without the IN_IGNORED event.
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
 */
 static void remove_watch_no_event(struct inotify_watch *watch,
-                                  struct inotify_device *dev)
+                                  struct inotify_handle *ih)
 {
        list_del(&watch->i_list);
-        list_del(&watch->d_list);
+        list_del(&watch->h_list);
        if (!inotify_inode_watched(watch->inode))
                set_dentry_child_flags(watch->inode, 0);
-        atomic_dec(&dev->user->inotify_watches);
+        idr_remove(&ih->idr, watch->wd);
-        idr_remove(&dev->idr, watch->wd);
-        put_inotify_watch(watch);
 }
-/*
+/**
- * remove_watch - Remove a watch from both the device and the inode.  Sends
+ * inotify_remove_watch_locked - Remove a watch from both the handle and the
- * the IN_IGNORED event to the given device signifying that the inode is no
+ * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
- * longer watched.
+ * watched.  May be invoked from a caller's event handler.
- *
+ * @ih: inotify handle associated with watch
- * Callers must hold both inode->inotify_mutex and dev->mutex.  We drop a
+ * @watch: watch to remove
- * reference to the inode before returning.
 *
- * The inode is not iput() so as to remain atomic.  If the inode needs to be
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
- * iput(), the call returns one.  Otherwise, it returns zero.
 */
-static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev)
+void inotify_remove_watch_locked(struct inotify_handle *ih,
+                                 struct inotify_watch *watch)
 {
-        inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL);
+        remove_watch_no_event(watch, ih);
-        remove_watch_no_event(watch, dev);
+        ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
-/* Kernel API */
+/* Kernel API for producing events */
 /*
 * inotify_d_instantiate - instantiate dcache entry for inode
@@ -563,9 +280,10 @@ void inotify_d_move(struct dentry *entry)
 * @mask: event mask describing this event
 * @cookie: cookie for synchronization, or zero
 * @name: filename, if any
+ * @n_inode: inode associated with name
 */
 void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-                               const char *name)
+                               const char *name, struct inode *n_inode)
 {
        struct inotify_watch *watch, *next;
@@ -576,14 +294,13 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
        list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
                u32 watch_mask = watch->mask;
                if (watch_mask & mask) {
-                        struct inotify_device *dev = watch->dev;
+                        struct inotify_handle *ih= watch->ih;
-                        get_inotify_watch(watch);
+                        mutex_lock(&ih->mutex);
-                        mutex_lock(&dev->mutex);
-                        inotify_dev_queue_event(dev, watch, mask, cookie, name);
                        if (watch_mask & IN_ONESHOT)
-                                remove_watch_no_event(watch, dev);
+                                remove_watch_no_event(watch, ih);
-                        mutex_unlock(&dev->mutex);
+                        ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
-                        put_inotify_watch(watch);
+                                                 name, n_inode);
+                        mutex_unlock(&ih->mutex);
                }
        }
        mutex_unlock(&inode->inotify_mutex);
@@ -613,7 +330,8 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
        if (inotify_inode_watched(inode)) {
                dget(parent);
                spin_unlock(&dentry->d_lock);
-                inotify_inode_queue_event(inode, mask, cookie, name);
+                inotify_inode_queue_event(inode, mask, cookie, name,
+                                          dentry->d_inode);
                dput(parent);
        } else
                spin_unlock(&dentry->d_lock);
@@ -665,7 +383,7 @@ void inotify_unmount_inodes(struct list_head *list)
                need_iput_tmp = need_iput;
                need_iput = NULL;
-                /* In case the remove_watch() drops a reference. */
+                /* In case inotify_remove_watch_locked() drops a reference. */
                if (inode != need_iput_tmp)
                        __iget(inode);
                else
@@ -694,11 +412,12 @@ void inotify_unmount_inodes(struct list_head *list)
                mutex_lock(&inode->inotify_mutex);
                watches = &inode->inotify_watches;
                list_for_each_entry_safe(watch, next_w, watches, i_list) {
-                        struct inotify_device *dev = watch->dev;
+                        struct inotify_handle *ih= watch->ih;
-                        mutex_lock(&dev->mutex);
+                        mutex_lock(&ih->mutex);
-                        inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL);
+                        ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-                        remove_watch(watch, dev);
+                                                 NULL, NULL);
-                        mutex_unlock(&dev->mutex);
+                        inotify_remove_watch_locked(ih, watch);
+                        mutex_unlock(&ih->mutex);
                }
                mutex_unlock(&inode->inotify_mutex);
                iput(inode);            
@@ -718,432 +437,292 @@ void inotify_inode_is_dead(struct inode *inode)
        mutex_lock(&inode->inotify_mutex);
        list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-                struct inotify_device *dev = watch->dev;
+                struct inotify_handle *ih = watch->ih;
-                mutex_lock(&dev->mutex);
+                mutex_lock(&ih->mutex);
-                remove_watch(watch, dev);
+                inotify_remove_watch_locked(ih, watch);
-                mutex_unlock(&dev->mutex);
+                mutex_unlock(&ih->mutex);
        }
        mutex_unlock(&inode->inotify_mutex);
 }
 EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
-/* Device Interface */
+/* Kernel Consumer API */
-static unsigned int inotify_poll(struct file *file, poll_table *wait)
+/**
+ * inotify_init - allocate and initialize an inotify instance
+ * @ops: caller's inotify operations
+ */
+struct inotify_handle *inotify_init(const struct inotify_operations *ops)
 {
-        struct inotify_device *dev = file->private_data;
+        struct inotify_handle *ih;
-        int ret = 0;
-        poll_wait(file, &dev->wq, wait);
+        ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
-        mutex_lock(&dev->mutex);
+        if (unlikely(!ih))
-        if (!list_empty(&dev->events))
+                return ERR_PTR(-ENOMEM);
-                ret = POLLIN | POLLRDNORM;
-        mutex_unlock(&dev->mutex);
-        return ret;
+        idr_init(&ih->idr);
+        INIT_LIST_HEAD(&ih->watches);
+        mutex_init(&ih->mutex);
+        ih->last_wd = 0;
+        ih->in_ops = ops;
+        atomic_set(&ih->count, 0);
+        get_inotify_handle(ih);
+        return ih;
 }
+EXPORT_SYMBOL_GPL(inotify_init);
-static ssize_t inotify_read(struct file *file, char __user *buf,
+/**
-                            size_t count, loff_t *pos)
+ * inotify_init_watch - initialize an inotify watch
+ * @watch: watch to initialize
+ */
+void inotify_init_watch(struct inotify_watch *watch)
 {
-        size_t event_size = sizeof (struct inotify_event);
+        INIT_LIST_HEAD(&watch->h_list);
-        struct inotify_device *dev;
+        INIT_LIST_HEAD(&watch->i_list);
-        char __user *start;
+        atomic_set(&watch->count, 0);
-        int ret;
+        get_inotify_watch(watch); /* initial get */
-        DEFINE_WAIT(wait);
-        start = buf;
-        dev = file->private_data;
-        while (1) {
-                int events;
-                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-                mutex_lock(&dev->mutex);
-                events = !list_empty(&dev->events);
-                mutex_unlock(&dev->mutex);
-                if (events) {
-                        ret = 0;
-                        break;
-                }
-                if (file->f_flags & O_NONBLOCK) {
-                        ret = -EAGAIN;
-                        break;
-                }
-                if (signal_pending(current)) {
-                        ret = -EINTR;
-                        break;
-                }
-                schedule();
-        }
-        finish_wait(&dev->wq, &wait);
-        if (ret)
-                return ret;
-        mutex_lock(&dev->mutex);
-        while (1) {
-                struct inotify_kernel_event *kevent;
-                ret = buf - start;
-                if (list_empty(&dev->events))
-                        break;
-                kevent = inotify_dev_get_event(dev);
-                if (event_size + kevent->event.len > count)
-                        break;
-                if (copy_to_user(buf, &kevent->event, event_size)) {
-                        ret = -EFAULT;
-                        break;
-                }
-                buf += event_size;
-                count -= event_size;
-                if (kevent->name) {
-                        if (copy_to_user(buf, kevent->name, kevent->event.len)){
-                                ret = -EFAULT;
-                                break;
-                        }
-                        buf += kevent->event.len;
-                        count -= kevent->event.len;
-                }
-                remove_kevent(dev, kevent);
-        }
-        mutex_unlock(&dev->mutex);
-        return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_init_watch);
-static int inotify_release(struct inode *ignored, struct file *file)
+/**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+ */
+void inotify_destroy(struct inotify_handle *ih)
 {
-        struct inotify_device *dev = file->private_data;
        /*
-         * Destroy all of the watches on this device.  Unfortunately, not very
+         * Destroy all of the watches for this handle. Unfortunately, not very
         * pretty.  We cannot do a simple iteration over the list, because we
         * do not know the inode until we iterate to the watch.  But we need to
-         * hold inode->inotify_mutex before dev->mutex.  The following works.
+         * hold inode->inotify_mutex before ih->mutex.  The following works.
         */
        while (1) {
                struct inotify_watch *watch;
                struct list_head *watches;
                struct inode *inode;
-                mutex_lock(&dev->mutex);
+                mutex_lock(&ih->mutex);
-                watches = &dev->watches;
+                watches = &ih->watches;
                if (list_empty(watches)) {
-                        mutex_unlock(&dev->mutex);
+                        mutex_unlock(&ih->mutex);
                        break;
                }
-                watch = list_entry(watches->next, struct inotify_watch, d_list);
+                watch = list_entry(watches->next, struct inotify_watch, h_list);
                get_inotify_watch(watch);
-                mutex_unlock(&dev->mutex);
+                mutex_unlock(&ih->mutex);
                inode = watch->inode;
                mutex_lock(&inode->inotify_mutex);
-                mutex_lock(&dev->mutex);
+                mutex_lock(&ih->mutex);
                /* make sure we didn't race with another list removal */
-                if (likely(idr_find(&dev->idr, watch->wd)))
+                if (likely(idr_find(&ih->idr, watch->wd))) {
-                        remove_watch_no_event(watch, dev);
+                        remove_watch_no_event(watch, ih);
+                        put_inotify_watch(watch);
+                }
-                mutex_unlock(&dev->mutex);
+                mutex_unlock(&ih->mutex);
                mutex_unlock(&inode->inotify_mutex);
                put_inotify_watch(watch);
        }
-        /* destroy all of the events on this device */
+        /* free this handle: the put matching the get in inotify_init() */
-        mutex_lock(&dev->mutex);
+        put_inotify_handle(ih);
-        while (!list_empty(&dev->events))
-                inotify_dev_event_dequeue(dev);
-        mutex_unlock(&dev->mutex);
-        /* free this device: the put matching the get in inotify_init() */
-        put_inotify_dev(dev);
-        return 0;
 }
+EXPORT_SYMBOL_GPL(inotify_destroy);
-/*
+/**
- * inotify_ignore - remove a given wd from this inotify instance.
+ * inotify_find_watch - find an existing watch for an (ih,inode) pair
+ * @ih: inotify handle
+ * @inode: inode to watch
+ * @watchp: pointer to existing inotify_watch
 *
- * Can sleep.
+ * Caller must pin given inode (via nameidata).
 */
-static int inotify_ignore(struct inotify_device *dev, s32 wd)
+s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+                       struct inotify_watch **watchp)
 {
-        struct inotify_watch *watch;
+        struct inotify_watch *old;
-        struct inode *inode;
+        int ret = -ENOENT;
-        mutex_lock(&dev->mutex);
-        watch = idr_find(&dev->idr, wd);
-        if (unlikely(!watch)) {
-                mutex_unlock(&dev->mutex);
-                return -EINVAL;
-        }
-        get_inotify_watch(watch);
-        inode = watch->inode;
-        mutex_unlock(&dev->mutex);
        mutex_lock(&inode->inotify_mutex);
-        mutex_lock(&dev->mutex);
+        mutex_lock(&ih->mutex);
-        /* make sure that we did not race */
+        old = inode_find_handle(inode, ih);
-        if (likely(idr_find(&dev->idr, wd) == watch))
+        if (unlikely(old)) {
-                remove_watch(watch, dev);
+                get_inotify_watch(old); /* caller must put watch */
+                *watchp = old;
+                ret = old->wd;
+        }
-        mutex_unlock(&dev->mutex);
+        mutex_unlock(&ih->mutex);
        mutex_unlock(&inode->inotify_mutex);
-        put_inotify_watch(watch);
-        return 0;
+        return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_find_watch);
-static long inotify_ioctl(struct file *file, unsigned int cmd,
+/**
-                          unsigned long arg)
+ * inotify_find_update_watch - find and update the mask of an existing watch
+ * @ih: inotify handle
+ * @inode: inode's watch to update
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
+                              u32 mask)
 {
-        struct inotify_device *dev;
+        struct inotify_watch *old;
-        void __user *p;
+        int mask_add = 0;
-        int ret = -ENOTTY;
+        int ret;
-        dev = file->private_data;
-        p = (void __user *) arg;
-        switch (cmd) {
-        case FIONREAD:
-                ret = put_user(dev->queue_size, (int __user *) p);
-                break;
-        }
-        return ret;
-}
-static const struct file_operations inotify_fops = {
+        if (mask & IN_MASK_ADD)
-        .poll           = inotify_poll,
+                mask_add = 1;
-        .read           = inotify_read,
-        .release        = inotify_release,
-        .unlocked_ioctl = inotify_ioctl,
-        .compat_ioctl   = inotify_ioctl,
-};
-asmlinkage long sys_inotify_init(void)
+        /* don't allow invalid bits: we don't want flags set */
-{
+        mask &= IN_ALL_EVENTS | IN_ONESHOT;
-        struct inotify_device *dev;
+        if (unlikely(!mask))
-        struct user_struct *user;
+                return -EINVAL;
-        struct file *filp;      
-        int fd, ret;
-        fd = get_unused_fd();
-        if (fd < 0)
-                return fd;
-        filp = get_empty_filp();
-        if (!filp) {
-                ret = -ENFILE;
-                goto out_put_fd;
-        }
-        user = get_uid(current->user);
+        mutex_lock(&inode->inotify_mutex);
-        if (unlikely(atomic_read(&user->inotify_devs) >=
+        mutex_lock(&ih->mutex);
-                        inotify_max_user_instances)) {
-                ret = -EMFILE;
-                goto out_free_uid;
-        }
-        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        /*
-        if (unlikely(!dev)) {
+         * Handle the case of re-adding a watch on an (inode,ih) pair that we
-                ret = -ENOMEM;
+         * are already watching.  We just update the mask and return its wd.
-                goto out_free_uid;
+         */
+        old = inode_find_handle(inode, ih);
+        if (unlikely(!old)) {
+                ret = -ENOENT;
+                goto out;
        }
-        filp->f_op = &inotify_fops;
+        if (mask_add)
-        filp->f_vfsmnt = mntget(inotify_mnt);
+                old->mask |= mask;
-        filp->f_dentry = dget(inotify_mnt->mnt_root);
+        else
-        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+                old->mask = mask;
-        filp->f_mode = FMODE_READ;
+        ret = old->wd;
-        filp->f_flags = O_RDONLY;
+out:
-        filp->private_data = dev;
+        mutex_unlock(&ih->mutex);
+        mutex_unlock(&inode->inotify_mutex);
-        idr_init(&dev->idr);
-        INIT_LIST_HEAD(&dev->events);
-        INIT_LIST_HEAD(&dev->watches);
-        init_waitqueue_head(&dev->wq);
-        mutex_init(&dev->mutex);
-        dev->event_count = 0;
-        dev->queue_size = 0;
-        dev->max_events = inotify_max_queued_events;
-        dev->user = user;
-        dev->last_wd = 0;
-        atomic_set(&dev->count, 0);
-        get_inotify_dev(dev);
-        atomic_inc(&user->inotify_devs);
-        fd_install(fd, filp);
-        return fd;
-out_free_uid:
-        free_uid(user);
-        put_filp(filp);
-out_put_fd:
-        put_unused_fd(fd);
        return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_find_update_watch);
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+/**
+ * inotify_add_watch - add a watch to an inotify instance
+ * @ih: inotify handle
+ * @watch: caller allocated watch structure
+ * @inode: inode to watch
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ * Caller must ensure it only calls inotify_add_watch() once per watch.
+ * Calls inotify_handle_get_wd() so may sleep.
+ */
+s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+                      struct inode *inode, u32 mask)
 {
-        struct inotify_watch *watch, *old;
+        int ret = 0;
-        struct inode *inode;
-        struct inotify_device *dev;
-        struct nameidata nd;
-        struct file *filp;
-        int ret, fput_needed;
-        int mask_add = 0;
-        unsigned flags = 0;
-        filp = fget_light(fd, &fput_needed);
-        if (unlikely(!filp))
-                return -EBADF;
-        /* verify that this is indeed an inotify instance */
-        if (unlikely(filp->f_op != &inotify_fops)) {
-                ret = -EINVAL;
-                goto fput_and_out;
-        }
-        if (!(mask & IN_DONT_FOLLOW))
-                flags |= LOOKUP_FOLLOW;
-        if (mask & IN_ONLYDIR)
-                flags |= LOOKUP_DIRECTORY;
-        ret = find_inode(path, &nd, flags);
-        if (unlikely(ret))
-                goto fput_and_out;
-        /* inode held in place by reference to nd; dev by fget on fd */
+        /* don't allow invalid bits: we don't want flags set */
-        inode = nd.dentry->d_inode;
+        mask &= IN_ALL_EVENTS | IN_ONESHOT;
-        dev = filp->private_data;
+        if (unlikely(!mask))
+                return -EINVAL;
+        watch->mask = mask;
        mutex_lock(&inode->inotify_mutex);
-        mutex_lock(&dev->mutex);
+        mutex_lock(&ih->mutex);
-        if (mask & IN_MASK_ADD)
-                mask_add = 1;
-        /* don't let user-space set invalid bits: we don't want flags set */
+        /* Initialize a new watch */
-        mask &= IN_ALL_EVENTS | IN_ONESHOT;
+        ret = inotify_handle_get_wd(ih, watch);
-        if (unlikely(!mask)) {
+        if (unlikely(ret))
-                ret = -EINVAL;
                goto out;
-        }
+        ret = watch->wd;
+        /* save a reference to handle and bump the count to make it official */
+        get_inotify_handle(ih);
+        watch->ih = ih;
        /*
-         * Handle the case of re-adding a watch on an (inode,dev) pair that we
+         * Save a reference to the inode and bump the ref count to make it
-         * are already watching.  We just update the mask and return its wd.
+         * official.  We hold a reference to nameidata, which makes this safe.
         */
-        old = inode_find_dev(inode, dev);
+        watch->inode = igrab(inode);
-        if (unlikely(old)) {
-                if (mask_add)
-                        old->mask |= mask;
-                else
-                        old->mask = mask;
-                ret = old->wd;
-                goto out;
-        }
-        watch = create_watch(dev, mask, inode);
-        if (unlikely(IS_ERR(watch))) {
-                ret = PTR_ERR(watch);
-                goto out;
-        }
        if (!inotify_inode_watched(inode))
                set_dentry_child_flags(inode, 1);
-        /* Add the watch to the device's and the inode's list */
+        /* Add the watch to the handle's and the inode's list */
-        list_add(&watch->d_list, &dev->watches);
+        list_add(&watch->h_list, &ih->watches);
        list_add(&watch->i_list, &inode->inotify_watches);
-        ret = watch->wd;
 out:
-        mutex_unlock(&dev->mutex);
+        mutex_unlock(&ih->mutex);
        mutex_unlock(&inode->inotify_mutex);
-        path_release(&nd);
-fput_and_out:
-        fput_light(filp, fput_needed);
        return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_add_watch);
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+/**
+ * inotify_rm_wd - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @wd: watch descriptor to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
-        struct file *filp;
+        struct inotify_watch *watch;
-        struct inotify_device *dev;
+        struct inode *inode;
-        int ret, fput_needed;
-        filp = fget_light(fd, &fput_needed);
-        if (unlikely(!filp))
-                return -EBADF;
-        /* verify that this is indeed an inotify instance */
+        mutex_lock(&ih->mutex);
-        if (unlikely(filp->f_op != &inotify_fops)) {
+        watch = idr_find(&ih->idr, wd);
-                ret = -EINVAL;
+        if (unlikely(!watch)) {
-                goto out;
+                mutex_unlock(&ih->mutex);
+                return -EINVAL;
        }
+        get_inotify_watch(watch);
+        inode = watch->inode;
+        mutex_unlock(&ih->mutex);
-        dev = filp->private_data;
+        mutex_lock(&inode->inotify_mutex);
-        ret = inotify_ignore(dev, wd);
+        mutex_lock(&ih->mutex);
-out:
+        /* make sure that we did not race */
-        fput_light(filp, fput_needed);
+        if (likely(idr_find(&ih->idr, wd) == watch))
-        return ret;
+                inotify_remove_watch_locked(ih, watch);
+        mutex_unlock(&ih->mutex);
+        mutex_unlock(&inode->inotify_mutex);
+        put_inotify_watch(watch);
+        return 0;
 }
+EXPORT_SYMBOL_GPL(inotify_rm_wd);
-static struct super_block *
+/**
-inotify_get_sb(struct file_system_type *fs_type, int flags,
+ * inotify_rm_watch - remove a watch from an inotify instance
-               const char *dev_name, void *data)
+ * @ih: inotify handle
+ * @watch: watch to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_watch(struct inotify_handle *ih,
+                     struct inotify_watch *watch)
 {
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+        return inotify_rm_wd(ih, watch->wd);
 }
+EXPORT_SYMBOL_GPL(inotify_rm_watch);
-static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
-};
 /*
- * inotify_setup - Our initialization function.  Note that we cannnot return
+ * inotify_setup - core initialization function
- * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
- * must result in panic().
 */
 static int __init inotify_setup(void)
 {
-        int ret;
-        ret = register_filesystem(&inotify_fs_type);
-        if (unlikely(ret))
-                panic("inotify: register_filesystem returned %d!\n", ret);
-        inotify_mnt = kern_mount(&inotify_fs_type);
-        if (IS_ERR(inotify_mnt))
-                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-        inotify_max_queued_events = 16384;
-        inotify_max_user_instances = 128;
-        inotify_max_user_watches = 8192;
        atomic_set(&inotify_cookie, 0);
-        watch_cachep = kmem_cache_create("inotify_watch_cache",
-                                         sizeof(struct inotify_watch),
-                                         0, SLAB_PANIC, NULL, NULL);
-        event_cachep = kmem_cache_create("inotify_event_cache",
-                                         sizeof(struct inotify_kernel_event),
-                                         0, SLAB_PANIC, NULL, NULL);
        return 0;
 }
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
new file mode 100644
index 000000000000..f2386442adee
--- /dev/null
+++ b/fs/inotify_user.c
@@ -0,0 +1,719 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *      John McCutchan  <ttb@tentacle.dhs.org>
+ *      Robert Love     <rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+#include <asm/ioctls.h>
+static kmem_cache_t *watch_cachep __read_mostly;
+static kmem_cache_t *event_cachep __read_mostly;
+static struct vfsmount *inotify_mnt __read_mostly;
+/* these are configurable via /proc/sys/fs/inotify/ */
+int inotify_max_user_instances __read_mostly;
+int inotify_max_user_watches __read_mostly;
+int inotify_max_queued_events __read_mostly;
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ *      inode->inotify_mutex (protects inode's watch list)
+ *              inotify_handle->mutex (protects inotify_handle's watch list)
+ *                      inotify_dev->ev_mutex (protects device's event queue)
+ */
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+        wait_queue_head_t       wq;             /* wait queue for i/o */
+        struct mutex            ev_mutex;       /* protects event queue */
+        struct mutex            up_mutex;       /* synchronizes watch updates */
+        struct list_head        events;         /* list of queued events */
+        atomic_t                count;          /* reference count */
+        struct user_struct      *user;          /* user who opened this dev */
+        struct inotify_handle   *ih;            /* inotify handle */
+        unsigned int            queue_size;     /* size of the queue (bytes) */
+        unsigned int            event_count;    /* number of pending events */
+        unsigned int            max_events;     /* maximum number of events */
+};
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+        struct inotify_event    event;  /* the user-space event */
+        struct list_head        list;   /* entry in inotify_device's list */
+        char                    *name;  /* filename, if any */
+};
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+        struct inotify_device   *dev;   /* associated device */
+        struct inotify_watch    wdata;  /* inotify watch data */
+};
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+static int zero;
+ctl_table inotify_table[] = {
+        {
+                .ctl_name       = INOTIFY_MAX_USER_INSTANCES,
+                .procname       = "max_user_instances",
+                .data           = &inotify_max_user_instances,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+        },
+        {
+                .ctl_name       = INOTIFY_MAX_USER_WATCHES,
+                .procname       = "max_user_watches",
+                .data           = &inotify_max_user_watches,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+        },
+        {
+                .ctl_name       = INOTIFY_MAX_QUEUED_EVENTS,
+                .procname       = "max_queued_events",
+                .data           = &inotify_max_queued_events,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero
+        },
+        { .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+        atomic_inc(&dev->count);
+}
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+        if (atomic_dec_and_test(&dev->count)) {
+                atomic_dec(&dev->user->inotify_devs);
+                free_uid(dev->user);
+                kfree(dev);
+        }
+}
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+        struct inotify_user_watch *watch;
+        struct inotify_device *dev;
+        watch = container_of(w, struct inotify_user_watch, wdata);
+        dev = watch->dev;
+        atomic_dec(&dev->user->inotify_watches);
+        put_inotify_dev(dev);
+        kmem_cache_free(watch_cachep, watch);
+}
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+                                                  const char *name)
+{
+        struct inotify_kernel_event *kevent;
+        kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
+        if (unlikely(!kevent))
+                return NULL;
+        /* we hand this out to user-space, so zero it just in case */
+        memset(&kevent->event, 0, sizeof(struct inotify_event));
+        kevent->event.wd = wd;
+        kevent->event.mask = mask;
+        kevent->event.cookie = cookie;
+        INIT_LIST_HEAD(&kevent->list);
+        if (name) {
+                size_t len, rem, event_size = sizeof(struct inotify_event);
+                /*
+                 * We need to pad the filename so as to properly align an
+                 * array of inotify_event structures.  Because the structure is
+                 * small and the common case is a small filename, we just round
+                 * up to the next multiple of the structure's sizeof.  This is
+                 * simple and safe for all architectures.
+                 */
+                len = strlen(name) + 1;
+                rem = event_size - len;
+                if (len > event_size) {
+                        rem = event_size - (len % event_size);
+                        if (len % event_size == 0)
+                                rem = 0;
+                }
+                kevent->name = kmalloc(len + rem, GFP_KERNEL);
+                if (unlikely(!kevent->name)) {
+                        kmem_cache_free(event_cachep, kevent);
+                        return NULL;
+                }
+                memcpy(kevent->name, name, len);
+                if (rem)
+                        memset(kevent->name + len, 0, rem);
+                kevent->event.len = len + rem;
+        } else {
+                kevent->event.len = 0;
+                kevent->name = NULL;
+        }
+        return kevent;
+}
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+        return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+                                    u32 cookie, const char *name,
+                                    struct inode *ignored)
+{
+        struct inotify_user_watch *watch;
+        struct inotify_device *dev;
+        struct inotify_kernel_event *kevent, *last;
+        watch = container_of(w, struct inotify_user_watch, wdata);
+        dev = watch->dev;
+        mutex_lock(&dev->ev_mutex);
+        /* we can safely put the watch as we don't reference it while
+         * generating the event
+         */
+        if (mask & IN_IGNORED || mask & IN_ONESHOT)
+                put_inotify_watch(w); /* final put */
+        /* coalescing: drop this event if it is a dupe of the previous */
+        last = inotify_dev_get_event(dev);
+        if (last && last->event.mask == mask && last->event.wd == wd &&
+                        last->event.cookie == cookie) {
+                const char *lastname = last->name;
+                if (!name && !lastname)
+                        goto out;
+                if (name && lastname && !strcmp(lastname, name))
+                        goto out;
+        }
+        /* the queue overflowed and we already sent the Q_OVERFLOW event */
+        if (unlikely(dev->event_count > dev->max_events))
+                goto out;
+        /* if the queue overflows, we need to notify user space */
+        if (unlikely(dev->event_count == dev->max_events))
+                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+        else
+                kevent = kernel_event(wd, mask, cookie, name);
+        if (unlikely(!kevent))
+                goto out;
+        /* queue the event and wake up anyone waiting */
+        dev->event_count++;
+        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+        list_add_tail(&kevent->list, &dev->events);
+        wake_up_interruptible(&dev->wq);
+out:
+        mutex_unlock(&dev->ev_mutex);
+}
+/*
+ * remove_kevent - cleans up and ultimately frees the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+                          struct inotify_kernel_event *kevent)
+{
+        list_del(&kevent->list);
+        dev->event_count--;
+        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+        kfree(kevent->name);
+        kmem_cache_free(event_cachep, kevent);
+}
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+        if (!list_empty(&dev->events)) {
+                struct inotify_kernel_event *kevent;
+                kevent = inotify_dev_get_event(dev);
+                remove_kevent(dev, kevent);
+        }
+}
+/*
+ * find_inode - resolve a user-given path to a specific inode and return a nd
+ */
+static int find_inode(const char __user *dirname, struct nameidata *nd,
+                      unsigned flags)
+{
+        int error;
+        error = __user_walk(dirname, flags, nd);
+        if (error)
+                return error;
+        /* you can only watch an inode if you have read permissions on it */
+        error = vfs_permission(nd, MAY_READ);
+        if (error)
+                path_release(nd);
+        return error;
+}
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+                        u32 mask)
+{
+        struct inotify_user_watch *watch;
+        int ret;
+        if (atomic_read(&dev->user->inotify_watches) >=
+                        inotify_max_user_watches)
+                return -ENOSPC;
+        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+        if (unlikely(!watch))
+                return -ENOMEM;
+        /* save a reference to device and bump the count to make it official */
+        get_inotify_dev(dev);
+        watch->dev = dev;
+        atomic_inc(&dev->user->inotify_watches);
+        inotify_init_watch(&watch->wdata);
+        ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+        if (ret < 0)
+                free_inotify_user_watch(&watch->wdata);
+        return ret;
+}
+/* Device Interface */
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+        struct inotify_device *dev = file->private_data;
+        int ret = 0;
+        poll_wait(file, &dev->wq, wait);
+        mutex_lock(&dev->ev_mutex);
+        if (!list_empty(&dev->events))
+                ret = POLLIN | POLLRDNORM;
+        mutex_unlock(&dev->ev_mutex);
+        return ret;
+}
+static ssize_t inotify_read(struct file *file, char __user *buf,
+                            size_t count, loff_t *pos)
+{
+        size_t event_size = sizeof (struct inotify_event);
+        struct inotify_device *dev;
+        char __user *start;
+        int ret;
+        DEFINE_WAIT(wait);
+        start = buf;
+        dev = file->private_data;
+        while (1) {
+                int events;
+                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&dev->ev_mutex);
+                events = !list_empty(&dev->events);
+                mutex_unlock(&dev->ev_mutex);
+                if (events) {
+                        ret = 0;
+                        break;
+                }
+                if (file->f_flags & O_NONBLOCK) {
+                        ret = -EAGAIN;
+                        break;
+                }
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                schedule();
+        }
+        finish_wait(&dev->wq, &wait);
+        if (ret)
+                return ret;
+        mutex_lock(&dev->ev_mutex);
+        while (1) {
+                struct inotify_kernel_event *kevent;
+                ret = buf - start;
+                if (list_empty(&dev->events))
+                        break;
+                kevent = inotify_dev_get_event(dev);
+                if (event_size + kevent->event.len > count)
+                        break;
+                if (copy_to_user(buf, &kevent->event, event_size)) {
+                        ret = -EFAULT;
+                        break;
+                }
+                buf += event_size;
+                count -= event_size;
+                if (kevent->name) {
+                        if (copy_to_user(buf, kevent->name, kevent->event.len)){
+                                ret = -EFAULT;
+                                break;
+                        }
+                        buf += kevent->event.len;
+                        count -= kevent->event.len;
+                }
+                remove_kevent(dev, kevent);
+        }
+        mutex_unlock(&dev->ev_mutex);
+        return ret;
+}
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+        struct inotify_device *dev = file->private_data;
+        inotify_destroy(dev->ih);
+        /* destroy all of the events on this device */
+        mutex_lock(&dev->ev_mutex);
+        while (!list_empty(&dev->events))
+                inotify_dev_event_dequeue(dev);
+        mutex_unlock(&dev->ev_mutex);
+        /* free this device: the put matching the get in inotify_init() */
+        put_inotify_dev(dev);
+        return 0;
+}
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+                          unsigned long arg)
+{
+        struct inotify_device *dev;
+        void __user *p;
+        int ret = -ENOTTY;
+        dev = file->private_data;
+        p = (void __user *) arg;
+        switch (cmd) {
+        case FIONREAD:
+                ret = put_user(dev->queue_size, (int __user *) p);
+                break;
+        }
+        return ret;
+}
+static const struct file_operations inotify_fops = {
+        .poll           = inotify_poll,
+        .read           = inotify_read,
+        .release        = inotify_release,
+        .unlocked_ioctl = inotify_ioctl,
+        .compat_ioctl   = inotify_ioctl,
+};
+static const struct inotify_operations inotify_user_ops = {
+        .handle_event   = inotify_dev_queue_event,
+        .destroy_watch  = free_inotify_user_watch,
+};
+asmlinkage long sys_inotify_init(void)
+{
+        struct inotify_device *dev;
+        struct inotify_handle *ih;
+        struct user_struct *user;
+        struct file *filp;
+        int fd, ret;
+        fd = get_unused_fd();
+        if (fd < 0)
+                return fd;
+        filp = get_empty_filp();
+        if (!filp) {
+                ret = -ENFILE;
+                goto out_put_fd;
+        }
+        user = get_uid(current->user);
+        if (unlikely(atomic_read(&user->inotify_devs) >=
+                        inotify_max_user_instances)) {
+                ret = -EMFILE;
+                goto out_free_uid;
+        }
+        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        if (unlikely(!dev)) {
+                ret = -ENOMEM;
+                goto out_free_uid;
+        }
+        ih = inotify_init(&inotify_user_ops);
+        if (unlikely(IS_ERR(ih))) {
+                ret = PTR_ERR(ih);
+                goto out_free_dev;
+        }
+        dev->ih = ih;
+        filp->f_op = &inotify_fops;
+        filp->f_vfsmnt = mntget(inotify_mnt);
+        filp->f_dentry = dget(inotify_mnt->mnt_root);
+        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+        filp->f_mode = FMODE_READ;
+        filp->f_flags = O_RDONLY;
+        filp->private_data = dev;
+        INIT_LIST_HEAD(&dev->events);
+        init_waitqueue_head(&dev->wq);
+        mutex_init(&dev->ev_mutex);
+        mutex_init(&dev->up_mutex);
+        dev->event_count = 0;
+        dev->queue_size = 0;
+        dev->max_events = inotify_max_queued_events;
+        dev->user = user;
+        atomic_set(&dev->count, 0);
+        get_inotify_dev(dev);
+        atomic_inc(&user->inotify_devs);
+        fd_install(fd, filp);
+        return fd;
+out_free_dev:
+        kfree(dev);
+out_free_uid:
+        free_uid(user);
+        put_filp(filp);
+out_put_fd:
+        put_unused_fd(fd);
+        return ret;
+}
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+{
+        struct inode *inode;
+        struct inotify_device *dev;
+        struct nameidata nd;
+        struct file *filp;
+        int ret, fput_needed;
+        unsigned flags = 0;
+        filp = fget_light(fd, &fput_needed);
+        if (unlikely(!filp))
+                return -EBADF;
+        /* verify that this is indeed an inotify instance */
+        if (unlikely(filp->f_op != &inotify_fops)) {
+                ret = -EINVAL;
+                goto fput_and_out;
+        }
+        if (!(mask & IN_DONT_FOLLOW))
+                flags |= LOOKUP_FOLLOW;
+        if (mask & IN_ONLYDIR)
+                flags |= LOOKUP_DIRECTORY;
+        ret = find_inode(path, &nd, flags);
+        if (unlikely(ret))
+                goto fput_and_out;
+        /* inode held in place by reference to nd; dev by fget on fd */
+        inode = nd.dentry->d_inode;
+        dev = filp->private_data;
+        mutex_lock(&dev->up_mutex);
+        ret = inotify_find_update_watch(dev->ih, inode, mask);
+        if (ret == -ENOENT)
+                ret = create_watch(dev, inode, mask);
+        mutex_unlock(&dev->up_mutex);
+        path_release(&nd);
+fput_and_out:
+        fput_light(filp, fput_needed);
+        return ret;
+}
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+        struct file *filp;
+        struct inotify_device *dev;
+        int ret, fput_needed;
+        filp = fget_light(fd, &fput_needed);
+        if (unlikely(!filp))
+                return -EBADF;
+        /* verify that this is indeed an inotify instance */
+        if (unlikely(filp->f_op != &inotify_fops)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        dev = filp->private_data;
+        /* we free our watch data when we get IN_IGNORED */
+        ret = inotify_rm_wd(dev->ih, wd);
+out:
+        fput_light(filp, fput_needed);
+        return ret;
+}
+static int
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+               const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt);
+}
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+        int ret;
+        ret = register_filesystem(&inotify_fs_type);
+        if (unlikely(ret))
+                panic("inotify: register_filesystem returned %d!\n", ret);
+        inotify_mnt = kern_mount(&inotify_fs_type);
+        if (IS_ERR(inotify_mnt))
+                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+        inotify_max_queued_events = 16384;
+        inotify_max_user_instances = 128;
+        inotify_max_user_watches = 8192;
+        watch_cachep = kmem_cache_create("inotify_watch_cache",
+                                         sizeof(struct inotify_user_watch),
+                                         0, SLAB_PANIC, NULL, NULL);
+        event_cachep = kmem_cache_create("inotify_event_cache",
+                                         sizeof(struct inotify_kernel_event),
+                                         0, SLAB_PANIC, NULL, NULL);
+        return 0;
+}
+module_init(inotify_user_setup);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f8aeec3ca10c..4b7660b09ac0 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -4,7 +4,6 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
diff --git a/fs/ioprio.c b/fs/ioprio.c
index ca77008146c0..93aa5715f224 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -24,15 +24,21 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/security.h>
 static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
+        int err;
        struct io_context *ioc;
        if (task->uid != current->euid &&
            task->uid != current->uid && !capable(CAP_SYS_NICE))
                return -EPERM;
+        err = security_task_setioprio(task, ioprio);
+        if (err)
+                return err;
        task_lock(task);
        task->ioprio = ioprio;
@@ -119,11 +125,24 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
        return ret;
 }
+static int get_task_ioprio(struct task_struct *p)
+{
+        int ret;
+        ret = security_task_getioprio(p);
+        if (ret)
+                goto out;
+        ret = p->ioprio;
+out:
+        return ret;
+}
 asmlinkage long sys_ioprio_get(int which, int who)
 {
        struct task_struct *g, *p;
        struct user_struct *user;
        int ret = -ESRCH;
+        int tmpio;
        read_lock_irq(&tasklist_lock);
        switch (which) {
@@ -133,16 +152,19 @@ asmlinkage long sys_ioprio_get(int which, int who)
                        else
                                p = find_task_by_pid(who);
                        if (p)
-                                ret = p->ioprio;
+                                ret = get_task_ioprio(p);
                        break;
                case IOPRIO_WHO_PGRP:
                        if (!who)
                                who = process_group(current);
                        do_each_task_pid(who, PIDTYPE_PGID, p) {
+                                tmpio = get_task_ioprio(p);
+                                if (tmpio < 0)
+                                        continue;
                                if (ret == -ESRCH)
-                                        ret = p->ioprio;
+                                        ret = tmpio;
                                else
-                                        ret = ioprio_best(ret, p->ioprio);
+                                        ret = ioprio_best(ret, tmpio);
                        } while_each_task_pid(who, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
@@ -157,10 +179,13 @@ asmlinkage long sys_ioprio_get(int which, int who)
                        do_each_thread(g, p) {
                                if (p->uid != user->uid)
                                        continue;
+                                tmpio = get_task_ioprio(p);
+                                if (tmpio < 0)
+                                        continue;
                                if (ret == -ESRCH)
-                                        ret = p->ioprio;
+                                        ret = tmpio;
                                else
-                                        ret = ioprio_best(ret, p->ioprio);
+                                        ret = ioprio_best(ret, tmpio);
                        } while_each_thread(g, p);
                        if (who)
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 4917315db732..731816332b12 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -16,7 +16,6 @@
 * Transparent decompression of files on an iso9660 filesystem
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -312,7 +311,7 @@ eio:
        return err;
 }
-struct address_space_operations zisofs_aops = {
+const struct address_space_operations zisofs_aops = {
        .readpage = zisofs_readpage,
        /* No sync_page operation supported? */
        /* No bmap operation supported */
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 5440ea292c69..27e276987fd2 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
 * 
 *  isofs directory handling functions
 */
-#include <linux/config.h>
 #include <linux/smp_lock.h>
 #include "isofs.h"
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 70adbb98bad1..14391361c886 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -11,7 +11,6 @@
 *      2004  Paul Serice - NFS Export Operations
 */
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -56,7 +55,7 @@ static void isofs_put_super(struct super_block *sb)
 }
 static void isofs_read_inode(struct inode *);
-static int isofs_statfs (struct super_block *, struct kstatfs *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
 static kmem_cache_t *isofs_inode_cachep;
@@ -901,8 +900,10 @@ out_freesbi:
        return -EINVAL;
 }
-static int isofs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        buf->f_type = ISOFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (ISOFS_SB(sb)->s_nzones
@@ -1052,7 +1053,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping,block,isofs_get_block);
 }
-static struct address_space_operations isofs_aops = {
+static const struct address_space_operations isofs_aops = {
        .readpage = isofs_readpage,
        .sync_page = block_sync_page,
        .bmap = _isofs_bmap
@@ -1399,10 +1400,11 @@ struct inode *isofs_iget(struct super_block *sb,
        return inode;
 }
-static struct super_block *isofs_get_sb(struct file_system_type *fs_type,
+static int isofs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+                           mnt);
 }
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index b87ba066f5e7..e6308c8b5735 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -176,5 +176,5 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 extern struct inode_operations isofs_dir_inode_operations;
 extern const struct file_operations isofs_dir_operations;
-extern struct address_space_operations isofs_symlink_aops;
+extern const struct address_space_operations isofs_symlink_aops;
 extern struct export_operations isofs_export_ops;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 4326cb47f8fa..f3a1db3098de 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -754,6 +754,6 @@ error:
        return -EIO;
 }
-struct address_space_operations isofs_symlink_aops = {
+const struct address_space_operations isofs_symlink_aops = {
        .readpage = rock_ridge_symlink_readpage
 };
diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h
index d78485d101c2..273795709155 100644
--- a/fs/isofs/zisofs.h
+++ b/fs/isofs/zisofs.h
@@ -15,7 +15,7 @@
 */
 #ifdef CONFIG_ZISOFS
-extern struct address_space_operations zisofs_aops;
+extern const struct address_space_operations zisofs_aops;
 extern int __init zisofs_init(void);
 extern void zisofs_cleanup(void);
 #endif
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 3f5102b069db..47678a26c13b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,67 @@
 #include <linux/slab.h>
 /*
- * Unlink a buffer from a transaction.
+ * Unlink a buffer from a transaction checkpoint list.
 *
 * Called with j_list_lock held.
 */
+static inline void __buffer_unlink_first(struct journal_head *jh)
-static inline void __buffer_unlink(struct journal_head *jh)
 {
-        transaction_t *transaction;
+        transaction_t *transaction = jh->b_cp_transaction;
-        transaction = jh->b_cp_transaction;
-        jh->b_cp_transaction = NULL;
        jh->b_cpnext->b_cpprev = jh->b_cpprev;
        jh->b_cpprev->b_cpnext = jh->b_cpnext;
-        if (transaction->t_checkpoint_list == jh)
+        if (transaction->t_checkpoint_list == jh) {
                transaction->t_checkpoint_list = jh->b_cpnext;
-        if (transaction->t_checkpoint_list == jh)
+                if (transaction->t_checkpoint_list == jh)
-                transaction->t_checkpoint_list = NULL;
+                        transaction->t_checkpoint_list = NULL;
+        }
+}
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+        transaction_t *transaction = jh->b_cp_transaction;
+        __buffer_unlink_first(jh);
+        if (transaction->t_checkpoint_io_list == jh) {
+                transaction->t_checkpoint_io_list = jh->b_cpnext;
+                if (transaction->t_checkpoint_io_list == jh)
+                        transaction->t_checkpoint_io_list = NULL;
+        }
+}
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+        transaction_t *transaction = jh->b_cp_transaction;
+        __buffer_unlink_first(jh);
+        if (!transaction->t_checkpoint_io_list) {
+                jh->b_cpnext = jh->b_cpprev = jh;
+        } else {
+                jh->b_cpnext = transaction->t_checkpoint_io_list;
+                jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+                jh->b_cpprev->b_cpnext = jh;
+                jh->b_cpnext->b_cpprev = jh;
+        }
+        transaction->t_checkpoint_io_list = jh;
 }
 /*
 * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
 * Requires j_list_lock
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
@@ -57,12 +95,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
                JBUFFER_TRACE(jh, "remove from checkpoint list");
-                __journal_remove_checkpoint(jh);
+                ret = __journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
                journal_remove_journal_head(bh);
                BUFFER_TRACE(bh, "release");
                __brelse(bh);
-                ret = 1;
        } else {
                jbd_unlock_bh_state(bh);
        }
@@ -117,83 +154,54 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 /*
- * Clean up a transaction's checkpoint list.
+ * Clean up transaction's list of buffers submitted for io.
- *
+ * We wait for any pending IO to complete and remove any clean
- * We wait for any pending IO to complete and make sure any clean
+ * buffers. Note that we take the buffers in the opposite ordering
- * buffers are removed from the transaction.
+ * from the one in which they were submitted for IO.
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
 *
 * Called with j_list_lock held.
 */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-        struct journal_head *jh, *next_jh, *last_jh;
+        struct journal_head *jh;
        struct buffer_head *bh;
-        int ret = 0;
+        tid_t this_tid;
+        int released = 0;
-        assert_spin_locked(&journal->j_list_lock);
-        jh = transaction->t_checkpoint_list;
+        this_tid = transaction->t_tid;
-        if (!jh)
+restart:
-                return 0;
+        /* Did somebody clean up the transaction in the meanwhile? */
+        if (journal->j_checkpoint_transactions != transaction ||
-        last_jh = jh->b_cpprev;
+                        transaction->t_tid != this_tid)
-        next_jh = jh;
+                return;
-        do {
+        while (!released && transaction->t_checkpoint_io_list) {
-                jh = next_jh;
+                jh = transaction->t_checkpoint_io_list;
                bh = jh2bh(jh);
+                if (!jbd_trylock_bh_state(bh)) {
+                        jbd_sync_bh(journal, bh);
+                        spin_lock(&journal->j_list_lock);
+                        goto restart;
+                }
                if (buffer_locked(bh)) {
                        atomic_inc(&bh->b_count);
                        spin_unlock(&journal->j_list_lock);
+                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
                        /* the journal_head may have gone by now */
                        BUFFER_TRACE(bh, "brelse");
                        __brelse(bh);
-                        goto out_return_1;
+                        spin_lock(&journal->j_list_lock);
+                        goto restart;
                }
                /*
-                 * This is foul
+                 * Now in whatever state the buffer currently is, we know that
+                 * it has been written out and so we can drop it from the list
                 */
-                if (!jbd_trylock_bh_state(bh)) {
+                released = __journal_remove_checkpoint(jh);
-                        jbd_sync_bh(journal, bh);
+                jbd_unlock_bh_state(bh);
-                        goto out_return_1;
+                journal_remove_journal_head(bh);
-                }
+                __brelse(bh);
+        }
-                if (jh->b_transaction != NULL) {
-                        transaction_t *t = jh->b_transaction;
-                        tid_t tid = t->t_tid;
-                        spin_unlock(&journal->j_list_lock);
-                        jbd_unlock_bh_state(bh);
-                        log_start_commit(journal, tid);
-                        log_wait_commit(journal, tid);
-                        goto out_return_1;
-                }
-                /*
-                 * AKPM: I think the buffer_jbddirty test is redundant - it
-                 * shouldn't have NULL b_transaction?
-                 */
-                next_jh = jh->b_cpnext;
-                if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-                        BUFFER_TRACE(bh, "remove from checkpoint");
-                        __journal_remove_checkpoint(jh);
-                        jbd_unlock_bh_state(bh);
-                        journal_remove_journal_head(bh);
-                        __brelse(bh);
-                        ret = 1;
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
-        } while (jh != last_jh);
-        return ret;
-out_return_1:
-        spin_lock(&journal->j_list_lock);
-        return 1;
 }
 #define NR_BATCH        64
@@ -203,9 +211,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
        int i;
-        spin_unlock(&journal->j_list_lock);
        ll_rw_block(SWRITE, *batch_count, bhs);
-        spin_lock(&journal->j_list_lock);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = bhs[i];
                clear_buffer_jwrite(bh);
@@ -221,19 +227,43 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Return 1 if something happened which requires us to abort the current
 * scan of the checkpoint list.  
 *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
-                        struct buffer_head **bhs, int *batch_count,
+                        struct buffer_head **bhs, int *batch_count)
-                        int *drop_count)
 {
        struct buffer_head *bh = jh2bh(jh);
        int ret = 0;
-        if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
+        if (buffer_locked(bh)) {
-                J_ASSERT_JH(jh, jh->b_transaction == NULL);
+                atomic_inc(&bh->b_count);
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                wait_on_buffer(bh);
+                /* the journal_head may have gone by now */
+                BUFFER_TRACE(bh, "brelse");
+                __brelse(bh);
+                ret = 1;
+        } else if (jh->b_transaction != NULL) {
+                transaction_t *t = jh->b_transaction;
+                tid_t tid = t->t_tid;
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                log_start_commit(journal, tid);
+                log_wait_commit(journal, tid);
+                ret = 1;
+        } else if (!buffer_dirty(bh)) {
+                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+                BUFFER_TRACE(bh, "remove from checkpoint");
+                __journal_remove_checkpoint(jh);
+                spin_unlock(&journal->j_list_lock);
+                jbd_unlock_bh_state(bh);
+                journal_remove_journal_head(bh);
+                __brelse(bh);
+                ret = 1;
+        } else {
                /*
                 * Important: we are about to write the buffer, and
                 * possibly block, while still holding the journal lock.
@@ -246,45 +276,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
                J_ASSERT_BH(bh, !buffer_jwrite(bh));
                set_buffer_jwrite(bh);
                bhs[*batch_count] = bh;
+                __buffer_relink_io(jh);
                jbd_unlock_bh_state(bh);
                (*batch_count)++;
                if (*batch_count == NR_BATCH) {
+                        spin_unlock(&journal->j_list_lock);
                        __flush_batch(journal, bhs, batch_count);
                        ret = 1;
                }
-        } else {
-                int last_buffer = 0;
-                if (jh->b_cpnext == jh) {
-                        /* We may be about to drop the transaction.  Tell the
-                         * caller that the lists have changed.
-                         */
-                        last_buffer = 1;
-                }
-                if (__try_to_free_cp_buf(jh)) {
-                        (*drop_count)++;
-                        ret = last_buffer;
-                }
        }
        return ret;
 }
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
+ * Perform an actual checkpoint. We take the first transaction on the
- * satisfy the current blocked requests: rather we submit a reasonably
+ * list of transactions to be checkpointed and send all its buffers
- * sized chunk of the outstanding data to disk at once for
+ * to disk. We submit larger chunks of data at once.
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
 * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.
- *
 * The journal should be locked before calling this function.
 */
 int log_do_checkpoint(journal_t *journal)
 {
+        transaction_t *transaction;
+        tid_t this_tid;
        int result;
-        int batch_count = 0;
-        struct buffer_head *bhs[NR_BATCH];
        jbd_debug(1, "Start checkpoint\n");
@@ -299,79 +314,68 @@ int log_do_checkpoint(journal_t *journal)
                return result;
        /*
-         * OK, we need to start writing disk blocks.  Try to free up a
+         * OK, we need to start writing disk blocks.  Take one transaction
-         * quarter of the log in a single checkpoint if we can.
+         * and write it.
         */
+        spin_lock(&journal->j_list_lock);
+        if (!journal->j_checkpoint_transactions)
+                goto out;
+        transaction = journal->j_checkpoint_transactions;
+        this_tid = transaction->t_tid;
+restart:
        /*
-         * AKPM: check this code.  I had a feeling a while back that it
+         * If someone cleaned up this transaction while we slept, we're
-         * degenerates into a busy loop at unmount time.
+         * done (maybe it's a new transaction, but it fell at the same
+         * address).
         */
-        spin_lock(&journal->j_list_lock);
+        if (journal->j_checkpoint_transactions == transaction &&
-        while (journal->j_checkpoint_transactions) {
+                        transaction->t_tid == this_tid) {
-                transaction_t *transaction;
+                int batch_count = 0;
-                struct journal_head *jh, *last_jh, *next_jh;
+                struct buffer_head *bhs[NR_BATCH];
-                int drop_count = 0;
+                struct journal_head *jh;
-                int cleanup_ret, retry = 0;
+                int retry = 0;
-                tid_t this_tid;
+                while (!retry && transaction->t_checkpoint_list) {
-                transaction = journal->j_checkpoint_transactions;
-                this_tid = transaction->t_tid;
-                jh = transaction->t_checkpoint_list;
-                last_jh = jh->b_cpprev;
-                next_jh = jh;
-                do {
                        struct buffer_head *bh;
-                        jh = next_jh;
+                        jh = transaction->t_checkpoint_list;
-                        next_jh = jh->b_cpnext;
                        bh = jh2bh(jh);
                        if (!jbd_trylock_bh_state(bh)) {
                                jbd_sync_bh(journal, bh);
-                                spin_lock(&journal->j_list_lock);
                                retry = 1;
                                break;
                        }
-                        retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
+                        retry = __process_buffer(journal, jh, bhs,&batch_count);
-                        if (cond_resched_lock(&journal->j_list_lock)) {
+                        if (!retry && lock_need_resched(&journal->j_list_lock)){
+                                spin_unlock(&journal->j_list_lock);
                                retry = 1;
                                break;
                        }
-                } while (jh != last_jh && !retry);
+                }
                if (batch_count) {
+                        if (!retry) {
+                                spin_unlock(&journal->j_list_lock);
+                                retry = 1;
+                        }
                        __flush_batch(journal, bhs, &batch_count);
-                        retry = 1;
                }
+                if (retry) {
+                        spin_lock(&journal->j_list_lock);
+                        goto restart;
+                }
                /*
-                 * If someone cleaned up this transaction while we slept, we're
+                 * Now we have cleaned up the first transaction's checkpoint
-                 * done
+                 * list. Let's clean up the second one
-                 */
-                if (journal->j_checkpoint_transactions != transaction)
-                        break;
-                if (retry)
-                        continue;
-                /*
-                 * Maybe it's a new transaction, but it fell at the same
-                 * address
-                 */
-                if (transaction->t_tid != this_tid)
-                        continue;
-                /*
-                 * We have walked the whole transaction list without
-                 * finding anything to write to disk.  We had better be
-                 * able to make some progress or we are in trouble.
                 */
-                cleanup_ret = __cleanup_transaction(journal, transaction);
+                __wait_cp_io(journal, transaction);
-                J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-                if (journal->j_checkpoint_transactions != transaction)
-                        break;
        }
+out:
        spin_unlock(&journal->j_list_lock);
        result = cleanup_journal_tail(journal);
        if (result < 0)
                return result;
        return 0;
 }
@@ -456,52 +460,98 @@ int cleanup_journal_tail(journal_t *journal)
 /* Checkpoint list management */
 /*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+        struct journal_head *last_jh;
+        struct journal_head *next_jh = jh;
+        int ret, freed = 0;
+        *released = 0;
+        if (!jh)
+                return 0;
+        last_jh = jh->b_cpprev;
+        do {
+                jh = next_jh;
+                next_jh = jh->b_cpnext;
+                /* Use trylock because of the ranking */
+                if (jbd_trylock_bh_state(jh2bh(jh))) {
+                        ret = __try_to_free_cp_buf(jh);
+                        if (ret) {
+                                freed++;
+                                if (ret == 2) {
+                                        *released = 1;
+                                        return freed;
+                                }
+                        }
+                }
+                /*
+                 * This function only frees up some memory
+                 * if possible so we dont have an obligation
+                 * to finish processing. Bail out if preemption
+                 * requested:
+                 */
+                if (need_resched())
+                        return freed;
+        } while (jh != last_jh);
+        return freed;
+}
+/*
 * journal_clean_checkpoint_list
 *
 * Find all the written-back checkpoint buffers in the journal and release them.
 *
 * Called with the journal locked.
 * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
 */
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
        transaction_t *transaction, *last_transaction, *next_transaction;
        int ret = 0;
+        int released;
        transaction = journal->j_checkpoint_transactions;
-        if (transaction == 0)
+        if (!transaction)
                goto out;
        last_transaction = transaction->t_cpprev;
        next_transaction = transaction;
        do {
-                struct journal_head *jh;
                transaction = next_transaction;
                next_transaction = transaction->t_cpnext;
-                jh = transaction->t_checkpoint_list;
+                ret += journal_clean_one_cp_list(transaction->
-                if (jh) {
+                                t_checkpoint_list, &released);
-                        struct journal_head *last_jh = jh->b_cpprev;
+                /*
-                        struct journal_head *next_jh = jh;
+                 * This function only frees up some memory if possible so we
+                 * dont have an obligation to finish processing. Bail out if
-                        do {
+                 * preemption requested:
-                                jh = next_jh;
+                 */
-                                next_jh = jh->b_cpnext;
+                if (need_resched())
-                                /* Use trylock because of the ranknig */
+                        goto out;
-                                if (jbd_trylock_bh_state(jh2bh(jh)))
+                if (released)
-                                        ret += __try_to_free_cp_buf(jh);
+                        continue;
-                                /*
+                /*
-                                 * This function only frees up some memory
+                 * It is essential that we are as careful as in the case of
-                                 * if possible so we dont have an obligation
+                 * t_checkpoint_list with removing the buffer from the list as
-                                 * to finish processing. Bail out if preemption
+                 * we can possibly see not yet submitted buffers on io_list
-                                 * requested:
+                 */
-                                 */
+                ret += journal_clean_one_cp_list(transaction->
-                                if (need_resched())
+                                t_checkpoint_io_list, &released);
-                                        goto out;
+                if (need_resched())
-                        } while (jh != last_jh);
+                        goto out;
-                }
        } while (transaction != last_transaction);
 out:
        return ret;
@@ -516,18 +566,22 @@ out:
 * buffer updates committed in that transaction have safely been stored
 * elsewhere on disk.  To achieve this, all of the buffers in a
 * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
 * called to remove the buffer from the existing transaction's
- * checkpoint list.
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
 *
 * This function is called with the journal locked.
 * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
 */
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
        transaction_t *transaction;
        journal_t *journal;
+        int ret = 0;
        JBUFFER_TRACE(jh, "entry");
@@ -538,8 +592,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
        journal = transaction->t_journal;
        __buffer_unlink(jh);
+        jh->b_cp_transaction = NULL;
-        if (transaction->t_checkpoint_list != NULL)
+        if (transaction->t_checkpoint_list != NULL ||
+            transaction->t_checkpoint_io_list != NULL)
                goto out;
        JBUFFER_TRACE(jh, "transaction has no more buffers");
@@ -565,8 +621,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
        /* Just in case anybody was waiting for more transactions to be
           checkpointed... */
        wake_up(&journal->j_wait_logspace);
+        ret = 1;
 out:
        JBUFFER_TRACE(jh, "exit");
+        return ret;
 }
 /*
@@ -628,6 +686,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
        J_ASSERT(transaction->t_shadow_list == NULL);
        J_ASSERT(transaction->t_log_list == NULL);
        J_ASSERT(transaction->t_checkpoint_list == NULL);
+        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
        J_ASSERT(transaction->t_updates == 0);
        J_ASSERT(journal->j_committing_transaction != transaction);
        J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 002ad2bbc769..0971814c38b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -790,11 +790,22 @@ restart_loop:
                        jbd_unlock_bh_state(bh);
                } else {
                        J_ASSERT_BH(bh, !buffer_dirty(bh));
-                        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+                        /* The buffer on BJ_Forget list and not jbddirty means
-                        __journal_unfile_buffer(jh);
+                         * it has been freed by this transaction and hence it
-                        jbd_unlock_bh_state(bh);
+                         * could not have been reallocated until this
-                        journal_remove_journal_head(bh);  /* needs a brelse */
+                         * transaction has committed. *BUT* it could be
-                        release_buffer_page(bh);
+                         * reallocated once we have written all the data to
+                         * disk and before we process the buffer on BJ_Forget
+                         * list. */
+                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+                        __journal_refile_buffer(jh);
+                        if (!jh->b_transaction) {
+                                jbd_unlock_bh_state(bh);
+                                 /* needs a brelse */
+                                journal_remove_journal_head(bh);
+                                release_buffer_page(bh);
+                        } else
+                                jbd_unlock_bh_state(bh);
                }
                cond_resched_lock(&journal->j_list_lock);
        }
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5cb6781..8c9b28dff119 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
 {
 #ifdef CONFIG_JBD_DEBUG
        atomic_dec(&nr_journal_heads);
-        memset(jh, 0x5b, sizeof(*jh));
+        memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
        kmem_cache_free(journal_head_cache, jh);
 }
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 80d7f53fd0a7..de5bafb4e853 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -531,6 +531,7 @@ static int do_one_pass(journal_t *journal,
                default:
                        jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
                                  blocktype);
+                        brelse(bh);
                        goto done;
                }
        }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c609f5034fcd..508b2ea91f43 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -227,7 +227,8 @@ repeat_locked:
        spin_unlock(&transaction->t_handle_lock);
        spin_unlock(&journal->j_state_lock);
 out:
-        kfree(new_transaction);
+        if (unlikely(new_transaction))          /* It's usually NULL */
+                kfree(new_transaction);
        return ret;
 }
@@ -724,7 +725,8 @@ done:
        journal_cancel_revoke(handle, jh);
 out:
-        kfree(frozen_buffer);
+        if (unlikely(frozen_buffer))    /* It's usually NULL */
+                kfree(frozen_buffer);
        JBUFFER_TRACE(jh, "exit");
        return error;
@@ -903,7 +905,8 @@ repeat:
        jbd_unlock_bh_state(bh);
 out:
        journal_put_journal_head(jh);
-        kfree(committed_data);
+        if (unlikely(committed_data))
+                kfree(committed_data);
        return err;
 }
@@ -2038,7 +2041,8 @@ void __journal_refile_buffer(struct journal_head *jh)
        __journal_temp_unlink_buffer(jh);
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
-        __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
+        __journal_file_buffer(jh, jh->b_transaction,
+                                was_dirty ? BJ_Metadata : BJ_Reserved);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
        if (was_dirty)
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 020cc097c539..93068697a9bf 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -59,7 +59,7 @@ static const struct file_operations jffs_file_operations;
 static struct inode_operations jffs_file_inode_operations;
 static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
-static struct address_space_operations jffs_address_operations;
+static const struct address_space_operations jffs_address_operations;
 kmem_cache_t     *node_cache = NULL;
 kmem_cache_t     *fm_cache = NULL;
@@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 /* Get statistics of the file system.  */
 static int
-jffs_statfs(struct super_block *sb, struct kstatfs *buf)
+jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
+        struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
        struct jffs_fmcontrol *fmc;
        lock_kernel();
@@ -1614,7 +1614,7 @@ jffs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 } /* jffs_ioctl()  */
-static struct address_space_operations jffs_address_operations = {
+static const struct address_space_operations jffs_address_operations = {
        .readpage       = jffs_readpage,
        .prepare_write  = jffs_prepare_write,
        .commit_write   = jffs_commit_write,
@@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops =
        .remount_fs     = jffs_remount,
 };
-static struct super_block *jffs_get_sb(struct file_system_type *fs_type,
+static int jffs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
+                           mnt);
 }
 static struct file_system_type jffs_fs_type = {
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 0ef207dfaf6f..9000f1effedf 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -55,7 +55,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/jffs.h>
@@ -247,7 +246,7 @@ flash_safe_read(struct mtd_info *mtd, loff_t from,
        D3(printk(KERN_NOTICE "flash_safe_read(%p, %08x, %p, %08x)\n",
                  mtd, (unsigned int) from, buf, count));
-        res = MTD_READ(mtd, from, count, &retlen, buf);
+        res = mtd->read(mtd, from, count, &retlen, buf);
        if (retlen != count) {
                panic("Didn't read all bytes in flash_safe_read(). Returned %d\n", res);
        }
@@ -262,7 +261,7 @@ flash_read_u32(struct mtd_info *mtd, loff_t from)
        __u32 ret;
        int res;
-        res = MTD_READ(mtd, from, 4, &retlen, (unsigned char *)&ret);
+        res = mtd->read(mtd, from, 4, &retlen, (unsigned char *)&ret);
        if (retlen != 4) {
                printk("Didn't read all bytes in flash_read_u32(). Returned %d\n", res);
                return 0;
@@ -282,7 +281,7 @@ flash_safe_write(struct mtd_info *mtd, loff_t to,
        D3(printk(KERN_NOTICE "flash_safe_write(%p, %08x, %p, %08x)\n",
                  mtd, (unsigned int) to, buf, count));
-        res = MTD_WRITE(mtd, to, count, &retlen, buf);
+        res = mtd->write(mtd, to, count, &retlen, buf);
        if (retlen != count) {
                printk("Didn't write all bytes in flash_safe_write(). Returned %d\n", res);
        }
@@ -300,9 +299,9 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
        D3(printk(KERN_NOTICE "flash_safe_writev(%p, %08x, %p)\n",
                  mtd, (unsigned int) to, vecs));
-        
        if (mtd->writev) {
-                res = MTD_WRITEV(mtd, vecs, iovec_cnt, to, &retlen);
+                res = mtd->writev(mtd, vecs, iovec_cnt, to, &retlen);
                return res ? res : retlen;
        }
        /* Not implemented writev. Repeatedly use write - on the not so
@@ -312,7 +311,8 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
        retlen=0;
        for (i=0; !res && i<iovec_cnt; i++) {
-                res = MTD_WRITE(mtd, to, vecs[i].iov_len, &retlen_a, vecs[i].iov_base);
+                res = mtd->write(mtd, to, vecs[i].iov_len, &retlen_a,
+                                 vecs[i].iov_base);
                if (retlen_a != vecs[i].iov_len) {
                        printk("Didn't write all bytes in flash_safe_writev(). Returned %d\n", res);
                        if (i != iovec_cnt-1)
@@ -393,7 +393,7 @@ flash_erase_region(struct mtd_info *mtd, loff_t start,
        set_current_state(TASK_UNINTERRUPTIBLE);
        add_wait_queue(&wait_q, &wait);
-        if (MTD_ERASE(mtd, erase) < 0) {
+        if (mtd->erase(mtd, erase) < 0) {
                set_current_state(TASK_RUNNING);
                remove_wait_queue(&wait_q, &wait);
                kfree(erase);
diff --git a/fs/jffs/jffs_fm.h b/fs/jffs/jffs_fm.h
index c794d923df2a..9ee6ad29eff5 100644
--- a/fs/jffs/jffs_fm.h
+++ b/fs/jffs/jffs_fm.h
@@ -20,7 +20,6 @@
 #ifndef __LINUX_JFFS_FM_H__
 #define __LINUX_JFFS_FM_H__
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/jffs.h>
 #include <linux/mtd/mtd.h>
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 77dc5561a04e..7f28ee0bd132 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -12,6 +12,9 @@ jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
 jffs2-y += super.o debug.o
 jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)    += wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_XATTR)          += xattr.o xattr_trusted.o xattr_user.o
+jffs2-$(CONFIG_JFFS2_FS_SECURITY)       += security.o
+jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL)      += acl.o
 jffs2-$(CONFIG_JFFS2_RUBIN)     += compr_rubin.o
 jffs2-$(CONFIG_JFFS2_RTIME)     += compr_rtime.o
 jffs2-$(CONFIG_JFFS2_ZLIB)      += compr_zlib.o
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index b7943439b6ec..c8f0bd64e53e 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -150,3 +150,24 @@ the buffer.
 Ordering constraints:
        Lock wbuf_sem last, after the alloc_sem or and f->sem.
+        c->xattr_sem
+        ------------
+This read/write semaphore protects against concurrent access to the
+xattr related objects which include stuff in superblock and ic->xref.
+In read-only path, write-semaphore is too much exclusion. It's enough
+by read-semaphore. But you must hold write-semaphore when updating,
+creating or deleting any xattr related object.
+Once xattr_sem released, there would be no assurance for the existence
+of those objects. Thus, a series of processes is often required to retry,
+when updating such a object is necessary under holding read semaphore.
+For example, do_jffs2_getxattr() holds read-semaphore to scan xref and
+xdatum at first. But it retries this process with holding write-semaphore
+after release read-semaphore, if it's necessary to load name/value pair
+from medium.
+Ordering constraints:
+        Lock xattr_sem last, after the alloc_sem.
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
new file mode 100644
index 000000000000..9c2077e7e081
--- /dev/null
+++ b/fs/jffs2/acl.c
@@ -0,0 +1,487 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+static size_t jffs2_acl_size(int count)
+{
+        if (count <= 4) {
+                return sizeof(struct jffs2_acl_header)
+                       + count * sizeof(struct jffs2_acl_entry_short);
+        } else {
+                return sizeof(struct jffs2_acl_header)
+                       + 4 * sizeof(struct jffs2_acl_entry_short)
+                       + (count - 4) * sizeof(struct jffs2_acl_entry);
+        }
+}
+static int jffs2_acl_count(size_t size)
+{
+        size_t s;
+        size -= sizeof(struct jffs2_acl_header);
+        s = size - 4 * sizeof(struct jffs2_acl_entry_short);
+        if (s < 0) {
+                if (size % sizeof(struct jffs2_acl_entry_short))
+                        return -1;
+                return size / sizeof(struct jffs2_acl_entry_short);
+        } else {
+                if (s % sizeof(struct jffs2_acl_entry))
+                        return -1;
+                return s / sizeof(struct jffs2_acl_entry) + 4;
+        }
+}
+static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size)
+{
+        void *end = value + size;
+        struct jffs2_acl_header *header = value;
+        struct jffs2_acl_entry *entry;
+        struct posix_acl *acl;
+        uint32_t ver;
+        int i, count;
+        if (!value)
+                return NULL;
+        if (size < sizeof(struct jffs2_acl_header))
+                return ERR_PTR(-EINVAL);
+        ver = je32_to_cpu(header->a_version);
+        if (ver != JFFS2_ACL_VERSION) {
+                JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
+                return ERR_PTR(-EINVAL);
+        }
+        value += sizeof(struct jffs2_acl_header);
+        count = jffs2_acl_count(size);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_KERNEL);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (i=0; i < count; i++) {
+                entry = value;
+                if (value + sizeof(struct jffs2_acl_entry_short) > end)
+                        goto fail;
+                acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
+                acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
+                switch (acl->a_entries[i].e_tag) {
+                        case ACL_USER_OBJ:
+                        case ACL_GROUP_OBJ:
+                        case ACL_MASK:
+                        case ACL_OTHER:
+                                value += sizeof(struct jffs2_acl_entry_short);
+                                acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+                                break;
+                        case ACL_USER:
+                        case ACL_GROUP:
+                                value += sizeof(struct jffs2_acl_entry);
+                                if (value > end)
+                                        goto fail;
+                                acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
+                                break;
+                        default:
+                                goto fail;
+                }
+        }
+        if (value != end)
+                goto fail;
+        return acl;
+ fail:
+        posix_acl_release(acl);
+        return ERR_PTR(-EINVAL);
+}
+static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
+{
+        struct jffs2_acl_header *header;
+        struct jffs2_acl_entry *entry;
+        void *e;
+        size_t i;
+        *size = jffs2_acl_size(acl->a_count);
+        header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL);
+        if (!header)
+                return ERR_PTR(-ENOMEM);
+        header->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+        e = header + 1;
+        for (i=0; i < acl->a_count; i++) {
+                entry = e;
+                entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
+                entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
+                switch(acl->a_entries[i].e_tag) {
+                        case ACL_USER:
+                        case ACL_GROUP:
+                                entry->e_id = cpu_to_je32(acl->a_entries[i].e_id);
+                                e += sizeof(struct jffs2_acl_entry);
+                                break;
+                        case ACL_USER_OBJ:
+                        case ACL_GROUP_OBJ:
+                        case ACL_MASK:
+                        case ACL_OTHER:
+                                e += sizeof(struct jffs2_acl_entry_short);
+                                break;
+                        default:
+                                goto fail;
+                }
+        }
+        return header;
+ fail:
+        kfree(header);
+        return ERR_PTR(-EINVAL);
+}
+static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+        struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
+        spin_lock(&inode->i_lock);
+        if (*i_acl != JFFS2_ACL_NOT_CACHED)
+                acl = posix_acl_dup(*i_acl);
+        spin_unlock(&inode->i_lock);
+        return acl;
+}
+static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
+{
+        spin_lock(&inode->i_lock);
+        if (*i_acl != JFFS2_ACL_NOT_CACHED)
+                posix_acl_release(*i_acl);
+        *i_acl = posix_acl_dup(acl);
+        spin_unlock(&inode->i_lock);
+}
+static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+{
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        struct posix_acl *acl;
+        char *value = NULL;
+        int rc, xprefix;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                acl = jffs2_iget_acl(inode, &f->i_acl_access);
+                if (acl != JFFS2_ACL_NOT_CACHED)
+                        return acl;
+                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                acl = jffs2_iget_acl(inode, &f->i_acl_default);
+                if (acl != JFFS2_ACL_NOT_CACHED)
+                        return acl;
+                xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
+        if (rc > 0) {
+                value = kmalloc(rc, GFP_KERNEL);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                rc = do_jffs2_getxattr(inode, xprefix, "", value, rc);
+        }
+        if (rc > 0) {
+                acl = jffs2_acl_from_medium(value, rc);
+        } else if (rc == -ENODATA || rc == -ENOSYS) {
+                acl = NULL;
+        } else {
+                acl = ERR_PTR(rc);
+        }
+        if (value)
+                kfree(value);
+        if (!IS_ERR(acl)) {
+                switch (type) {
+                case ACL_TYPE_ACCESS:
+                        jffs2_iset_acl(inode, &f->i_acl_access, acl);
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
+                        break;
+                }
+        }
+        return acl;
+}
+static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        size_t size = 0;
+        char *value = NULL;
+        int rc, xprefix;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        rc = posix_acl_equiv_mode(acl, &mode);
+                        if (rc < 0)
+                                return rc;
+                        if (inode->i_mode != mode) {
+                                inode->i_mode = mode;
+                                jffs2_dirty_inode(inode);
+                        }
+                        if (rc == 0)
+                                acl = NULL;
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                value = jffs2_acl_to_medium(acl, &size);
+                if (IS_ERR(value))
+                        return PTR_ERR(value);
+        }
+        rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+        if (!value && rc == -ENODATA)
+                rc = 0;
+        if (value)
+                kfree(value);
+        if (!rc) {
+                switch(type) {
+                case ACL_TYPE_ACCESS:
+                        jffs2_iset_acl(inode, &f->i_acl_access, acl);
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        jffs2_iset_acl(inode, &f->i_acl_default, acl);
+                        break;
+                }
+        }
+        return rc;
+}
+static int jffs2_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl;
+        int rc;
+        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                rc = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return rc;
+        }
+        return -EAGAIN;
+}
+int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        return generic_permission(inode, mask, jffs2_check_acl);
+}
+int jffs2_init_acl(struct inode *inode, struct inode *dir)
+{
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        struct posix_acl *acl = NULL, *clone;
+        mode_t mode;
+        int rc = 0;
+        f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+        f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+        if (!S_ISLNK(inode->i_mode)) {
+                acl = jffs2_get_acl(dir, ACL_TYPE_DEFAULT);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if (acl) {
+                if (S_ISDIR(inode->i_mode)) {
+                        rc = jffs2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+                        if (rc)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                rc = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                mode = inode->i_mode;
+                rc = posix_acl_create_masq(clone, &mode);
+                if (rc >= 0) {
+                        inode->i_mode = mode;
+                        if (rc > 0)
+                                rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+                }
+                posix_acl_release(clone);
+        }
+ cleanup:
+        posix_acl_release(acl);
+        return rc;
+}
+void jffs2_clear_acl(struct inode *inode)
+{
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
+                posix_acl_release(f->i_acl_access);
+                f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+        }
+        if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
+                posix_acl_release(f->i_acl_default);
+                f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+        }
+}
+int jffs2_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int rc;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        rc = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!rc)
+                rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+        posix_acl_release(clone);
+        return rc;
+}
+static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size,
+                                         const char *name, size_t name_len)
+{
+        const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
+        if (list && retlen <= list_size)
+                strcpy(list, POSIX_ACL_XATTR_ACCESS);
+        return retlen;
+}
+static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size,
+                                          const char *name, size_t name_len)
+{
+        const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
+        if (list && retlen <= list_size)
+                strcpy(list, POSIX_ACL_XATTR_DEFAULT);
+        return retlen;
+}
+static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size)
+{
+        struct posix_acl *acl;
+        int rc;
+        acl = jffs2_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (!acl)
+                return -ENODATA;
+        rc = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return rc;
+}
+static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+        if (name[0] != '\0')
+                return -EINVAL;
+        return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+        if (name[0] != '\0')
+                return -EINVAL;
+        return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
+{
+        struct posix_acl *acl;
+        int rc;
+        if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                if (acl) {
+                        rc = posix_acl_valid(acl);
+                        if (rc)
+                                goto out;
+                }
+        } else {
+                acl = NULL;
+        }
+        rc = jffs2_set_acl(inode, type, acl);
+ out:
+        posix_acl_release(acl);
+        return rc;
+}
+static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
+                                     const void *buffer, size_t size, int flags)
+{
+        if (name[0] != '\0')
+                return -EINVAL;
+        return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
+                                      const void *buffer, size_t size, int flags)
+{
+        if (name[0] != '\0')
+                return -EINVAL;
+        return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+struct xattr_handler jffs2_acl_access_xattr_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .list   = jffs2_acl_access_listxattr,
+        .get    = jffs2_acl_access_getxattr,
+        .set    = jffs2_acl_access_setxattr,
+};
+struct xattr_handler jffs2_acl_default_xattr_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .list   = jffs2_acl_default_listxattr,
+        .get    = jffs2_acl_default_getxattr,
+        .set    = jffs2_acl_default_setxattr,
+};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
new file mode 100644
index 000000000000..8893bd1a6ba7
--- /dev/null
+++ b/fs/jffs2/acl.h
@@ -0,0 +1,45 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+struct jffs2_acl_entry {
+        jint16_t        e_tag;
+        jint16_t        e_perm;
+        jint32_t        e_id;
+};
+struct jffs2_acl_entry_short {
+        jint16_t        e_tag;
+        jint16_t        e_perm;
+};
+struct jffs2_acl_header {
+        jint32_t        a_version;
+};
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+#define JFFS2_ACL_NOT_CACHED ((void *)-1)
+extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_acl_chmod(struct inode *);
+extern int jffs2_init_acl(struct inode *, struct inode *);
+extern void jffs2_clear_acl(struct inode *);
+extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern struct xattr_handler jffs2_acl_default_xattr_handler;
+#else
+#define jffs2_permission NULL
+#define jffs2_acl_chmod(inode)          (0)
+#define jffs2_init_acl(inode,dir)       (0)
+#define jffs2_clear_acl(inode)
+#endif  /* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 70f7a896c04a..02826967ab58 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -160,6 +160,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
                ic->scan_dents = NULL;
                cond_resched();
        }
+        jffs2_build_xattr_subsystem(c);
        c->flags &= ~JFFS2_SB_FLAG_BUILDING;
        dbg_fsbuild("FS build complete\n");
@@ -178,6 +179,7 @@ exit:
                                jffs2_free_full_dirent(fd);
                        }
                }
+                jffs2_clear_xattr_subsystem(c);
        }
        return ret;
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index e7944e665b9f..7001ba26c067 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -412,7 +412,7 @@ void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
                kfree(comprbuf);
 }
-int jffs2_compressors_init(void)
+int __init jffs2_compressors_init(void)
 {
 /* Registering compressors */
 #ifdef CONFIG_JFFS2_ZLIB
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index a77e830d85c5..509b8b1c0811 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -23,8 +23,8 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
+#include "jffs2_fs_i.h"
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_sb.h"
 #include "nodelist.h"
 #define JFFS2_RUBINMIPS_PRIORITY 10
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 5c63e0cdcf4c..3681d0728ac7 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -15,7 +15,6 @@
 #error "The userspace support got too messy and was removed. Update your mkfs.jffs2"
 #endif
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 1fe17de713e8..72b4fc13a106 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -192,13 +192,13 @@ __jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
                else
                        my_dirty_size += totlen;
-                if ((!ref2->next_phys) != (ref2 == jeb->last_node)) {
+                if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) {
-                        JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next_phys at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
+                        JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
-                                ref_offset(ref2), ref2, ref_offset(ref2->next_phys), ref2->next_phys,
+                                    ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2),
-                                ref_offset(jeb->last_node), jeb->last_node);
+                                    ref_offset(jeb->last_node), jeb->last_node);
                        goto error;
                }
-                ref2 = ref2->next_phys;
+                ref2 = ref_next(ref2);
        }
        if (my_used_size != jeb->used_size) {
@@ -268,9 +268,9 @@ __jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
        }
        printk(JFFS2_DBG);
-        for (ref = jeb->first_node; ; ref = ref->next_phys) {
+        for (ref = jeb->first_node; ; ref = ref_next(ref)) {
                printk("%#08x(%#x)", ref_offset(ref), ref->__totlen);
-                if (ref->next_phys)
+                if (ref_next(ref))
                        printk("->");
                else
                        break;
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index 162af6dfe292..3daf3bca0376 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -13,7 +13,6 @@
 #ifndef _JFFS2_DEBUG_H_
 #define _JFFS2_DEBUG_H_
-#include <linux/config.h>
 #ifndef CONFIG_JFFS2_FS_DEBUG
 #define CONFIG_JFFS2_FS_DEBUG 0
@@ -171,6 +170,12 @@
 #define dbg_memalloc(fmt, ...)
 #endif
+/* Watch the XATTR subsystem */
+#ifdef JFFS2_DBG_XATTR_MESSAGES
+#define dbg_xattr(fmt, ...)  JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_xattr(fmt, ...)
+#endif 
 /* "Sanity" checks */
 void
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 8bc7a5018e40..edd8371fc6a5 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -17,8 +17,8 @@
 #include <linux/fs.h>
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
+#include "jffs2_fs_i.h"
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_sb.h"
 #include <linux/time.h>
 #include "nodelist.h"
@@ -57,7 +57,12 @@ struct inode_operations jffs2_dir_inode_operations =
        .rmdir =        jffs2_rmdir,
        .mknod =        jffs2_mknod,
        .rename =       jffs2_rename,
+        .permission =   jffs2_permission,
        .setattr =      jffs2_setattr,
+        .setxattr =     jffs2_setxattr,
+        .getxattr =     jffs2_getxattr,
+        .listxattr =    jffs2_listxattr,
+        .removexattr =  jffs2_removexattr
 };
 /***********************************************************************/
@@ -78,6 +83,9 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
        D1(printk(KERN_DEBUG "jffs2_lookup()\n"));
+        if (target->d_name.len > JFFS2_MAX_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
        dir_f = JFFS2_INODE_INFO(dir_i);
        c = JFFS2_SB_INFO(dir_i->i_sb);
@@ -206,12 +214,15 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
        ret = jffs2_do_create(c, dir_f, f, ri,
                              dentry->d_name.name, dentry->d_name.len);
-        if (ret) {
+        if (ret)
-                make_bad_inode(inode);
+                goto fail;
-                iput(inode);
-                jffs2_free_raw_inode(ri);
+        ret = jffs2_init_security(inode, dir_i);
-                return ret;
+        if (ret)
-        }
+                goto fail;
+        ret = jffs2_init_acl(inode, dir_i);
+        if (ret)
+                goto fail;
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
@@ -221,6 +232,12 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
        D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
                  inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages));
        return 0;
+ fail:
+        make_bad_inode(inode);
+        iput(inode);
+        jffs2_free_raw_inode(ri);
+        return ret;
 }
 /***********************************************************************/
@@ -291,7 +308,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        struct jffs2_full_dnode *fn;
        struct jffs2_full_dirent *fd;
        int namelen;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret, targetlen = strlen(target);
        /* FIXME: If you care. We'd need to use frags for the target
@@ -310,8 +327,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
         * Just the node will do for now, though
         */
        namelen = dentry->d_name.len;
-        ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+                                  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                jffs2_free_raw_inode(ri);
@@ -339,7 +356,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        ri->data_crc = cpu_to_je32(crc32(0, target, targetlen));
        ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
-        fn = jffs2_write_dnode(c, f, ri, target, targetlen, phys_ofs, ALLOC_NORMAL);
+        fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL);
        jffs2_free_raw_inode(ri);
@@ -371,8 +388,20 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        up(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+        ret = jffs2_init_security(inode, dir_i);
+        if (ret) {
+                jffs2_clear_inode(inode);
+                return ret;
+        }
+        ret = jffs2_init_acl(inode, dir_i);
+        if (ret) {
+                jffs2_clear_inode(inode);
+                return ret;
+        }
+        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
        if (ret) {
                /* Eep. */
                jffs2_clear_inode(inode);
@@ -404,7 +433,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
        rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
-        fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
        if (IS_ERR(fd)) {
                /* dirent failed to write. Delete the inode normally
@@ -442,7 +471,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        struct jffs2_full_dnode *fn;
        struct jffs2_full_dirent *fd;
        int namelen;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret;
        mode |= S_IFDIR;
@@ -457,8 +486,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
         * Just the node will do for now, though
         */
        namelen = dentry->d_name.len;
-        ret = jffs2_reserve_space(c, sizeof(*ri), &phys_ofs, &alloclen, ALLOC_NORMAL,
+        ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
-                                JFFS2_SUMMARY_INODE_SIZE);
+                                  JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                jffs2_free_raw_inode(ri);
@@ -483,7 +512,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        ri->data_crc = cpu_to_je32(0);
        ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
-        fn = jffs2_write_dnode(c, f, ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+        fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
        jffs2_free_raw_inode(ri);
@@ -501,8 +530,20 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        up(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+        ret = jffs2_init_security(inode, dir_i);
+        if (ret) {
+                jffs2_clear_inode(inode);
+                return ret;
+        }
+        ret = jffs2_init_acl(inode, dir_i);
+        if (ret) {
+                jffs2_clear_inode(inode);
+                return ret;
+        }
+        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
        if (ret) {
                /* Eep. */
                jffs2_clear_inode(inode);
@@ -534,7 +575,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
        rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
-        fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
        if (IS_ERR(fd)) {
                /* dirent failed to write. Delete the inode normally
@@ -588,12 +629,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        struct jffs2_full_dnode *fn;
        struct jffs2_full_dirent *fd;
        int namelen;
-        jint16_t dev;
+        union jffs2_device_node dev;
        int devlen = 0;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret;
-        if (!old_valid_dev(rdev))
+        if (!new_valid_dev(rdev))
                return -EINVAL;
        ri = jffs2_alloc_raw_inode();
@@ -602,17 +643,15 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        c = JFFS2_SB_INFO(dir_i->i_sb);
-        if (S_ISBLK(mode) || S_ISCHR(mode)) {
+        if (S_ISBLK(mode) || S_ISCHR(mode))
-                dev = cpu_to_je16(old_encode_dev(rdev));
+                devlen = jffs2_encode_dev(&dev, rdev);
-                devlen = sizeof(dev);
-        }
        /* Try to reserve enough space for both node and dirent.
         * Just the node will do for now, though
         */
        namelen = dentry->d_name.len;
-        ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+                                  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                jffs2_free_raw_inode(ri);
@@ -639,7 +678,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen));
        ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
-        fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, phys_ofs, ALLOC_NORMAL);
+        fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL);
        jffs2_free_raw_inode(ri);
@@ -657,8 +696,20 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        up(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+        ret = jffs2_init_security(inode, dir_i);
+        if (ret) {
+                jffs2_clear_inode(inode);
+                return ret;
+        }
+        ret = jffs2_init_acl(inode, dir_i);
+        if (ret) {
+                jffs2_clear_inode(inode);
+                return ret;
+        }
+        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
        if (ret) {
                /* Eep. */
                jffs2_clear_inode(inode);
@@ -693,7 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
        rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
-        fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
        if (IS_ERR(fd)) {
                /* dirent failed to write. Delete the inode normally
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dad68fdffe9e..ad0121088dde 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -30,7 +30,6 @@ static void jffs2_erase_callback(struct erase_info *);
 #endif
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
-static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 static void jffs2_erase_block(struct jffs2_sb_info *c,
@@ -54,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        if (!instr) {
                printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
                spin_lock(&c->erase_completion_lock);
-                list_del(&jeb->list);
+                list_move(&jeb->list, &c->erase_pending_list);
-                list_add(&jeb->list, &c->erase_pending_list);
                c->erasing_size -= c->sector_size;
                c->dirty_size += c->sector_size;
                jeb->dirty_size = c->sector_size;
@@ -87,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
                /* Erase failed immediately. Refile it on the list */
                D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
                spin_lock(&c->erase_completion_lock);
-                list_del(&jeb->list);
+                list_move(&jeb->list, &c->erase_pending_list);
-                list_add(&jeb->list, &c->erase_pending_list);
                c->erasing_size -= c->sector_size;
                c->dirty_size += c->sector_size;
                jeb->dirty_size = c->sector_size;
@@ -136,7 +133,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                        c->used_size -= jeb->used_size;
                        c->dirty_size -= jeb->dirty_size;
                        jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0;
-                        jffs2_free_all_node_refs(c, jeb);
+                        jffs2_free_jeb_node_refs(c, jeb);
                        list_add(&jeb->list, &c->erasing_list);
                        spin_unlock(&c->erase_completion_lock);
@@ -162,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
 {
        D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
        spin_lock(&c->erase_completion_lock);
-        list_del(&jeb->list);
+        list_move_tail(&jeb->list, &c->erase_complete_list);
-        list_add_tail(&jeb->list, &c->erase_complete_list);
        spin_unlock(&c->erase_completion_lock);
        /* Ensure that kupdated calls us again to mark them clean */
        jffs2_erase_pending_trigger(c);
@@ -179,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
                if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
                        /* We'd like to give this block another try. */
                        spin_lock(&c->erase_completion_lock);
-                        list_del(&jeb->list);
+                        list_move(&jeb->list, &c->erase_pending_list);
-                        list_add(&jeb->list, &c->erase_pending_list);
                        c->erasing_size -= c->sector_size;
                        c->dirty_size += c->sector_size;
                        jeb->dirty_size = c->sector_size;
@@ -192,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
        spin_lock(&c->erase_completion_lock);
        c->erasing_size -= c->sector_size;
        c->bad_size += c->sector_size;
-        list_del(&jeb->list);
+        list_move(&jeb->list, &c->bad_list);
-        list_add(&jeb->list, &c->bad_list);
        c->nr_erasing_blocks--;
        spin_unlock(&c->erase_completion_lock);
        wake_up(&c->erase_wait);
@@ -254,7 +248,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
        /* PARANOIA */
        if (!ic) {
-                printk(KERN_WARNING "inode_cache not found in remove_node_refs()!!\n");
+                JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref"
+                              " not found in remove_node_refs()!!\n");
                return;
        }
@@ -279,26 +274,42 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
                printk("\n");
        });
-        if (ic->nodes == (void *)ic && ic->nlink == 0)
+        switch (ic->class) {
-                jffs2_del_ino_cache(c, ic);
+#ifdef CONFIG_JFFS2_FS_XATTR
+                case RAWNODE_CLASS_XATTR_DATUM:
+                        jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+                        break;
+                case RAWNODE_CLASS_XATTR_REF:
+                        jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+                        break;
+#endif
+                default:
+                        if (ic->nodes == (void *)ic && ic->nlink == 0)
+                                jffs2_del_ino_cache(c, ic);
+        }
 }
-static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-        struct jffs2_raw_node_ref *ref;
+        struct jffs2_raw_node_ref *block, *ref;
        D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
-        while(jeb->first_node) {
-                ref = jeb->first_node;
-                jeb->first_node = ref->next_phys;
-                /* Remove from the inode-list */
+        block = ref = jeb->first_node;
-                if (ref->next_in_ino)
+        while (ref) {
+                if (ref->flash_offset == REF_LINK_NODE) {
+                        ref = ref->next_in_ino;
+                        jffs2_free_refblock(block);
+                        block = ref;
+                        continue;
+                }
+                if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino)
                        jffs2_remove_node_refs_from_ino_list(c, ref, jeb);
                /* else it was a non-inode node or already removed, so don't bother */
-                jffs2_free_raw_node_ref(ref);
+                ref++;
        }
-        jeb->last_node = NULL;
+        jeb->first_node = jeb->last_node = NULL;
 }
 static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset)
@@ -351,7 +362,6 @@ fail:
 static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-        struct jffs2_raw_node_ref *marker_ref = NULL;
        size_t retlen;
        int ret;
        uint32_t bad_offset;
@@ -373,12 +383,8 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
                                goto filebad;
                }
-                jeb->first_node = jeb->last_node = NULL;
+                /* Everything else got zeroed before the erase */
                jeb->free_size = c->sector_size;
-                jeb->used_size = 0;
-                jeb->dirty_size = 0;
-                jeb->wasted_size = 0;
        } else {
                struct kvec vecs[1];
@@ -388,11 +394,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
                        .totlen =       cpu_to_je32(c->cleanmarker_size)
                };
-                marker_ref = jffs2_alloc_raw_node_ref();
+                jffs2_prealloc_raw_node_refs(c, jeb, 1);
-                if (!marker_ref) {
-                        printk(KERN_WARNING "Failed to allocate raw node ref for clean marker. Refiling\n");
-                        goto refile;
-                }
                marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
@@ -408,21 +410,13 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
                                printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
                                       jeb->offset, sizeof(marker), retlen);
-                        jffs2_free_raw_node_ref(marker_ref);
                        goto filebad;
                }
-                marker_ref->next_in_ino = NULL;
+                /* Everything else got zeroed before the erase */
-                marker_ref->next_phys = NULL;
+                jeb->free_size = c->sector_size;
-                marker_ref->flash_offset = jeb->offset | REF_NORMAL;
+                /* FIXME Special case for cleanmarker in empty block */
-                marker_ref->__totlen = c->cleanmarker_size;
+                jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL);
-                jeb->first_node = jeb->last_node = marker_ref;
-                jeb->free_size = c->sector_size - c->cleanmarker_size;
-                jeb->used_size = c->cleanmarker_size;
-                jeb->dirty_size = 0;
-                jeb->wasted_size = 0;
        }
        spin_lock(&c->erase_completion_lock);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9f4171213e58..3ed6e3e120b6 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -54,10 +54,15 @@ const struct file_operations jffs2_file_operations =
 struct inode_operations jffs2_file_inode_operations =
 {
-        .setattr =      jffs2_setattr
+        .permission =   jffs2_permission,
+        .setattr =      jffs2_setattr,
+        .setxattr =     jffs2_setxattr,
+        .getxattr =     jffs2_getxattr,
+        .listxattr =    jffs2_listxattr,
+        .removexattr =  jffs2_removexattr
 };
-struct address_space_operations jffs2_file_address_operations =
+const struct address_space_operations jffs2_file_address_operations =
 {
        .readpage =     jffs2_readpage,
        .prepare_write =jffs2_prepare_write,
@@ -129,13 +134,13 @@ static int jffs2_prepare_write (struct file *filp, struct page *pg,
                struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
                struct jffs2_raw_inode ri;
                struct jffs2_full_dnode *fn;
-                uint32_t phys_ofs, alloc_len;
+                uint32_t alloc_len;
                D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
                          (unsigned int)inode->i_size, pageofs));
-                ret = jffs2_reserve_space(c, sizeof(ri), &phys_ofs, &alloc_len,
+                ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
-                                        ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+                                          ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
                if (ret)
                        return ret;
@@ -161,7 +166,7 @@ static int jffs2_prepare_write (struct file *filp, struct page *pg,
                ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
                ri.data_crc = cpu_to_je32(0);
-                fn = jffs2_write_dnode(c, f, &ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+                fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL);
                if (IS_ERR(fn)) {
                        ret = PTR_ERR(fn);
@@ -215,12 +220,20 @@ static int jffs2_commit_write (struct file *filp, struct page *pg,
        D1(printk(KERN_DEBUG "jffs2_commit_write(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
                  inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags));
-        if (!start && end == PAGE_CACHE_SIZE) {
+        if (end == PAGE_CACHE_SIZE) {
-                /* We need to avoid deadlock with page_cache_read() in
+                if (!start) {
-                   jffs2_garbage_collect_pass(). So we have to mark the
+                        /* We need to avoid deadlock with page_cache_read() in
-                   page up to date, to prevent page_cache_read() from
+                           jffs2_garbage_collect_pass(). So we have to mark the
-                   trying to re-lock it. */
+                           page up to date, to prevent page_cache_read() from
-                SetPageUptodate(pg);
+                           trying to re-lock it. */
+                        SetPageUptodate(pg);
+                } else {
+                        /* When writing out the end of a page, write out the 
+                           _whole_ page. This helps to reduce the number of
+                           nodes in files which have many short writes, like
+                           syslog files. */
+                        start = aligned_start = 0;
+                }
        }
        ri = jffs2_alloc_raw_inode();
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09e5d10b8840..4780f82825d6 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -12,7 +12,6 @@
 */
 #include <linux/capability.h>
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
@@ -33,11 +32,11 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
        struct jffs2_raw_inode *ri;
-        unsigned short dev;
+        union jffs2_device_node dev;
        unsigned char *mdata = NULL;
        int mdatalen = 0;
        unsigned int ivalid;
-        uint32_t phys_ofs, alloclen;
+        uint32_t alloclen;
        int ret;
        D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino));
        ret = inode_change_ok(inode, iattr);
@@ -51,20 +50,24 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
           it out again with the appropriate data attached */
        if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
                /* For these, we don't actually need to read the old node */
-                dev = old_encode_dev(inode->i_rdev);
+                mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
                mdata = (char *)&dev;
-                mdatalen = sizeof(dev);
                D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
        } else if (S_ISLNK(inode->i_mode)) {
+                down(&f->sem);
                mdatalen = f->metadata->size;
                mdata = kmalloc(f->metadata->size, GFP_USER);
-                if (!mdata)
+                if (!mdata) {
+                        up(&f->sem);
                        return -ENOMEM;
+                }
                ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen);
                if (ret) {
+                        up(&f->sem);
                        kfree(mdata);
                        return ret;
                }
+                up(&f->sem);
                D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen));
        }
@@ -75,8 +78,8 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
                return -ENOMEM;
        }
-        ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+                                  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                jffs2_free_raw_inode(ri);
                if (S_ISLNK(inode->i_mode & S_IFMT))
@@ -127,7 +130,7 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
        else
                ri->data_crc = cpu_to_je32(0);
-        new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, phys_ofs, ALLOC_NORMAL);
+        new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, ALLOC_NORMAL);
        if (S_ISLNK(inode->i_mode))
                kfree(mdata);
@@ -180,12 +183,17 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
-        return jffs2_do_setattr(dentry->d_inode, iattr);
+        int rc;
+        rc = jffs2_do_setattr(dentry->d_inode, iattr);
+        if (!rc && (iattr->ia_valid & ATTR_MODE))
+                rc = jffs2_acl_chmod(dentry->d_inode);
+        return rc;
 }
-int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
        unsigned long avail;
        buf->f_type = JFFS2_SUPER_MAGIC;
@@ -218,7 +226,6 @@ void jffs2_clear_inode (struct inode *inode)
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
        jffs2_do_clear_inode(c, f);
 }
@@ -227,6 +234,8 @@ void jffs2_read_inode (struct inode *inode)
        struct jffs2_inode_info *f;
        struct jffs2_sb_info *c;
        struct jffs2_raw_inode latest_node;
+        union jffs2_device_node jdev;
+        dev_t rdev = 0;
        int ret;
        D1(printk(KERN_DEBUG "jffs2_read_inode(): inode->i_ino == %lu\n", inode->i_ino));
@@ -258,7 +267,6 @@ void jffs2_read_inode (struct inode *inode)
        inode->i_blocks = (inode->i_size + 511) >> 9;
        switch (inode->i_mode & S_IFMT) {
-                jint16_t rdev;
        case S_IFLNK:
                inode->i_op = &jffs2_symlink_inode_operations;
@@ -292,8 +300,16 @@ void jffs2_read_inode (struct inode *inode)
        case S_IFBLK:
        case S_IFCHR:
                /* Read the device numbers from the media */
+                if (f->metadata->size != sizeof(jdev.old) &&
+                    f->metadata->size != sizeof(jdev.new)) {
+                        printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
+                        up(&f->sem);
+                        jffs2_do_clear_inode(c, f);
+                        make_bad_inode(inode);
+                        return;
+                }
                D1(printk(KERN_DEBUG "Reading device numbers from flash\n"));
-                if (jffs2_read_dnode(c, f, f->metadata, (char *)&rdev, 0, sizeof(rdev)) < 0) {
+                if (jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size) < 0) {
                        /* Eep */
                        printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
                        up(&f->sem);
@@ -301,12 +317,15 @@ void jffs2_read_inode (struct inode *inode)
                        make_bad_inode(inode);
                        return;
                }
+                if (f->metadata->size == sizeof(jdev.old))
+                        rdev = old_decode_dev(je16_to_cpu(jdev.old));
+                else
+                        rdev = new_decode_dev(je32_to_cpu(jdev.new));
        case S_IFSOCK:
        case S_IFIFO:
                inode->i_op = &jffs2_file_inode_operations;
-                init_special_inode(inode, inode->i_mode,
+                init_special_inode(inode, inode->i_mode, rdev);
-                                   old_decode_dev((je16_to_cpu(rdev))));
                break;
        default:
@@ -492,6 +511,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
        }
        memset(c->inocache_list, 0, INOCACHE_HASHSIZE * sizeof(struct jffs2_inode_cache *));
+        jffs2_init_xattr_subsystem(c);
        if ((ret = jffs2_do_mount_fs(c)))
                goto out_inohash;
@@ -526,6 +547,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
        else
                kfree(c->blocks);
 out_inohash:
+        jffs2_clear_xattr_subsystem(c);
        kfree(c->inocache_list);
 out_wbuf:
        jffs2_flash_cleanup(c);
@@ -639,13 +661,6 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
                        return ret;
        }
-        /* add setups for other bizarre flashes here... */
-        if (jffs2_nor_ecc(c)) {
-                ret = jffs2_nor_ecc_flash_setup(c);
-                if (ret)
-                        return ret;
-        }
        /* and Dataflash */
        if (jffs2_dataflash(c)) {
                ret = jffs2_dataflash_setup(c);
@@ -669,11 +684,6 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
                jffs2_nand_flash_cleanup(c);
        }
-        /* add cleanups for other bizarre flashes here... */
-        if (jffs2_nor_ecc(c)) {
-                jffs2_nor_ecc_flash_cleanup(c);
-        }
        /* and DataFlash */
        if (jffs2_dataflash(c)) {
                jffs2_dataflash_cleanup(c);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f9ffece453a3..daff3341ff92 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -125,6 +125,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        struct jffs2_eraseblock *jeb;
        struct jffs2_raw_node_ref *raw;
        int ret = 0, inum, nlink;
+        int xattr = 0;
        if (down_interruptible(&c->alloc_sem))
                return -EINTR;
@@ -138,7 +139,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                   the node CRCs etc. Do it now. */
                /* checked_ino is protected by the alloc_sem */
-                if (c->checked_ino > c->highest_ino) {
+                if (c->checked_ino > c->highest_ino && xattr) {
                        printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n",
                               c->unchecked_size);
                        jffs2_dbg_dump_block_lists_nolock(c);
@@ -148,6 +149,9 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                spin_unlock(&c->erase_completion_lock);
+                if (!xattr)
+                        xattr = jffs2_verify_xattr(c);
                spin_lock(&c->inocache_lock);
                ic = jffs2_get_ino_cache(c, c->checked_ino++);
@@ -161,6 +165,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                        D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink zero\n",
                                  ic->ino));
                        spin_unlock(&c->inocache_lock);
+                        jffs2_xattr_delete_inode(c, ic);
                        continue;
                }
                switch(ic->state) {
@@ -181,6 +186,10 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                           and trigger the BUG() above while we haven't yet
                           finished checking all its nodes */
                        D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino));
+                        /* We need to come back again for the _same_ inode. We've
+                         made no progress in this case, but that should be OK */
+                        c->checked_ino--;
                        up(&c->alloc_sem);
                        sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
                        return 0;
@@ -231,7 +240,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        while(ref_obsolete(raw)) {
                D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw)));
-                raw = raw->next_phys;
+                raw = ref_next(raw);
                if (unlikely(!raw)) {
                        printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n");
                        printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
@@ -248,16 +257,36 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!raw->next_in_ino) {
                /* Inode-less node. Clean marker, snapshot or something like that */
-                /* FIXME: If it's something that needs to be copied, including something
-                   we don't grok that has JFFS2_NODETYPE_RWCOMPAT_COPY, we should do so */
                spin_unlock(&c->erase_completion_lock);
-                jffs2_mark_node_obsolete(c, raw);
+                if (ref_flags(raw) == REF_PRISTINE) {
+                        /* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */
+                        jffs2_garbage_collect_pristine(c, NULL, raw);
+                } else {
+                        /* Just mark it obsolete */
+                        jffs2_mark_node_obsolete(c, raw);
+                }
                up(&c->alloc_sem);
                goto eraseit_lock;
        }
        ic = jffs2_raw_ref_to_ic(raw);
+#ifdef CONFIG_JFFS2_FS_XATTR
+        /* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
+         * We can decide whether this node is inode or xattr by ic->class.     */
+        if (ic->class == RAWNODE_CLASS_XATTR_DATUM
+            || ic->class == RAWNODE_CLASS_XATTR_REF) {
+                spin_unlock(&c->erase_completion_lock);
+                if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+                        ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw);
+                } else {
+                        ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw);
+                }
+                goto release_sem;
+        }
+#endif
        /* We need to hold the inocache. Either the erase_completion_lock or
           the inocache_lock are sufficient; we trade down since the inocache_lock
           causes less contention. */
@@ -499,7 +528,6 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
                                          struct jffs2_raw_node_ref *raw)
 {
        union jffs2_node_union *node;
-        struct jffs2_raw_node_ref *nraw;
        size_t retlen;
        int ret;
        uint32_t phys_ofs, alloclen;
@@ -508,15 +536,16 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
        D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw)));
-        rawlen = ref_totlen(c, c->gcblock, raw);
+        alloclen = rawlen = ref_totlen(c, c->gcblock, raw);
        /* Ask for a small amount of space (or the totlen if smaller) because we
           don't want to force wastage of the end of a block if splitting would
           work. */
-        ret = jffs2_reserve_space_gc(c, min_t(uint32_t, sizeof(struct jffs2_raw_inode) +
+        if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
-                                JFFS2_MIN_DATA_LEN, rawlen), &phys_ofs, &alloclen, rawlen);
+                alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN;
-                                /* this is not the exact summary size of it,
-                                        it is only an upper estimation */
+        ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen);
+        /* 'rawlen' is not the exact summary size; it is only an upper estimation */
        if (ret)
                return ret;
@@ -580,22 +609,17 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
                }
                break;
        default:
-                printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
+                /* If it's inode-less, we don't _know_ what it is. Just copy it intact */
-                       ref_offset(raw), je16_to_cpu(node->u.nodetype));
+                if (ic) {
-                goto bail;
+                        printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
-        }
+                               ref_offset(raw), je16_to_cpu(node->u.nodetype));
+                        goto bail;
-        nraw = jffs2_alloc_raw_node_ref();
+                }
-        if (!nraw) {
-                ret = -ENOMEM;
-                goto out_node;
        }
        /* OK, all the CRCs are good; this node can just be copied as-is. */
 retry:
-        nraw->flash_offset = phys_ofs;
+        phys_ofs = write_ofs(c);
-        nraw->__totlen = rawlen;
-        nraw->next_phys = NULL;
        ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
@@ -603,17 +627,11 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
                printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
                       rawlen, phys_ofs, ret, retlen);
                if (retlen) {
-                        /* Doesn't belong to any inode */
+                        jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL);
-                        nraw->next_in_ino = NULL;
-                        nraw->flash_offset |= REF_OBSOLETE;
-                        jffs2_add_physical_node_ref(c, nraw);
-                        jffs2_mark_node_obsolete(c, nraw);
                } else {
-                        printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", nraw->flash_offset);
+                        printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs);
-                        jffs2_free_raw_node_ref(nraw);
                }
-                if (!retried && (nraw = jffs2_alloc_raw_node_ref())) {
+                if (!retried) {
                        /* Try to reallocate space and retry */
                        uint32_t dummy;
                        struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size];
@@ -625,7 +643,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
                        jffs2_dbg_acct_sanity_check(c,jeb);
                        jffs2_dbg_acct_paranoia_check(c, jeb);
-                        ret = jffs2_reserve_space_gc(c, rawlen, &phys_ofs, &dummy, rawlen);
+                        ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen);
                                                /* this is not the exact summary size of it,
                                                        it is only an upper estimation */
@@ -638,25 +656,13 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
                                goto retry;
                        }
                        D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-                        jffs2_free_raw_node_ref(nraw);
                }
-                jffs2_free_raw_node_ref(nraw);
                if (!ret)
                        ret = -EIO;
                goto out_node;
        }
-        nraw->flash_offset |= REF_PRISTINE;
+        jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic);
-        jffs2_add_physical_node_ref(c, nraw);
-        /* Link into per-inode list. This is safe because of the ic
-           state being INO_STATE_GC. Note that if we're doing this
-           for an inode which is in-core, the 'nraw' pointer is then
-           going to be fetched from ic->nodes by our caller. */
-        spin_lock(&c->erase_completion_lock);
-        nraw->next_in_ino = ic->nodes;
-        ic->nodes = nraw;
-        spin_unlock(&c->erase_completion_lock);
        jffs2_mark_node_obsolete(c, raw);
        D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
@@ -675,19 +681,16 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
        struct jffs2_full_dnode *new_fn;
        struct jffs2_raw_inode ri;
        struct jffs2_node_frag *last_frag;
-        jint16_t dev;
+        union jffs2_device_node dev;
        char *mdata = NULL, mdatalen = 0;
-        uint32_t alloclen, phys_ofs, ilen;
+        uint32_t alloclen, ilen;
        int ret;
        if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
            S_ISCHR(JFFS2_F_I_MODE(f)) ) {
                /* For these, we don't actually need to read the old node */
-                /* FIXME: for minor or major > 255. */
+                mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
-                dev = cpu_to_je16(((JFFS2_F_I_RDEV_MAJ(f) << 8) |
-                        JFFS2_F_I_RDEV_MIN(f)));
                mdata = (char *)&dev;
-                mdatalen = sizeof(dev);
                D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen));
        } else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
                mdatalen = fn->size;
@@ -706,7 +709,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
        }
-        ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen,
                                JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
@@ -744,7 +747,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
        ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
        ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
-        new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, phys_ofs, ALLOC_GC);
+        new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC);
        if (IS_ERR(new_fn)) {
                printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
@@ -765,7 +768,7 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
 {
        struct jffs2_full_dirent *new_fd;
        struct jffs2_raw_dirent rd;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret;
        rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -787,14 +790,14 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
        rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8));
        rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize));
-        ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen,
                                JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize));
        if (ret) {
                printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
                       sizeof(rd)+rd.nsize, ret);
                return ret;
        }
-        new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, phys_ofs, ALLOC_GC);
+        new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC);
        if (IS_ERR(new_fd)) {
                printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd));
@@ -922,7 +925,7 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
        struct jffs2_raw_inode ri;
        struct jffs2_node_frag *frag;
        struct jffs2_full_dnode *new_fn;
-        uint32_t alloclen, phys_ofs, ilen;
+        uint32_t alloclen, ilen;
        int ret;
        D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
@@ -1001,14 +1004,14 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
        ri.data_crc = cpu_to_je32(0);
        ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
-        ret = jffs2_reserve_space_gc(c, sizeof(ri), &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen,
-                                JFFS2_SUMMARY_INODE_SIZE);
+                                     JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
                       sizeof(ri), ret);
                return ret;
        }
-        new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, phys_ofs, ALLOC_GC);
+        new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC);
        if (IS_ERR(new_fn)) {
                printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn));
@@ -1070,7 +1073,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 {
        struct jffs2_full_dnode *new_fn;
        struct jffs2_raw_inode ri;
-        uint32_t alloclen, phys_ofs, offset, orig_end, orig_start;
+        uint32_t alloclen, offset, orig_end, orig_start;
        int ret = 0;
        unsigned char *comprbuf = NULL, *writebuf;
        unsigned long pg;
@@ -1227,7 +1230,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
                uint32_t cdatalen;
                uint16_t comprtype = JFFS2_COMPR_NONE;
-                ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN, &phys_ofs,
+                ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN,
                                        &alloclen, JFFS2_SUMMARY_INODE_SIZE);
                if (ret) {
@@ -1264,7 +1267,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
                ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
                ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
-                new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, phys_ofs, ALLOC_GC);
+                new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC);
                jffs2_free_comprbuf(comprbuf, writebuf);
diff --git a/fs/jffs2/histo.h b/fs/jffs2/histo.h
deleted file mode 100644
index 22a93a08210c..000000000000
--- a/fs/jffs2/histo.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* This file provides the bit-probabilities for the input file */
-#define BIT_DIVIDER 629
-static int bits[9] = { 179,167,183,165,159,198,178,119,}; /* ia32 .so files */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
new file mode 100644
index 000000000000..2e0cc8e00b85
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -0,0 +1,55 @@
+/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
+#ifndef _JFFS2_FS_I
+#define _JFFS2_FS_I
+#include <linux/version.h>
+#include <linux/rbtree.h>
+#include <linux/posix_acl.h>
+#include <asm/semaphore.h>
+struct jffs2_inode_info {
+        /* We need an internal mutex similar to inode->i_mutex.
+           Unfortunately, we can't used the existing one, because
+           either the GC would deadlock, or we'd have to release it
+           before letting GC proceed. Or we'd have to put ugliness
+           into the GC code so it didn't attempt to obtain the i_mutex
+           for the inode(s) which are already locked */
+        struct semaphore sem;
+        /* The highest (datanode) version number used for this ino */
+        uint32_t highest_version;
+        /* List of data fragments which make up the file */
+        struct rb_root fragtree;
+        /* There may be one datanode which isn't referenced by any of the
+           above fragments, if it contains a metadata update but no actual
+           data - or if this is a directory inode */
+        /* This also holds the _only_ dnode for symlinks/device nodes,
+           etc. */
+        struct jffs2_full_dnode *metadata;
+        /* Directory entries */
+        struct jffs2_full_dirent *dents;
+        /* The target path if this is the inode of a symlink */
+        unsigned char *target;
+        /* Some stuff we just have to keep in-core at all times, for each inode. */
+        struct jffs2_inode_cache *inocache;
+        uint16_t flags;
+        uint8_t usercompr;
+#if !defined (__ECOS)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2)
+        struct inode vfs_inode;
+#endif
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+        struct posix_acl *i_acl_access;
+        struct posix_acl *i_acl_default;
+#endif
+};
+#endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
new file mode 100644
index 000000000000..b98594992eed
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -0,0 +1,136 @@
+/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
+#ifndef _JFFS2_FS_SB
+#define _JFFS2_FS_SB
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <asm/semaphore.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+#define JFFS2_SB_FLAG_RO 1
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
+struct jffs2_inodirty;
+/* A struct for the overall file system control.  Pointers to
+   jffs2_sb_info structs are named `c' in the source code.
+   Nee jffs_control
+*/
+struct jffs2_sb_info {
+        struct mtd_info *mtd;
+        uint32_t highest_ino;
+        uint32_t checked_ino;
+        unsigned int flags;
+        struct task_struct *gc_task;    /* GC task struct */
+        struct completion gc_thread_start; /* GC thread start completion */
+        struct completion gc_thread_exit; /* GC thread exit completion port */
+        struct semaphore alloc_sem;     /* Used to protect all the following
+                                           fields, and also to protect against
+                                           out-of-order writing of nodes. And GC. */
+        uint32_t cleanmarker_size;      /* Size of an _inline_ CLEANMARKER
+                                         (i.e. zero for OOB CLEANMARKER */
+        uint32_t flash_size;
+        uint32_t used_size;
+        uint32_t dirty_size;
+        uint32_t wasted_size;
+        uint32_t free_size;
+        uint32_t erasing_size;
+        uint32_t bad_size;
+        uint32_t sector_size;
+        uint32_t unchecked_size;
+        uint32_t nr_free_blocks;
+        uint32_t nr_erasing_blocks;
+        /* Number of free blocks there must be before we... */
+        uint8_t resv_blocks_write;      /* ... allow a normal filesystem write */
+        uint8_t resv_blocks_deletion;   /* ... allow a normal filesystem deletion */
+        uint8_t resv_blocks_gctrigger;  /* ... wake up the GC thread */
+        uint8_t resv_blocks_gcbad;      /* ... pick a block from the bad_list to GC */
+        uint8_t resv_blocks_gcmerge;    /* ... merge pages when garbage collecting */
+        uint32_t nospc_dirty_size;
+        uint32_t nr_blocks;
+        struct jffs2_eraseblock *blocks;        /* The whole array of blocks. Used for getting blocks
+                                                 * from the offset (blocks[ofs / sector_size]) */
+        struct jffs2_eraseblock *nextblock;     /* The block we're currently filling */
+        struct jffs2_eraseblock *gcblock;       /* The block we're currently garbage-collecting */
+        struct list_head clean_list;            /* Blocks 100% full of clean data */
+        struct list_head very_dirty_list;       /* Blocks with lots of dirty space */
+        struct list_head dirty_list;            /* Blocks with some dirty space */
+        struct list_head erasable_list;         /* Blocks which are completely dirty, and need erasing */
+        struct list_head erasable_pending_wbuf_list;    /* Blocks which need erasing but only after the current wbuf is flushed */
+        struct list_head erasing_list;          /* Blocks which are currently erasing */
+        struct list_head erase_pending_list;    /* Blocks which need erasing now */
+        struct list_head erase_complete_list;   /* Blocks which are erased and need the clean marker written to them */
+        struct list_head free_list;             /* Blocks which are free and ready to be used */
+        struct list_head bad_list;              /* Bad blocks. */
+        struct list_head bad_used_list;         /* Bad blocks with valid data in. */
+        spinlock_t erase_completion_lock;       /* Protect free_list and erasing_list
+                                                   against erase completion handler */
+        wait_queue_head_t erase_wait;           /* For waiting for erases to complete */
+        wait_queue_head_t inocache_wq;
+        struct jffs2_inode_cache **inocache_list;
+        spinlock_t inocache_lock;
+        /* Sem to allow jffs2_garbage_collect_deletion_dirent to
+           drop the erase_completion_lock while it's holding a pointer
+           to an obsoleted node. I don't like this. Alternatives welcomed. */
+        struct semaphore erase_free_sem;
+        uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+        /* Write-behind buffer for NAND flash */
+        unsigned char *wbuf;
+        unsigned char *oobbuf;
+        uint32_t wbuf_ofs;
+        uint32_t wbuf_len;
+        struct jffs2_inodirty *wbuf_inodes;
+        struct rw_semaphore wbuf_sem;   /* Protects the write buffer */
+        /* Information about out-of-band area usage... */
+        struct nand_ecclayout *ecclayout;
+        uint32_t badblock_pos;
+        uint32_t fsdata_pos;
+        uint32_t fsdata_len;
+#endif
+        struct jffs2_summary *summary;          /* Summary information */
+#ifdef CONFIG_JFFS2_FS_XATTR
+#define XATTRINDEX_HASHSIZE     (57)
+        uint32_t highest_xid;
+        uint32_t highest_xseqno;
+        struct list_head xattrindex[XATTRINDEX_HASHSIZE];
+        struct list_head xattr_unchecked;
+        struct list_head xattr_dead_list;
+        struct jffs2_xattr_ref *xref_dead_list;
+        struct jffs2_xattr_ref *xref_temp;
+        struct rw_semaphore xattr_sem;
+        uint32_t xdatum_mem_usage;
+        uint32_t xdatum_mem_threshold;
+#endif
+        /* OS-private pointer for getting back to master superblock info */
+        void *os_priv;
+};
+#endif /* _JFFS2_FB_SB */
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 036cbd11c004..8310c95478e9 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -26,6 +26,10 @@ static kmem_cache_t *tmp_dnode_info_slab;
 static kmem_cache_t *raw_node_ref_slab;
 static kmem_cache_t *node_frag_slab;
 static kmem_cache_t *inode_cache_slab;
+#ifdef CONFIG_JFFS2_FS_XATTR
+static kmem_cache_t *xattr_datum_cache;
+static kmem_cache_t *xattr_ref_cache;
+#endif
 int __init jffs2_create_slab_caches(void)
 {
@@ -53,8 +57,8 @@ int __init jffs2_create_slab_caches(void)
        if (!tmp_dnode_info_slab)
                goto err;
-        raw_node_ref_slab = kmem_cache_create("jffs2_raw_node_ref",
+        raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
-                                              sizeof(struct jffs2_raw_node_ref),
+                                              sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
                                              0, 0, NULL, NULL);
        if (!raw_node_ref_slab)
                goto err;
@@ -68,8 +72,24 @@ int __init jffs2_create_slab_caches(void)
        inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
                                             sizeof(struct jffs2_inode_cache),
                                             0, 0, NULL, NULL);
-        if (inode_cache_slab)
+        if (!inode_cache_slab)
-                return 0;
+                goto err;
+#ifdef CONFIG_JFFS2_FS_XATTR
+        xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
+                                             sizeof(struct jffs2_xattr_datum),
+                                             0, 0, NULL, NULL);
+        if (!xattr_datum_cache)
+                goto err;
+        xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
+                                           sizeof(struct jffs2_xattr_ref),
+                                           0, 0, NULL, NULL);
+        if (!xattr_ref_cache)
+                goto err;
+#endif
+        return 0;
 err:
        jffs2_destroy_slab_caches();
        return -ENOMEM;
@@ -91,6 +111,12 @@ void jffs2_destroy_slab_caches(void)
                kmem_cache_destroy(node_frag_slab);
        if(inode_cache_slab)
                kmem_cache_destroy(inode_cache_slab);
+#ifdef CONFIG_JFFS2_FS_XATTR
+        if (xattr_datum_cache)
+                kmem_cache_destroy(xattr_datum_cache);
+        if (xattr_ref_cache)
+                kmem_cache_destroy(xattr_ref_cache);
+#endif
 }
 struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize)
@@ -164,15 +190,65 @@ void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
        kmem_cache_free(tmp_dnode_info_slab, x);
 }
-struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void)
+struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
 {
        struct jffs2_raw_node_ref *ret;
        ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
-        dbg_memalloc("%p\n", ret);
+        if (ret) {
+                int i = 0;
+                for (i=0; i < REFS_PER_BLOCK; i++) {
+                        ret[i].flash_offset = REF_EMPTY_NODE;
+                        ret[i].next_in_ino = NULL;
+                }
+                ret[i].flash_offset = REF_LINK_NODE;
+                ret[i].next_in_ino = NULL;
+        }
        return ret;
 }
-void jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *x)
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+                                 struct jffs2_eraseblock *jeb, int nr)
+{
+        struct jffs2_raw_node_ref **p, *ref;
+        int i = nr;
+        dbg_memalloc("%d\n", nr);
+        p = &jeb->last_node;
+        ref = *p;
+        dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset);
+        /* If jeb->last_node is really a valid node then skip over it */
+        if (ref && ref->flash_offset != REF_EMPTY_NODE)
+                ref++;
+        while (i) {
+                if (!ref) {
+                        dbg_memalloc("Allocating new refblock linked from %p\n", p);
+                        ref = *p = jffs2_alloc_refblock();
+                        if (!ref)
+                                return -ENOMEM;
+                }
+                if (ref->flash_offset == REF_LINK_NODE) {
+                        p = &ref->next_in_ino;
+                        ref = *p;
+                        continue;
+                }
+                i--;
+                ref++;
+        }
+        jeb->allocated_refs = nr;
+        dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n",
+                  nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset,
+                  jeb->last_node->next_in_ino);
+        return 0;
+}
+void jffs2_free_refblock(struct jffs2_raw_node_ref *x)
 {
        dbg_memalloc("%p\n", x);
        kmem_cache_free(raw_node_ref_slab, x);
@@ -205,3 +281,42 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
        dbg_memalloc("%p\n", x);
        kmem_cache_free(inode_cache_slab, x);
 }
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
+{
+        struct jffs2_xattr_datum *xd;
+        xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+        dbg_memalloc("%p\n", xd);
+        memset(xd, 0, sizeof(struct jffs2_xattr_datum));
+        xd->class = RAWNODE_CLASS_XATTR_DATUM;
+        xd->node = (void *)xd;
+        INIT_LIST_HEAD(&xd->xindex);
+        return xd;
+}
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
+{
+        dbg_memalloc("%p\n", xd);
+        kmem_cache_free(xattr_datum_cache, xd);
+}
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
+{
+        struct jffs2_xattr_ref *ref;
+        ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+        dbg_memalloc("%p\n", ref);
+        memset(ref, 0, sizeof(struct jffs2_xattr_ref));
+        ref->class = RAWNODE_CLASS_XATTR_REF;
+        ref->node = (void *)ref;
+        return ref;
+}
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref)
+{
+        dbg_memalloc("%p\n", ref);
+        kmem_cache_free(xattr_ref_cache, ref);
+}
+#endif
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 1d46677afd17..7675b33396c7 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -438,8 +438,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
        if (c->mtd->point) {
                err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
                if (!err && retlen < tn->csize) {
-                        JFFS2_WARNING("MTD point returned len too short: %zu "
+                        JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
-                                        "instead of %u.\n", retlen, tn->csize);
                        c->mtd->unpoint(c->mtd, buffer, ofs, len);
                } else if (err)
                        JFFS2_WARNING("MTD point failed: error code %d.\n", err);
@@ -462,8 +461,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
                }
                if (retlen != len) {
-                        JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n",
+                        JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
-                                        ofs, retlen, len);
                        err = -EIO;
                        goto free_out;
                }
@@ -908,6 +906,9 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
 {
        struct jffs2_inode_cache **prev;
+#ifdef CONFIG_JFFS2_FS_XATTR
+        BUG_ON(old->xref);
+#endif
        dbg_inocache("del %p (ino #%u)\n", old, old->ino);
        spin_lock(&c->inocache_lock);
@@ -940,6 +941,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
                this = c->inocache_list[i];
                while (this) {
                        next = this->next;
+                        jffs2_xattr_free_inode(c, this);
                        jffs2_free_inode_cache(this);
                        this = next;
                }
@@ -954,9 +956,13 @@ void jffs2_free_raw_node_refs(struct jffs2_sb_info *c)
        for (i=0; i<c->nr_blocks; i++) {
                this = c->blocks[i].first_node;
-                while(this) {
+                while (this) {
-                        next = this->next_phys;
+                        if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE)
-                        jffs2_free_raw_node_ref(this);
+                                next = this[REFS_PER_BLOCK].next_in_ino;
+                        else
+                                next = NULL;
+                        jffs2_free_refblock(this);
                        this = next;
                }
                c->blocks[i].first_node = c->blocks[i].last_node = NULL;
@@ -1047,3 +1053,169 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
                cond_resched();
        }
 }
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+                                               struct jffs2_eraseblock *jeb,
+                                               uint32_t ofs, uint32_t len,
+                                               struct jffs2_inode_cache *ic)
+{
+        struct jffs2_raw_node_ref *ref;
+        BUG_ON(!jeb->allocated_refs);
+        jeb->allocated_refs--;
+        ref = jeb->last_node;
+        dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset,
+                    ref->next_in_ino);
+        while (ref->flash_offset != REF_EMPTY_NODE) {
+                if (ref->flash_offset == REF_LINK_NODE)
+                        ref = ref->next_in_ino;
+                else
+                        ref++;
+        }
+        dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, 
+                    ref->flash_offset, ofs, ref->next_in_ino, len);
+        ref->flash_offset = ofs;
+        if (!jeb->first_node) {
+                jeb->first_node = ref;
+                BUG_ON(ref_offset(ref) != jeb->offset);
+        } else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) {
+                uint32_t last_len = ref_totlen(c, jeb, jeb->last_node);
+                JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
+                            ref, ref_offset(ref), ref_offset(ref)+len,
+                            ref_offset(jeb->last_node), 
+                            ref_offset(jeb->last_node)+last_len);
+                BUG();
+        }
+        jeb->last_node = ref;
+        if (ic) {
+                ref->next_in_ino = ic->nodes;
+                ic->nodes = ref;
+        } else {
+                ref->next_in_ino = NULL;
+        }
+        switch(ref_flags(ref)) {
+        case REF_UNCHECKED:
+                c->unchecked_size += len;
+                jeb->unchecked_size += len;
+                break;
+        case REF_NORMAL:
+        case REF_PRISTINE:
+                c->used_size += len;
+                jeb->used_size += len;
+                break;
+        case REF_OBSOLETE:
+                c->dirty_size += len;
+                jeb->dirty_size += len;
+                break;
+        }
+        c->free_size -= len;
+        jeb->free_size -= len;
+#ifdef TEST_TOTLEN
+        /* Set (and test) __totlen field... for now */
+        ref->__totlen = len;
+        ref_totlen(c, jeb, ref);
+#endif
+        return ref;
+}
+/* No locking, no reservation of 'ref'. Do not use on a live file system */
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+                           uint32_t size)
+{
+        if (!size)
+                return 0;
+        if (unlikely(size > jeb->free_size)) {
+                printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
+                       size, jeb->free_size, jeb->wasted_size);
+                BUG();
+        }
+        /* REF_EMPTY_NODE is !obsolete, so that works OK */
+        if (jeb->last_node && ref_obsolete(jeb->last_node)) {
+#ifdef TEST_TOTLEN
+                jeb->last_node->__totlen += size;
+#endif
+                c->dirty_size += size;
+                c->free_size -= size;
+                jeb->dirty_size += size;
+                jeb->free_size -= size;
+        } else {
+                uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size;
+                ofs |= REF_OBSOLETE;
+                jffs2_link_node_ref(c, jeb, ofs, size, NULL);
+        }
+        return 0;
+}
+/* Calculate totlen from surrounding nodes or eraseblock */
+static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
+                                    struct jffs2_eraseblock *jeb,
+                                    struct jffs2_raw_node_ref *ref)
+{
+        uint32_t ref_end;
+        struct jffs2_raw_node_ref *next_ref = ref_next(ref);
+        if (next_ref)
+                ref_end = ref_offset(next_ref);
+        else {
+                if (!jeb)
+                        jeb = &c->blocks[ref->flash_offset / c->sector_size];
+                /* Last node in block. Use free_space */
+                if (unlikely(ref != jeb->last_node)) {
+                        printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
+                               ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0);
+                        BUG();
+                }
+                ref_end = jeb->offset + c->sector_size - jeb->free_size;
+        }
+        return ref_end - ref_offset(ref);
+}
+uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+                            struct jffs2_raw_node_ref *ref)
+{
+        uint32_t ret;
+        ret = __ref_totlen(c, jeb, ref);
+#ifdef TEST_TOTLEN
+        if (unlikely(ret != ref->__totlen)) {
+                if (!jeb)
+                        jeb = &c->blocks[ref->flash_offset / c->sector_size];
+                printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
+                       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
+                       ret, ref->__totlen);
+                if (ref_next(ref)) {
+                        printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)),
+                               ref_offset(ref_next(ref))+ref->__totlen);
+                } else 
+                        printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node);
+                printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+                __jffs2_dbg_dump_node_refs_nolock(c, jeb);
+#endif
+                WARN_ON(1);
+                ret = ref->__totlen;
+        }
+#endif /* TEST_TOTLEN */
+        return ret;
+}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 23a67bb3052f..f752baa8d399 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -14,12 +14,13 @@
 #ifndef __JFFS2_NODELIST_H__
 #define __JFFS2_NODELIST_H__
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_sb.h"
-#include <linux/jffs2_fs_i.h>
+#include "jffs2_fs_i.h"
+#include "xattr.h"
+#include "acl.h"
 #include "summary.h"
 #ifdef __ECOS
@@ -75,14 +76,50 @@
 struct jffs2_raw_node_ref
 {
        struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref
-                for this inode. If this is the last, it points to the inode_cache
+                for this object. If this _is_ the last, it points to the inode_cache,
-                for this inode instead. The inode_cache will have NULL in the first
+                xattr_ref or xattr_datum instead. The common part of those structures
-                word so you know when you've got there :) */
+                has NULL in the first word. See jffs2_raw_ref_to_ic() below */
-        struct jffs2_raw_node_ref *next_phys;
        uint32_t flash_offset;
+#define TEST_TOTLEN
+#ifdef TEST_TOTLEN
        uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */
+#endif
 };
+#define REF_LINK_NODE ((int32_t)-1)
+#define REF_EMPTY_NODE ((int32_t)-2)
+/* Use blocks of about 256 bytes */
+#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1)
+static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref)
+{
+        ref++;
+        /* Link to another block of refs */
+        if (ref->flash_offset == REF_LINK_NODE) {
+                ref = ref->next_in_ino;
+                if (!ref)
+                        return ref;
+        }
+        /* End of chain */
+        if (ref->flash_offset == REF_EMPTY_NODE)
+                return NULL;
+        return ref;
+}
+static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+{
+        while(raw->next_in_ino)
+                raw = raw->next_in_ino;
+        /* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and
+           not actually a jffs2_inode_cache. Check ->class */
+        return ((struct jffs2_inode_cache *)raw);
+}
        /* flash_offset & 3 always has to be zero, because nodes are
           always aligned at 4 bytes. So we have a couple of extra bits
           to play with, which indicate the node's status; see below: */
@@ -95,6 +132,11 @@ struct jffs2_raw_node_ref
 #define ref_obsolete(ref)       (((ref)->flash_offset & 3) == REF_OBSOLETE)
 #define mark_ref_normal(ref)    do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0)
+/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates
+   it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get
+   copied. If you need to do anything different to GC inode-less nodes, then
+   you need to modify gc.c accordingly. */
 /* For each inode in the filesystem, we need to keep a record of
   nlink, because it would be a PITA to scan the whole directory tree
   at read_inode() time to calculate it, and to keep sufficient information
@@ -103,15 +145,27 @@ struct jffs2_raw_node_ref
   a pointer to the first physical node which is part of this inode, too.
 */
 struct jffs2_inode_cache {
+        /* First part of structure is shared with other objects which
+           can terminate the raw node refs' next_in_ino list -- which
+           currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */
        struct jffs2_full_dirent *scan_dents; /* Used during scan to hold
                temporary lists of dirents, and later must be set to
                NULL to mark the end of the raw_node_ref->next_in_ino
                chain. */
-        struct jffs2_inode_cache *next;
        struct jffs2_raw_node_ref *nodes;
+        uint8_t class;  /* It's used for identification */
+        /* end of shared structure */
+        uint8_t flags;
+        uint16_t state;
        uint32_t ino;
+        struct jffs2_inode_cache *next;
+#ifdef CONFIG_JFFS2_FS_XATTR
+        struct jffs2_xattr_ref *xref;
+#endif
        int nlink;
-        int state;
 };
 /* Inode states for 'state' above. We need the 'GC' state to prevent
@@ -125,8 +179,16 @@ struct jffs2_inode_cache {
 #define INO_STATE_READING       5       /* In read_inode() */
 #define INO_STATE_CLEARING      6       /* In clear_inode() */
+#define INO_FLAGS_XATTR_CHECKED 0x01    /* has no duplicate xattr_ref */
+#define RAWNODE_CLASS_INODE_CACHE       0
+#define RAWNODE_CLASS_XATTR_DATUM       1
+#define RAWNODE_CLASS_XATTR_REF         2
 #define INOCACHE_HASHSIZE 128
+#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
 /*
  Larger representation of a raw node, kept in-core only when the
  struct inode for this particular ino is instantiated.
@@ -192,6 +254,7 @@ struct jffs2_eraseblock
        uint32_t wasted_size;
        uint32_t free_size;     /* Note that sector_size - free_size
                                   is the address of the first free space */
+        uint32_t allocated_refs;
        struct jffs2_raw_node_ref *first_node;
        struct jffs2_raw_node_ref *last_node;
@@ -203,57 +266,7 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
        return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024);
 }
-/* Calculate totlen from surrounding nodes or eraseblock */
+#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c))
-static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
-                                    struct jffs2_eraseblock *jeb,
-                                    struct jffs2_raw_node_ref *ref)
-{
-        uint32_t ref_end;
-        if (ref->next_phys)
-                ref_end = ref_offset(ref->next_phys);
-        else {
-                if (!jeb)
-                        jeb = &c->blocks[ref->flash_offset / c->sector_size];
-                /* Last node in block. Use free_space */
-                BUG_ON(ref != jeb->last_node);
-                ref_end = jeb->offset + c->sector_size - jeb->free_size;
-        }
-        return ref_end - ref_offset(ref);
-}
-static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
-                                  struct jffs2_eraseblock *jeb,
-                                  struct jffs2_raw_node_ref *ref)
-{
-        uint32_t ret;
-#if CONFIG_JFFS2_FS_DEBUG > 0
-        if (jeb && jeb != &c->blocks[ref->flash_offset / c->sector_size]) {
-                printk(KERN_CRIT "ref_totlen called with wrong block -- at 0x%08x instead of 0x%08x; ref 0x%08x\n",
-                       jeb->offset, c->blocks[ref->flash_offset / c->sector_size].offset, ref_offset(ref));
-                BUG();
-        }
-#endif
-#if 1
-        ret = ref->__totlen;
-#else
-        /* This doesn't actually work yet */
-        ret = __ref_totlen(c, jeb, ref);
-        if (ret != ref->__totlen) {
-                printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
-                       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
-                       ret, ref->__totlen);
-                if (!jeb)
-                        jeb = &c->blocks[ref->flash_offset / c->sector_size];
-                jffs2_dbg_dump_node_refs_nolock(c, jeb);
-                BUG();
-        }
-#endif
-        return ret;
-}
 #define ALLOC_NORMAL    0       /* Normal allocation */
 #define ALLOC_DELETION  1       /* Deletion node. Best to allow it */
@@ -268,13 +281,15 @@ static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
 #define PAD(x) (((x)+3)&~3)
-static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
-        while(raw->next_in_ino) {
+        if (old_valid_dev(rdev)) {
-                raw = raw->next_in_ino;
+                jdev->old = cpu_to_je16(old_encode_dev(rdev));
+                return sizeof(jdev->old);
+        } else {
+                jdev->new = cpu_to_je32(new_encode_dev(rdev));
+                return sizeof(jdev->new);
        }
-        return ((struct jffs2_inode_cache *)raw);
 }
 static inline struct jffs2_node_frag *frag_first(struct rb_root *root)
@@ -299,7 +314,6 @@ static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
        return rb_entry(node, struct jffs2_node_frag, rb);
 }
-#define rb_parent(rb) ((rb)->rb_parent)
 #define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb)
 #define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb)
 #define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb)
@@ -324,28 +338,44 @@ void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, struct jffs2_node_frag *t
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+                                               struct jffs2_eraseblock *jeb,
+                                               uint32_t ofs, uint32_t len,
+                                               struct jffs2_inode_cache *ic);
+extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
+                                   struct jffs2_eraseblock *jeb,
+                                   struct jffs2_raw_node_ref *ref);
 /* nodemgmt.c */
 int jffs2_thread_should_wake(struct jffs2_sb_info *c);
-int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                        uint32_t *len, int prio, uint32_t sumsize);
-int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
                        uint32_t *len, uint32_t sumsize);
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new);
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
+                                                       uint32_t ofs, uint32_t len,
+                                                       struct jffs2_inode_cache *ic);
 void jffs2_complete_reservation(struct jffs2_sb_info *c);
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
 /* write.c */
 int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri);
-struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const unsigned char *data, uint32_t datalen, uint32_t flash_ofs, int alloc_mode);
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
-struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_dirent *rd, const unsigned char *name, uint32_t namelen, uint32_t flash_ofs, int alloc_mode);
+                                           struct jffs2_raw_inode *ri, const unsigned char *data,
+                                           uint32_t datalen, int alloc_mode);
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+                                             struct jffs2_raw_dirent *rd, const unsigned char *name,
+                                             uint32_t namelen, int alloc_mode);
 int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                            struct jffs2_raw_inode *ri, unsigned char *buf,
                            uint32_t offset, uint32_t writelen, uint32_t *retlen);
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen);
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
-int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
+                    struct jffs2_raw_inode *ri, const char *name, int namelen);
-int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time);
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
+                    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
+int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
+                   uint8_t type, const char *name, int namelen, uint32_t time);
 /* readinode.c */
@@ -368,12 +398,19 @@ struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
 void jffs2_free_raw_inode(struct jffs2_raw_inode *);
 struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
 void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
-struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void);
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
-void jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *);
+                                 struct jffs2_eraseblock *jeb, int nr);
+void jffs2_free_refblock(struct jffs2_raw_node_ref *);
 struct jffs2_node_frag *jffs2_alloc_node_frag(void);
 void jffs2_free_node_frag(struct jffs2_node_frag *);
 struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
 void jffs2_free_inode_cache(struct jffs2_inode_cache *);
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void);
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *);
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void);
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *);
+#endif
 /* gc.c */
 int jffs2_garbage_collect_pass(struct jffs2_sb_info *c);
@@ -393,12 +430,14 @@ int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
                                uint32_t ofs, uint32_t len);
 struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
 int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size);
 /* build.c */
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 /* erase.c */
 void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 /* wbuf.c */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 49127a1f0458..d88376992ed9 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -23,13 +23,12 @@
 *      jffs2_reserve_space - request physical space to write nodes to flash
 *      @c: superblock info
 *      @minsize: Minimum acceptable size of allocation
- *      @ofs: Returned value of node offset
 *      @len: Returned value of allocation length
 *      @prio: Allocation type - ALLOC_{NORMAL,DELETION}
 *
 *      Requests a block of physical space on the flash. Returns zero for success
- *      and puts 'ofs' and 'len' into the appriopriate place, or returns -ENOSPC
+ *      and puts 'len' into the appropriate place, or returns -ENOSPC or other 
- *      or other error if appropriate.
+ *      error if appropriate. Doesn't return len since that's 
 *
 *      If it returns zero, jffs2_reserve_space() also downs the per-filesystem
 *      allocation semaphore, to prevent more than one allocation from being
@@ -40,9 +39,9 @@
 */
 static int jffs2_do_reserve_space(struct jffs2_sb_info *c,  uint32_t minsize,
-                                        uint32_t *ofs, uint32_t *len, uint32_t sumsize);
+                                  uint32_t *len, uint32_t sumsize);
-int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                        uint32_t *len, int prio, uint32_t sumsize)
 {
        int ret = -EAGAIN;
@@ -132,19 +131,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs
                        spin_lock(&c->erase_completion_lock);
                }
-                ret = jffs2_do_reserve_space(c, minsize, ofs, len, sumsize);
+                ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
                if (ret) {
                        D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret));
                }
        }
        spin_unlock(&c->erase_completion_lock);
+        if (!ret)
+                ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
        if (ret)
                up(&c->alloc_sem);
        return ret;
 }
-int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
-                        uint32_t *len, uint32_t sumsize)
+                           uint32_t *len, uint32_t sumsize)
 {
        int ret = -EAGAIN;
        minsize = PAD(minsize);
@@ -153,12 +154,15 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *
        spin_lock(&c->erase_completion_lock);
        while(ret == -EAGAIN) {
-                ret = jffs2_do_reserve_space(c, minsize, ofs, len, sumsize);
+                ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
                if (ret) {
                        D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret));
                }
        }
        spin_unlock(&c->erase_completion_lock);
+        if (!ret)
+                ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
        return ret;
 }
@@ -207,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
                        struct jffs2_eraseblock *ejeb;
                        ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
-                        list_del(&ejeb->list);
+                        list_move_tail(&ejeb->list, &c->erase_pending_list);
-                        list_add_tail(&ejeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
                        jffs2_erase_pending_trigger(c);
                        D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
@@ -259,10 +262,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 }
 /* Called with alloc sem _and_ erase_completion_lock */
-static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs, uint32_t *len, uint32_t sumsize)
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+                                  uint32_t *len, uint32_t sumsize)
 {
        struct jffs2_eraseblock *jeb = c->nextblock;
-        uint32_t reserved_size;                         /* for summary information at the end of the jeb */
+        uint32_t reserved_size;                         /* for summary information at the end of the jeb */
        int ret;
 restart:
@@ -312,6 +316,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
                }
        } else {
                if (jeb && minsize > jeb->free_size) {
+                        uint32_t waste;
                        /* Skip the end of this block and file it as having some dirty space */
                        /* If there's a pending write to it, flush now */
@@ -324,10 +330,26 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
                                goto restart;
                        }
-                        c->wasted_size += jeb->free_size;
+                        spin_unlock(&c->erase_completion_lock);
-                        c->free_size -= jeb->free_size;
-                        jeb->wasted_size += jeb->free_size;
+                        ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
-                        jeb->free_size = 0;
+                        if (ret)
+                                return ret;
+                        /* Just lock it again and continue. Nothing much can change because
+                           we hold c->alloc_sem anyway. In fact, it's not entirely clear why
+                           we hold c->erase_completion_lock in the majority of this function...
+                           but that's a question for another (more caffeine-rich) day. */
+                        spin_lock(&c->erase_completion_lock);
+                        waste = jeb->free_size;
+                        jffs2_link_node_ref(c, jeb,
+                                            (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
+                                            waste, NULL);
+                        /* FIXME: that made it count as dirty. Convert to wasted */
+                        jeb->dirty_size -= waste;
+                        c->dirty_size -= waste;
+                        jeb->wasted_size += waste;
+                        c->wasted_size += waste;
                        jffs2_close_nextblock(c, jeb);
                        jeb = NULL;
@@ -349,7 +371,6 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
        }
        /* OK, jeb (==c->nextblock) is now pointing at a block which definitely has
           enough space */
-        *ofs = jeb->offset + (c->sector_size - jeb->free_size);
        *len = jeb->free_size - reserved_size;
        if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size &&
@@ -365,7 +386,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
                spin_lock(&c->erase_completion_lock);
        }
-        D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n", *len, *ofs));
+        D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n",
+                  *len, jeb->offset + (c->sector_size - jeb->free_size)));
        return 0;
 }
@@ -374,7 +396,6 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 *      @c: superblock info
 *      @new: new node reference to add
 *      @len: length of this physical node
- *      @dirty: dirty flag for new node
 *
 *      Should only be used to report nodes for which space has been allocated
 *      by jffs2_reserve_space.
@@ -382,42 +403,30 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 *      Must be called with the alloc_sem held.
 */
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new)
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
+                                                       uint32_t ofs, uint32_t len,
+                                                       struct jffs2_inode_cache *ic)
 {
        struct jffs2_eraseblock *jeb;
-        uint32_t len;
+        struct jffs2_raw_node_ref *new;
-        jeb = &c->blocks[new->flash_offset / c->sector_size];
+        jeb = &c->blocks[ofs / c->sector_size];
-        len = ref_totlen(c, jeb, new);
-        D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", ref_offset(new), ref_flags(new), len));
+        D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n",
+                  ofs & ~3, ofs & 3, len));
 #if 1
-        /* we could get some obsolete nodes after nextblock was refiled
+        /* Allow non-obsolete nodes only to be added at the end of c->nextblock, 
-           in wbuf.c */
+           if c->nextblock is set. Note that wbuf.c will file obsolete nodes
-        if ((c->nextblock || !ref_obsolete(new))
+           even after refiling c->nextblock */
-            &&(jeb != c->nextblock || ref_offset(new) != jeb->offset + (c->sector_size - jeb->free_size))) {
+        if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE))
+            && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) {
                printk(KERN_WARNING "argh. node added in wrong place\n");
-                jffs2_free_raw_node_ref(new);
+                return ERR_PTR(-EINVAL);
-                return -EINVAL;
        }
 #endif
        spin_lock(&c->erase_completion_lock);
-        if (!jeb->first_node)
+        new = jffs2_link_node_ref(c, jeb, ofs, len, ic);
-                jeb->first_node = new;
-        if (jeb->last_node)
-                jeb->last_node->next_phys = new;
-        jeb->last_node = new;
-        jeb->free_size -= len;
-        c->free_size -= len;
-        if (ref_obsolete(new)) {
-                jeb->dirty_size += len;
-                c->dirty_size += len;
-        } else {
-                jeb->used_size += len;
-                c->used_size += len;
-        }
        if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
                /* If it lives on the dirty_list, jffs2_reserve_space will put it there */
@@ -438,7 +447,7 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r
        spin_unlock(&c->erase_completion_lock);
-        return 0;
+        return new;
 }
@@ -470,8 +479,9 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        struct jffs2_unknown_node n;
        int ret, addedsize;
        size_t retlen;
+        uint32_t freed_len;
-        if(!ref) {
+        if(unlikely(!ref)) {
                printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
                return;
        }
@@ -499,32 +509,34 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        spin_lock(&c->erase_completion_lock);
+        freed_len = ref_totlen(c, jeb, ref);
        if (ref_flags(ref) == REF_UNCHECKED) {
-                D1(if (unlikely(jeb->unchecked_size < ref_totlen(c, jeb, ref))) {
+                D1(if (unlikely(jeb->unchecked_size < freed_len)) {
                        printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
-                               ref_totlen(c, jeb, ref), blocknr, ref->flash_offset, jeb->used_size);
+                               freed_len, blocknr, ref->flash_offset, jeb->used_size);
                        BUG();
                })
-                D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), ref_totlen(c, jeb, ref)));
+                D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len));
-                jeb->unchecked_size -= ref_totlen(c, jeb, ref);
+                jeb->unchecked_size -= freed_len;
-                c->unchecked_size -= ref_totlen(c, jeb, ref);
+                c->unchecked_size -= freed_len;
        } else {
-                D1(if (unlikely(jeb->used_size < ref_totlen(c, jeb, ref))) {
+                D1(if (unlikely(jeb->used_size < freed_len)) {
                        printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
-                               ref_totlen(c, jeb, ref), blocknr, ref->flash_offset, jeb->used_size);
+                               freed_len, blocknr, ref->flash_offset, jeb->used_size);
                        BUG();
                })
-                D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), ref_totlen(c, jeb, ref)));
+                D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len));
-                jeb->used_size -= ref_totlen(c, jeb, ref);
+                jeb->used_size -= freed_len;
-                c->used_size -= ref_totlen(c, jeb, ref);
+                c->used_size -= freed_len;
        }
        // Take care, that wasted size is taken into concern
-        if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + ref_totlen(c, jeb, ref))) && jeb != c->nextblock) {
+        if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
-                D1(printk(KERN_DEBUG "Dirtying\n"));
+                D1(printk("Dirtying\n"));
-                addedsize = ref_totlen(c, jeb, ref);
+                addedsize = freed_len;
-                jeb->dirty_size += ref_totlen(c, jeb, ref);
+                jeb->dirty_size += freed_len;
-                c->dirty_size += ref_totlen(c, jeb, ref);
+                c->dirty_size += freed_len;
                /* Convert wasted space to dirty, if not a bad block */
                if (jeb->wasted_size) {
@@ -543,10 +555,10 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                        }
                }
        } else {
-                D1(printk(KERN_DEBUG "Wasting\n"));
+                D1(printk("Wasting\n"));
                addedsize = 0;
-                jeb->wasted_size += ref_totlen(c, jeb, ref);
+                jeb->wasted_size += freed_len;
-                c->wasted_size += ref_totlen(c, jeb, ref);
+                c->wasted_size += freed_len;
        }
        ref->flash_offset = ref_offset(ref) | REF_OBSOLETE;
@@ -622,7 +634,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        /* The erase_free_sem is locked, and has been since before we marked the node obsolete
           and potentially put its eraseblock onto the erase_pending_list. Thus, we know that
           the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet
-           by jffs2_free_all_node_refs() in erase.c. Which is nice. */
+           by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */
        D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref)));
        ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
@@ -634,8 +646,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
                goto out_erase_sem;
        }
-        if (PAD(je32_to_cpu(n.totlen)) != PAD(ref_totlen(c, jeb, ref))) {
+        if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) {
-                printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), ref_totlen(c, jeb, ref));
+                printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len);
                goto out_erase_sem;
        }
        if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) {
@@ -677,57 +689,23 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                *p = ref->next_in_ino;
                ref->next_in_ino = NULL;
-                if (ic->nodes == (void *)ic && ic->nlink == 0)
+                switch (ic->class) {
-                        jffs2_del_ino_cache(c, ic);
+#ifdef CONFIG_JFFS2_FS_XATTR
+                        case RAWNODE_CLASS_XATTR_DATUM:
-                spin_unlock(&c->erase_completion_lock);
+                                jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
-        }
+                                break;
+                        case RAWNODE_CLASS_XATTR_REF:
+                                jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
-        /* Merge with the next node in the physical list, if there is one
+                                break;
-           and if it's also obsolete and if it doesn't belong to any inode */
+#endif
-        if (ref->next_phys && ref_obsolete(ref->next_phys) &&
+                        default:
-            !ref->next_phys->next_in_ino) {
+                                if (ic->nodes == (void *)ic && ic->nlink == 0)
-                struct jffs2_raw_node_ref *n = ref->next_phys;
+                                        jffs2_del_ino_cache(c, ic);
+                                break;
-                spin_lock(&c->erase_completion_lock);
-                ref->__totlen += n->__totlen;
-                ref->next_phys = n->next_phys;
-                if (jeb->last_node == n) jeb->last_node = ref;
-                if (jeb->gc_node == n) {
-                        /* gc will be happy continuing gc on this node */
-                        jeb->gc_node=ref;
                }
                spin_unlock(&c->erase_completion_lock);
-                jffs2_free_raw_node_ref(n);
        }
-        /* Also merge with the previous node in the list, if there is one
-           and that one is obsolete */
-        if (ref != jeb->first_node ) {
-                struct jffs2_raw_node_ref *p = jeb->first_node;
-                spin_lock(&c->erase_completion_lock);
-                while (p->next_phys != ref)
-                        p = p->next_phys;
-                if (ref_obsolete(p) && !ref->next_in_ino) {
-                        p->__totlen += ref->__totlen;
-                        if (jeb->last_node == ref) {
-                                jeb->last_node = p;
-                        }
-                        if (jeb->gc_node == ref) {
-                                /* gc will be happy continuing gc on this node */
-                                jeb->gc_node=p;
-                        }
-                        p->next_phys = ref->next_phys;
-                        jffs2_free_raw_node_ref(ref);
-                }
-                spin_unlock(&c->erase_completion_lock);
-        }
 out_erase_sem:
        up(&c->erase_free_sem);
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d307cf548625..9f41fc01a371 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -31,9 +31,7 @@ struct kvec;
 #define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
 #define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid)
 #define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid)
+#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
-#define JFFS2_F_I_RDEV_MIN(f) (iminor(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_RDEV_MAJ(f) (imajor(OFNI_EDONI_2SFFJ(f)))
 #define ITIME(sec) ((struct timespec){sec, 0})
 #define I_SEC(tv) ((tv).tv_sec)
@@ -60,6 +58,10 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
        f->target = NULL;
        f->flags = 0;
        f->usercompr = 0;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+        f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+        f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+#endif
 }
@@ -90,13 +92,10 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e)
 #define jffs2_wbuf_timeout NULL
 #define jffs2_wbuf_process NULL
-#define jffs2_nor_ecc(c) (0)
 #define jffs2_dataflash(c) (0)
-#define jffs2_nor_wbuf_flash(c) (0)
-#define jffs2_nor_ecc_flash_setup(c) (0)
-#define jffs2_nor_ecc_flash_cleanup(c) do {} while (0)
 #define jffs2_dataflash_setup(c) (0)
 #define jffs2_dataflash_cleanup(c) do {} while (0)
+#define jffs2_nor_wbuf_flash(c) (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
 #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
@@ -107,9 +106,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #ifdef CONFIG_JFFS2_SUMMARY
 #define jffs2_can_mark_obsolete(c) (0)
 #else
-#define jffs2_can_mark_obsolete(c) \
+#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE))
-  ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & (MTD_ECC|MTD_PROGRAM_REGIONS))) || \
-   c->mtd->type == MTD_RAM)
 #endif
 #define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH)
@@ -133,15 +130,11 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
-#define jffs2_nor_ecc(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_ECC))
-int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c);
-void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c);
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
-#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_PROGRAM_REGIONS))
+#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
@@ -165,7 +158,7 @@ extern struct inode_operations jffs2_dir_inode_operations;
 /* file.c */
 extern const struct file_operations jffs2_file_operations;
 extern struct inode_operations jffs2_file_inode_operations;
-extern struct address_space_operations jffs2_file_address_operations;
+extern const struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
@@ -182,7 +175,7 @@ void jffs2_clear_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
                               struct jffs2_raw_inode *ri);
-int jffs2_statfs (struct super_block *, struct kstatfs *);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
 void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index f1695642d0f7..cc1899268c43 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -66,7 +66,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
                        jffs2_free_full_dnode(tn->fn);
                        jffs2_free_tmp_dnode_info(tn);
-                        this = this->rb_parent;
+                        this = rb_parent(this);
                        if (!this)
                                break;
@@ -116,19 +116,42 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
                                uint32_t *latest_mctime, uint32_t *mctime_ver)
 {
        struct jffs2_full_dirent *fd;
+        uint32_t crc;
-        /* The direntry nodes are checked during the flash scanning */
-        BUG_ON(ref_flags(ref) == REF_UNCHECKED);
        /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
        BUG_ON(ref_obsolete(ref));
-        /* Sanity check */
+        crc = crc32(0, rd, sizeof(*rd) - 8);
-        if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
+        if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
-                JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
+                JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
-                       ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+                             ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
                return 1;
        }
+        /* If we've never checked the CRCs on this node, check them now */
+        if (ref_flags(ref) == REF_UNCHECKED) {
+                struct jffs2_eraseblock *jeb;
+                int len;
+                /* Sanity check */
+                if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
+                        JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
+                                    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+                        return 1;
+                }
+                jeb = &c->blocks[ref->flash_offset / c->sector_size];
+                len = ref_totlen(c, jeb, ref);
+                spin_lock(&c->erase_completion_lock);
+                jeb->used_size += len;
+                jeb->unchecked_size -= len;
+                c->used_size += len;
+                c->unchecked_size -= len;
+                ref->flash_offset = ref_offset(ref) | REF_PRISTINE;
+                spin_unlock(&c->erase_completion_lock);
+        }
        fd = jffs2_alloc_full_dirent(rd->nsize + 1);
        if (unlikely(!fd))
                return -ENOMEM;
@@ -198,13 +221,21 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        struct jffs2_tmp_dnode_info *tn;
        uint32_t len, csize;
        int ret = 1;
+        uint32_t crc;
        /* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
        BUG_ON(ref_obsolete(ref));
+        crc = crc32(0, rd, sizeof(*rd) - 8);
+        if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+                JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
+                             ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+                return 1;
+        }
        tn = jffs2_alloc_tmp_dnode_info();
        if (!tn) {
-                JFFS2_ERROR("failed to allocate tn (%d bytes).\n", sizeof(*tn));
+                JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn));
                return -ENOMEM;
        }
@@ -213,14 +244,6 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        /* If we've never checked the CRCs on this node, check them now */
        if (ref_flags(ref) == REF_UNCHECKED) {
-                uint32_t crc;
-                crc = crc32(0, rd, sizeof(*rd) - 8);
-                if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
-                        JFFS2_NOTICE("header CRC failed on node at %#08x: read %#08x, calculated %#08x\n",
-                                        ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-                        goto free_out;
-                }
                /* Sanity checks */
                if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) ||
@@ -343,7 +366,7 @@ free_out:
 * Helper function for jffs2_get_inode_nodes().
 * It is called every time an unknown node is found.
 *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
 *          1 if the node should be marked obsolete;
 *          negative error code on failure.
 */
@@ -354,37 +377,30 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
        un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
-        if (crc32(0, un, sizeof(struct jffs2_unknown_node) - 4) != je32_to_cpu(un->hdr_crc)) {
+        switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
-                /* Hmmm. This should have been caught at scan time. */
-                JFFS2_NOTICE("node header CRC failed at %#08x. But it must have been OK earlier.\n", ref_offset(ref));
-                jffs2_dbg_dump_node(c, ref_offset(ref));
-                return 1;
-        } else {
-                switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
-                case JFFS2_FEATURE_INCOMPAT:
+        case JFFS2_FEATURE_INCOMPAT:
-                        JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
+                JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
-                                je16_to_cpu(un->nodetype), ref_offset(ref));
+                            je16_to_cpu(un->nodetype), ref_offset(ref));
-                        /* EEP */
+                /* EEP */
-                        BUG();
+                BUG();
-                        break;
+                break;
-                case JFFS2_FEATURE_ROCOMPAT:
+        case JFFS2_FEATURE_ROCOMPAT:
-                        JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
+                JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
-                                        je16_to_cpu(un->nodetype), ref_offset(ref));
+                            je16_to_cpu(un->nodetype), ref_offset(ref));
-                        BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
+                BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
-                        break;
+                break;
-                case JFFS2_FEATURE_RWCOMPAT_COPY:
+        case JFFS2_FEATURE_RWCOMPAT_COPY:
-                        JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
+                JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
-                                        je16_to_cpu(un->nodetype), ref_offset(ref));
+                             je16_to_cpu(un->nodetype), ref_offset(ref));
-                        break;
+                break;
-                case JFFS2_FEATURE_RWCOMPAT_DELETE:
+        case JFFS2_FEATURE_RWCOMPAT_DELETE:
-                        JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
+                JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
-                                        je16_to_cpu(un->nodetype), ref_offset(ref));
+                             je16_to_cpu(un->nodetype), ref_offset(ref));
-                        return 1;
+                return 1;
-                }
        }
        return 0;
@@ -434,7 +450,7 @@ static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
        }
        if (retlen < len) {
-                JFFS2_ERROR("short read at %#08x: %d instead of %d.\n",
+                JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
                                offs, retlen, len);
                return -EIO;
        }
@@ -542,13 +558,25 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                }
                if (retlen < len) {
-                        JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ref_offset(ref), retlen, len);
+                        JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len);
                        err = -EIO;
                        goto free_out;
                }
                node = (union jffs2_node_union *)bufstart;
+                /* No need to mask in the valid bit; it shouldn't be invalid */
+                if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
+                        JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n",
+                                     ref_offset(ref), je16_to_cpu(node->u.magic),
+                                     je16_to_cpu(node->u.nodetype),
+                                     je32_to_cpu(node->u.totlen),
+                                     je32_to_cpu(node->u.hdr_crc));
+                        jffs2_dbg_dump_node(c, ref_offset(ref));
+                        jffs2_mark_node_obsolete(c, ref);
+                        goto cont;
+                }
                switch (je16_to_cpu(node->u.nodetype)) {
                case JFFS2_NODETYPE_DIRENT:
@@ -606,6 +634,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                                goto free_out;
                }
+        cont:
                spin_lock(&c->erase_completion_lock);
        }
@@ -679,12 +708,12 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                        jffs2_mark_node_obsolete(c, fn->raw);
                BUG_ON(rb->rb_left);
-                if (rb->rb_parent && rb->rb_parent->rb_left == rb) {
+                if (rb_parent(rb) && rb_parent(rb)->rb_left == rb) {
                        /* We were then left-hand child of our parent. We need
                         * to move our own right-hand child into our place. */
                        repl_rb = rb->rb_right;
                        if (repl_rb)
-                                repl_rb->rb_parent = rb->rb_parent;
+                                rb_set_parent(repl_rb, rb_parent(rb));
                } else
                        repl_rb = NULL;
@@ -692,14 +721,14 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                /* Remove the spent tn from the tree; don't bother rebalancing
                 * but put our right-hand child in our own place. */
-                if (tn->rb.rb_parent) {
+                if (rb_parent(&tn->rb)) {
-                        if (tn->rb.rb_parent->rb_left == &tn->rb)
+                        if (rb_parent(&tn->rb)->rb_left == &tn->rb)
-                                tn->rb.rb_parent->rb_left = repl_rb;
+                                rb_parent(&tn->rb)->rb_left = repl_rb;
-                        else if (tn->rb.rb_parent->rb_right == &tn->rb)
+                        else if (rb_parent(&tn->rb)->rb_right == &tn->rb)
-                                tn->rb.rb_parent->rb_right = repl_rb;
+                                rb_parent(&tn->rb)->rb_right = repl_rb;
                        else BUG();
                } else if (tn->rb.rb_right)
-                        tn->rb.rb_right->rb_parent = NULL;
+                        rb_set_parent(tn->rb.rb_right, NULL);
                jffs2_free_tmp_dnode_info(tn);
                if (ret) {
@@ -939,6 +968,7 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
        struct jffs2_full_dirent *fd, *fds;
        int deleted;
+        jffs2_xattr_delete_inode(c, f->inocache);
        down(&f->sem);
        deleted = f->inocache && !f->inocache->nlink;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index cf55b221fc2b..2bfdc33752d3 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -65,6 +65,28 @@ static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) {
                return DEFAULT_EMPTY_SCAN_SIZE;
 }
+static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+        int ret;
+        if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+                return ret;
+        if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size)))
+                return ret;
+        /* Turned wasted size into dirty, since we apparently 
+           think it's recoverable now. */
+        jeb->dirty_size += jeb->wasted_size;
+        c->dirty_size += jeb->wasted_size;
+        c->wasted_size -= jeb->wasted_size;
+        jeb->wasted_size = 0;
+        if (VERYDIRTY(c, jeb->dirty_size)) {
+                list_add(&jeb->list, &c->very_dirty_list);
+        } else {
+                list_add(&jeb->list, &c->dirty_list);
+        }
+        return 0;
+}
 int jffs2_scan_medium(struct jffs2_sb_info *c)
 {
        int i, ret;
@@ -170,34 +192,20 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                                        (!c->nextblock || c->nextblock->free_size < jeb->free_size)) {
                                /* Better candidate for the next writes to go to */
                                if (c->nextblock) {
-                                        c->nextblock->dirty_size += c->nextblock->free_size + c->nextblock->wasted_size;
+                                        ret = file_dirty(c, c->nextblock);
-                                        c->dirty_size += c->nextblock->free_size + c->nextblock->wasted_size;
+                                        if (ret)
-                                        c->free_size -= c->nextblock->free_size;
+                                                return ret;
-                                        c->wasted_size -= c->nextblock->wasted_size;
-                                        c->nextblock->free_size = c->nextblock->wasted_size = 0;
-                                        if (VERYDIRTY(c, c->nextblock->dirty_size)) {
-                                                list_add(&c->nextblock->list, &c->very_dirty_list);
-                                        } else {
-                                                list_add(&c->nextblock->list, &c->dirty_list);
-                                        }
                                        /* deleting summary information of the old nextblock */
                                        jffs2_sum_reset_collected(c->summary);
                                }
-                                /* update collected summary infromation for the current nextblock */
+                                /* update collected summary information for the current nextblock */
                                jffs2_sum_move_collected(c, s);
                                D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset));
                                c->nextblock = jeb;
                        } else {
-                                jeb->dirty_size += jeb->free_size + jeb->wasted_size;
+                                ret = file_dirty(c, jeb);
-                                c->dirty_size += jeb->free_size + jeb->wasted_size;
+                                if (ret)
-                                c->free_size -= jeb->free_size;
+                                        return ret;
-                                c->wasted_size -= jeb->wasted_size;
-                                jeb->free_size = jeb->wasted_size = 0;
-                                if (VERYDIRTY(c, jeb->dirty_size)) {
-                                        list_add(&jeb->list, &c->very_dirty_list);
-                                } else {
-                                        list_add(&jeb->list, &c->dirty_list);
-                                }
                        }
                        break;
@@ -222,9 +230,6 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                }
        }
-        if (jffs2_sum_active() && s)
-                kfree(s);
        /* Nextblock dirty is always seen as wasted, because we cannot recycle it now */
        if (c->nextblock && (c->nextblock->dirty_size)) {
                c->nextblock->wasted_size += c->nextblock->dirty_size;
@@ -242,11 +247,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n",
                          skip));
-                c->nextblock->wasted_size += skip;
+                jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
-                c->wasted_size += skip;
+                jffs2_scan_dirty_space(c, c->nextblock, skip);
-                c->nextblock->free_size -= skip;
-                c->free_size -= skip;
        }
 #endif
        if (c->nr_erasing_blocks) {
@@ -266,6 +268,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        else
                c->mtd->unpoint(c->mtd, flashbuf, 0, c->mtd->size);
 #endif
+        if (s)
+                kfree(s);
        return ret;
 }
@@ -290,7 +295,7 @@ int jffs2_fill_scan_buf (struct jffs2_sb_info *c, void *buf,
 int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
        if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size
-                && (!jeb->first_node || !jeb->first_node->next_phys) )
+            && (!jeb->first_node || !ref_next(jeb->first_node)) )
                return BLK_STATE_CLEANMARKER;
        /* move blocks with max 4 byte dirty space to cleanlist */
@@ -306,11 +311,126 @@ int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
                return BLK_STATE_ALLDIRTY;
 }
+#ifdef CONFIG_JFFS2_FS_XATTR
+static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+                                 struct jffs2_raw_xattr *rx, uint32_t ofs,
+                                 struct jffs2_summary *s)
+{
+        struct jffs2_xattr_datum *xd;
+        uint32_t xid, version, totlen, crc;
+        int err;
+        crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
+        if (crc != je32_to_cpu(rx->node_crc)) {
+                JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+                              ofs, je32_to_cpu(rx->node_crc), crc);
+                if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+                        return err;
+                return 0;
+        }
+        xid = je32_to_cpu(rx->xid);
+        version = je32_to_cpu(rx->version);
+        totlen = PAD(sizeof(struct jffs2_raw_xattr)
+                        + rx->name_len + 1 + je16_to_cpu(rx->value_len));
+        if (totlen != je32_to_cpu(rx->totlen)) {
+                JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+                              ofs, je32_to_cpu(rx->totlen), totlen);
+                if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+                        return err;
+                return 0;
+        }
+        xd = jffs2_setup_xattr_datum(c, xid, version);
+        if (IS_ERR(xd))
+                return PTR_ERR(xd);
+        if (xd->version > version) {
+                struct jffs2_raw_node_ref *raw
+                        = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+                raw->next_in_ino = xd->node->next_in_ino;
+                xd->node->next_in_ino = raw;
+        } else {
+                xd->version = version;
+                xd->xprefix = rx->xprefix;
+                xd->name_len = rx->name_len;
+                xd->value_len = je16_to_cpu(rx->value_len);
+                xd->data_crc = je32_to_cpu(rx->data_crc);
+                jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd);
+        }
+        if (jffs2_sum_active())
+                jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
+        dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
+                  ofs, xd->xid, xd->version);
+        return 0;
+}
+static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+                                struct jffs2_raw_xref *rr, uint32_t ofs,
+                                struct jffs2_summary *s)
+{
+        struct jffs2_xattr_ref *ref;
+        uint32_t crc;
+        int err;
+        crc = crc32(0, rr, sizeof(*rr) - 4);
+        if (crc != je32_to_cpu(rr->node_crc)) {
+                JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+                              ofs, je32_to_cpu(rr->node_crc), crc);
+                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
+                        return err;
+                return 0;
+        }
+        if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
+                JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n",
+                              ofs, je32_to_cpu(rr->totlen),
+                              PAD(sizeof(struct jffs2_raw_xref)));
+                if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen))))
+                        return err;
+                return 0;
+        }
+        ref = jffs2_alloc_xattr_ref();
+        if (!ref)
+                return -ENOMEM;
+        /* BEFORE jffs2_build_xattr_subsystem() called, 
+         * and AFTER xattr_ref is marked as a dead xref,
+         * ref->xid is used to store 32bit xid, xd is not used
+         * ref->ino is used to store 32bit inode-number, ic is not used
+         * Thoes variables are declared as union, thus using those
+         * are exclusive. In a similar way, ref->next is temporarily
+         * used to chain all xattr_ref object. It's re-chained to
+         * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
+         */
+        ref->ino = je32_to_cpu(rr->ino);
+        ref->xid = je32_to_cpu(rr->xid);
+        ref->xseqno = je32_to_cpu(rr->xseqno);
+        if (ref->xseqno > c->highest_xseqno)
+                c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
+        ref->next = c->xref_temp;
+        c->xref_temp = ref;
+        jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref);
+        if (jffs2_sum_active())
+                jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
+        dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
+                  ofs, ref->xid, ref->ino);
+        return 0;
+}
+#endif
+/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into
+   the flash, XIP-style */
 static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-                                unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
+                                  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
        struct jffs2_unknown_node *node;
        struct jffs2_unknown_node crcnode;
-        struct jffs2_sum_marker *sm;
        uint32_t ofs, prevofs;
        uint32_t hdr_crc, buf_ofs, buf_len;
        int err;
@@ -344,44 +464,75 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 #endif
        if (jffs2_sum_active()) {
-                sm = kmalloc(sizeof(struct jffs2_sum_marker), GFP_KERNEL);
+                struct jffs2_sum_marker *sm;
-                if (!sm) {
+                void *sumptr = NULL;
-                        return -ENOMEM;
+                uint32_t sumlen;
-                }
+              
+                if (!buf_size) {
-                err = jffs2_fill_scan_buf(c, (unsigned char *) sm, jeb->offset + c->sector_size -
+                        /* XIP case. Just look, point at the summary if it's there */
-                                        sizeof(struct jffs2_sum_marker), sizeof(struct jffs2_sum_marker));
+                        sm = (void *)buf + c->sector_size - sizeof(*sm);
-                if (err) {
+                        if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
-                        kfree(sm);
+                                sumptr = buf + je32_to_cpu(sm->offset);
-                        return err;
+                                sumlen = c->sector_size - je32_to_cpu(sm->offset);
-                }
+                        }
+                } else {
-                if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC ) {
+                        /* If NAND flash, read a whole page of it. Else just the end */
-                        err = jffs2_sum_scan_sumnode(c, jeb, je32_to_cpu(sm->offset), &pseudo_random);
+                        if (c->wbuf_pagesize)
-                        if (err) {
+                                buf_len = c->wbuf_pagesize;
-                                kfree(sm);
+                        else
+                                buf_len = sizeof(*sm);
+                        /* Read as much as we want into the _end_ of the preallocated buffer */
+                        err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, 
+                                                  jeb->offset + c->sector_size - buf_len,
+                                                  buf_len);                             
+                        if (err)
                                return err;
+                        sm = (void *)buf + buf_size - sizeof(*sm);
+                        if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+                                sumlen = c->sector_size - je32_to_cpu(sm->offset);
+                                sumptr = buf + buf_size - sumlen;
+                                /* Now, make sure the summary itself is available */
+                                if (sumlen > buf_size) {
+                                        /* Need to kmalloc for this. */
+                                        sumptr = kmalloc(sumlen, GFP_KERNEL);
+                                        if (!sumptr)
+                                                return -ENOMEM;
+                                        memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len);
+                                }
+                                if (buf_len < sumlen) {
+                                        /* Need to read more so that the entire summary node is present */
+                                        err = jffs2_fill_scan_buf(c, sumptr, 
+                                                                  jeb->offset + c->sector_size - sumlen,
+                                                                  sumlen - buf_len);                            
+                                        if (err)
+                                                return err;
+                                }
                        }
                }
-                kfree(sm);
+                if (sumptr) {
+                        err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random);
-                ofs = jeb->offset;
+                        if (buf_size && sumlen > buf_size)
-                prevofs = jeb->offset - 1;
+                                kfree(sumptr);
+                        /* If it returns with a real error, bail. 
+                           If it returns positive, that's a block classification
+                           (i.e. BLK_STATE_xxx) so return that too.
+                           If it returns zero, fall through to full scan. */
+                        if (err)
+                                return err;
+                }
        }
        buf_ofs = jeb->offset;
        if (!buf_size) {
+                /* This is the XIP case -- we're reading _directly_ from the flash chip */
                buf_len = c->sector_size;
-                if (jffs2_sum_active()) {
-                        /* must reread because of summary test */
-                        err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
-                        if (err)
-                                return err;
-                }
        } else {
                buf_len = EMPTY_SCAN_SIZE(c->sector_size);
                err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
@@ -418,7 +569,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
        if (ofs) {
                D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
                          jeb->offset + ofs));
-                DIRTY_SPACE(ofs);
+                if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+                        return err;
+                if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
+                        return err;
        }
        /* Now ofs is a complete physical flash offset as it always was... */
@@ -433,6 +587,11 @@ scan_more:
                jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+                /* Make sure there are node refs available for use */
+                err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+                if (err)
+                        return err;
                cond_resched();
                if (ofs & 3) {
@@ -442,7 +601,8 @@ scan_more:
                }
                if (ofs == prevofs) {
                        printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs);
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
@@ -451,7 +611,8 @@ scan_more:
                if (jeb->offset + c->sector_size < ofs + sizeof(*node)) {
                        D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node),
                                  jeb->offset, c->sector_size, ofs, sizeof(*node)));
-                        DIRTY_SPACE((jeb->offset + c->sector_size)-ofs);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs)))
+                                return err;
                        break;
                }
@@ -481,7 +642,8 @@ scan_more:
                                if (*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff) {
                                        printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
                                               empty_start, ofs);
-                                        DIRTY_SPACE(ofs-empty_start);
+                                        if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
+                                                return err;
                                        goto scan_more;
                                }
@@ -494,7 +656,7 @@ scan_more:
                        /* If we're only checking the beginning of a block with a cleanmarker,
                           bail now */
                        if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) &&
-                            c->cleanmarker_size && !jeb->dirty_size && !jeb->first_node->next_phys) {
+                            c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) {
                                D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
                                return BLK_STATE_CLEANMARKER;
                        }
@@ -518,20 +680,23 @@ scan_more:
                if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) {
                        printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs);
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
                if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) {
                        D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs));
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
                if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) {
                        printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs);
                        printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n");
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
@@ -540,7 +705,8 @@ scan_more:
                        noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
                                     JFFS2_MAGIC_BITMASK, ofs,
                                     je16_to_cpu(node->magic));
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
@@ -557,7 +723,8 @@ scan_more:
                                     je32_to_cpu(node->totlen),
                                     je32_to_cpu(node->hdr_crc),
                                     hdr_crc);
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
@@ -568,7 +735,8 @@ scan_more:
                        printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
                               ofs, je32_to_cpu(node->totlen));
                        printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n");
-                        DIRTY_SPACE(4);
+                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+                                return err;
                        ofs += 4;
                        continue;
                }
@@ -576,7 +744,8 @@ scan_more:
                if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) {
                        /* Wheee. This is an obsoleted node */
                        D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs));
-                        DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+                        if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+                                return err;
                        ofs += PAD(je32_to_cpu(node->totlen));
                        continue;
                }
@@ -614,30 +783,59 @@ scan_more:
                        ofs += PAD(je32_to_cpu(node->totlen));
                        break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+                case JFFS2_NODETYPE_XATTR:
+                        if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+                                buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+                                D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)"
+                                          " left to end of buf. Reading 0x%x at 0x%08x\n",
+                                          je32_to_cpu(node->totlen), buf_len, ofs));
+                                err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+                                if (err)
+                                        return err;
+                                buf_ofs = ofs;
+                                node = (void *)buf;
+                        }
+                        err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s);
+                        if (err)
+                                return err;
+                        ofs += PAD(je32_to_cpu(node->totlen));
+                        break;
+                case JFFS2_NODETYPE_XREF:
+                        if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+                                buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+                                D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)"
+                                          " left to end of buf. Reading 0x%x at 0x%08x\n",
+                                          je32_to_cpu(node->totlen), buf_len, ofs));
+                                err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+                                if (err)
+                                        return err;
+                                buf_ofs = ofs;
+                                node = (void *)buf;
+                        }
+                        err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s);
+                        if (err)
+                                return err;
+                        ofs += PAD(je32_to_cpu(node->totlen));
+                        break;
+#endif  /* CONFIG_JFFS2_FS_XATTR */
                case JFFS2_NODETYPE_CLEANMARKER:
                        D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs));
                        if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
                                printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
                                       ofs, je32_to_cpu(node->totlen), c->cleanmarker_size);
-                                DIRTY_SPACE(PAD(sizeof(struct jffs2_unknown_node)));
+                                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+                                        return err;
                                ofs += PAD(sizeof(struct jffs2_unknown_node));
                        } else if (jeb->first_node) {
                                printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset);
-                                DIRTY_SPACE(PAD(sizeof(struct jffs2_unknown_node)));
+                                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+                                        return err;
                                ofs += PAD(sizeof(struct jffs2_unknown_node));
                        } else {
-                                struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
+                                jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL);
-                                if (!marker_ref) {
-                                        printk(KERN_NOTICE "Failed to allocate node ref for clean marker\n");
-                                        return -ENOMEM;
-                                }
-                                marker_ref->next_in_ino = NULL;
-                                marker_ref->next_phys = NULL;
-                                marker_ref->flash_offset = ofs | REF_NORMAL;
-                                marker_ref->__totlen = c->cleanmarker_size;
-                                jeb->first_node = jeb->last_node = marker_ref;
-                                USED_SPACE(PAD(c->cleanmarker_size));
                                ofs += PAD(c->cleanmarker_size);
                        }
                        break;
@@ -645,7 +843,8 @@ scan_more:
                case JFFS2_NODETYPE_PADDING:
                        if (jffs2_sum_active())
                                jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen));
-                        DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+                        if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+                                return err;
                        ofs += PAD(je32_to_cpu(node->totlen));
                        break;
@@ -656,7 +855,8 @@ scan_more:
                                c->flags |= JFFS2_SB_FLAG_RO;
                                if (!(jffs2_is_readonly(c)))
                                        return -EROFS;
-                                DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+                                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+                                        return err;
                                ofs += PAD(je32_to_cpu(node->totlen));
                                break;
@@ -666,15 +866,21 @@ scan_more:
                        case JFFS2_FEATURE_RWCOMPAT_DELETE:
                                D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
-                                DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+                                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+                                        return err;
                                ofs += PAD(je32_to_cpu(node->totlen));
                                break;
-                        case JFFS2_FEATURE_RWCOMPAT_COPY:
+                        case JFFS2_FEATURE_RWCOMPAT_COPY: {
                                D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
-                                USED_SPACE(PAD(je32_to_cpu(node->totlen)));
+                                jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL);
+                                /* We can't summarise nodes we don't grok */
+                                jffs2_sum_disable_collecting(s);
                                ofs += PAD(je32_to_cpu(node->totlen));
                                break;
+                                }
                        }
                }
        }
@@ -687,9 +893,9 @@ scan_more:
                }
        }
-        D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x\n", jeb->offset,
+        D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
-                  jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size));
+                  jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size));
+        
        /* mark_node_obsolete can add to wasted !! */
        if (jeb->wasted_size) {
                jeb->dirty_size += jeb->wasted_size;
@@ -730,9 +936,9 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
 static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
                                 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
 {
-        struct jffs2_raw_node_ref *raw;
        struct jffs2_inode_cache *ic;
        uint32_t ino = je32_to_cpu(ri->ino);
+        int err;
        D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
@@ -745,12 +951,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
           Which means that the _full_ amount of time to get to proper write mode with GC
           operational may actually be _longer_ than before. Sucks to be me. */
-        raw = jffs2_alloc_raw_node_ref();
-        if (!raw) {
-                printk(KERN_NOTICE "jffs2_scan_inode_node(): allocation of node reference failed\n");
-                return -ENOMEM;
-        }
        ic = jffs2_get_ino_cache(c, ino);
        if (!ic) {
                /* Inocache get failed. Either we read a bogus ino# or it's just genuinely the
@@ -762,30 +962,17 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
                        printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
                               ofs, je32_to_cpu(ri->node_crc), crc);
                        /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-                        DIRTY_SPACE(PAD(je32_to_cpu(ri->totlen)));
+                        if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
-                        jffs2_free_raw_node_ref(raw);
+                                return err;
                        return 0;
                }
                ic = jffs2_scan_make_ino_cache(c, ino);
-                if (!ic) {
+                if (!ic)
-                        jffs2_free_raw_node_ref(raw);
                        return -ENOMEM;
-                }
        }
        /* Wheee. It worked */
+        jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic);
-        raw->flash_offset = ofs | REF_UNCHECKED;
-        raw->__totlen = PAD(je32_to_cpu(ri->totlen));
-        raw->next_phys = NULL;
-        raw->next_in_ino = ic->nodes;
-        ic->nodes = raw;
-        if (!jeb->first_node)
-                jeb->first_node = raw;
-        if (jeb->last_node)
-                jeb->last_node->next_phys = raw;
-        jeb->last_node = raw;
        D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
                  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
@@ -794,8 +981,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
        pseudo_random += je32_to_cpu(ri->version);
-        UNCHECKED_SPACE(PAD(je32_to_cpu(ri->totlen)));
        if (jffs2_sum_active()) {
                jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset);
        }
@@ -806,10 +991,10 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
                                  struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s)
 {
-        struct jffs2_raw_node_ref *raw;
        struct jffs2_full_dirent *fd;
        struct jffs2_inode_cache *ic;
        uint32_t crc;
+        int err;
        D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs));
@@ -821,7 +1006,8 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
                printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
                       ofs, je32_to_cpu(rd->node_crc), crc);
                /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-                DIRTY_SPACE(PAD(je32_to_cpu(rd->totlen)));
+                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+                        return err;
                return 0;
        }
@@ -842,40 +1028,23 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
                jffs2_free_full_dirent(fd);
                /* FIXME: Why do we believe totlen? */
                /* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */
-                DIRTY_SPACE(PAD(je32_to_cpu(rd->totlen)));
+                if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+                        return err;
                return 0;
        }
-        raw = jffs2_alloc_raw_node_ref();
-        if (!raw) {
-                jffs2_free_full_dirent(fd);
-                printk(KERN_NOTICE "jffs2_scan_dirent_node(): allocation of node reference failed\n");
-                return -ENOMEM;
-        }
        ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino));
        if (!ic) {
                jffs2_free_full_dirent(fd);
-                jffs2_free_raw_node_ref(raw);
                return -ENOMEM;
        }
-        raw->__totlen = PAD(je32_to_cpu(rd->totlen));
+        fd->raw = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rd->totlen)), ic);
-        raw->flash_offset = ofs | REF_PRISTINE;
-        raw->next_phys = NULL;
-        raw->next_in_ino = ic->nodes;
-        ic->nodes = raw;
-        if (!jeb->first_node)
-                jeb->first_node = raw;
-        if (jeb->last_node)
-                jeb->last_node->next_phys = raw;
-        jeb->last_node = raw;
-        fd->raw = raw;
        fd->next = NULL;
        fd->version = je32_to_cpu(rd->version);
        fd->ino = je32_to_cpu(rd->ino);
        fd->nhash = full_name_hash(fd->name, rd->nsize);
        fd->type = rd->type;
-        USED_SPACE(PAD(je32_to_cpu(rd->totlen)));
        jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
        if (jffs2_sum_active()) {
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
new file mode 100644
index 000000000000..52a9894a6364
--- /dev/null
+++ b/fs/jffs2/security.c
@@ -0,0 +1,82 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include <linux/security.h>
+#include "nodelist.h"
+/* ---- Initial Security Label Attachment -------------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir)
+{
+        int rc;
+        size_t len;
+        void *value;
+        char *name;
+        rc = security_inode_init_security(inode, dir, &name, &value, &len);
+        if (rc) {
+                if (rc == -EOPNOTSUPP)
+                        return 0;
+                return rc;
+        }
+        rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+        kfree(name);
+        kfree(value);
+        return rc;
+}
+/* ---- XATTR Handler for "security.*" ----------------- */
+static int jffs2_security_getxattr(struct inode *inode, const char *name,
+                                   void *buffer, size_t size)
+{
+        if (!strcmp(name, ""))
+                return -EINVAL;
+        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size);
+}
+static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer,
+                                   size_t size, int flags)
+{
+        if (!strcmp(name, ""))
+                return -EINVAL;
+        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags);
+}
+static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size,
+                                       const char *name, size_t name_len)
+{
+        size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
+        if (list && retlen <= list_size) {
+                strcpy(list, XATTR_SECURITY_PREFIX);
+                strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
+        }
+        return retlen;
+}
+struct xattr_handler jffs2_security_xattr_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list = jffs2_security_listxattr,
+        .set = jffs2_security_setxattr,
+        .get = jffs2_security_getxattr
+};
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index fb9cec61fcf2..c19bd476e8ec 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -5,6 +5,7 @@
 *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
 *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
 *                     University of Szeged, Hungary
+ *               2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
@@ -42,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
                return -ENOMEM;
        }
-        dbg_summary("returned succesfully\n");
+        dbg_summary("returned successfully\n");
        return 0;
 }
@@ -81,6 +82,19 @@ static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item)
                        dbg_summary("dirent (%u) added to summary\n",
                                                je32_to_cpu(item->d.ino));
                        break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+                case JFFS2_NODETYPE_XATTR:
+                        s->sum_size += JFFS2_SUMMARY_XATTR_SIZE;
+                        s->sum_num++;
+                        dbg_summary("xattr (xid=%u, version=%u) added to summary\n",
+                                    je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version));
+                        break;
+                case JFFS2_NODETYPE_XREF:
+                        s->sum_size += JFFS2_SUMMARY_XREF_SIZE;
+                        s->sum_num++;
+                        dbg_summary("xref added to summary\n");
+                        break;
+#endif
                default:
                        JFFS2_WARNING("UNKNOWN node type %u\n",
                                            je16_to_cpu(item->u.nodetype));
@@ -141,6 +155,40 @@ int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *r
        return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
 }
+#ifdef CONFIG_JFFS2_FS_XATTR
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs)
+{
+        struct jffs2_sum_xattr_mem *temp;
+        temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+        if (!temp)
+                return -ENOMEM;
+        temp->nodetype = rx->nodetype;
+        temp->xid = rx->xid;
+        temp->version = rx->version;
+        temp->offset = cpu_to_je32(ofs);
+        temp->totlen = rx->totlen;
+        temp->next = NULL;
+        return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs)
+{
+        struct jffs2_sum_xref_mem *temp;
+        temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+        if (!temp)
+                return -ENOMEM;
+        temp->nodetype = rr->nodetype;
+        temp->offset = cpu_to_je32(ofs);
+        temp->next = NULL;
+        return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+#endif
 /* Cleanup every collected summary information */
 static void jffs2_sum_clean_collected(struct jffs2_summary *s)
@@ -259,7 +307,34 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
                        return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
                }
+#ifdef CONFIG_JFFS2_FS_XATTR
+                case JFFS2_NODETYPE_XATTR: {
+                        struct jffs2_sum_xattr_mem *temp;
+                        temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+                        if (!temp)
+                                goto no_mem;
+                        temp->nodetype = node->x.nodetype;
+                        temp->xid = node->x.xid;
+                        temp->version = node->x.version;
+                        temp->totlen = node->x.totlen;
+                        temp->offset = cpu_to_je32(ofs);
+                        temp->next = NULL;
+                        return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+                }
+                case JFFS2_NODETYPE_XREF: {
+                        struct jffs2_sum_xref_mem *temp;
+                        temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+                        if (!temp)
+                                goto no_mem;
+                        temp->nodetype = node->r.nodetype;
+                        temp->offset = cpu_to_je32(ofs);
+                        temp->next = NULL;
+                        return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+                }
+#endif
                case JFFS2_NODETYPE_PADDING:
                        dbg_summary("node PADDING\n");
                        c->summary->sum_padded += je32_to_cpu(node->u.totlen);
@@ -288,23 +363,41 @@ no_mem:
        return -ENOMEM;
 }
+static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c,
+                                                    struct jffs2_eraseblock *jeb,
+                                                    uint32_t ofs, uint32_t len,
+                                                    struct jffs2_inode_cache *ic)
+{
+        /* If there was a gap, mark it dirty */
+        if ((ofs & ~3) > c->sector_size - jeb->free_size) {
+                /* Ew. Summary doesn't actually tell us explicitly about dirty space */
+                jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size));
+        }
+        return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic);
+}
 /* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */
 static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
                                struct jffs2_raw_summary *summary, uint32_t *pseudo_random)
 {
-        struct jffs2_raw_node_ref *raw;
        struct jffs2_inode_cache *ic;
        struct jffs2_full_dirent *fd;
        void *sp;
        int i, ino;
+        int err;
        sp = summary->sum;
        for (i=0; i<je32_to_cpu(summary->sum_num); i++) {
                dbg_summary("processing summary index %d\n", i);
+                /* Make sure there's a spare ref for dirty space */
+                err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+                if (err)
+                        return err;
                switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) {
                        case JFFS2_NODETYPE_INODE: {
                                struct jffs2_sum_inode_flash *spi;
@@ -312,38 +405,20 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
                                ino = je32_to_cpu(spi->inode);
-                                dbg_summary("Inode at 0x%08x\n",
+                                dbg_summary("Inode at 0x%08x-0x%08x\n",
-                                                        jeb->offset + je32_to_cpu(spi->offset));
+                                            jeb->offset + je32_to_cpu(spi->offset),
+                                            jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen));
-                                raw = jffs2_alloc_raw_node_ref();
-                                if (!raw) {
-                                        JFFS2_NOTICE("allocation of node reference failed\n");
-                                        kfree(summary);
-                                        return -ENOMEM;
-                                }
                                ic = jffs2_scan_make_ino_cache(c, ino);
                                if (!ic) {
                                        JFFS2_NOTICE("scan_make_ino_cache failed\n");
-                                        jffs2_free_raw_node_ref(raw);
-                                        kfree(summary);
                                        return -ENOMEM;
                                }
-                                raw->flash_offset = (jeb->offset + je32_to_cpu(spi->offset)) | REF_UNCHECKED;
+                                sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED,
-                                raw->__totlen = PAD(je32_to_cpu(spi->totlen));
+                                                  PAD(je32_to_cpu(spi->totlen)), ic);
-                                raw->next_phys = NULL;
-                                raw->next_in_ino = ic->nodes;
-                                ic->nodes = raw;
-                                if (!jeb->first_node)
-                                        jeb->first_node = raw;
-                                if (jeb->last_node)
-                                        jeb->last_node->next_phys = raw;
-                                jeb->last_node = raw;
-                                *pseudo_random += je32_to_cpu(spi->version);
-                                UNCHECKED_SPACE(PAD(je32_to_cpu(spi->totlen)));
+                                *pseudo_random += je32_to_cpu(spi->version);
                                sp += JFFS2_SUMMARY_INODE_SIZE;
@@ -354,52 +429,33 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
                                struct jffs2_sum_dirent_flash *spd;
                                spd = sp;
-                                dbg_summary("Dirent at 0x%08x\n",
+                                dbg_summary("Dirent at 0x%08x-0x%08x\n",
-                                                        jeb->offset + je32_to_cpu(spd->offset));
+                                            jeb->offset + je32_to_cpu(spd->offset),
+                                            jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));
                                fd = jffs2_alloc_full_dirent(spd->nsize+1);
-                                if (!fd) {
+                                if (!fd)
-                                        kfree(summary);
                                        return -ENOMEM;
-                                }
                                memcpy(&fd->name, spd->name, spd->nsize);
                                fd->name[spd->nsize] = 0;
-                                raw = jffs2_alloc_raw_node_ref();
-                                if (!raw) {
-                                        jffs2_free_full_dirent(fd);
-                                        JFFS2_NOTICE("allocation of node reference failed\n");
-                                        kfree(summary);
-                                        return -ENOMEM;
-                                }
                                ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino));
                                if (!ic) {
                                        jffs2_free_full_dirent(fd);
-                                        jffs2_free_raw_node_ref(raw);
-                                        kfree(summary);
                                        return -ENOMEM;
                                }
-                                raw->__totlen = PAD(je32_to_cpu(spd->totlen));
+                                fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_UNCHECKED,
-                                raw->flash_offset = (jeb->offset + je32_to_cpu(spd->offset)) | REF_PRISTINE;
+                                                            PAD(je32_to_cpu(spd->totlen)), ic);
-                                raw->next_phys = NULL;
-                                raw->next_in_ino = ic->nodes;
-                                ic->nodes = raw;
-                                if (!jeb->first_node)
-                                        jeb->first_node = raw;
-                                if (jeb->last_node)
-                                        jeb->last_node->next_phys = raw;
-                                jeb->last_node = raw;
-                                fd->raw = raw;
                                fd->next = NULL;
                                fd->version = je32_to_cpu(spd->version);
                                fd->ino = je32_to_cpu(spd->ino);
                                fd->nhash = full_name_hash(fd->name, spd->nsize);
                                fd->type = spd->type;
-                                USED_SPACE(PAD(je32_to_cpu(spd->totlen)));
                                jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
                                *pseudo_random += je32_to_cpu(spd->version);
@@ -408,48 +464,100 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
                                break;
                        }
+#ifdef CONFIG_JFFS2_FS_XATTR
+                        case JFFS2_NODETYPE_XATTR: {
+                                struct jffs2_xattr_datum *xd;
+                                struct jffs2_sum_xattr_flash *spx;
+                                spx = (struct jffs2_sum_xattr_flash *)sp;
+                                dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", 
+                                            jeb->offset + je32_to_cpu(spx->offset),
+                                            jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
+                                            je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
+                                xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
+                                                                je32_to_cpu(spx->version));
+                                if (IS_ERR(xd))
+                                        return PTR_ERR(xd);
+                                if (xd->version > je32_to_cpu(spx->version)) {
+                                        /* node is not the newest one */
+                                        struct jffs2_raw_node_ref *raw
+                                                = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+                                                                    PAD(je32_to_cpu(spx->totlen)), NULL);
+                                        raw->next_in_ino = xd->node->next_in_ino;
+                                        xd->node->next_in_ino = raw;
+                                } else {
+                                        xd->version = je32_to_cpu(spx->version);
+                                        sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+                                                          PAD(je32_to_cpu(spx->totlen)), (void *)xd);
+                                }
+                                *pseudo_random += je32_to_cpu(spx->xid);
+                                sp += JFFS2_SUMMARY_XATTR_SIZE;
+                                break;
+                        }
+                        case JFFS2_NODETYPE_XREF: {
+                                struct jffs2_xattr_ref *ref;
+                                struct jffs2_sum_xref_flash *spr;
+                                spr = (struct jffs2_sum_xref_flash *)sp;
+                                dbg_summary("xref at %#08x-%#08x\n",
+                                            jeb->offset + je32_to_cpu(spr->offset),
+                                            jeb->offset + je32_to_cpu(spr->offset) + 
+                                            (uint32_t)PAD(sizeof(struct jffs2_raw_xref)));
+                                ref = jffs2_alloc_xattr_ref();
+                                if (!ref) {
+                                        JFFS2_NOTICE("allocation of xattr_datum failed\n");
+                                        return -ENOMEM;
+                                }
+                                ref->next = c->xref_temp;
+                                c->xref_temp = ref;
+                                sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+                                                  PAD(sizeof(struct jffs2_raw_xref)), (void *)ref);
+                                *pseudo_random += ref->node->flash_offset;
+                                sp += JFFS2_SUMMARY_XREF_SIZE;
+                                break;
+                        }
+#endif
                        default : {
-                                JFFS2_WARNING("Unsupported node type found in summary! Exiting...");
+                                uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype);
-                                kfree(summary);
+                                JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype);
-                                return -EIO;
+                                if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT)
+                                        return -EIO;
+                                /* For compatible node types, just fall back to the full scan */
+                                c->wasted_size -= jeb->wasted_size;
+                                c->free_size += c->sector_size - jeb->free_size;
+                                c->used_size -= jeb->used_size;
+                                c->dirty_size -= jeb->dirty_size;
+                                jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
+                                jeb->free_size = c->sector_size;
+                                jffs2_free_jeb_node_refs(c, jeb);
+                                return -ENOTRECOVERABLE;
                        }
                }
        }
-        kfree(summary);
        return 0;
 }
 /* Process the summary node - called from jffs2_scan_eraseblock() */
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-                                uint32_t ofs, uint32_t *pseudo_random)
+                           struct jffs2_raw_summary *summary, uint32_t sumsize,
+                           uint32_t *pseudo_random)
 {
        struct jffs2_unknown_node crcnode;
-        struct jffs2_raw_node_ref *cache_ref;
+        int ret, ofs;
-        struct jffs2_raw_summary *summary;
-        int ret, sumsize;
        uint32_t crc;
-        sumsize = c->sector_size - ofs;
+        ofs = c->sector_size - sumsize;
-        ofs += jeb->offset;
        dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n",
-                                jeb->offset, ofs, sumsize);
+                    jeb->offset, jeb->offset + ofs, sumsize);
-        summary = kmalloc(sumsize, GFP_KERNEL);
-        if (!summary) {
-                return -ENOMEM;
-        }
-        ret = jffs2_fill_scan_buf(c, (unsigned char *)summary, ofs, sumsize);
-        if (ret) {
-                kfree(summary);
-                return ret;
-        }
        /* OK, now check for node validity and CRC */
        crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -486,66 +594,49 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
                dbg_summary("Summary : CLEANMARKER node \n");
+                ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+                if (ret)
+                        return ret;
                if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
                        dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
                                je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
-                        UNCHECKED_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+                        if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+                                return ret;
                } else if (jeb->first_node) {
                        dbg_summary("CLEANMARKER node not first node in block "
                                        "(0x%08x)\n", jeb->offset);
-                        UNCHECKED_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+                        if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+                                return ret;
                } else {
-                        struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
+                        jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL,
+                                            je32_to_cpu(summary->cln_mkr), NULL);
-                        if (!marker_ref) {
-                                JFFS2_NOTICE("Failed to allocate node ref for clean marker\n");
-                                kfree(summary);
-                                return -ENOMEM;
-                        }
-                        marker_ref->next_in_ino = NULL;
-                        marker_ref->next_phys = NULL;
-                        marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-                        marker_ref->__totlen = je32_to_cpu(summary->cln_mkr);
-                        jeb->first_node = jeb->last_node = marker_ref;
-                        USED_SPACE( PAD(je32_to_cpu(summary->cln_mkr)) );
                }
        }
-        if (je32_to_cpu(summary->padded)) {
-                DIRTY_SPACE(je32_to_cpu(summary->padded));
-        }
        ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
+        /* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full
+           scan of this eraseblock. So return zero */
+        if (ret == -ENOTRECOVERABLE)
+                return 0;
        if (ret)
-                return ret;
+                return ret;             /* real error */
        /* for PARANOIA_CHECK */
-        cache_ref = jffs2_alloc_raw_node_ref();
+        ret = jffs2_prealloc_raw_node_refs(c, jeb, 2);
+        if (ret)
-        if (!cache_ref) {
+                return ret;
-                JFFS2_NOTICE("Failed to allocate node ref for cache\n");
-                return -ENOMEM;
-        }
-        cache_ref->next_in_ino = NULL;
-        cache_ref->next_phys = NULL;
-        cache_ref->flash_offset = ofs | REF_NORMAL;
-        cache_ref->__totlen = sumsize;
-        if (!jeb->first_node)
-                jeb->first_node = cache_ref;
-        if (jeb->last_node)
-                jeb->last_node->next_phys = cache_ref;
-        jeb->last_node = cache_ref;
-        USED_SPACE(sumsize);
+        sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL);
-        jeb->wasted_size += jeb->free_size;
+        if (unlikely(jeb->free_size)) {
-        c->wasted_size += jeb->free_size;
+                JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
-        c->free_size -= jeb->free_size;
+                              jeb->free_size, jeb->offset);
-        jeb->free_size = 0;
+                jeb->wasted_size += jeb->free_size;
+                c->wasted_size += jeb->free_size;
+                c->free_size -= jeb->free_size;
+                jeb->free_size = 0;
+        }
        return jffs2_scan_classify_jeb(c, jeb);
@@ -564,6 +655,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
        union jffs2_sum_mem *temp;
        struct jffs2_sum_marker *sm;
        struct kvec vecs[2];
+        uint32_t sum_ofs;
        void *wpage;
        int ret;
        size_t retlen;
@@ -581,16 +673,17 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
        wpage = c->summary->sum_buf;
        while (c->summary->sum_num) {
+                temp = c->summary->sum_list_head;
-                switch (je16_to_cpu(c->summary->sum_list_head->u.nodetype)) {
+                switch (je16_to_cpu(temp->u.nodetype)) {
                        case JFFS2_NODETYPE_INODE: {
                                struct jffs2_sum_inode_flash *sino_ptr = wpage;
-                                sino_ptr->nodetype = c->summary->sum_list_head->i.nodetype;
+                                sino_ptr->nodetype = temp->i.nodetype;
-                                sino_ptr->inode = c->summary->sum_list_head->i.inode;
+                                sino_ptr->inode = temp->i.inode;
-                                sino_ptr->version = c->summary->sum_list_head->i.version;
+                                sino_ptr->version = temp->i.version;
-                                sino_ptr->offset = c->summary->sum_list_head->i.offset;
+                                sino_ptr->offset = temp->i.offset;
-                                sino_ptr->totlen = c->summary->sum_list_head->i.totlen;
+                                sino_ptr->totlen = temp->i.totlen;
                                wpage += JFFS2_SUMMARY_INODE_SIZE;
@@ -600,30 +693,60 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
                        case JFFS2_NODETYPE_DIRENT: {
                                struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage;
-                                sdrnt_ptr->nodetype = c->summary->sum_list_head->d.nodetype;
+                                sdrnt_ptr->nodetype = temp->d.nodetype;
-                                sdrnt_ptr->totlen = c->summary->sum_list_head->d.totlen;
+                                sdrnt_ptr->totlen = temp->d.totlen;
-                                sdrnt_ptr->offset = c->summary->sum_list_head->d.offset;
+                                sdrnt_ptr->offset = temp->d.offset;
-                                sdrnt_ptr->pino = c->summary->sum_list_head->d.pino;
+                                sdrnt_ptr->pino = temp->d.pino;
-                                sdrnt_ptr->version = c->summary->sum_list_head->d.version;
+                                sdrnt_ptr->version = temp->d.version;
-                                sdrnt_ptr->ino = c->summary->sum_list_head->d.ino;
+                                sdrnt_ptr->ino = temp->d.ino;
-                                sdrnt_ptr->nsize = c->summary->sum_list_head->d.nsize;
+                                sdrnt_ptr->nsize = temp->d.nsize;
-                                sdrnt_ptr->type = c->summary->sum_list_head->d.type;
+                                sdrnt_ptr->type = temp->d.type;
-                                memcpy(sdrnt_ptr->name, c->summary->sum_list_head->d.name,
+                                memcpy(sdrnt_ptr->name, temp->d.name,
-                                                        c->summary->sum_list_head->d.nsize);
+                                                        temp->d.nsize);
-                                wpage += JFFS2_SUMMARY_DIRENT_SIZE(c->summary->sum_list_head->d.nsize);
+                                wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize);
                                break;
                        }
+#ifdef CONFIG_JFFS2_FS_XATTR
+                        case JFFS2_NODETYPE_XATTR: {
+                                struct jffs2_sum_xattr_flash *sxattr_ptr = wpage;
+                                temp = c->summary->sum_list_head;
+                                sxattr_ptr->nodetype = temp->x.nodetype;
+                                sxattr_ptr->xid = temp->x.xid;
+                                sxattr_ptr->version = temp->x.version;
+                                sxattr_ptr->offset = temp->x.offset;
+                                sxattr_ptr->totlen = temp->x.totlen;
+                                wpage += JFFS2_SUMMARY_XATTR_SIZE;
+                                break;
+                        }
+                        case JFFS2_NODETYPE_XREF: {
+                                struct jffs2_sum_xref_flash *sxref_ptr = wpage;
+                                temp = c->summary->sum_list_head;
+                                sxref_ptr->nodetype = temp->r.nodetype;
+                                sxref_ptr->offset = temp->r.offset;
+                                wpage += JFFS2_SUMMARY_XREF_SIZE;
+                                break;
+                        }
+#endif
                        default : {
-                                BUG();  /* unknown node in summary information */
+                                if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK)
+                                    == JFFS2_FEATURE_RWCOMPAT_COPY) {
+                                        dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
+                                                    je16_to_cpu(temp->u.nodetype));
+                                        jffs2_sum_disable_collecting(c->summary);
+                                } else {
+                                        BUG();  /* unknown node in summary information */
+                                }
                        }
                }
-                temp = c->summary->sum_list_head;
+                c->summary->sum_list_head = temp->u.next;
-                c->summary->sum_list_head = c->summary->sum_list_head->u.next;
                kfree(temp);
                c->summary->sum_num--;
@@ -645,25 +768,34 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
        vecs[1].iov_base = c->summary->sum_buf;
        vecs[1].iov_len = datasize;
-        dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
+        sum_ofs = jeb->offset + c->sector_size - jeb->free_size;
-                        jeb->offset + c->sector_size - jeb->free_size);
-        spin_unlock(&c->erase_completion_lock);
+        dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
-        ret = jffs2_flash_writev(c, vecs, 2, jeb->offset + c->sector_size -
+                    sum_ofs);
-                                jeb->free_size, &retlen, 0);
-        spin_lock(&c->erase_completion_lock);
+        ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0);
        if (ret || (retlen != infosize)) {
-                JFFS2_WARNING("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
-                        infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
+                JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
+                              infosize, sum_ofs, ret, retlen);
+                if (retlen) {
+                        /* Waste remaining space */
+                        spin_lock(&c->erase_completion_lock);
+                        jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
+                        spin_unlock(&c->erase_completion_lock);
+                }
                c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-                WASTED_SPACE(infosize);
-                return 1;
+                return 0;
        }
+        spin_lock(&c->erase_completion_lock);
+        jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL);
+        spin_unlock(&c->erase_completion_lock);
        return 0;
 }
@@ -671,13 +803,16 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 {
-        struct jffs2_raw_node_ref *summary_ref;
+        int datasize, infosize, padsize;
-        int datasize, infosize, padsize, ret;
        struct jffs2_eraseblock *jeb;
+        int ret;
        dbg_summary("called\n");
+        spin_unlock(&c->erase_completion_lock);
        jeb = c->nextblock;
+        jffs2_prealloc_raw_node_refs(c, jeb, 1);
        if (!c->summary->sum_num || !c->summary->sum_list_head) {
                JFFS2_WARNING("Empty summary info!!!\n");
@@ -696,35 +831,11 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
                jffs2_sum_disable_collecting(c->summary);
                JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize);
+                spin_lock(&c->erase_completion_lock);
                return 0;
        }
        ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
-        if (ret)
-                return 0; /* can't write out summary, block is marked as NOSUM_SIZE */
-        /* for ACCT_PARANOIA_CHECK */
-        spin_unlock(&c->erase_completion_lock);
-        summary_ref = jffs2_alloc_raw_node_ref();
        spin_lock(&c->erase_completion_lock);
+        return ret;
-        if (!summary_ref) {
-                JFFS2_NOTICE("Failed to allocate node ref for summary\n");
-                return -ENOMEM;
-        }
-        summary_ref->next_in_ino = NULL;
-        summary_ref->next_phys = NULL;
-        summary_ref->flash_offset = (jeb->offset + c->sector_size - jeb->free_size) | REF_NORMAL;
-        summary_ref->__totlen = infosize;
-        if (!jeb->first_node)
-                jeb->first_node = summary_ref;
-        if (jeb->last_node)
-                jeb->last_node->next_phys = summary_ref;
-        jeb->last_node = summary_ref;
-        USED_SPACE(infosize);
-        return 0;
 }
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index b7a678be1709..6bf1f6aa4552 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -18,23 +18,6 @@
 #include <linux/uio.h>
 #include <linux/jffs2.h>
-#define DIRTY_SPACE(x) do { typeof(x) _x = (x); \
-                c->free_size -= _x; c->dirty_size += _x; \
-                jeb->free_size -= _x ; jeb->dirty_size += _x; \
-                }while(0)
-#define USED_SPACE(x) do { typeof(x) _x = (x); \
-                c->free_size -= _x; c->used_size += _x; \
-                jeb->free_size -= _x ; jeb->used_size += _x; \
-                }while(0)
-#define WASTED_SPACE(x) do { typeof(x) _x = (x); \
-                c->free_size -= _x; c->wasted_size += _x; \
-                jeb->free_size -= _x ; jeb->wasted_size += _x; \
-                }while(0)
-#define UNCHECKED_SPACE(x) do { typeof(x) _x = (x); \
-                c->free_size -= _x; c->unchecked_size += _x; \
-                jeb->free_size -= _x ; jeb->unchecked_size += _x; \
-                }while(0)
 #define BLK_STATE_ALLFF         0
 #define BLK_STATE_CLEAN         1
 #define BLK_STATE_PARTDIRTY     2
@@ -45,6 +28,8 @@
 #define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff
 #define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash))
 #define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x))
+#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash))
+#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash))
 /* Summary structures used on flash */
@@ -75,11 +60,28 @@ struct jffs2_sum_dirent_flash
        uint8_t name[0];        /* dirent name */
 } __attribute__((packed));
+struct jffs2_sum_xattr_flash
+{
+        jint16_t nodetype;      /* == JFFS2_NODETYPE_XATR */
+        jint32_t xid;           /* xattr identifier */
+        jint32_t version;       /* version number */
+        jint32_t offset;        /* offset on jeb */
+        jint32_t totlen;        /* node length */
+} __attribute__((packed));
+struct jffs2_sum_xref_flash
+{
+        jint16_t nodetype;      /* == JFFS2_NODETYPE_XREF */
+        jint32_t offset;        /* offset on jeb */
+} __attribute__((packed));
 union jffs2_sum_flash
 {
        struct jffs2_sum_unknown_flash u;
        struct jffs2_sum_inode_flash i;
        struct jffs2_sum_dirent_flash d;
+        struct jffs2_sum_xattr_flash x;
+        struct jffs2_sum_xref_flash r;
 };
 /* Summary structures used in the memory */
@@ -114,11 +116,30 @@ struct jffs2_sum_dirent_mem
        uint8_t name[0];        /* dirent name */
 } __attribute__((packed));
+struct jffs2_sum_xattr_mem
+{
+        union jffs2_sum_mem *next;
+        jint16_t nodetype;
+        jint32_t xid;
+        jint32_t version;
+        jint32_t offset;
+        jint32_t totlen;
+} __attribute__((packed));
+struct jffs2_sum_xref_mem
+{
+        union jffs2_sum_mem *next;
+        jint16_t nodetype;
+        jint32_t offset;
+} __attribute__((packed));
 union jffs2_sum_mem
 {
        struct jffs2_sum_unknown_mem u;
        struct jffs2_sum_inode_mem i;
        struct jffs2_sum_dirent_mem d;
+        struct jffs2_sum_xattr_mem x;
+        struct jffs2_sum_xref_mem r;
 };
 /* Summary related information stored in superblock */
@@ -159,8 +180,11 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c);
 int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
 int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
 int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs);
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs);
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-                        uint32_t ofs, uint32_t *pseudo_random);
+                           struct jffs2_raw_summary *summary, uint32_t sumlen,
+                           uint32_t *pseudo_random);
 #else                           /* SUMMARY DISABLED */
@@ -176,7 +200,9 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 #define jffs2_sum_add_padding_mem(a,b)
 #define jffs2_sum_add_inode_mem(a,b,c)
 #define jffs2_sum_add_dirent_mem(a,b,c)
-#define jffs2_sum_scan_sumnode(a,b,c,d) (0)
+#define jffs2_sum_add_xattr_mem(a,b,c)
+#define jffs2_sum_add_xref_mem(a,b,c)
+#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
 #endif /* CONFIG_JFFS2_SUMMARY */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ffd8e84b22cc..68e3953419b4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -11,7 +11,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -111,9 +110,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data)
        return 0;
 }
-static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
+static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
-                                              int flags, const char *dev_name,
+                            int flags, const char *dev_name,
-                                              void *data, struct mtd_info *mtd)
+                            void *data, struct mtd_info *mtd,
+                            struct vfsmount *mnt)
 {
        struct super_block *sb;
        struct jffs2_sb_info *c;
@@ -121,19 +121,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
        c = kmalloc(sizeof(*c), GFP_KERNEL);
        if (!c)
-                return ERR_PTR(-ENOMEM);
+                return -ENOMEM;
        memset(c, 0, sizeof(*c));
        c->mtd = mtd;
        sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
        if (IS_ERR(sb))
-                goto out_put;
+                goto out_error;
        if (sb->s_root) {
                /* New mountpoint for JFFS2 which is already mounted */
                D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n",
                          mtd->index, mtd->name));
+                ret = simple_set_mnt(mnt, sb);
                goto out_put;
        }
@@ -151,51 +152,57 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
        sb->s_op = &jffs2_super_operations;
        sb->s_flags = flags | MS_NOATIME;
+        sb->s_xattr = jffs2_xattr_handlers;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+        sb->s_flags |= MS_POSIXACL;
+#endif
        ret = jffs2_do_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
        if (ret) {
                /* Failure case... */
                up_write(&sb->s_umount);
                deactivate_super(sb);
-                return ERR_PTR(ret);
+                return ret;
        }
        sb->s_flags |= MS_ACTIVE;
-        return sb;
+        return simple_set_mnt(mnt, sb);
+out_error:
+        ret = PTR_ERR(sb);
 out_put:
        kfree(c);
        put_mtd_device(mtd);
-        return sb;
+        return ret;
 }
-static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
+static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
-                                              int flags, const char *dev_name,
+                              int flags, const char *dev_name,
-                                              void *data, int mtdnr)
+                              void *data, int mtdnr,
+                              struct vfsmount *mnt)
 {
        struct mtd_info *mtd;
        mtd = get_mtd_device(NULL, mtdnr);
        if (!mtd) {
                D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr));
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        }
-        return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+        return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 }
-static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
+static int jffs2_get_sb(struct file_system_type *fs_type,
-                                        int flags, const char *dev_name,
+                        int flags, const char *dev_name,
-                                        void *data)
+                        void *data, struct vfsmount *mnt)
 {
        int err;
        struct nameidata nd;
        int mtdnr;
        if (!dev_name)
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name));
@@ -217,7 +224,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
                                mtd = get_mtd_device(NULL, mtdnr);
                                if (mtd) {
                                        if (!strcmp(mtd->name, dev_name+4))
-                                                return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+                                                return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
                                        put_mtd_device(mtd);
                                }
                        }
@@ -230,7 +237,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
                        if (!*endptr) {
                                /* It was a valid number */
                                D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr));
-                                return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+                                return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
                        }
                }
        }
@@ -244,7 +251,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
                  err, nd.dentry->d_inode));
        if (err)
-                return ERR_PTR(err);
+                return err;
        err = -EINVAL;
@@ -266,11 +273,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
        mtdnr = iminor(nd.dentry->d_inode);
        path_release(&nd);
-        return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+        return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 out:
        path_release(&nd);
-        return ERR_PTR(err);
+        return err;
 }
 static void jffs2_put_super (struct super_block *sb)
@@ -293,6 +300,7 @@ static void jffs2_put_super (struct super_block *sb)
                kfree(c->blocks);
        jffs2_flash_cleanup(c);
        kfree(c->inocache_list);
+        jffs2_clear_xattr_subsystem(c);
        if (c->mtd->sync)
                c->mtd->sync(c->mtd);
@@ -320,6 +328,18 @@ static int __init init_jffs2_fs(void)
 {
        int ret;
+        /* Paranoia checks for on-medium structures. If we ask GCC
+           to pack them with __attribute__((packed)) then it _also_
+           assumes that they're not aligned -- so it emits crappy
+           code on some architectures. Ideally we want an attribute
+           which means just 'no padding', without the alignment
+           thing. But GCC doesn't have that -- we have to just
+           hope the structs are the right sizes, instead. */
+        BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+        BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+        BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+        BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
        printk(KERN_INFO "JFFS2 version 2.2."
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
               " (NAND)"
@@ -327,7 +347,7 @@ static int __init init_jffs2_fs(void)
 #ifdef CONFIG_JFFS2_SUMMARY
               " (SUMMARY) "
 #endif
-               " (C) 2001-2003 Red Hat, Inc.\n");
+               " (C) 2001-2006 Red Hat, Inc.\n");
        jffs2_inode_cachep = kmem_cache_create("jffs2_i",
                                             sizeof(struct jffs2_inode_info),
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index d55754fe8925..fc211b6e9b03 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -24,7 +24,12 @@ struct inode_operations jffs2_symlink_inode_operations =
 {
        .readlink =     generic_readlink,
        .follow_link =  jffs2_follow_link,
-        .setattr =      jffs2_setattr
+        .permission =   jffs2_permission,
+        .setattr =      jffs2_setattr,
+        .setxattr =     jffs2_setxattr,
+        .getxattr =     jffs2_getxattr,
+        .listxattr =    jffs2_listxattr,
+        .removexattr =  jffs2_removexattr
 };
 static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4cebf0e57c46..b9b700730dfe 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -156,69 +156,130 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
                jffs2_erase_pending_trigger(c);
        }
-        /* Adjust its size counts accordingly */
+        if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
-        c->wasted_size += jeb->free_size;
+                uint32_t oldfree = jeb->free_size;
-        c->free_size -= jeb->free_size;
-        jeb->wasted_size += jeb->free_size;
+                jffs2_link_node_ref(c, jeb, 
-        jeb->free_size = 0;
+                                    (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE,
+                                    oldfree, NULL);
+                /* convert to wasted */
+                c->wasted_size += oldfree;
+                jeb->wasted_size += oldfree;
+                c->dirty_size -= oldfree;
+                jeb->dirty_size -= oldfree;
+        }
        jffs2_dbg_dump_block_lists_nolock(c);
        jffs2_dbg_acct_sanity_check_nolock(c,jeb);
        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 }
+static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c,
+                                                            struct jffs2_inode_info *f,
+                                                            struct jffs2_raw_node_ref *raw,
+                                                            union jffs2_node_union *node)
+{
+        struct jffs2_node_frag *frag;
+        struct jffs2_full_dirent *fd;
+        dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n",
+                    node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype));
+        BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 &&
+               je16_to_cpu(node->u.magic) != 0);
+        switch (je16_to_cpu(node->u.nodetype)) {
+        case JFFS2_NODETYPE_INODE:
+                if (f->metadata && f->metadata->raw == raw) {
+                        dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata);
+                        return &f->metadata->raw;
+                }
+                frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset));
+                BUG_ON(!frag);
+                /* Find a frag which refers to the full_dnode we want to modify */
+                while (!frag->node || frag->node->raw != raw) {
+                        frag = frag_next(frag);
+                        BUG_ON(!frag);
+                }
+                dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node);
+                return &frag->node->raw;
+        case JFFS2_NODETYPE_DIRENT:
+                for (fd = f->dents; fd; fd = fd->next) {
+                        if (fd->raw == raw) {
+                                dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd);
+                                return &fd->raw;
+                        }
+                }
+                BUG();
+        default:
+                dbg_noderef("Don't care about replacing raw for nodetype %x\n",
+                            je16_to_cpu(node->u.nodetype));
+                break;
+        }
+        return NULL;
+}
 /* Recover from failure to write wbuf. Recover the nodes up to the
 * wbuf, not the one which we were starting to try to write. */
 static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 {
        struct jffs2_eraseblock *jeb, *new_jeb;
-        struct jffs2_raw_node_ref **first_raw, **raw;
+        struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL;
        size_t retlen;
        int ret;
+        int nr_refile = 0;
        unsigned char *buf;
        uint32_t start, end, ofs, len;
-        spin_lock(&c->erase_completion_lock);
        jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+        spin_lock(&c->erase_completion_lock);
        jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+        spin_unlock(&c->erase_completion_lock);
+        BUG_ON(!ref_obsolete(jeb->last_node));
        /* Find the first node to be recovered, by skipping over every
           node which ends before the wbuf starts, or which is obsolete. */
-        first_raw = &jeb->first_node;
+        for (next = raw = jeb->first_node; next; raw = next) {
-        while (*first_raw &&
+                next = ref_next(raw);
-               (ref_obsolete(*first_raw) ||
-                (ref_offset(*first_raw)+ref_totlen(c, jeb, *first_raw)) < c->wbuf_ofs)) {
+                if (ref_obsolete(raw) || 
-                D1(printk(KERN_DEBUG "Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
+                    (next && ref_offset(next) <= c->wbuf_ofs)) {
-                          ref_offset(*first_raw), ref_flags(*first_raw),
+                        dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
-                          (ref_offset(*first_raw) + ref_totlen(c, jeb, *first_raw)),
+                                    ref_offset(raw), ref_flags(raw),
-                          c->wbuf_ofs));
+                                    (ref_offset(raw) + ref_totlen(c, jeb, raw)),
-                first_raw = &(*first_raw)->next_phys;
+                                    c->wbuf_ofs);
+                        continue;
+                }
+                dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n",
+                            ref_offset(raw), ref_flags(raw),
+                            (ref_offset(raw) + ref_totlen(c, jeb, raw)));
+                first_raw = raw;
+                break;
        }
-        if (!*first_raw) {
+        if (!first_raw) {
                /* All nodes were obsolete. Nothing to recover. */
                D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n"));
-                spin_unlock(&c->erase_completion_lock);
+                c->wbuf_len = 0;
                return;
        }
-        start = ref_offset(*first_raw);
+        start = ref_offset(first_raw);
-        end = ref_offset(*first_raw) + ref_totlen(c, jeb, *first_raw);
+        end = ref_offset(jeb->last_node);
+        nr_refile = 1;
-        /* Find the last node to be recovered */
-        raw = first_raw;
-        while ((*raw)) {
-                if (!ref_obsolete(*raw))
-                        end = ref_offset(*raw) + ref_totlen(c, jeb, *raw);
-                raw = &(*raw)->next_phys;
+        /* Count the number of refs which need to be copied */
-        }
+        while ((raw = ref_next(raw)) != jeb->last_node)
-        spin_unlock(&c->erase_completion_lock);
+                nr_refile++;
-        D1(printk(KERN_DEBUG "wbuf recover %08x-%08x\n", start, end));
+        dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n",
+                    start, end, end - start, nr_refile);
        buf = NULL;
        if (start < c->wbuf_ofs) {
@@ -233,28 +294,37 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                }
                /* Do the read... */
-                if (jffs2_cleanmarker_oob(c))
+                ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
-                        ret = c->mtd->read_ecc(c->mtd, start, c->wbuf_ofs - start, &retlen, buf, NULL, c->oobinfo);
-                else
-                        ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
-                if (ret == -EBADMSG && retlen == c->wbuf_ofs - start) {
+                /* ECC recovered ? */
-                        /* ECC recovered */
+                if ((ret == -EUCLEAN || ret == -EBADMSG) &&
+                    (retlen == c->wbuf_ofs - start))
                        ret = 0;
-                }
                if (ret || retlen != c->wbuf_ofs - start) {
                        printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n");
                        kfree(buf);
                        buf = NULL;
                read_failed:
-                        first_raw = &(*first_raw)->next_phys;
+                        first_raw = ref_next(first_raw);
+                        nr_refile--;
+                        while (first_raw && ref_obsolete(first_raw)) {
+                                first_raw = ref_next(first_raw);
+                                nr_refile--;
+                        }
                        /* If this was the only node to be recovered, give up */
-                        if (!(*first_raw))
+                        if (!first_raw) {
+                                c->wbuf_len = 0;
                                return;
+                        }
                        /* It wasn't. Go on and try to recover nodes complete in the wbuf */
-                        start = ref_offset(*first_raw);
+                        start = ref_offset(first_raw);
+                        dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n",
+                                    start, end, end - start, nr_refile);
                } else {
                        /* Read succeeded. Copy the remaining data from the wbuf */
                        memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs);
@@ -263,14 +333,23 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
        /* OK... we're to rewrite (end-start) bytes of data from first_raw onwards.
           Either 'buf' contains the data, or we find it in the wbuf */
        /* ... and get an allocation of space from a shiny new block instead */
-        ret = jffs2_reserve_space_gc(c, end-start, &ofs, &len, JFFS2_SUMMARY_NOSUM_SIZE);
+        ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
        if (ret) {
                printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n");
                kfree(buf);
                return;
        }
+        ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
+        if (ret) {
+                printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
+                kfree(buf);
+                return;
+        }
+        ofs = write_ofs(c);
        if (end-start >= c->wbuf_pagesize) {
                /* Need to do another write immediately, but it's possible
                   that this is just because the wbuf itself is completely
@@ -288,36 +367,22 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                if (breakme++ == 20) {
                        printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
                        breakme = 0;
-                        c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
+                        c->mtd->write(c->mtd, ofs, towrite, &retlen,
-                                          brokenbuf, NULL, c->oobinfo);
+                                      brokenbuf);
                        ret = -EIO;
                } else
 #endif
-                if (jffs2_cleanmarker_oob(c))
+                        ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
-                        ret = c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
+                                            rewrite_buf);
-                                                rewrite_buf, NULL, c->oobinfo);
-                else
-                        ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, rewrite_buf);
                if (ret || retlen != towrite) {
                        /* Argh. We tried. Really we did. */
                        printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n");
                        kfree(buf);
-                        if (retlen) {
+                        if (retlen)
-                                struct jffs2_raw_node_ref *raw2;
+                                jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL);
-                                raw2 = jffs2_alloc_raw_node_ref();
-                                if (!raw2)
-                                        return;
-                                raw2->flash_offset = ofs | REF_OBSOLETE;
-                                raw2->__totlen = ref_totlen(c, jeb, *first_raw);
-                                raw2->next_phys = NULL;
-                                raw2->next_in_ino = NULL;
-                                jffs2_add_physical_node_ref(c, raw2);
-                        }
                        return;
                }
                printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs);
@@ -326,12 +391,10 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                c->wbuf_ofs = ofs + towrite;
                memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len);
                /* Don't muck about with c->wbuf_inodes. False positives are harmless. */
-                kfree(buf);
        } else {
                /* OK, now we're left with the dregs in whichever buffer we're using */
                if (buf) {
                        memcpy(c->wbuf, buf, end-start);
-                        kfree(buf);
                } else {
                        memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start);
                }
@@ -343,62 +406,110 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
        new_jeb = &c->blocks[ofs / c->sector_size];
        spin_lock(&c->erase_completion_lock);
-        if (new_jeb->first_node) {
+        for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) {
-                /* Odd, but possible with ST flash later maybe */
+                uint32_t rawlen = ref_totlen(c, jeb, raw);
-                new_jeb->last_node->next_phys = *first_raw;
+                struct jffs2_inode_cache *ic;
-        } else {
+                struct jffs2_raw_node_ref *new_ref;
-                new_jeb->first_node = *first_raw;
+                struct jffs2_raw_node_ref **adjust_ref = NULL;
-        }
+                struct jffs2_inode_info *f = NULL;
-        raw = first_raw;
-        while (*raw) {
-                uint32_t rawlen = ref_totlen(c, jeb, *raw);
                D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n",
-                          rawlen, ref_offset(*raw), ref_flags(*raw), ofs));
+                          rawlen, ref_offset(raw), ref_flags(raw), ofs));
+                ic = jffs2_raw_ref_to_ic(raw);
+                /* Ick. This XATTR mess should be fixed shortly... */
+                if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+                        struct jffs2_xattr_datum *xd = (void *)ic;
+                        BUG_ON(xd->node != raw);
+                        adjust_ref = &xd->node;
+                        raw->next_in_ino = NULL;
+                        ic = NULL;
+                } else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) {
+                        struct jffs2_xattr_datum *xr = (void *)ic;
+                        BUG_ON(xr->node != raw);
+                        adjust_ref = &xr->node;
+                        raw->next_in_ino = NULL;
+                        ic = NULL;
+                } else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) {
+                        struct jffs2_raw_node_ref **p = &ic->nodes;
+                        /* Remove the old node from the per-inode list */
+                        while (*p && *p != (void *)ic) {
+                                if (*p == raw) {
+                                        (*p) = (raw->next_in_ino);
+                                        raw->next_in_ino = NULL;
+                                        break;
+                                }
+                                p = &((*p)->next_in_ino);
+                        }
-                if (ref_obsolete(*raw)) {
+                        if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) {
-                        /* Shouldn't really happen much */
+                                /* If it's an in-core inode, then we have to adjust any
-                        new_jeb->dirty_size += rawlen;
+                                   full_dirent or full_dnode structure to point to the
-                        new_jeb->free_size -= rawlen;
+                                   new version instead of the old */
-                        c->dirty_size += rawlen;
+                                f = jffs2_gc_fetch_inode(c, ic->ino, ic->nlink);
-                } else {
+                                if (IS_ERR(f)) {
-                        new_jeb->used_size += rawlen;
+                                        /* Should never happen; it _must_ be present */
-                        new_jeb->free_size -= rawlen;
+                                        JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n",
+                                                    ic->ino, PTR_ERR(f));
+                                        BUG();
+                                }
+                                /* We don't lock f->sem. There's a number of ways we could
+                                   end up in here with it already being locked, and nobody's
+                                   going to modify it on us anyway because we hold the
+                                   alloc_sem. We're only changing one ->raw pointer too,
+                                   which we can get away with without upsetting readers. */
+                                adjust_ref = jffs2_incore_replace_raw(c, f, raw,
+                                                                      (void *)(buf?:c->wbuf) + (ref_offset(raw) - start));
+                        } else if (unlikely(ic->state != INO_STATE_PRESENT &&
+                                            ic->state != INO_STATE_CHECKEDABSENT &&
+                                            ic->state != INO_STATE_GC)) {
+                                JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state);
+                                BUG();
+                        }
+                }
+                new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic);
+                if (adjust_ref) {
+                        BUG_ON(*adjust_ref != raw);
+                        *adjust_ref = new_ref;
+                }
+                if (f)
+                        jffs2_gc_release_inode(c, f);
+                if (!ref_obsolete(raw)) {
                        jeb->dirty_size += rawlen;
                        jeb->used_size  -= rawlen;
                        c->dirty_size += rawlen;
+                        c->used_size -= rawlen;
+                        raw->flash_offset = ref_offset(raw) | REF_OBSOLETE;
+                        BUG_ON(raw->next_in_ino);
                }
-                c->free_size -= rawlen;
-                (*raw)->flash_offset = ofs | ref_flags(*raw);
                ofs += rawlen;
-                new_jeb->last_node = *raw;
-                raw = &(*raw)->next_phys;
        }
+        kfree(buf);
        /* Fix up the original jeb now it's on the bad_list */
-        *first_raw = NULL;
+        if (first_raw == jeb->first_node) {
-        if (first_raw == &jeb->first_node) {
-                jeb->last_node = NULL;
                D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
-                list_del(&jeb->list);
+                list_move(&jeb->list, &c->erase_pending_list);
-                list_add(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
                jffs2_erase_pending_trigger(c);
        }
-        else
-                jeb->last_node = container_of(first_raw, struct jffs2_raw_node_ref, next_phys);
        jffs2_dbg_acct_sanity_check_nolock(c, jeb);
-        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
        jffs2_dbg_acct_sanity_check_nolock(c, new_jeb);
-        jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
+        jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
        spin_unlock(&c->erase_completion_lock);
-        D1(printk(KERN_DEBUG "wbuf recovery completed OK\n"));
+        D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len));
 }
 /* Meaning of pad argument:
@@ -412,6 +523,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 {
+        struct jffs2_eraseblock *wbuf_jeb;
        int ret;
        size_t retlen;
@@ -429,6 +541,10 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
        if (!c->wbuf_len)       /* already checked c->wbuf above */
                return 0;
+        wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+        if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1))
+                return -ENOMEM;
        /* claim remaining space on the page
           this happens, if we have a change to a new block,
           or if fsync forces us to flush the writebuffer.
@@ -458,15 +574,12 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
        if (breakme++ == 20) {
                printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
                breakme = 0;
-                c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
+                c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
-                                        &retlen, brokenbuf, NULL, c->oobinfo);
+                              brokenbuf);
                ret = -EIO;
        } else
 #endif
-        if (jffs2_cleanmarker_oob(c))
-                ret = c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf, NULL, c->oobinfo);
-        else
                ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
        if (ret || retlen != c->wbuf_pagesize) {
@@ -483,32 +596,34 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
                return ret;
        }
-        spin_lock(&c->erase_completion_lock);
        /* Adjust free size of the block if we padded. */
        if (pad) {
-                struct jffs2_eraseblock *jeb;
+                uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
-                jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
                D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
-                          (jeb==c->nextblock)?"next":"", jeb->offset));
+                          (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset));
                /* wbuf_pagesize - wbuf_len is the amount of space that's to be
                   padded. If there is less free space in the block than that,
                   something screwed up */
-                if (jeb->free_size < (c->wbuf_pagesize - c->wbuf_len)) {
+                if (wbuf_jeb->free_size < waste) {
                        printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
-                               c->wbuf_ofs, c->wbuf_len, c->wbuf_pagesize-c->wbuf_len);
+                               c->wbuf_ofs, c->wbuf_len, waste);
                        printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
-                               jeb->offset, jeb->free_size);
+                               wbuf_jeb->offset, wbuf_jeb->free_size);
                        BUG();
                }
-                jeb->free_size -= (c->wbuf_pagesize - c->wbuf_len);
-                c->free_size -= (c->wbuf_pagesize - c->wbuf_len);
+                spin_lock(&c->erase_completion_lock);
-                jeb->wasted_size += (c->wbuf_pagesize - c->wbuf_len);
-                c->wasted_size += (c->wbuf_pagesize - c->wbuf_len);
+                jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
-        }
+                /* FIXME: that made it count as dirty. Convert to wasted */
+                wbuf_jeb->dirty_size -= waste;
+                c->dirty_size -= waste;
+                wbuf_jeb->wasted_size += waste;
+                c->wasted_size += waste;
+        } else
+                spin_lock(&c->erase_completion_lock);
        /* Stick any now-obsoleted blocks on the erase_pending_list */
        jffs2_refile_wbuf_blocks(c);
@@ -603,20 +718,30 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
        return ret;
 }
-int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino)
+static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf,
+                              size_t len)
+{
+        if (len && !c->wbuf_len && (len >= c->wbuf_pagesize))
+                return 0;
+        if (len > (c->wbuf_pagesize - c->wbuf_len))
+                len = c->wbuf_pagesize - c->wbuf_len;
+        memcpy(c->wbuf + c->wbuf_len, buf, len);
+        c->wbuf_len += (uint32_t) len;
+        return len;
+}
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
+                       unsigned long count, loff_t to, size_t *retlen,
+                       uint32_t ino)
 {
-        struct kvec outvecs[3];
+        struct jffs2_eraseblock *jeb;
-        uint32_t totlen = 0;
+        size_t wbuf_retlen, donelen = 0;
-        uint32_t split_ofs = 0;
-        uint32_t old_totlen;
-        int ret, splitvec = -1;
-        int invec, outvec;
-        size_t wbuf_retlen;
-        unsigned char *wbuf_ptr;
-        size_t donelen = 0;
        uint32_t outvec_to = to;
+        int ret, invec;
-        /* If not NAND flash, don't bother */
+        /* If not writebuffered flash, don't bother */
        if (!jffs2_is_writebuffered(c))
                return jffs2_flash_direct_writev(c, invecs, count, to, retlen);
@@ -629,34 +754,22 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
                memset(c->wbuf,0xff,c->wbuf_pagesize);
        }
-        /* Fixup the wbuf if we are moving to a new eraseblock.  The checks below
+        /*
-           fail for ECC'd NOR because cleanmarker == 16, so a block starts at
+         * Sanity checks on target address.  It's permitted to write
-           xxx0010.  */
+         * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to
-        if (jffs2_nor_ecc(c)) {
+         * write at the beginning of a new erase block. Anything else,
-                if (((c->wbuf_ofs % c->sector_size) == 0) && !c->wbuf_len) {
+         * and you die.  New block starts at xxx000c (0-b = block
-                        c->wbuf_ofs = PAGE_DIV(to);
+         * header)
-                        c->wbuf_len = PAGE_MOD(to);
+         */
-                        memset(c->wbuf,0xff,c->wbuf_pagesize);
-                }
-        }
-        /* Sanity checks on target address.
-           It's permitted to write at PAD(c->wbuf_len+c->wbuf_ofs),
-           and it's permitted to write at the beginning of a new
-           erase block. Anything else, and you die.
-           New block starts at xxx000c (0-b = block header)
-        */
        if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) {
                /* It's a write to a new block */
                if (c->wbuf_len) {
-                        D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx causes flush of wbuf at 0x%08x\n", (unsigned long)to, c->wbuf_ofs));
+                        D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx "
+                                  "causes flush of wbuf at 0x%08x\n",
+                                  (unsigned long)to, c->wbuf_ofs));
                        ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
-                        if (ret) {
+                        if (ret)
-                                /* the underlying layer has to check wbuf_len to do the cleanup */
+                                goto outerr;
-                                D1(printk(KERN_WARNING "jffs2_flush_wbuf() called from jffs2_flash_writev() failed %d\n", ret));
-                                *retlen = 0;
-                                goto exit;
-                        }
                }
                /* set pointer to new block */
                c->wbuf_ofs = PAGE_DIV(to);
@@ -665,165 +778,70 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
        if (to != PAD(c->wbuf_ofs + c->wbuf_len)) {
                /* We're not writing immediately after the writebuffer. Bad. */
-                printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write to %08lx\n", (unsigned long)to);
+                printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write "
+                       "to %08lx\n", (unsigned long)to);
                if (c->wbuf_len)
                        printk(KERN_CRIT "wbuf was previously %08x-%08x\n",
-                                          c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
+                               c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
                BUG();
        }
-        /* Note outvecs[3] above. We know count is never greater than 2 */
+        /* adjust alignment offset */
-        if (count > 2) {
+        if (c->wbuf_len != PAGE_MOD(to)) {
-                printk(KERN_CRIT "jffs2_flash_writev(): count is %ld\n", count);
+                c->wbuf_len = PAGE_MOD(to);
-                BUG();
+                /* take care of alignment to next page */
-        }
+                if (!c->wbuf_len) {
+                        c->wbuf_len = c->wbuf_pagesize;
-        invec = 0;
+                        ret = __jffs2_flush_wbuf(c, NOPAD);
-        outvec = 0;
+                        if (ret)
+                                goto outerr;
-        /* Fill writebuffer first, if already in use */
-        if (c->wbuf_len) {
-                uint32_t invec_ofs = 0;
-                /* adjust alignment offset */
-                if (c->wbuf_len != PAGE_MOD(to)) {
-                        c->wbuf_len = PAGE_MOD(to);
-                        /* take care of alignment to next page */
-                        if (!c->wbuf_len)
-                                c->wbuf_len = c->wbuf_pagesize;
-                }
-                while(c->wbuf_len < c->wbuf_pagesize) {
-                        uint32_t thislen;
-                        if (invec == count)
-                                goto alldone;
-                        thislen = c->wbuf_pagesize - c->wbuf_len;
-                        if (thislen >= invecs[invec].iov_len)
-                                thislen = invecs[invec].iov_len;
-                        invec_ofs = thislen;
-                        memcpy(c->wbuf + c->wbuf_len, invecs[invec].iov_base, thislen);
-                        c->wbuf_len += thislen;
-                        donelen += thislen;
-                        /* Get next invec, if actual did not fill the buffer */
-                        if (c->wbuf_len < c->wbuf_pagesize)
-                                invec++;
-                }
-                /* write buffer is full, flush buffer */
-                ret = __jffs2_flush_wbuf(c, NOPAD);
-                if (ret) {
-                        /* the underlying layer has to check wbuf_len to do the cleanup */
-                        D1(printk(KERN_WARNING "jffs2_flush_wbuf() called from jffs2_flash_writev() failed %d\n", ret));
-                        /* Retlen zero to make sure our caller doesn't mark the space dirty.
-                           We've already done everything that's necessary */
-                        *retlen = 0;
-                        goto exit;
-                }
-                outvec_to += donelen;
-                c->wbuf_ofs = outvec_to;
-                /* All invecs done ? */
-                if (invec == count)
-                        goto alldone;
-                /* Set up the first outvec, containing the remainder of the
-                   invec we partially used */
-                if (invecs[invec].iov_len > invec_ofs) {
-                        outvecs[0].iov_base = invecs[invec].iov_base+invec_ofs;
-                        totlen = outvecs[0].iov_len = invecs[invec].iov_len-invec_ofs;
-                        if (totlen > c->wbuf_pagesize) {
-                                splitvec = outvec;
-                                split_ofs = outvecs[0].iov_len - PAGE_MOD(totlen);
-                        }
-                        outvec++;
-                }
-                invec++;
-        }
-        /* OK, now we've flushed the wbuf and the start of the bits
-           we have been asked to write, now to write the rest.... */
-        /* totlen holds the amount of data still to be written */
-        old_totlen = totlen;
-        for ( ; invec < count; invec++,outvec++ ) {
-                outvecs[outvec].iov_base = invecs[invec].iov_base;
-                totlen += outvecs[outvec].iov_len = invecs[invec].iov_len;
-                if (PAGE_DIV(totlen) != PAGE_DIV(old_totlen)) {
-                        splitvec = outvec;
-                        split_ofs = outvecs[outvec].iov_len - PAGE_MOD(totlen);
-                        old_totlen = totlen;
                }
        }
-        /* Now the outvecs array holds all the remaining data to write */
+        for (invec = 0; invec < count; invec++) {
-        /* Up to splitvec,split_ofs is to be written immediately. The rest
+                int vlen = invecs[invec].iov_len;
-           goes into the (now-empty) wbuf */
+                uint8_t *v = invecs[invec].iov_base;
-        if (splitvec != -1) {
-                uint32_t remainder;
-                remainder = outvecs[splitvec].iov_len - split_ofs;
-                outvecs[splitvec].iov_len = split_ofs;
-                /* We did cross a page boundary, so we write some now */
-                if (jffs2_cleanmarker_oob(c))
-                        ret = c->mtd->writev_ecc(c->mtd, outvecs, splitvec+1, outvec_to, &wbuf_retlen, NULL, c->oobinfo);
-                else
-                        ret = jffs2_flash_direct_writev(c, outvecs, splitvec+1, outvec_to, &wbuf_retlen);
-                if (ret < 0 || wbuf_retlen != PAGE_DIV(totlen)) {
+                wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
-                        /* At this point we have no problem,
-                           c->wbuf is empty. However refile nextblock to avoid
-                           writing again to same address.
-                        */
-                        struct jffs2_eraseblock *jeb;
-                        spin_lock(&c->erase_completion_lock);
+                if (c->wbuf_len == c->wbuf_pagesize) {
+                        ret = __jffs2_flush_wbuf(c, NOPAD);
-                        jeb = &c->blocks[outvec_to / c->sector_size];
+                        if (ret)
-                        jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+                                goto outerr;
-                        *retlen = 0;
-                        spin_unlock(&c->erase_completion_lock);
-                        goto exit;
                }
+                vlen -= wbuf_retlen;
+                outvec_to += wbuf_retlen;
                donelen += wbuf_retlen;
-                c->wbuf_ofs = PAGE_DIV(outvec_to) + PAGE_DIV(totlen);
+                v += wbuf_retlen;
-                if (remainder) {
+                if (vlen >= c->wbuf_pagesize) {
-                        outvecs[splitvec].iov_base += split_ofs;
+                        ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
-                        outvecs[splitvec].iov_len = remainder;
+                                            &wbuf_retlen, v);
-                } else {
+                        if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
-                        splitvec++;
+                                goto outfile;
+                        vlen -= wbuf_retlen;
+                        outvec_to += wbuf_retlen;
+                        c->wbuf_ofs = outvec_to;
+                        donelen += wbuf_retlen;
+                        v += wbuf_retlen;
                }
-        } else {
+                wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
-                splitvec = 0;
+                if (c->wbuf_len == c->wbuf_pagesize) {
-        }
+                        ret = __jffs2_flush_wbuf(c, NOPAD);
+                        if (ret)
-        /* Now splitvec points to the start of the bits we have to copy
+                                goto outerr;
-           into the wbuf */
+                }
-        wbuf_ptr = c->wbuf;
-        for ( ; splitvec < outvec; splitvec++) {
+                outvec_to += wbuf_retlen;
-                /* Don't copy the wbuf into itself */
+                donelen += wbuf_retlen;
-                if (outvecs[splitvec].iov_base == c->wbuf)
-                        continue;
-                memcpy(wbuf_ptr, outvecs[splitvec].iov_base, outvecs[splitvec].iov_len);
-                wbuf_ptr += outvecs[splitvec].iov_len;
-                donelen += outvecs[splitvec].iov_len;
        }
-        c->wbuf_len = wbuf_ptr - c->wbuf;
-        /* If there's a remainder in the wbuf and it's a non-GC write,
+        /*
-           remember that the wbuf affects this ino */
+         * If there's a remainder in the wbuf and it's a non-GC write,
-alldone:
+         * remember that the wbuf affects this ino
+         */
        *retlen = donelen;
        if (jffs2_sum_active()) {
@@ -836,8 +854,24 @@ alldone:
                jffs2_wbuf_dirties_inode(c, ino);
        ret = 0;
+        up_write(&c->wbuf_sem);
+        return ret;
-exit:
+outfile:
+        /*
+         * At this point we have no problem, c->wbuf is empty. However
+         * refile nextblock to avoid writing again to same address.
+         */
+        spin_lock(&c->erase_completion_lock);
+        jeb = &c->blocks[outvec_to / c->sector_size];
+        jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+        spin_unlock(&c->erase_completion_lock);
+outerr:
+        *retlen = 0;
        up_write(&c->wbuf_sem);
        return ret;
 }
@@ -846,7 +880,8 @@ exit:
 *      This is the entry for flash write.
 *      Check, if we work on NAND FLASH, if so build an kvec and write it via vritev
 */
-int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf)
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+                      size_t *retlen, const u_char *buf)
 {
        struct kvec vecs[1];
@@ -871,25 +906,23 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
        /* Read flash */
        down_read(&c->wbuf_sem);
-        if (jffs2_cleanmarker_oob(c))
+        ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
-                ret = c->mtd->read_ecc(c->mtd, ofs, len, retlen, buf, NULL, c->oobinfo);
-        else
+        if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
-                ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+                if (ret == -EBADMSG)
+                        printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)"
-        if ( (ret == -EBADMSG) && (*retlen == len) ) {
+                               " returned ECC error\n", len, ofs);
-                printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
-                       len, ofs);
                /*
-                 * We have the raw data without ECC correction in the buffer, maybe
+                 * We have the raw data without ECC correction in the buffer,
-                 * we are lucky and all data or parts are correct. We check the node.
+                 * maybe we are lucky and all data or parts are correct. We
-                 * If data are corrupted node check will sort it out.
+                 * check the node.  If data are corrupted node check will sort
-                 * We keep this block, it will fail on write or erase and the we
+                 * it out.  We keep this block, it will fail on write or erase
-                 * mark it bad. Or should we do that now? But we should give him a chance.
+                 * and the we mark it bad. Or should we do that now? But we
-                 * Maybe we had a system crash or power loss before the ecc write or
+                 * should give him a chance.  Maybe we had a system crash or
-                 * a erase was completed.
+                 * power loss before the ecc write or a erase was completed.
                 * So we return success. :)
                 */
-                ret = 0;
+                ret = 0;
        }
        /* if no writebuffer available or write buffer empty, return */
@@ -911,7 +944,7 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
                orbf = (c->wbuf_ofs - ofs);     /* offset in read buffer */
                if (orbf > len)                 /* is write beyond write buffer ? */
                        goto exit;
-                lwbf = len - orbf;              /* number of bytes to copy */
+                lwbf = len - orbf;              /* number of bytes to copy */
                if (lwbf > c->wbuf_len)
                        lwbf = c->wbuf_len;
        }
@@ -923,158 +956,159 @@ exit:
        return ret;
 }
+#define NR_OOB_SCAN_PAGES       4
 /*
- *      Check, if the out of band area is empty
+ * Check, if the out of band area is empty
 */
-int jffs2_check_oob_empty( struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int mode)
+int jffs2_check_oob_empty(struct jffs2_sb_info *c,
+                          struct jffs2_eraseblock *jeb, int mode)
 {
-        unsigned char *buf;
+        int i, page, ret;
-        int     ret = 0;
+        int oobsize = c->mtd->oobsize;
-        int     i,len,page;
+        struct mtd_oob_ops ops;
-        size_t  retlen;
-        int     oob_size;
+        ops.len = NR_OOB_SCAN_PAGES * oobsize;
+        ops.ooblen = oobsize;
-        /* allocate a buffer for all oob data in this sector */
+        ops.oobbuf = c->oobbuf;
-        oob_size = c->mtd->oobsize;
+        ops.ooboffs = 0;
-        len = 4 * oob_size;
+        ops.datbuf = NULL;
-        buf = kmalloc(len, GFP_KERNEL);
+        ops.mode = MTD_OOB_PLACE;
-        if (!buf) {
-                printk(KERN_NOTICE "jffs2_check_oob_empty(): allocation of temporary data buffer for oob check failed\n");
+        ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
-                return -ENOMEM;
-        }
-        /*
-         * if mode = 0, we scan for a total empty oob area, else we have
-         * to take care of the cleanmarker in the first page of the block
-        */
-        ret = jffs2_flash_read_oob(c, jeb->offset, len , &retlen, buf);
        if (ret) {
-                D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
+                D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
-                goto out;
+                          "failed %d for block at %08x\n", ret, jeb->offset));
+                return ret;
        }
-        if (retlen < len) {
+        if (ops.retlen < ops.len) {
-                D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB return short read "
+                D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
-                          "(%zd bytes not %d) for block at %08x\n", retlen, len, jeb->offset));
+                          "returned short read (%zd bytes not %d) for block "
-                ret = -EIO;
+                          "at %08x\n", ops.retlen, ops.len, jeb->offset));
-                goto out;
+                return -EIO;
        }
        /* Special check for first page */
-        for(i = 0; i < oob_size ; i++) {
+        for(i = 0; i < oobsize ; i++) {
                /* Yeah, we know about the cleanmarker. */
                if (mode && i >= c->fsdata_pos &&
                    i < c->fsdata_pos + c->fsdata_len)
                        continue;
-                if (buf[i] != 0xFF) {
+                if (ops.oobbuf[i] != 0xFF) {
-                        D2(printk(KERN_DEBUG "Found %02x at %x in OOB for %08x\n",
+                        D2(printk(KERN_DEBUG "Found %02x at %x in OOB for "
-                                  buf[i], i, jeb->offset));
+                                  "%08x\n", ops.oobbuf[i], i, jeb->offset));
-                        ret = 1;
+                        return 1;
-                        goto out;
                }
        }
        /* we know, we are aligned :) */
-        for (page = oob_size; page < len; page += sizeof(long)) {
+        for (page = oobsize; page < ops.len; page += sizeof(long)) {
-                unsigned long dat = *(unsigned long *)(&buf[page]);
+                long dat = *(long *)(&ops.oobbuf[page]);
-                if(dat != -1) {
+                if(dat != -1)
-                        ret = 1;
+                        return 1;
-                        goto out;
-                }
        }
+        return 0;
-out:
-        kfree(buf);
-        return ret;
 }
 /*
-*       Scan for a valid cleanmarker and for bad blocks
+ * Scan for a valid cleanmarker and for bad blocks
-*       For virtual blocks (concatenated physical blocks) check the cleanmarker
+ */
-*       only in the first page of the first physical block, but scan for bad blocks in all
+int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c,
-*       physical blocks
+                                  struct jffs2_eraseblock *jeb)
-*/
-int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
        struct jffs2_unknown_node n;
-        unsigned char buf[2 * NAND_MAX_OOBSIZE];
+        struct mtd_oob_ops ops;
-        unsigned char *p;
+        int oobsize = c->mtd->oobsize;
-        int ret, i, cnt, retval = 0;
+        unsigned char *p,*b;
-        size_t retlen, offset;
+        int i, ret;
-        int oob_size;
+        size_t offset = jeb->offset;
-        offset = jeb->offset;
+        /* Check first if the block is bad. */
-        oob_size = c->mtd->oobsize;
+        if (c->mtd->block_isbad(c->mtd, offset)) {
+                D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker()"
-        /* Loop through the physical blocks */
+                           ": Bad block at %08x\n", jeb->offset));
-        for (cnt = 0; cnt < (c->sector_size / c->mtd->erasesize); cnt++) {
+                return 2;
-                /* Check first if the block is bad. */
+        }
-                if (c->mtd->block_isbad (c->mtd, offset)) {
-                        D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Bad block at %08x\n", jeb->offset));
-                        return 2;
-                }
-                /*
-                   *    We read oob data from page 0 and 1 of the block.
-                   *    page 0 contains cleanmarker and badblock info
-                   *    page 1 contains failure count of this block
-                 */
-                ret = c->mtd->read_oob (c->mtd, offset, oob_size << 1, &retlen, buf);
-                if (ret) {
+        ops.len = oobsize;
-                        D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
+        ops.ooblen = oobsize;
-                        return ret;
+        ops.oobbuf = c->oobbuf;
-                }
+        ops.ooboffs = 0;
-                if (retlen < (oob_size << 1)) {
+        ops.datbuf = NULL;
-                        D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB return short read (%zd bytes not %d) for block at %08x\n", retlen, oob_size << 1, jeb->offset));
+        ops.mode = MTD_OOB_PLACE;
-                        return -EIO;
-                }
-                /* Check cleanmarker only on the first physical block */
+        ret = c->mtd->read_oob(c->mtd, offset, &ops);
-                if (!cnt) {
+        if (ret) {
-                        n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
+                D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
-                        n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
+                           "Read OOB failed %d for block at %08x\n",
-                        n.totlen = cpu_to_je32 (8);
+                           ret, jeb->offset));
-                        p = (unsigned char *) &n;
+                return ret;
+        }
-                        for (i = 0; i < c->fsdata_len; i++) {
+        if (ops.retlen < ops.len) {
-                                if (buf[c->fsdata_pos + i] != p[i]) {
+                D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): "
-                                        retval = 1;
+                            "Read OOB return short read (%zd bytes not %d) "
-                                }
+                            "for block at %08x\n", ops.retlen, ops.len,
-                        }
+                            jeb->offset));
-                        D1(if (retval == 1) {
+                return -EIO;
-                                printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): Cleanmarker node not detected in block at %08x\n", jeb->offset);
-                                printk(KERN_WARNING "OOB at %08x was ", offset);
-                                for (i=0; i < oob_size; i++) {
-                                        printk("%02x ", buf[i]);
-                                }
-                                printk("\n");
-                        })
-                }
-                offset += c->mtd->erasesize;
        }
-        return retval;
+        n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
+        n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
+        n.totlen = cpu_to_je32 (8);
+        p = (unsigned char *) &n;
+        b = c->oobbuf + c->fsdata_pos;
+        for (i = c->fsdata_len; i; i--) {
+                if (*b++ != *p++)
+                        ret = 1;
+        }
+        D1(if (ret == 1) {
+                printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+                       "Cleanmarker node not detected in block at %08x\n",
+                       offset);
+                printk(KERN_WARNING "OOB at %08zx was ", offset);
+                for (i=0; i < oobsize; i++)
+                        printk("%02x ", c->oobbuf[i]);
+                printk("\n");
+        });
+        return ret;
 }
-int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
+                                 struct jffs2_eraseblock *jeb)
 {
-        struct  jffs2_unknown_node n;
+        struct jffs2_unknown_node n;
-        int     ret;
+        int     ret;
-        size_t  retlen;
+        struct mtd_oob_ops ops;
        n.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
        n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER);
        n.totlen = cpu_to_je32(8);
-        ret = jffs2_flash_write_oob(c, jeb->offset + c->fsdata_pos, c->fsdata_len, &retlen, (unsigned char *)&n);
+        ops.len = c->fsdata_len;
+        ops.ooblen = c->fsdata_len;;
+        ops.oobbuf = (uint8_t *)&n;
+        ops.ooboffs = c->fsdata_pos;
+        ops.datbuf = NULL;
+        ops.mode = MTD_OOB_PLACE;
+        ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
        if (ret) {
-                D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
+                D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+                          "Write failed for block at %08x: error %d\n",
+                          jeb->offset, ret));
                return ret;
        }
-        if (retlen != c->fsdata_len) {
+        if (ops.retlen != ops.len) {
-                D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Short write for block at %08x: %zd not %d\n", jeb->offset, retlen, c->fsdata_len));
+                D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
-                return ret;
+                          "Short write for block at %08x: %zd not %d\n",
+                          jeb->offset, ops.retlen, ops.len));
+                return -EIO;
        }
        return 0;
 }
@@ -1108,18 +1142,9 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
        return 1;
 }
-#define NAND_JFFS2_OOB16_FSDALEN        8
-static struct nand_oobinfo jffs2_oobinfo_docecc = {
-        .useecc = MTD_NANDECC_PLACE,
-        .eccbytes = 6,
-        .eccpos = {0,1,2,3,4,5}
-};
 static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 {
-        struct nand_oobinfo *oinfo = &c->mtd->oobinfo;
+        struct nand_ecclayout *oinfo = c->mtd->ecclayout;
        /* Do this only, if we have an oob buffer */
        if (!c->mtd->oobsize)
@@ -1129,33 +1154,23 @@ static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
        c->cleanmarker_size = 0;
        /* Should we use autoplacement ? */
-        if (oinfo && oinfo->useecc == MTD_NANDECC_AUTOPLACE) {
+        if (!oinfo) {
-                D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
+                D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
-                /* Get the position of the free bytes */
+                return -EINVAL;
-                if (!oinfo->oobfree[0][1]) {
+        }
-                        printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep. Autoplacement selected and no empty space in oob\n");
-                        return -ENOSPC;
-                }
-                c->fsdata_pos = oinfo->oobfree[0][0];
-                c->fsdata_len = oinfo->oobfree[0][1];
-                if (c->fsdata_len > 8)
-                        c->fsdata_len = 8;
-        } else {
-                /* This is just a legacy fallback and should go away soon */
-                switch(c->mtd->ecctype) {
-                case MTD_ECC_RS_DiskOnChip:
-                        printk(KERN_WARNING "JFFS2 using DiskOnChip hardware ECC without autoplacement. Fix it!\n");
-                        c->oobinfo = &jffs2_oobinfo_docecc;
-                        c->fsdata_pos = 6;
-                        c->fsdata_len = NAND_JFFS2_OOB16_FSDALEN;
-                        c->badblock_pos = 15;
-                        break;
-                default:
+        D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
-                        D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
+        /* Get the position of the free bytes */
-                        return -EINVAL;
+        if (!oinfo->oobfree[0].length) {
-                }
+                printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep."
+                        " Autoplacement selected and no empty space in oob\n");
+                return -ENOSPC;
        }
+        c->fsdata_pos = oinfo->oobfree[0].offset;
+        c->fsdata_len = oinfo->oobfree[0].length;
+        if (c->fsdata_len > 8)
+                c->fsdata_len = 8;
        return 0;
 }
@@ -1165,13 +1180,17 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
        /* Initialise write buffer */
        init_rwsem(&c->wbuf_sem);
-        c->wbuf_pagesize = c->mtd->oobblock;
+        c->wbuf_pagesize = c->mtd->writesize;
        c->wbuf_ofs = 0xFFFFFFFF;
        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
        if (!c->wbuf)
                return -ENOMEM;
+        c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->mtd->oobsize, GFP_KERNEL);
+        if (!c->oobbuf)
+                return -ENOMEM;
        res = jffs2_nand_set_oobinfo(c);
 #ifdef BREAKME
@@ -1189,6 +1208,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
 {
        kfree(c->wbuf);
+        kfree(c->oobbuf);
 }
 int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
@@ -1236,33 +1256,14 @@ void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
        kfree(c->wbuf);
 }
-int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) {
-        /* Cleanmarker is actually larger on the flashes */
-        c->cleanmarker_size = 16;
-        /* Initialize write buffer */
-        init_rwsem(&c->wbuf_sem);
-        c->wbuf_pagesize = c->mtd->eccsize;
-        c->wbuf_ofs = 0xFFFFFFFF;
-        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
-        if (!c->wbuf)
-                return -ENOMEM;
-        return 0;
-}
-void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) {
-        kfree(c->wbuf);
-}
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
-        /* Cleanmarker currently occupies a whole programming region */
+        /* Cleanmarker currently occupies whole programming regions,
-        c->cleanmarker_size = MTD_PROGREGION_SIZE(c->mtd);
+         * either one or 2 for 8Byte STMicro flashes. */
+        c->cleanmarker_size = max(16u, c->mtd->writesize);
        /* Initialize write buffer */
        init_rwsem(&c->wbuf_sem);
-        c->wbuf_pagesize = MTD_PROGREGION_SIZE(c->mtd);
+        c->wbuf_pagesize = c->mtd->writesize;
        c->wbuf_ofs = 0xFFFFFFFF;
        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 1342f0158e9b..67176792e138 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -37,7 +37,6 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
        f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
        f->inocache->state = INO_STATE_PRESENT;
        jffs2_add_ino_cache(c, f->inocache);
        D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
        ri->ino = cpu_to_je32(f->inocache->ino);
@@ -57,12 +56,14 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 /* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it,
   write it to the flash, link it into the existing inode/fragment list */
-struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const unsigned char *data, uint32_t datalen, uint32_t flash_ofs, int alloc_mode)
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+                                           struct jffs2_raw_inode *ri, const unsigned char *data,
+                                           uint32_t datalen, int alloc_mode)
 {
-        struct jffs2_raw_node_ref *raw;
        struct jffs2_full_dnode *fn;
        size_t retlen;
+        uint32_t flash_ofs;
        struct kvec vecs[2];
        int ret;
        int retried = 0;
@@ -78,34 +79,21 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
        vecs[1].iov_base = (unsigned char *)data;
        vecs[1].iov_len = datalen;
-        jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
        if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
                printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen);
        }
-        raw = jffs2_alloc_raw_node_ref();
-        if (!raw)
-                return ERR_PTR(-ENOMEM);
        fn = jffs2_alloc_full_dnode();
-        if (!fn) {
+        if (!fn)
-                jffs2_free_raw_node_ref(raw);
                return ERR_PTR(-ENOMEM);
-        }
-        fn->ofs = je32_to_cpu(ri->offset);
-        fn->size = je32_to_cpu(ri->dsize);
-        fn->frags = 0;
        /* check number of valid vecs */
        if (!datalen || !data)
                cnt = 1;
 retry:
-        fn->raw = raw;
+        flash_ofs = write_ofs(c);
-        raw->flash_offset = flash_ofs;
+        jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-        raw->__totlen = PAD(sizeof(*ri)+datalen);
-        raw->next_phys = NULL;
        if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
                BUG_ON(!retried);
@@ -125,22 +113,16 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
                /* Mark the space as dirtied */
                if (retlen) {
-                        /* Doesn't belong to any inode */
-                        raw->next_in_ino = NULL;
                        /* Don't change raw->size to match retlen. We may have
                           written the node header already, and only the data will
                           seem corrupted, in which case the scan would skip over
                           any node we write before the original intended end of
                           this node */
-                        raw->flash_offset |= REF_OBSOLETE;
+                        jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL);
-                        jffs2_add_physical_node_ref(c, raw);
-                        jffs2_mark_node_obsolete(c, raw);
                } else {
-                        printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
+                        printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
-                        jffs2_free_raw_node_ref(raw);
                }
-                if (!retried && alloc_mode != ALLOC_NORETRY && (raw = jffs2_alloc_raw_node_ref())) {
+                if (!retried && alloc_mode != ALLOC_NORETRY) {
                        /* Try to reallocate space and retry */
                        uint32_t dummy;
                        struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
@@ -153,19 +135,20 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
                        jffs2_dbg_acct_paranoia_check(c, jeb);
                        if (alloc_mode == ALLOC_GC) {
-                                ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &flash_ofs,
+                                ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy,
-                                                        &dummy, JFFS2_SUMMARY_INODE_SIZE);
+                                                             JFFS2_SUMMARY_INODE_SIZE);
                        } else {
                                /* Locking pain */
                                up(&f->sem);
                                jffs2_complete_reservation(c);
-                                ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &flash_ofs,
+                                ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy,
-                                                        &dummy, alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
+                                                          alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
                                down(&f->sem);
                        }
                        if (!ret) {
+                                flash_ofs = write_ofs(c);
                                D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
                                jffs2_dbg_acct_sanity_check(c,jeb);
@@ -174,7 +157,6 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
                                goto retry;
                        }
                        D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-                        jffs2_free_raw_node_ref(raw);
                }
                /* Release the full_dnode which is now useless, and return */
                jffs2_free_full_dnode(fn);
@@ -188,20 +170,17 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
        if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
            ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
              (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) ==  je32_to_cpu(ri->isize)))) {
-                raw->flash_offset |= REF_PRISTINE;
+                flash_ofs |= REF_PRISTINE;
        } else {
-                raw->flash_offset |= REF_NORMAL;
+                flash_ofs |= REF_NORMAL;
        }
-        jffs2_add_physical_node_ref(c, raw);
+        fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache);
+        fn->ofs = je32_to_cpu(ri->offset);
-        /* Link into per-inode list */
+        fn->size = je32_to_cpu(ri->dsize);
-        spin_lock(&c->erase_completion_lock);
+        fn->frags = 0;
-        raw->next_in_ino = f->inocache->nodes;
-        f->inocache->nodes = raw;
-        spin_unlock(&c->erase_completion_lock);
        D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
-                  flash_ofs, ref_flags(raw), je32_to_cpu(ri->dsize),
+                  flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize),
                  je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc),
                  je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)));
@@ -212,12 +191,14 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
        return fn;
 }
-struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_dirent *rd, const unsigned char *name, uint32_t namelen, uint32_t flash_ofs, int alloc_mode)
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+                                             struct jffs2_raw_dirent *rd, const unsigned char *name,
+                                             uint32_t namelen, int alloc_mode)
 {
-        struct jffs2_raw_node_ref *raw;
        struct jffs2_full_dirent *fd;
        size_t retlen;
        struct kvec vecs[2];
+        uint32_t flash_ofs;
        int retried = 0;
        int ret;
@@ -228,26 +209,16 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
        D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) {
                printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n");
                BUG();
-        }
+           });
-           );
        vecs[0].iov_base = rd;
        vecs[0].iov_len = sizeof(*rd);
        vecs[1].iov_base = (unsigned char *)name;
        vecs[1].iov_len = namelen;
-        jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-        raw = jffs2_alloc_raw_node_ref();
-        if (!raw)
-                return ERR_PTR(-ENOMEM);
        fd = jffs2_alloc_full_dirent(namelen+1);
-        if (!fd) {
+        if (!fd)
-                jffs2_free_raw_node_ref(raw);
                return ERR_PTR(-ENOMEM);
-        }
        fd->version = je32_to_cpu(rd->version);
        fd->ino = je32_to_cpu(rd->ino);
@@ -257,11 +228,9 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
        fd->name[namelen]=0;
 retry:
-        fd->raw = raw;
+        flash_ofs = write_ofs(c);
-        raw->flash_offset = flash_ofs;
+        jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-        raw->__totlen = PAD(sizeof(*rd)+namelen);
-        raw->next_phys = NULL;
        if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
                BUG_ON(!retried);
@@ -280,15 +249,11 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
                               sizeof(*rd)+namelen, flash_ofs, ret, retlen);
                /* Mark the space as dirtied */
                if (retlen) {
-                        raw->next_in_ino = NULL;
+                        jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL);
-                        raw->flash_offset |= REF_OBSOLETE;
-                        jffs2_add_physical_node_ref(c, raw);
-                        jffs2_mark_node_obsolete(c, raw);
                } else {
-                        printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
+                        printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
-                        jffs2_free_raw_node_ref(raw);
                }
-                if (!retried && (raw = jffs2_alloc_raw_node_ref())) {
+                if (!retried) {
                        /* Try to reallocate space and retry */
                        uint32_t dummy;
                        struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
@@ -301,39 +266,33 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
                        jffs2_dbg_acct_paranoia_check(c, jeb);
                        if (alloc_mode == ALLOC_GC) {
-                                ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &flash_ofs,
+                                ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy,
-                                                        &dummy, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+                                                             JFFS2_SUMMARY_DIRENT_SIZE(namelen));
                        } else {
                                /* Locking pain */
                                up(&f->sem);
                                jffs2_complete_reservation(c);
-                                ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &flash_ofs,
+                                ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy,
-                                                        &dummy, alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+                                                          alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
                                down(&f->sem);
                        }
                        if (!ret) {
+                                flash_ofs = write_ofs(c);
                                D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
                                jffs2_dbg_acct_sanity_check(c,jeb);
                                jffs2_dbg_acct_paranoia_check(c, jeb);
                                goto retry;
                        }
                        D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-                        jffs2_free_raw_node_ref(raw);
                }
                /* Release the full_dnode which is now useless, and return */
                jffs2_free_full_dirent(fd);
                return ERR_PTR(ret?ret:-EIO);
        }
        /* Mark the space used */
-        raw->flash_offset |= REF_PRISTINE;
+        fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | REF_PRISTINE, PAD(sizeof(*rd)+namelen), f->inocache);
-        jffs2_add_physical_node_ref(c, raw);
-        spin_lock(&c->erase_completion_lock);
-        raw->next_in_ino = f->inocache->nodes;
-        f->inocache->nodes = raw;
-        spin_unlock(&c->erase_completion_lock);
        if (retried) {
                jffs2_dbg_acct_sanity_check(c,NULL);
@@ -359,14 +318,14 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                struct jffs2_full_dnode *fn;
                unsigned char *comprbuf = NULL;
                uint16_t comprtype = JFFS2_COMPR_NONE;
-                uint32_t phys_ofs, alloclen;
+                uint32_t alloclen;
                uint32_t datalen, cdatalen;
                int retried = 0;
        retry:
                D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset));
-                ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN, &phys_ofs,
+                ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN,
                                        &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
                if (ret) {
                        D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret));
@@ -394,7 +353,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
                ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
-                fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, phys_ofs, ALLOC_NORETRY);
+                fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY);
                jffs2_free_comprbuf(comprbuf, buf);
@@ -448,13 +407,13 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        struct jffs2_raw_dirent *rd;
        struct jffs2_full_dnode *fn;
        struct jffs2_full_dirent *fd;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret;
        /* Try to reserve enough space for both node and dirent.
         * Just the node will do for now, though
         */
-        ret = jffs2_reserve_space(c, sizeof(*ri), &phys_ofs, &alloclen, ALLOC_NORMAL,
+        ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
                                JFFS2_SUMMARY_INODE_SIZE);
        D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen));
        if (ret) {
@@ -465,7 +424,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        ri->data_crc = cpu_to_je32(0);
        ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
-        fn = jffs2_write_dnode(c, f, ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+        fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
        D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n",
                  jemode_to_cpu(ri->mode)));
@@ -484,7 +443,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        up(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
        if (ret) {
@@ -516,7 +475,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
        rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
-        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
        jffs2_free_raw_dirent(rd);
@@ -545,7 +504,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 {
        struct jffs2_raw_dirent *rd;
        struct jffs2_full_dirent *fd;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret;
        if (1 /* alternative branch needs testing */ ||
@@ -556,7 +515,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
                if (!rd)
                        return -ENOMEM;
-                ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+                ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                        ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
                if (ret) {
                        jffs2_free_raw_dirent(rd);
@@ -580,7 +539,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
                rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
                rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
-                fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_DELETION);
+                fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION);
                jffs2_free_raw_dirent(rd);
@@ -659,14 +618,14 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint
 {
        struct jffs2_raw_dirent *rd;
        struct jffs2_full_dirent *fd;
-        uint32_t alloclen, phys_ofs;
+        uint32_t alloclen;
        int ret;
        rd = jffs2_alloc_raw_dirent();
        if (!rd)
                return -ENOMEM;
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
        if (ret) {
                jffs2_free_raw_dirent(rd);
@@ -692,7 +651,7 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
        rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
-        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
        jffs2_free_raw_dirent(rd);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
new file mode 100644
index 000000000000..18e66dbf23b4
--- /dev/null
+++ b/fs/jffs2/xattr.c
@@ -0,0 +1,1326 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+/* -------- xdatum related functions ----------------
+ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
+ *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
+ *   the index of the xattr name/value pair cache (c->xattrindex).
+ * is_xattr_datum_unchecked(c, xd)
+ *   returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not
+ *   unchecked, it returns 0.
+ * unload_xattr_datum(c, xd)
+ *   is used to release xattr name/value pair and detach from c->xattrindex.
+ * reclaim_xattr_datum(c)
+ *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   is hard coded as 32KiB.
+ * do_verify_xattr_datum(c, xd)
+ *   is used to load the xdatum informations without name/value pair from the medium.
+ *   It's necessary once, because those informations are not collected during mounting
+ *   process when EBS is enabled.
+ *   0 will be returned, if success. An negative return value means recoverable error, and
+ *   positive return value means unrecoverable error. Thus, caller must remove this xdatum
+ *   and xref when it returned positive value.
+ * do_load_xattr_datum(c, xd)
+ *   is used to load name/value pair from the medium.
+ *   The meanings of return value is same as do_verify_xattr_datum().
+ * load_xattr_datum(c, xd)
+ *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
+ *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
+ *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
+ * save_xattr_datum(c, xd)
+ *   is used to write xdatum to medium. xd->version will be incremented.
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
+ *   is used to create new xdatum and write to medium.
+ * delete_xattr_datum(c, xd)
+ *   is used to delete a xdatum. It marks xd JFFS2_XFLAGS_DEAD, and allows
+ *   GC to reclaim those physical nodes.
+ * -------------------------------------------------- */
+static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
+{
+        int name_len = strlen(xname);
+        return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
+}
+static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        struct jffs2_raw_node_ref *raw;
+        int rc = 0;
+        spin_lock(&c->erase_completion_lock);
+        for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+                if (ref_flags(raw) == REF_UNCHECKED) {
+                        rc = 1;
+                        break;
+                }
+        }
+        spin_unlock(&c->erase_completion_lock);
+        return rc;
+}
+static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem) */
+        D1(dbg_xattr("%s: xid=%u, version=%u\n", __FUNCTION__, xd->xid, xd->version));
+        if (xd->xname) {
+                c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
+                kfree(xd->xname);
+        }
+        list_del_init(&xd->xindex);
+        xd->hashkey = 0;
+        xd->xname = NULL;
+        xd->xvalue = NULL;
+}
+static void reclaim_xattr_datum(struct jffs2_sb_info *c)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_xattr_datum *xd, *_xd;
+        uint32_t target, before;
+        static int index = 0;
+        int count;
+        if (c->xdatum_mem_threshold > c->xdatum_mem_usage)
+                return;
+        before = c->xdatum_mem_usage;
+        target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */
+        for (count = 0; count < XATTRINDEX_HASHSIZE; count++) {
+                list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) {
+                        if (xd->flags & JFFS2_XFLAGS_HOT) {
+                                xd->flags &= ~JFFS2_XFLAGS_HOT;
+                        } else if (!(xd->flags & JFFS2_XFLAGS_BIND)) {
+                                unload_xattr_datum(c, xd);
+                        }
+                        if (c->xdatum_mem_usage <= target)
+                                goto out;
+                }
+                index = (index+1) % XATTRINDEX_HASHSIZE;
+        }
+ out:
+        JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n",
+                     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
+}
+static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_eraseblock *jeb;
+        struct jffs2_raw_node_ref *raw;
+        struct jffs2_raw_xattr rx;
+        size_t readlen;
+        uint32_t crc, offset, totlen;
+        int rc;
+        spin_lock(&c->erase_completion_lock);
+        offset = ref_offset(xd->node);
+        if (ref_flags(xd->node) == REF_PRISTINE)
+                goto complete;
+        spin_unlock(&c->erase_completion_lock);
+        rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx);
+        if (rc || readlen != sizeof(rx)) {
+                JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
+                              rc, sizeof(rx), readlen, offset);
+                return rc ? rc : -EIO;
+        }
+        crc = crc32(0, &rx, sizeof(rx) - 4);
+        if (crc != je32_to_cpu(rx.node_crc)) {
+                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+                            offset, je32_to_cpu(rx.hdr_crc), crc);
+                xd->flags |= JFFS2_XFLAGS_INVALID;
+                return EIO;
+        }
+        totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+        if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
+            || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
+            || je32_to_cpu(rx.totlen) != totlen
+            || je32_to_cpu(rx.xid) != xd->xid
+            || je32_to_cpu(rx.version) != xd->version) {
+                JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
+                            "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
+                            offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+                            je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
+                            je32_to_cpu(rx.totlen), totlen,
+                            je32_to_cpu(rx.xid), xd->xid,
+                            je32_to_cpu(rx.version), xd->version);
+                xd->flags |= JFFS2_XFLAGS_INVALID;
+                return EIO;
+        }
+        xd->xprefix = rx.xprefix;
+        xd->name_len = rx.name_len;
+        xd->value_len = je16_to_cpu(rx.value_len);
+        xd->data_crc = je32_to_cpu(rx.data_crc);
+        spin_lock(&c->erase_completion_lock);
+ complete:
+        for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+                jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+                totlen = PAD(ref_totlen(c, jeb, raw));
+                if (ref_flags(raw) == REF_UNCHECKED) {
+                        c->unchecked_size -= totlen; c->used_size += totlen;
+                        jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+                }
+                raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL);
+        }
+        spin_unlock(&c->erase_completion_lock);
+        /* unchecked xdatum is chained with c->xattr_unchecked */
+        list_del_init(&xd->xindex);
+        dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+                  xd->xid, xd->version);
+        return 0;
+}
+static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem) */
+        char *data;
+        size_t readlen;
+        uint32_t crc, length;
+        int i, ret, retry = 0;
+        BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
+        BUG_ON(!list_empty(&xd->xindex));
+ retry:
+        length = xd->name_len + 1 + xd->value_len;
+        data = kmalloc(length, GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr),
+                               length, &readlen, data);
+        if (ret || length!=readlen) {
+                JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n",
+                              ret, length, readlen, ref_offset(xd->node));
+                kfree(data);
+                return ret ? ret : -EIO;
+        }
+        data[xd->name_len] = '\0';
+        crc = crc32(0, data, length);
+        if (crc != xd->data_crc) {
+                JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XREF)"
+                              " at %#08x, read: 0x%08x calculated: 0x%08x\n",
+                              ref_offset(xd->node), xd->data_crc, crc);
+                kfree(data);
+                xd->flags |= JFFS2_XFLAGS_INVALID;
+                return EIO;
+        }
+        xd->flags |= JFFS2_XFLAGS_HOT;
+        xd->xname = data;
+        xd->xvalue = data + xd->name_len+1;
+        c->xdatum_mem_usage += length;
+        xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len);
+        i = xd->hashkey % XATTRINDEX_HASHSIZE;
+        list_add(&xd->xindex, &c->xattrindex[i]);
+        if (!retry) {
+                retry = 1;
+                reclaim_xattr_datum(c);
+                if (!xd->xname)
+                        goto retry;
+        }
+        dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n",
+                  xd->xid, xd->xprefix, xd->xname);
+        return 0;
+}
+static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem);
+         * rc < 0 : recoverable error, try again
+         * rc = 0 : success
+         * rc > 0 : Unrecoverable error, this node should be deleted.
+         */
+        int rc = 0;
+        BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD);
+        if (xd->xname)
+                return 0;
+        if (xd->flags & JFFS2_XFLAGS_INVALID)
+                return EIO;
+        if (unlikely(is_xattr_datum_unchecked(c, xd)))
+                rc = do_verify_xattr_datum(c, xd);
+        if (!rc)
+                rc = do_load_xattr_datum(c, xd);
+        return rc;
+}
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_raw_xattr rx;
+        struct kvec vecs[2];
+        size_t length;
+        int rc, totlen;
+        uint32_t phys_ofs = write_ofs(c);
+        BUG_ON(!xd->xname);
+        BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID));
+        vecs[0].iov_base = &rx;
+        vecs[0].iov_len = sizeof(rx);
+        vecs[1].iov_base = xd->xname;
+        vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+        totlen = vecs[0].iov_len + vecs[1].iov_len;
+        /* Setup raw-xattr */
+        memset(&rx, 0, sizeof(rx));
+        rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+        rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
+        rx.totlen = cpu_to_je32(PAD(totlen));
+        rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
+        rx.xid = cpu_to_je32(xd->xid);
+        rx.version = cpu_to_je32(++xd->version);
+        rx.xprefix = xd->xprefix;
+        rx.name_len = xd->name_len;
+        rx.value_len = cpu_to_je16(xd->value_len);
+        rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+        rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
+        rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+        if (rc || totlen != length) {
+                JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
+                              rc, totlen, length, phys_ofs);
+                rc = rc ? rc : -EIO;
+                if (length)
+                        jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL);
+                return rc;
+        }
+        /* success */
+        jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd);
+        dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
+                  xd->xid, xd->version, xd->xprefix, xd->xname);
+        return 0;
+}
+static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
+                                                    int xprefix, const char *xname,
+                                                    const char *xvalue, int xsize)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_xattr_datum *xd;
+        uint32_t hashkey, name_len;
+        char *data;
+        int i, rc;
+        /* Search xattr_datum has same xname/xvalue by index */
+        hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize);
+        i = hashkey % XATTRINDEX_HASHSIZE;
+        list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+                if (xd->hashkey==hashkey
+                    && xd->xprefix==xprefix
+                    && xd->value_len==xsize
+                    && !strcmp(xd->xname, xname)
+                    && !memcmp(xd->xvalue, xvalue, xsize)) {
+                        atomic_inc(&xd->refcnt);
+                        return xd;
+                }
+        }
+        /* Not found, Create NEW XATTR-Cache */
+        name_len = strlen(xname);
+        xd = jffs2_alloc_xattr_datum();
+        if (!xd)
+                return ERR_PTR(-ENOMEM);
+        data = kmalloc(name_len + 1 + xsize, GFP_KERNEL);
+        if (!data) {
+                jffs2_free_xattr_datum(xd);
+                return ERR_PTR(-ENOMEM);
+        }
+        strcpy(data, xname);
+        memcpy(data + name_len + 1, xvalue, xsize);
+        atomic_set(&xd->refcnt, 1);
+        xd->xid = ++c->highest_xid;
+        xd->flags |= JFFS2_XFLAGS_HOT;
+        xd->xprefix = xprefix;
+        xd->hashkey = hashkey;
+        xd->xname = data;
+        xd->xvalue = data + name_len + 1;
+        xd->name_len = name_len;
+        xd->value_len = xsize;
+        xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
+        rc = save_xattr_datum(c, xd);
+        if (rc) {
+                kfree(xd->xname);
+                jffs2_free_xattr_datum(xd);
+                return ERR_PTR(rc);
+        }
+        /* Insert Hash Index */
+        i = hashkey % XATTRINDEX_HASHSIZE;
+        list_add(&xd->xindex, &c->xattrindex[i]);
+        c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len);
+        reclaim_xattr_datum(c);
+        return xd;
+}
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem) */
+        BUG_ON(atomic_read(&xd->refcnt));
+        unload_xattr_datum(c, xd);
+        xd->flags |= JFFS2_XFLAGS_DEAD;
+        spin_lock(&c->erase_completion_lock);
+        if (xd->node == (void *)xd) {
+                BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID));
+                jffs2_free_xattr_datum(xd);
+        } else {
+                list_add(&xd->xindex, &c->xattr_dead_list);
+        }
+        spin_unlock(&c->erase_completion_lock);
+        dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", xd->xid, xd->version);
+}
+/* -------- xref related functions ------------------
+ * verify_xattr_ref(c, ref)
+ *   is used to load xref information from medium. Because summary data does not
+ *   contain xid/ino, it's necessary to verify once while mounting process.
+ * save_xattr_ref(c, ref)
+ *   is used to write xref to medium. If delete marker is marked, it write
+ *   a delete marker of xref into medium.
+ * create_xattr_ref(c, ic, xd)
+ *   is used to create a new xref and write to medium.
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER,
+ *   and allows GC to reclaim those physical nodes.
+ * jffs2_xattr_delete_inode(c, ic)
+ *   is called to remove xrefs related to obsolete inode when inode is unlinked.
+ * jffs2_xattr_free_inode(c, ic)
+ *   is called to release xattr related objects when unmounting. 
+ * check_xattr_ref_inode(c, ic)
+ *   is used to confirm inode does not have duplicate xattr name/value pair.
+ * -------------------------------------------------- */
+static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+        struct jffs2_eraseblock *jeb;
+        struct jffs2_raw_node_ref *raw;
+        struct jffs2_raw_xref rr;
+        size_t readlen;
+        uint32_t crc, offset, totlen;
+        int rc;
+        spin_lock(&c->erase_completion_lock);
+        if (ref_flags(ref->node) != REF_UNCHECKED)
+                goto complete;
+        offset = ref_offset(ref->node);
+        spin_unlock(&c->erase_completion_lock);
+        rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr);
+        if (rc || sizeof(rr) != readlen) {
+                JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
+                              rc, sizeof(rr), readlen, offset);
+                return rc ? rc : -EIO;
+        }
+        /* obsolete node */
+        crc = crc32(0, &rr, sizeof(rr) - 4);
+        if (crc != je32_to_cpu(rr.node_crc)) {
+                JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+                            offset, je32_to_cpu(rr.node_crc), crc);
+                return EIO;
+        }
+        if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
+            || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
+            || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
+                JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
+                            "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
+                            offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+                            je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
+                            je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
+                return EIO;
+        }
+        ref->ino = je32_to_cpu(rr.ino);
+        ref->xid = je32_to_cpu(rr.xid);
+        ref->xseqno = je32_to_cpu(rr.xseqno);
+        if (ref->xseqno > c->highest_xseqno)
+                c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
+        spin_lock(&c->erase_completion_lock);
+ complete:
+        for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) {
+                jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+                totlen = PAD(ref_totlen(c, jeb, raw));
+                if (ref_flags(raw) == REF_UNCHECKED) {
+                        c->unchecked_size -= totlen; c->used_size += totlen;
+                        jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+                }
+                raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL);
+        }
+        spin_unlock(&c->erase_completion_lock);
+        dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
+                  ref->ino, ref->xid, ref_offset(ref->node));
+        return 0;
+}
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_raw_xref rr;
+        size_t length;
+        uint32_t xseqno, phys_ofs = write_ofs(c);
+        int ret;
+        rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+        rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
+        rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
+        rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
+        xseqno = (c->highest_xseqno += 2);
+        if (is_xattr_ref_dead(ref)) {
+                xseqno |= XREF_DELETE_MARKER;
+                rr.ino = cpu_to_je32(ref->ino);
+                rr.xid = cpu_to_je32(ref->xid);
+        } else {
+                rr.ino = cpu_to_je32(ref->ic->ino);
+                rr.xid = cpu_to_je32(ref->xd->xid);
+        }
+        rr.xseqno = cpu_to_je32(xseqno);
+        rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
+        ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
+        if (ret || sizeof(rr) != length) {
+                JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n",
+                              ret, sizeof(rr), length, phys_ofs);
+                ret = ret ? ret : -EIO;
+                if (length)
+                        jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL);
+                return ret;
+        }
+        /* success */
+        ref->xseqno = xseqno;
+        jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref);
+        dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
+        return 0;
+}
+static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
+                                                struct jffs2_xattr_datum *xd)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_xattr_ref *ref;
+        int ret;
+        ref = jffs2_alloc_xattr_ref();
+        if (!ref)
+                return ERR_PTR(-ENOMEM);
+        ref->ic = ic;
+        ref->xd = xd;
+        ret = save_xattr_ref(c, ref);
+        if (ret) {
+                jffs2_free_xattr_ref(ref);
+                return ERR_PTR(ret);
+        }
+        /* Chain to inode */
+        ref->next = ic->xref;
+        ic->xref = ref;
+        return ref; /* success */
+}
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+        /* must be called under down_write(xattr_sem) */
+        struct jffs2_xattr_datum *xd;
+        xd = ref->xd;
+        ref->xseqno |= XREF_DELETE_MARKER;
+        ref->ino = ref->ic->ino;
+        ref->xid = ref->xd->xid;
+        spin_lock(&c->erase_completion_lock);
+        ref->next = c->xref_dead_list;
+        c->xref_dead_list = ref;
+        spin_unlock(&c->erase_completion_lock);
+        dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n",
+                  ref->ino, ref->xid, ref->xseqno);
+        if (atomic_dec_and_test(&xd->refcnt))
+                delete_xattr_datum(c, xd);
+}
+void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+        /* It's called from jffs2_clear_inode() on inode removing.
+           When an inode with XATTR is removed, those XATTRs must be removed. */
+        struct jffs2_xattr_ref *ref, *_ref;
+        if (!ic || ic->nlink > 0)
+                return;
+        down_write(&c->xattr_sem);
+        for (ref = ic->xref; ref; ref = _ref) {
+                _ref = ref->next;
+                delete_xattr_ref(c, ref);
+        }
+        ic->xref = NULL;
+        up_write(&c->xattr_sem);
+}
+void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+        /* It's called from jffs2_free_ino_caches() until unmounting FS. */
+        struct jffs2_xattr_datum *xd;
+        struct jffs2_xattr_ref *ref, *_ref;
+        down_write(&c->xattr_sem);
+        for (ref = ic->xref; ref; ref = _ref) {
+                _ref = ref->next;
+                xd = ref->xd;
+                if (atomic_dec_and_test(&xd->refcnt)) {
+                        unload_xattr_datum(c, xd);
+                        jffs2_free_xattr_datum(xd);
+                }
+                jffs2_free_xattr_ref(ref);
+        }
+        ic->xref = NULL;
+        up_write(&c->xattr_sem);
+}
+static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+        /* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+         * duplicate name/value pairs. If duplicate name/value pair would be found,
+         * one will be removed.
+         */
+        struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp;
+        int rc = 0;
+        if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
+                return 0;
+        down_write(&c->xattr_sem);
+ retry:
+        rc = 0;
+        for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+                if (!ref->xd->xname) {
+                        rc = load_xattr_datum(c, ref->xd);
+                        if (unlikely(rc > 0)) {
+                                *pref = ref->next;
+                                delete_xattr_ref(c, ref);
+                                goto retry;
+                        } else if (unlikely(rc < 0))
+                                goto out;
+                }
+                for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) {
+                        if (!cmp->xd->xname) {
+                                ref->xd->flags |= JFFS2_XFLAGS_BIND;
+                                rc = load_xattr_datum(c, cmp->xd);
+                                ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
+                                if (unlikely(rc > 0)) {
+                                        *pcmp = cmp->next;
+                                        delete_xattr_ref(c, cmp);
+                                        goto retry;
+                                } else if (unlikely(rc < 0))
+                                        goto out;
+                        }
+                        if (ref->xd->xprefix == cmp->xd->xprefix
+                            && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+                                if (ref->xseqno > cmp->xseqno) {
+                                        *pcmp = cmp->next;
+                                        delete_xattr_ref(c, cmp);
+                                } else {
+                                        *pref = ref->next;
+                                        delete_xattr_ref(c, ref);
+                                }
+                                goto retry;
+                        }
+                }
+        }
+        ic->flags |= INO_FLAGS_XATTR_CHECKED;
+ out:
+        up_write(&c->xattr_sem);
+        return rc;
+}
+/* -------- xattr subsystem functions ---------------
+ * jffs2_init_xattr_subsystem(c)
+ *   is used to initialize semaphore and list_head, and some variables.
+ * jffs2_find_xattr_datum(c, xid)
+ *   is used to lookup xdatum while scanning process.
+ * jffs2_clear_xattr_subsystem(c)
+ *   is used to release any xattr related objects.
+ * jffs2_build_xattr_subsystem(c)
+ *   is used to associate xdatum and xref while super block building process.
+ * jffs2_setup_xattr_datum(c, xid, version)
+ *   is used to insert xdatum while scanning process.
+ * -------------------------------------------------- */
+void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
+{
+        int i;
+        for (i=0; i < XATTRINDEX_HASHSIZE; i++)
+                INIT_LIST_HEAD(&c->xattrindex[i]);
+        INIT_LIST_HEAD(&c->xattr_unchecked);
+        INIT_LIST_HEAD(&c->xattr_dead_list);
+        c->xref_dead_list = NULL;
+        c->xref_temp = NULL;
+        init_rwsem(&c->xattr_sem);
+        c->highest_xid = 0;
+        c->highest_xseqno = 0;
+        c->xdatum_mem_usage = 0;
+        c->xdatum_mem_threshold = 32 * 1024;    /* Default 32KB */
+}
+static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid)
+{
+        struct jffs2_xattr_datum *xd;
+        int i = xid % XATTRINDEX_HASHSIZE;
+        /* It's only used in scanning/building process. */
+        BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING)));
+        list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+                if (xd->xid==xid)
+                        return xd;
+        }
+        return NULL;
+}
+void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
+{
+        struct jffs2_xattr_datum *xd, *_xd;
+        struct jffs2_xattr_ref *ref, *_ref;
+        int i;
+        for (ref=c->xref_temp; ref; ref = _ref) {
+                _ref = ref->next;
+                jffs2_free_xattr_ref(ref);
+        }
+        for (ref=c->xref_dead_list; ref; ref = _ref) {
+                _ref = ref->next;
+                jffs2_free_xattr_ref(ref);
+        }
+        for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+                list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+                        list_del(&xd->xindex);
+                        if (xd->xname)
+                                kfree(xd->xname);
+                        jffs2_free_xattr_datum(xd);
+                }
+        }
+        list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) {
+                list_del(&xd->xindex);
+                jffs2_free_xattr_datum(xd);
+        }
+}
+#define XREF_TMPHASH_SIZE       (128)
+void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+{
+        struct jffs2_xattr_ref *ref, *_ref;
+        struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
+        struct jffs2_xattr_datum *xd, *_xd;
+        struct jffs2_inode_cache *ic;
+        struct jffs2_raw_node_ref *raw;
+        int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0;
+        int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0;
+        BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+        /* Phase.1 : Merge same xref */
+        for (i=0; i < XREF_TMPHASH_SIZE; i++)
+                xref_tmphash[i] = NULL;
+        for (ref=c->xref_temp; ref; ref=_ref) {
+                struct jffs2_xattr_ref *tmp;
+                _ref = ref->next;
+                if (ref_flags(ref->node) != REF_PRISTINE) {
+                        if (verify_xattr_ref(c, ref)) {
+                                BUG_ON(ref->node->next_in_ino != (void *)ref);
+                                ref->node->next_in_ino = NULL;
+                                jffs2_mark_node_obsolete(c, ref->node);
+                                jffs2_free_xattr_ref(ref);
+                                continue;
+                        }
+                }
+                i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE;
+                for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) {
+                        if (tmp->ino == ref->ino && tmp->xid == ref->xid)
+                                break;
+                }
+                if (tmp) {
+                        raw = ref->node;
+                        if (ref->xseqno > tmp->xseqno) {
+                                tmp->xseqno = ref->xseqno;
+                                raw->next_in_ino = tmp->node;
+                                tmp->node = raw;
+                        } else {
+                                raw->next_in_ino = tmp->node->next_in_ino;
+                                tmp->node->next_in_ino = raw;
+                        }
+                        jffs2_free_xattr_ref(ref);
+                        continue;
+                } else {
+                        ref->next = xref_tmphash[i];
+                        xref_tmphash[i] = ref;
+                }
+        }
+        c->xref_temp = NULL;
+        /* Phase.2 : Bind xref with inode_cache and xattr_datum */
+        for (i=0; i < XREF_TMPHASH_SIZE; i++) {
+                for (ref=xref_tmphash[i]; ref; ref=_ref) {
+                        xref_count++;
+                        _ref = ref->next;
+                        if (is_xattr_ref_dead(ref)) {
+                                ref->next = c->xref_dead_list;
+                                c->xref_dead_list = ref;
+                                xref_dead_count++;
+                                continue;
+                        }
+                        /* At this point, ref->xid and ref->ino contain XID and inode number.
+                           ref->xd and ref->ic are not valid yet. */
+                        xd = jffs2_find_xattr_datum(c, ref->xid);
+                        ic = jffs2_get_ino_cache(c, ref->ino);
+                        if (!xd || !ic) {
+                                dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n",
+                                          ref->ino, ref->xid, ref->xseqno);
+                                ref->xseqno |= XREF_DELETE_MARKER;
+                                ref->next = c->xref_dead_list;
+                                c->xref_dead_list = ref;
+                                xref_orphan_count++;
+                                continue;
+                        }
+                        ref->xd = xd;
+                        ref->ic = ic;
+                        atomic_inc(&xd->refcnt);
+                        ref->next = ic->xref;
+                        ic->xref = ref;
+                }
+        }
+        /* Phase.3 : Link unchecked xdatum to xattr_unchecked list */
+        for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+                list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+                        xdatum_count++;
+                        list_del_init(&xd->xindex);
+                        if (!atomic_read(&xd->refcnt)) {
+                                dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n",
+                                          xd->xid, xd->version);
+                                xd->flags |= JFFS2_XFLAGS_DEAD;
+                                list_add(&xd->xindex, &c->xattr_unchecked);
+                                xdatum_orphan_count++;
+                                continue;
+                        }
+                        if (is_xattr_datum_unchecked(c, xd)) {
+                                dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n",
+                                          xd->xid, xd->version);
+                                list_add(&xd->xindex, &c->xattr_unchecked);
+                                xdatum_unchecked_count++;
+                        }
+                }
+        }
+        /* build complete */
+        JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum"
+                     " (%u unchecked, %u orphan) and "
+                     "%u of xref (%u dead, %u orphan) found.\n",
+                     xdatum_count, xdatum_unchecked_count, xdatum_orphan_count,
+                     xref_count, xref_dead_count, xref_orphan_count);
+}
+struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+                                                  uint32_t xid, uint32_t version)
+{
+        struct jffs2_xattr_datum *xd;
+        xd = jffs2_find_xattr_datum(c, xid);
+        if (!xd) {
+                xd = jffs2_alloc_xattr_datum();
+                if (!xd)
+                        return ERR_PTR(-ENOMEM);
+                xd->xid = xid;
+                xd->version = version;
+                if (xd->xid > c->highest_xid)
+                        c->highest_xid = xd->xid;
+                list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
+        }
+        return xd;
+}
+/* -------- xattr subsystem functions ---------------
+ * xprefix_to_handler(xprefix)
+ *   is used to translate xprefix into xattr_handler.
+ * jffs2_listxattr(dentry, buffer, size)
+ *   is an implementation of listxattr handler on jffs2.
+ * do_jffs2_getxattr(inode, xprefix, xname, buffer, size)
+ *   is an implementation of getxattr handler on jffs2.
+ * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
+ *   is an implementation of setxattr handler on jffs2.
+ * -------------------------------------------------- */
+struct xattr_handler *jffs2_xattr_handlers[] = {
+        &jffs2_user_xattr_handler,
+#ifdef CONFIG_JFFS2_FS_SECURITY
+        &jffs2_security_xattr_handler,
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+        &jffs2_acl_access_xattr_handler,
+        &jffs2_acl_default_xattr_handler,
+#endif
+        &jffs2_trusted_xattr_handler,
+        NULL
+};
+static struct xattr_handler *xprefix_to_handler(int xprefix) {
+        struct xattr_handler *ret;
+        switch (xprefix) {
+        case JFFS2_XPREFIX_USER:
+                ret = &jffs2_user_xattr_handler;
+                break;
+#ifdef CONFIG_JFFS2_FS_SECURITY
+        case JFFS2_XPREFIX_SECURITY:
+                ret = &jffs2_security_xattr_handler;
+                break;
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+        case JFFS2_XPREFIX_ACL_ACCESS:
+                ret = &jffs2_acl_access_xattr_handler;
+                break;
+        case JFFS2_XPREFIX_ACL_DEFAULT:
+                ret = &jffs2_acl_default_xattr_handler;
+                break;
+#endif
+        case JFFS2_XPREFIX_TRUSTED:
+                ret = &jffs2_trusted_xattr_handler;
+                break;
+        default:
+                ret = NULL;
+                break;
+        }
+        return ret;
+}
+ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+        struct jffs2_inode_cache *ic = f->inocache;
+        struct jffs2_xattr_ref *ref, **pref;
+        struct jffs2_xattr_datum *xd;
+        struct xattr_handler *xhandle;
+        ssize_t len, rc;
+        int retry = 0;
+        rc = check_xattr_ref_inode(c, ic);
+        if (unlikely(rc))
+                return rc;
+        down_read(&c->xattr_sem);
+ retry:
+        len = 0;
+        for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+                BUG_ON(ref->ic != ic);
+                xd = ref->xd;
+                if (!xd->xname) {
+                        /* xdatum is unchached */
+                        if (!retry) {
+                                retry = 1;
+                                up_read(&c->xattr_sem);
+                                down_write(&c->xattr_sem);
+                                goto retry;
+                        } else {
+                                rc = load_xattr_datum(c, xd);
+                                if (unlikely(rc > 0)) {
+                                        *pref = ref->next;
+                                        delete_xattr_ref(c, ref);
+                                        goto retry;
+                                } else if (unlikely(rc < 0))
+                                        goto out;
+                        }
+                }
+                xhandle = xprefix_to_handler(xd->xprefix);
+                if (!xhandle)
+                        continue;
+                if (buffer) {
+                        rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len);
+                } else {
+                        rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len);
+                }
+                if (rc < 0)
+                        goto out;
+                len += rc;
+        }
+        rc = len;
+ out:
+        if (!retry) {
+                up_read(&c->xattr_sem);
+        } else {
+                up_write(&c->xattr_sem);
+        }
+        return rc;
+}
+int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+                      char *buffer, size_t size)
+{
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+        struct jffs2_inode_cache *ic = f->inocache;
+        struct jffs2_xattr_datum *xd;
+        struct jffs2_xattr_ref *ref, **pref;
+        int rc, retry = 0;
+        rc = check_xattr_ref_inode(c, ic);
+        if (unlikely(rc))
+                return rc;
+        down_read(&c->xattr_sem);
+ retry:
+        for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+                BUG_ON(ref->ic!=ic);
+                xd = ref->xd;
+                if (xd->xprefix != xprefix)
+                        continue;
+                if (!xd->xname) {
+                        /* xdatum is unchached */
+                        if (!retry) {
+                                retry = 1;
+                                up_read(&c->xattr_sem);
+                                down_write(&c->xattr_sem);
+                                goto retry;
+                        } else {
+                                rc = load_xattr_datum(c, xd);
+                                if (unlikely(rc > 0)) {
+                                        *pref = ref->next;
+                                        delete_xattr_ref(c, ref);
+                                        goto retry;
+                                } else if (unlikely(rc < 0)) {
+                                        goto out;
+                                }
+                        }
+                }
+                if (!strcmp(xname, xd->xname)) {
+                        rc = xd->value_len;
+                        if (buffer) {
+                                if (size < rc) {
+                                        rc = -ERANGE;
+                                } else {
+                                        memcpy(buffer, xd->xvalue, rc);
+                                }
+                        }
+                        goto out;
+                }
+        }
+        rc = -ENODATA;
+ out:
+        if (!retry) {
+                up_read(&c->xattr_sem);
+        } else {
+                up_write(&c->xattr_sem);
+        }
+        return rc;
+}
+int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+                      const char *buffer, size_t size, int flags)
+{
+        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+        struct jffs2_inode_cache *ic = f->inocache;
+        struct jffs2_xattr_datum *xd;
+        struct jffs2_xattr_ref *ref, *newref, **pref;
+        uint32_t length, request;
+        int rc;
+        rc = check_xattr_ref_inode(c, ic);
+        if (unlikely(rc))
+                return rc;
+        request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+        rc = jffs2_reserve_space(c, request, &length,
+                                 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
+        if (rc) {
+                JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+                return rc;
+        }
+        /* Find existing xattr */
+        down_write(&c->xattr_sem);
+ retry:
+        for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
+                xd = ref->xd;
+                if (xd->xprefix != xprefix)
+                        continue;
+                if (!xd->xname) {
+                        rc = load_xattr_datum(c, xd);
+                        if (unlikely(rc > 0)) {
+                                *pref = ref->next;
+                                delete_xattr_ref(c, ref);
+                                goto retry;
+                        } else if (unlikely(rc < 0))
+                                goto out;
+                }
+                if (!strcmp(xd->xname, xname)) {
+                        if (flags & XATTR_CREATE) {
+                                rc = -EEXIST;
+                                goto out;
+                        }
+                        if (!buffer) {
+                                ref->ino = ic->ino;
+                                ref->xid = xd->xid;
+                                ref->xseqno |= XREF_DELETE_MARKER;
+                                rc = save_xattr_ref(c, ref);
+                                if (!rc) {
+                                        *pref = ref->next;
+                                        spin_lock(&c->erase_completion_lock);
+                                        ref->next = c->xref_dead_list;
+                                        c->xref_dead_list = ref;
+                                        spin_unlock(&c->erase_completion_lock);
+                                        if (atomic_dec_and_test(&xd->refcnt))
+                                                delete_xattr_datum(c, xd);
+                                } else {
+                                        ref->ic = ic;
+                                        ref->xd = xd;
+                                        ref->xseqno &= ~XREF_DELETE_MARKER;
+                                }
+                                goto out;
+                        }
+                        goto found;
+                }
+        }
+        /* not found */
+        if (flags & XATTR_REPLACE) {
+                rc = -ENODATA;
+                goto out;
+        }
+        if (!buffer) {
+                rc = -ENODATA;
+                goto out;
+        }
+ found:
+        xd = create_xattr_datum(c, xprefix, xname, buffer, size);
+        if (IS_ERR(xd)) {
+                rc = PTR_ERR(xd);
+                goto out;
+        }
+        up_write(&c->xattr_sem);
+        jffs2_complete_reservation(c);
+        /* create xattr_ref */
+        request = PAD(sizeof(struct jffs2_raw_xref));
+        rc = jffs2_reserve_space(c, request, &length,
+                                 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+        down_write(&c->xattr_sem);
+        if (rc) {
+                JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+                if (atomic_dec_and_test(&xd->refcnt))
+                        delete_xattr_datum(c, xd);
+                up_write(&c->xattr_sem);
+                return rc;
+        }
+        if (ref)
+                *pref = ref->next;
+        newref = create_xattr_ref(c, ic, xd);
+        if (IS_ERR(newref)) {
+                if (ref) {
+                        ref->next = ic->xref;
+                        ic->xref = ref;
+                }
+                rc = PTR_ERR(newref);
+                if (atomic_dec_and_test(&xd->refcnt))
+                        delete_xattr_datum(c, xd);
+        } else if (ref) {
+                delete_xattr_ref(c, ref);
+        }
+ out:
+        up_write(&c->xattr_sem);
+        jffs2_complete_reservation(c);
+        return rc;
+}
+/* -------- garbage collector functions -------------
+ * jffs2_garbage_collect_xattr_datum(c, xd, raw)
+ *   is used to move xdatum into new node.
+ * jffs2_garbage_collect_xattr_ref(c, ref, raw)
+ *   is used to move xref into new node.
+ * jffs2_verify_xattr(c)
+ *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * jffs2_release_xattr_datum(c, xd)
+ *   is used to release an in-memory object of xdatum.
+ * jffs2_release_xattr_ref(c, ref)
+ *   is used to release an in-memory object of xref.
+ * -------------------------------------------------- */
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+                                      struct jffs2_raw_node_ref *raw)
+{
+        uint32_t totlen, length, old_ofs;
+        int rc = 0;
+        down_write(&c->xattr_sem);
+        if (xd->node != raw)
+                goto out;
+        if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID))
+                goto out;
+        rc = load_xattr_datum(c, xd);
+        if (unlikely(rc)) {
+                rc = (rc > 0) ? 0 : rc;
+                goto out;
+        }
+        old_ofs = ref_offset(xd->node);
+        totlen = PAD(sizeof(struct jffs2_raw_xattr)
+                        + xd->name_len + 1 + xd->value_len);
+        rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
+        if (rc) {
+                JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
+                rc = rc ? rc : -EBADFD;
+                goto out;
+        }
+        rc = save_xattr_datum(c, xd);
+        if (!rc)
+                dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
+                          xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+ out:
+        if (!rc)
+                jffs2_mark_node_obsolete(c, raw);
+        up_write(&c->xattr_sem);
+        return rc;
+}
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+                                    struct jffs2_raw_node_ref *raw)
+{
+        uint32_t totlen, length, old_ofs;
+        int rc = 0;
+        down_write(&c->xattr_sem);
+        BUG_ON(!ref->node);
+        if (ref->node != raw)
+                goto out;
+        if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref))
+                goto out;
+        old_ofs = ref_offset(ref->node);
+        totlen = ref_totlen(c, c->gcblock, ref->node);
+        rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
+        if (rc) {
+                JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
+                              __FUNCTION__, rc, totlen);
+                rc = rc ? rc : -EBADFD;
+                goto out;
+        }
+        rc = save_xattr_ref(c, ref);
+        if (!rc)
+                dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
+                          ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+ out:
+        if (!rc)
+                jffs2_mark_node_obsolete(c, raw);
+        up_write(&c->xattr_sem);
+        return rc;
+}
+int jffs2_verify_xattr(struct jffs2_sb_info *c)
+{
+        struct jffs2_xattr_datum *xd, *_xd;
+        struct jffs2_eraseblock *jeb;
+        struct jffs2_raw_node_ref *raw;
+        uint32_t totlen;
+        int rc;
+        down_write(&c->xattr_sem);
+        list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+                rc = do_verify_xattr_datum(c, xd);
+                if (rc < 0)
+                        continue;
+                list_del_init(&xd->xindex);
+                spin_lock(&c->erase_completion_lock);
+                for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+                        if (ref_flags(raw) != REF_UNCHECKED)
+                                continue;
+                        jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+                        totlen = PAD(ref_totlen(c, jeb, raw));
+                        c->unchecked_size -= totlen; c->used_size += totlen;
+                        jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+                        raw->flash_offset = ref_offset(raw)
+                                | ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL);
+                }
+                if (xd->flags & JFFS2_XFLAGS_DEAD)
+                        list_add(&xd->xindex, &c->xattr_dead_list);
+                spin_unlock(&c->erase_completion_lock);
+        }
+        up_write(&c->xattr_sem);
+        return list_empty(&c->xattr_unchecked) ? 1 : 0;
+}
+void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+        /* must be called under spin_lock(&c->erase_completion_lock) */
+        if (atomic_read(&xd->refcnt) || xd->node != (void *)xd)
+                return;
+        list_del(&xd->xindex);
+        jffs2_free_xattr_datum(xd);
+}
+void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+        /* must be called under spin_lock(&c->erase_completion_lock) */
+        struct jffs2_xattr_ref *tmp, **ptmp;
+        if (ref->node != (void *)ref)
+                return;
+        for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) {
+                if (ref == tmp) {
+                        *ptmp = tmp->next;
+                        break;
+                }
+        }
+        jffs2_free_xattr_ref(ref);
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
new file mode 100644
index 000000000000..06a5c69dcf8b
--- /dev/null
+++ b/fs/jffs2/xattr.h
@@ -0,0 +1,129 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#ifndef _JFFS2_FS_XATTR_H_
+#define _JFFS2_FS_XATTR_H_
+#include <linux/xattr.h>
+#include <linux/list.h>
+#define JFFS2_XFLAGS_HOT        (0x01)  /* This datum is HOT */
+#define JFFS2_XFLAGS_BIND       (0x02)  /* This datum is not reclaimed */
+#define JFFS2_XFLAGS_DEAD       (0x40)  /* This datum is already dead */
+#define JFFS2_XFLAGS_INVALID    (0x80)  /* This datum contains crc error */
+struct jffs2_xattr_datum
+{
+        void *always_null;
+        struct jffs2_raw_node_ref *node;
+        uint8_t class;
+        uint8_t flags;
+        uint16_t xprefix;               /* see JFFS2_XATTR_PREFIX_* */
+        struct list_head xindex;        /* chained from c->xattrindex[n] */
+        atomic_t refcnt;                /* # of xattr_ref refers this */
+        uint32_t xid;
+        uint32_t version;
+        uint32_t data_crc;
+        uint32_t hashkey;
+        char *xname;            /* XATTR name without prefix */
+        uint32_t name_len;      /* length of xname */
+        char *xvalue;           /* XATTR value */
+        uint32_t value_len;     /* length of xvalue */
+};
+struct jffs2_inode_cache;
+struct jffs2_xattr_ref
+{
+        void *always_null;
+        struct jffs2_raw_node_ref *node;
+        uint8_t class;
+        uint8_t flags;          /* Currently unused */
+        u16 unused;
+        uint32_t xseqno;
+        union {
+                struct jffs2_inode_cache *ic;   /* reference to jffs2_inode_cache */
+                uint32_t ino;                   /* only used in scanning/building  */
+        };
+        union {
+                struct jffs2_xattr_datum *xd;   /* reference to jffs2_xattr_datum */
+                uint32_t xid;                   /* only used in sccanning/building */
+        };
+        struct jffs2_xattr_ref *next;           /* chained from ic->xref_list */
+};
+#define XREF_DELETE_MARKER      (0x00000001)
+static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+        return ((ref->xseqno & XREF_DELETE_MARKER) != 0);
+}
+#ifdef CONFIG_JFFS2_FS_XATTR
+extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
+extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+                                                  uint32_t xid, uint32_t version);
+extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+                                             struct jffs2_raw_node_ref *raw);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+                                           struct jffs2_raw_node_ref *raw);
+extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
+extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+                             char *buffer, size_t size);
+extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+                             const char *buffer, size_t size, int flags);
+extern struct xattr_handler *jffs2_xattr_handlers[];
+extern struct xattr_handler jffs2_user_xattr_handler;
+extern struct xattr_handler jffs2_trusted_xattr_handler;
+extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
+#define jffs2_getxattr          generic_getxattr
+#define jffs2_setxattr          generic_setxattr
+#define jffs2_removexattr       generic_removexattr
+#else
+#define jffs2_init_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c)
+#define jffs2_clear_xattr_subsystem(c)
+#define jffs2_xattr_delete_inode(c, ic)
+#define jffs2_xattr_free_inode(c, ic)
+#define jffs2_verify_xattr(c)                   (1)
+#define jffs2_xattr_handlers    NULL
+#define jffs2_listxattr         NULL
+#define jffs2_getxattr          NULL
+#define jffs2_setxattr          NULL
+#define jffs2_removexattr       NULL
+#endif /* CONFIG_JFFS2_FS_XATTR */
+#ifdef CONFIG_JFFS2_FS_SECURITY
+extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern struct xattr_handler jffs2_security_xattr_handler;
+#else
+#define jffs2_init_security(inode,dir)  (0)
+#endif /* CONFIG_JFFS2_FS_SECURITY */
+#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
new file mode 100644
index 000000000000..ed046e19dbfa
--- /dev/null
+++ b/fs/jffs2/xattr_trusted.c
@@ -0,0 +1,52 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+static int jffs2_trusted_getxattr(struct inode *inode, const char *name,
+                                  void *buffer, size_t size)
+{
+        if (!strcmp(name, ""))
+                return -EINVAL;
+        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size);
+}
+static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer,
+                                  size_t size, int flags)
+{
+        if (!strcmp(name, ""))
+                return -EINVAL;
+        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags);
+}
+static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size,
+                                      const char *name, size_t name_len)
+{
+        size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+        if (list && retlen<=list_size) {
+                strcpy(list, XATTR_TRUSTED_PREFIX);
+                strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
+        }
+        return retlen;
+}
+struct xattr_handler jffs2_trusted_xattr_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list = jffs2_trusted_listxattr,
+        .set = jffs2_trusted_setxattr,
+        .get = jffs2_trusted_getxattr
+};
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
new file mode 100644
index 000000000000..2f8e9aa01ea0
--- /dev/null
+++ b/fs/jffs2/xattr_user.c
@@ -0,0 +1,52 @@
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright (C) 2006  NEC Corporation
+ *
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+static int jffs2_user_getxattr(struct inode *inode, const char *name,
+                               void *buffer, size_t size)
+{
+        if (!strcmp(name, ""))
+                return -EINVAL;
+        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size);
+}
+static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer,
+                               size_t size, int flags)
+{
+        if (!strcmp(name, ""))
+                return -EINVAL;
+        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags);
+}
+static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size,
+                                   const char *name, size_t name_len)
+{
+        size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
+        if (list && retlen <= list_size) {
+                strcpy(list, XATTR_USER_PREFIX);
+                strcpy(list + XATTR_USER_PREFIX_LEN, name);
+        }
+        return retlen;
+}
+struct xattr_handler jffs2_user_xattr_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list = jffs2_user_listxattr,
+        .set = jffs2_user_setxattr,
+        .get = jffs2_user_getxattr
+};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 04eb78f1252e..43e3f566aad6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -305,7 +305,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
                                offset, nr_segs, jfs_get_block, NULL);
 }
-struct address_space_operations jfs_aops = {
+const struct address_space_operations jfs_aops = {
        .readpage       = jfs_readpage,
        .readpages      = jfs_readpages,
        .writepage      = jfs_writepage,
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358bf..4d52593a5fc6 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
        /* allocate the disk blocks for the extent.  initially, extBalloc()
         * will try to allocate disk blocks for the requested size (xlen). 
-         * if this fails (xlen contigious free blocks not avaliable), it'll
+         * if this fails (xlen contiguous free blocks not avaliable), it'll
         * try to allocate a smaller number of blocks (producing a smaller
         * extent), with this smaller number of blocks consisting of the
         * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
 *
 *              initially, we will try to allocate disk blocks for the
 *              requested size (nblocks).  if this fails (nblocks 
- *              contigious free blocks not avaliable), we'll try to allocate
+ *              contiguous free blocks not avaliable), we'll try to allocate
 *              a smaller number of blocks (producing a smaller extent), with
 *              this smaller number of blocks consisting of the requested
 *              number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
        /* get the number of blocks to initially attempt to allocate.
         * we'll first try the number of blocks requested unless this
-         * number is greater than the maximum number of contigious free
+         * number is greater than the maximum number of contiguous free
         * blocks in the map. in that case, we'll start off with the 
         * maximum free.
         */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 *              in place.  if this fails, we'll try to move the extent
 *              to a new set of blocks. if moving the extent, we initially
 *              will try to allocate disk blocks for the requested size
- *              (nnew).  if this fails  (nnew contigious free blocks not
+ *              (nnew).  if this fails  (new contiguous free blocks not
 *              avaliable), we'll try  to allocate a smaller number of
 *              blocks (producing a smaller extent), with this smaller
 *              number of blocks consisting of the requested number of
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index c30072674464..b5c7da6190dc 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,7 @@ extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
 extern void jfs_set_inode_flags(struct inode *);
-extern struct address_space_operations jfs_aops;
+extern const struct address_space_operations jfs_aops;
 extern struct inode_operations jfs_dir_inode_operations;
 extern const struct file_operations jfs_dir_operations;
 extern struct inode_operations jfs_file_inode_operations;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2b220dd6b4e7..e1e0a6e6ebdf 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -577,7 +577,7 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
        metapage_releasepage(page, 0);
 }
-struct address_space_operations jfs_metapage_aops = {
+const struct address_space_operations jfs_metapage_aops = {
        .readpage       = metapage_readpage,
        .writepage      = metapage_writepage,
        .sync_page      = block_sync_page,
@@ -632,10 +632,9 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
                }
                SetPageUptodate(page);
        } else {
-                page = read_cache_page(mapping, page_index,
+                page = read_mapping_page(mapping, page_index, NULL);
-                            (filler_t *)mapping->a_ops->readpage, NULL);
                if (IS_ERR(page) || !PageUptodate(page)) {
-                        jfs_err("read_cache_page failed!");
+                        jfs_err("read_mapping_page failed!");
                        return NULL;
                }
                lock_page(page);
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index f0b7d3282b07..d17a3290f5aa 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -139,7 +139,7 @@ static inline void metapage_homeok(struct metapage *mp)
        put_metapage(mp);
 }
-extern struct address_space_operations jfs_metapage_aops;
+extern const struct address_space_operations jfs_metapage_aops;
 /*
 * This routines invalidate all pages for an extent.
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index ac3d66948e8c..10c46231ce15 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -842,7 +842,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
        TXN_UNLOCK();
        release_metapage(mp);
        TXN_LOCK();
-        xtid = tlck->tid;       /* reaquire after dropping TXN_LOCK */
+        xtid = tlck->tid;       /* reacquire after dropping TXN_LOCK */
        jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
                 tid, xtid, lid);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index db6f41d6dd60..4f6cfebc82db 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -18,7 +18,6 @@
 */
 #include <linux/fs.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/parser.h>
 #include <linux/completion.h>
@@ -139,9 +138,9 @@ static void jfs_destroy_inode(struct inode *inode)
        kmem_cache_free(jfs_inode_cachep, ji);
 }
-static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct jfs_sb_info *sbi = JFS_SBI(sb);
+        struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
        s64 maxinodes;
        struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
@@ -565,10 +564,11 @@ static void jfs_unlockfs(struct super_block *sb)
        }
 }
-static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
+static int jfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+                           mnt);
 }
 static int jfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d0..ac02ea602c3d 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        buf->f_type = sb->s_magic;
+        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = PAGE_CACHE_SIZE;
        buf->f_namelen = NAME_MAX;
        return 0;
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        /* fallthrough */
                default:
                        spin_lock(&dcache_lock);
-                        if (filp->f_pos == 2) {
+                        if (filp->f_pos == 2)
-                                list_del(q);
+                                list_move(q, &dentry->d_subdirs);
-                                list_add(q, &dentry->d_subdirs);
-                        }
                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                        return 0;
                                spin_lock(&dcache_lock);
                                /* next is still alive */
-                                list_del(q);
+                                list_move(q, p);
-                                list_add(q, p);
                                p = q;
                                filp->f_pos++;
                        }
@@ -196,9 +194,9 @@ struct inode_operations simple_dir_inode_operations = {
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
-struct super_block *
+int get_sb_pseudo(struct file_system_type *fs_type, char *name,
-get_sb_pseudo(struct file_system_type *fs_type, char *name,
+        struct super_operations *ops, unsigned long magic,
-        struct super_operations *ops, unsigned long magic)
+        struct vfsmount *mnt)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        static struct super_operations default_ops = {.statfs = simple_statfs};
@@ -207,7 +205,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
        struct qstr d_name = {.name = name, .len = strlen(name)};
        if (IS_ERR(s))
-                return s;
+                return PTR_ERR(s);
        s->s_flags = MS_NOUSER;
        s->s_maxbytes = ~0ULL;
@@ -232,12 +230,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
        d_instantiate(dentry, root);
        s->s_root = dentry;
        s->s_flags |= MS_ACTIVE;
-        return s;
+        return simple_set_mnt(mnt, s);
 Enomem:
        up_write(&s->s_umount);
        deactivate_super(s);
-        return ERR_PTR(-ENOMEM);
+        return -ENOMEM;
 }
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
@@ -424,13 +422,13 @@ out:
 static DEFINE_SPINLOCK(pin_fs_lock);
-int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
 {
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
-                mnt = do_kern_mount(name, 0, name, NULL);
+                mnt = vfs_kern_mount(type, 0, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce744468708..52774feab93f 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
 * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
 */
-static inline
+static void nlmclnt_prepare_reclaim(struct nlm_host *host)
-void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 {
+        down_write(&host->h_rwsem);
        host->h_monitored = 0;
-        host->h_nsmstate = newstate;
        host->h_state++;
        host->h_nextrebind = 0;
        nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
        dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
+static void nlmclnt_finish_reclaim(struct nlm_host *host)
+{
+        host->h_reclaiming = 0;
+        up_write(&host->h_rwsem);
+        dprintk("NLM: done reclaiming locks for host %s", host->h_name);
+}
 /*
 * Reclaim all locks on server host. We do this by spawning a separate
 * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-        if (host->h_reclaiming++) {
+        if (host->h_nsmstate == newstate)
-                if (host->h_nsmstate == newstate)
+                return;
-                        return;
+        host->h_nsmstate = newstate;
-                nlmclnt_prepare_reclaim(host, newstate);
+        if (!host->h_reclaiming++) {
-        } else {
-                nlmclnt_prepare_reclaim(host, newstate);
                nlm_get_host(host);
                __module_get(THIS_MODULE);
                if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
        struct nlm_host   *host = (struct nlm_host *) ptr;
        struct nlm_wait   *block;
        struct file_lock *fl, *next;
+        u32 nsmstate;
        daemonize("%s-reclaim", host->h_name);
        allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
        lock_kernel();
        lockd_up();
+        nlmclnt_prepare_reclaim(host);
        /* First, reclaim all locks that have been marked. */
 restart:
+        nsmstate = host->h_nsmstate;
        list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
                list_del_init(&fl->fl_u.nfs_fl.list);
                if (signalled())
                        continue;
-                if (nlmclnt_reclaim(host, fl) == 0)
+                if (nlmclnt_reclaim(host, fl) != 0)
-                        list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+                        continue;
-                goto restart;
+                list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+                if (host->h_nsmstate != nsmstate) {
+                        /* Argh! The server rebooted again! */
+                        list_splice_init(&host->h_granted, &host->h_reclaim);
+                        goto restart;
+                }
        }
+        nlmclnt_finish_reclaim(host);
-        host->h_reclaiming = 0;
        /* Now, wake up all processes that sleep on a blocked lock */
        list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5c..5980c45998cc 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -6,7 +6,6 @@
 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/errno.h>
@@ -508,7 +507,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        }
        block = nlmclnt_prepare_block(host, fl);
+again:
        for(;;) {
+                /* Reboot protection */
+                fl->fl_u.nfs_fl.state = host->h_state;
                status = nlmclnt_call(req, NLMPROC_LOCK);
                if (status < 0)
                        goto out_unblock;
@@ -531,10 +533,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        }
        if (resp->status == NLM_LCK_GRANTED) {
-                fl->fl_u.nfs_fl.state = host->h_state;
+                down_read(&host->h_rwsem);
+                /* Check whether or not the server has rebooted */
+                if (fl->fl_u.nfs_fl.state != host->h_state) {
+                        up_read(&host->h_rwsem);
+                        goto again;
+                }
                fl->fl_flags |= FL_SLEEP;
                /* Ensure the resulting lock will get added to granted list */
                do_vfs_lock(fl);
+                up_read(&host->h_rwsem);
        }
        status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -596,6 +604,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 static int
 nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
+        struct nlm_host *host = req->a_host;
        struct nlm_res  *resp = &req->a_res;
        int             status;
@@ -604,7 +613,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
         * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
         * case, we want to unlock.
         */
+        down_read(&host->h_rwsem);
        do_vfs_lock(fl);
+        up_read(&host->h_rwsem);
        if (req->a_flags & RPC_TASK_ASYNC)
                return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d359..38b0e8a1aec0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,11 +112,12 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
        host->h_version    = version;
        host->h_proto      = proto;
        host->h_rpcclnt    = NULL;
-        init_MUTEX(&host->h_sema);
+        mutex_init(&host->h_mutex);
        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
        host->h_expires    = jiffies + NLM_HOST_EXPIRE;
        atomic_set(&host->h_count, 1);
        init_waitqueue_head(&host->h_gracewait);
+        init_rwsem(&host->h_rwsem);
        host->h_state      = 0;                 /* pseudo NSM state */
        host->h_nsmstate   = 0;                 /* real NSM state */
        host->h_server     = server;
@@ -172,7 +173,7 @@ nlm_bind_host(struct nlm_host *host)
                        (unsigned)ntohl(host->h_addr.sin_addr.s_addr));
        /* Lock host handle */
-        down(&host->h_sema);
+        mutex_lock(&host->h_mutex);
        /* If we've already created an RPC client, check whether
         * RPC rebind is required
@@ -204,12 +205,12 @@ nlm_bind_host(struct nlm_host *host)
                host->h_rpcclnt = clnt;
        }
-        up(&host->h_sema);
+        mutex_unlock(&host->h_mutex);
        return clnt;
 forgetit:
        printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-        up(&host->h_sema);
+        mutex_unlock(&host->h_mutex);
        return NULL;
 }
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index fd56c8872f34..9a991b52c647 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -12,7 +12,6 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 3ef739120dff..baf5ae513481 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -20,7 +20,6 @@
 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index d210cf304e92..dbb66a3b5cd9 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -7,7 +7,6 @@
 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/slab.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a570e5c8a930..2a4df9b3779a 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -6,7 +6,6 @@
 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/time.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index f22a3764461a..033ea4ac2c30 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -6,7 +6,6 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/utsname.h>
diff --git a/fs/locks.c b/fs/locks.c
index ab61a8b54829..1ad29c9b6252 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -703,7 +703,7 @@ EXPORT_SYMBOL(posix_test_lock);
 * from a broken NFS client. But broken NFS clients have a lot more to
 * worry about than proper deadlock detection anyway... --okir
 */
-int posix_locks_deadlock(struct file_lock *caller_fl,
+static int posix_locks_deadlock(struct file_lock *caller_fl,
                                struct file_lock *block_fl)
 {
        struct list_head *tmp;
@@ -722,8 +722,6 @@ next_task:
        return 0;
 }
-EXPORT_SYMBOL(posix_locks_deadlock);
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
 * at the head of the list, but that's secret knowledge known only to
 * flock_lock_file and posix_lock_file.
@@ -794,7 +792,8 @@ out:
 static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
        struct file_lock *fl;
-        struct file_lock *new_fl, *new_fl2;
+        struct file_lock *new_fl = NULL;
+        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock **before;
@@ -803,9 +802,15 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
+         *
+         * In some cases we can be sure, that no new locks will be needed
         */
-        new_fl = locks_alloc_lock();
+        if (!(request->fl_flags & FL_ACCESS) &&
-        new_fl2 = locks_alloc_lock();
+            (request->fl_type != F_UNLCK ||
+             request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
+                new_fl = locks_alloc_lock();
+                new_fl2 = locks_alloc_lock();
+        }
        lock_kernel();
        if (request->fl_type != F_UNLCK) {
@@ -834,14 +839,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
        if (request->fl_flags & FL_ACCESS)
                goto out;
-        error = -ENOLCK; /* "no luck" */
-        if (!(new_fl && new_fl2))
-                goto out;
        /*
-         * We've allocated the new locks in advance, so there are no
-         * errors possible (and no blocking operations) from here on.
-         * 
         * Find the first old lock with the same owner as the new lock.
         */
        
@@ -938,10 +936,25 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
                before = &fl->fl_next;
        }
+        /*
+         * The above code only modifies existing locks in case of
+         * merging or replacing.  If new lock(s) need to be inserted
+         * all modifications are done bellow this, so it's safe yet to
+         * bail out.
+         */
+        error = -ENOLCK; /* "no luck" */
+        if (right && left == right && !new_fl2)
+                goto out;
        error = 0;
        if (!added) {
                if (request->fl_type == F_UNLCK)
                        goto out;
+                if (!new_fl) {
+                        error = -ENOLCK;
+                        goto out;
+                }
                locks_copy_lock(new_fl, request);
                locks_insert_lock(before, new_fl);
                new_fl = NULL;
@@ -1881,19 +1894,18 @@ out:
 */
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
-        struct file_lock lock, **before;
+        struct file_lock lock;
        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
-        before = &filp->f_dentry->d_inode->i_flock;
+        if (!filp->f_dentry->d_inode->i_flock)
-        if (*before == NULL)
                return;
        lock.fl_type = F_UNLCK;
-        lock.fl_flags = FL_POSIX;
+        lock.fl_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
        lock.fl_owner = owner;
@@ -1902,25 +1914,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
        lock.fl_ops = NULL;
        lock.fl_lmops = NULL;
-        if (filp->f_op && filp->f_op->lock != NULL) {
+        if (filp->f_op && filp->f_op->lock != NULL)
                filp->f_op->lock(filp, F_SETLK, &lock);
-                goto out;
+        else
-        }
+                posix_lock_file(filp, &lock);
-        /* Can't use posix_lock_file here; we need to remove it no matter
-         * which pid we have.
-         */
-        lock_kernel();
-        while (*before != NULL) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) {
-                        locks_delete_lock(before);
-                        continue;
-                }
-                before = &fl->fl_next;
-        }
-        unlock_kernel();
-out:
        if (lock.fl_ops && lock.fl_ops->fl_release_private)
                lock.fl_ops->fl_release_private(&lock);
 }
@@ -2206,63 +2204,6 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 EXPORT_SYMBOL(lock_may_write);
-static inline void __steal_locks(struct file *file, fl_owner_t from)
-{
-        struct inode *inode = file->f_dentry->d_inode;
-        struct file_lock *fl = inode->i_flock;
-        while (fl) {
-                if (fl->fl_file == file && fl->fl_owner == from)
-                        fl->fl_owner = current->files;
-                fl = fl->fl_next;
-        }
-}
-/* When getting ready for executing a binary, we make sure that current
- * has a files_struct on its own. Before dropping the old files_struct,
- * we take over ownership of all locks for all file descriptors we own.
- * Note that we may accidentally steal a lock for a file that a sibling
- * has created since the unshare_files() call.
- */
-void steal_locks(fl_owner_t from)
-{
-        struct files_struct *files = current->files;
-        int i, j;
-        struct fdtable *fdt;
-        if (from == files)
-                return;
-        lock_kernel();
-        j = 0;
-        /*
-         * We are not taking a ref to the file structures, so
-         * we need to acquire ->file_lock.
-         */
-        spin_lock(&files->file_lock);
-        fdt = files_fdtable(files);
-        for (;;) {
-                unsigned long set;
-                i = j * __NFDBITS;
-                if (i >= fdt->max_fdset || i >= fdt->max_fds)
-                        break;
-                set = fdt->open_fds->fds_bits[j++];
-                while (set) {
-                        if (set & 1) {
-                                struct file *file = fdt->fd[i];
-                                if (file)
-                                        __steal_locks(file, from);
-                        }
-                        i++;
-                        set >>= 1;
-                }
-        }
-        spin_unlock(&files->file_lock);
-        unlock_kernel();
-}
-EXPORT_SYMBOL(steal_locks);
 static int __init filelock_init(void)
 {
        filelock_cache = kmem_cache_create("file_lock_cache",
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 69224d1fe043..2b0a389d1987 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -60,8 +60,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
-        struct page *page = read_cache_page(mapping, n,
+        struct page *page = read_mapping_page(mapping, n, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                kmap(page);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2dcccf1d1b7f..9ea91c5eeb7b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -19,7 +19,7 @@
 static void minix_read_inode(struct inode * inode);
 static int minix_write_inode(struct inode * inode, int wait);
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 static void minix_delete_inode(struct inode *inode)
@@ -296,11 +296,11 @@ out_bad_sb:
        return -EINVAL;
 }
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf)
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct minix_sb_info *sbi = minix_sb(sb);
+        struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
-        buf->f_type = sb->s_magic;
+        buf->f_type = dentry->d_sb->s_magic;
-        buf->f_bsize = sb->s_blocksize;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
        buf->f_bfree = minix_count_free_blocks(sbi);
        buf->f_bavail = buf->f_bfree;
@@ -335,7 +335,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,minix_get_block);
 }
-static struct address_space_operations minix_aops = {
+static const struct address_space_operations minix_aops = {
        .readpage = minix_readpage,
        .writepage = minix_writepage,
        .sync_page = block_sync_page,
@@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode)
                V2_minix_truncate(inode);
 }
-static struct super_block *minix_get_sb(struct file_system_type *fs_type,
+static int minix_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+                           mnt);
 }
 static struct file_system_type minix_fs_type = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 9bf2eb30e6f4..1e4598247d0b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -707,9 +707,9 @@ mpage_writepages(struct address_space *mapping,
        struct pagevec pvec;
        int nr_pages;
        pgoff_t index;
-        pgoff_t end = -1;               /* Inclusive */
+        pgoff_t end;            /* Inclusive */
        int scanned = 0;
-        int is_range = 0;
+        int range_whole = 0;
        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                wbc->encountered_congestion = 1;
@@ -721,16 +721,14 @@ mpage_writepages(struct address_space *mapping,
                writepage = mapping->a_ops->writepage;
        pagevec_init(&pvec, 0);
-        if (wbc->sync_mode == WB_SYNC_NONE) {
+        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
        } else {
-                index = 0;                        /* whole-file sweep */
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
-                scanned = 1;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        }
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-        if (wbc->start || wbc->end) {
+                        range_whole = 1;
-                index = wbc->start >> PAGE_CACHE_SHIFT;
-                end = wbc->end >> PAGE_CACHE_SHIFT;
-                is_range = 1;
                scanned = 1;
        }
 retry:
@@ -759,7 +757,7 @@ retry:
                                continue;
                        }
-                        if (unlikely(is_range) && page->index > end) {
+                        if (!wbc->range_cyclic && page->index > end) {
                                done = 1;
                                unlock_page(page);
                                continue;
@@ -810,7 +808,7 @@ retry:
                index = 0;
                goto retry;
        }
-        if (!is_range)
+        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                mapping->writeback_index = index;
        if (bio)
                mpage_bio_submit(WRITE, bio);
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 5b76ccd19e3f..9e44158a7540 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static struct super_block *msdos_get_sb(struct file_system_type *fs_type,
+static int msdos_get_sb(struct file_system_type *fs_type,
-                                        int flags, const char *dev_name,
+                        int flags, const char *dev_name,
-                                        void *data)
+                        void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+                           mnt);
 }
 static struct file_system_type msdos_fs_type = {
diff --git a/fs/namei.c b/fs/namei.c
index d6e2ee251736..c784e8bb57a3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1127,7 +1127,7 @@ out:
        if (likely(retval == 0)) {
                if (unlikely(current->audit_context && nd && nd->dentry &&
                                nd->dentry->d_inode))
-                audit_inode(name, nd->dentry->d_inode, flags);
+                audit_inode(name, nd->dentry->d_inode);
        }
 out_fail:
        return retval;
@@ -2243,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        int error;
        char * to;
-        if (flags != 0)
+        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
                return -EINVAL;
        to = getname(newname);
        if (IS_ERR(to))
                return PTR_ERR(to);
-        error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
+        error = __user_walk_fd(olddfd, oldname,
+                               flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+                               &old_nd);
        if (error)
                goto exit;
        error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
@@ -2577,8 +2579,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
        struct page * page;
        struct address_space *mapping = dentry->d_inode->i_mapping;
-        page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
+        page = read_mapping_page(mapping, 0, NULL);
-                                NULL);
        if (IS_ERR(page))
                goto sync_fail;
        wait_on_page_locked(page);
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..fa7ed6a9fc2d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -8,7 +8,6 @@
 * Heavily rewritten.
 */
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -86,6 +85,15 @@ struct vfsmount *alloc_vfsmnt(const char *name)
        return mnt;
 }
+int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+{
+        mnt->mnt_sb = sb;
+        mnt->mnt_root = dget(sb->s_root);
+        return 0;
+}
+EXPORT_SYMBOL(simple_set_mnt);
 void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
@@ -517,10 +525,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
        struct vfsmount *p;
-        for (p = mnt; p; p = next_mnt(p, mnt)) {
+        for (p = mnt; p; p = next_mnt(p, mnt))
-                list_del(&p->mnt_hash);
+                list_move(&p->mnt_hash, kill);
-                list_add(&p->mnt_hash, kill);
-        }
        if (propagate)
                propagate_umount(kill);
@@ -576,8 +582,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
         */
        lock_kernel();
-        if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
+        if (sb->s_op->umount_begin)
-                sb->s_op->umount_begin(sb);
+                sb->s_op->umount_begin(mnt, flags);
        unlock_kernel();
        /*
@@ -1163,13 +1169,46 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 }
 /*
+ * go through the vfsmounts we've just consigned to the graveyard to
+ * - check that they're still dead
+ * - delete the vfsmount from the appropriate namespace under lock
+ * - dispose of the corpse
+ */
+static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
+{
+        struct namespace *namespace;
+        struct vfsmount *mnt;
+        while (!list_empty(graveyard)) {
+                LIST_HEAD(umounts);
+                mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+                list_del_init(&mnt->mnt_expire);
+                /* don't do anything if the namespace is dead - all the
+                 * vfsmounts from it are going away anyway */
+                namespace = mnt->mnt_namespace;
+                if (!namespace || !namespace->root)
+                        continue;
+                get_namespace(namespace);
+                spin_unlock(&vfsmount_lock);
+                down_write(&namespace_sem);
+                expire_mount(mnt, mounts, &umounts);
+                up_write(&namespace_sem);
+                release_mounts(&umounts);
+                mntput(mnt);
+                put_namespace(namespace);
+                spin_lock(&vfsmount_lock);
+        }
+}
+/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
-        struct namespace *namespace;
        struct vfsmount *mnt, *next;
        LIST_HEAD(graveyard);
@@ -1193,38 +1232,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                list_move(&mnt->mnt_expire, &graveyard);
        }
-        /*
+        expire_mount_list(&graveyard, mounts);
-         * go through the vfsmounts we've just consigned to the graveyard to
-         * - check that they're still dead
-         * - delete the vfsmount from the appropriate namespace under lock
-         * - dispose of the corpse
-         */
-        while (!list_empty(&graveyard)) {
-                LIST_HEAD(umounts);
-                mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
-                list_del_init(&mnt->mnt_expire);
-                /* don't do anything if the namespace is dead - all the
+        spin_unlock(&vfsmount_lock);
-                 * vfsmounts from it are going away anyway */
+}
-                namespace = mnt->mnt_namespace;
-                if (!namespace || !namespace->root)
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+        struct vfsmount *this_parent = parent;
+        struct list_head *next;
+        int found = 0;
+repeat:
+        next = this_parent->mnt_mounts.next;
+resume:
+        while (next != &this_parent->mnt_mounts) {
+                struct list_head *tmp = next;
+                struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+                next = tmp->next;
+                if (!(mnt->mnt_flags & MNT_SHRINKABLE))
                        continue;
-                get_namespace(namespace);
+                /*
+                 * Descend a level if the d_mounts list is non-empty.
+                 */
+                if (!list_empty(&mnt->mnt_mounts)) {
+                        this_parent = mnt;
+                        goto repeat;
+                }
-                spin_unlock(&vfsmount_lock);
+                if (!propagate_mount_busy(mnt, 1)) {
-                down_write(&namespace_sem);
+                        mntget(mnt);
-                expire_mount(mnt, mounts, &umounts);
+                        list_move_tail(&mnt->mnt_expire, graveyard);
-                up_write(&namespace_sem);
+                        found++;
-                release_mounts(&umounts);
+                }
-                mntput(mnt);
-                put_namespace(namespace);
-                spin_lock(&vfsmount_lock);
        }
+        /*
+         * All done at this level ... ascend and resume the search
+         */
+        if (this_parent != parent) {
+                next = this_parent->mnt_child.next;
+                this_parent = this_parent->mnt_parent;
+                goto resume;
+        }
+        return found;
+}
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ */
+void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+{
+        LIST_HEAD(graveyard);
+        int found;
+        spin_lock(&vfsmount_lock);
+        /* extract submounts of 'mountpoint' from the expiration list */
+        while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+                expire_mount_list(&graveyard, mounts);
        spin_unlock(&vfsmount_lock);
 }
-EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+EXPORT_SYMBOL_GPL(shrink_submounts);
 /*
 * Some copy_from_user() implementations do not return the exact number of
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f0860c602d8b..b4ee89250e95 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -10,7 +10,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/errno.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a1f3e972c6ef..1ddf77b0b825 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -9,7 +9,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <asm/system.h>
@@ -39,7 +38,7 @@
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
-static int  ncp_statfs(struct super_block *, struct kstatfs *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
 static kmem_cache_t * ncp_inode_cachep;
@@ -105,7 +104,7 @@ static struct super_operations ncp_sops =
 extern struct dentry_operations ncp_root_dentry_operations;
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
-extern struct address_space_operations ncp_symlink_aops;
+extern const struct address_space_operations ncp_symlink_aops;
 extern int ncp_symlink(struct inode*, struct dentry*, const char*);
 #endif
@@ -724,13 +723,14 @@ static void ncp_put_super(struct super_block *sb)
        kfree(server);
 }
-static int ncp_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct dentry* d;
        struct inode* i;
        struct ncp_inode_info* ni;
        struct ncp_server* s;
        struct ncp_volume_info vi;
+        struct super_block *sb = dentry->d_sb;
        int err;
        __u8 dh;
        
@@ -957,10 +957,10 @@ out:
        return result;
 }
-static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
+static int ncp_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, ncp_fill_super);
+        return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
 }
 static struct file_system_type ncp_fs_type = {
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index eb3813ad136f..42039fe0653c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -7,7 +7,6 @@
 *
 */
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/capability.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 52d60c3d8996..e7d5a3097fe6 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -93,7 +93,7 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
         */
        if (type)
                *type = VM_FAULT_MAJOR;
-        inc_page_state(pgmajfault);
+        count_vm_event(PGMAJFAULT);
        return page;
 }
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index d9ebf6439f59..551e0bac7aac 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -10,7 +10,6 @@
 */
-#include <linux/config.h>
 #include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 799e5c2bec55..2441d1ab57dc 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -12,7 +12,6 @@
 #ifndef _NCPLIB_H
 #define _NCPLIB_H
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/types.h>
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index a6ec90cd8894..749a18d33599 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -5,7 +5,6 @@
 *
 */
-#include <linux/config.h>
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 8783eb7ec641..11c2b252ebed 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -8,7 +8,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/errno.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e935f1b34bc2..ca92c2406635 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -20,7 +20,6 @@
 *
 */
-#include <linux/config.h>
 #include <asm/uaccess.h>
@@ -99,7 +98,7 @@ fail:
 /*
 * symlinks can't do much...
 */
-struct address_space_operations ncp_symlink_aops = {
+const struct address_space_operations ncp_symlink_aops = {
        .readpage       = ncp_symlink_readpage,
 };
        
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a9..0b572a0c1967 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,14 +4,16 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
-nfs-y                   := dir.o file.o inode.o nfs2xdr.o pagelist.o \
+nfs-y                   := dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
-                           proc.o read.o symlink.o unlink.o write.o
+                           proc.o read.o symlink.o unlink.o write.o \
+                           namespace.o
 nfs-$(CONFIG_ROOT_NFS)  += nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)    += nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)        += nfs3acl.o
 nfs-$(CONFIG_NFS_V4)    += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
-                           callback.o callback_xdr.o callback_proc.o
+                           callback.o callback_xdr.o callback_proc.o \
+                           nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-objs                := $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 90c95adc8c1b..fe0a6b8ac149 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -6,7 +6,6 @@
 * NFSv4 callback handling
 */
-#include <linux/config.h>
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
@@ -182,8 +181,6 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 /*
 * Define NFS4 callback program
 */
-extern struct svc_version nfs4_callback_version1;
 static struct svc_version *nfs4_callback_version[] = {
        [1] = &nfs4_callback_version1,
 };
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 462cfceb50c5..7719483ecdfc 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -5,7 +5,6 @@
 *
 * NFSv4 callback procedures
 */
-#include <linux/config.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include "nfs4_fs.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05c38cf40b69..29f932192054 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -5,7 +5,6 @@
 *
 * NFSv4 callback encode/decode procedures
 */
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
@@ -202,7 +201,7 @@ static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xd
        status = decode_fh(xdr, &args->fh);
 out:
        dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
-        return 0;
+        return status;
 }
 static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d3be923d4e43..9540a316c05e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -6,7 +6,6 @@
 * NFS file delegation management
 *
 */
-#include <linux/config.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f5..3ddda6f7ecc2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        lock_kernel();
-        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (res < 0) {
                unlock_kernel();
                return res;
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
        return (nd->intent.open.flags & O_EXCL) != 0;
 }
+static inline int nfs_reval_fsid(struct inode *dir,
+                struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
+                /* Revalidate fsid on root dir */
+                return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+        return 0;
+}
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
        struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                res = ERR_PTR(error);
                goto out_unlock;
        }
+        error = nfs_reval_fsid(dir, &fhandle, &fattr);
+        if (error < 0) {
+                res = ERR_PTR(error);
+                goto out_unlock;
+        }
        inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
        res = (struct dentry *)inode;
        if (IS_ERR(res))
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c72b0c07283..4cdd1b499e35 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -38,7 +38,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -892,7 +891,7 @@ out:
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
-int nfs_init_directcache(void)
+int __init nfs_init_directcache(void)
 {
        nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
                                                sizeof(struct nfs_direct_req),
@@ -906,7 +905,7 @@ int nfs_init_directcache(void)
 }
 /**
- * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
 void nfs_destroy_directcache(void)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6e..cc2b874ad5a4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -43,7 +43,7 @@ static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
 static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
-static int  nfs_file_flush(struct file *);
+static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
@@ -127,23 +127,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 }
 /**
- * nfs_revalidate_file - Revalidate the page cache & related metadata
- * @inode - pointer to inode struct
- * @file - pointer to file
- */
-static int nfs_revalidate_file(struct inode *inode, struct file *filp)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        int retval = 0;
-        if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
-                        || nfs_attribute_timeout(inode))
-                retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-        nfs_revalidate_mapping(inode, filp->f_mapping);
-        return 0;
-}
-/**
 * nfs_revalidate_size - Revalidate the file size
 * @inode - pointer to inode struct
 * @file - pointer to struct file
@@ -188,7 +171,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 *
 */
 static int
-nfs_file_flush(struct file *file)
+nfs_file_flush(struct file *file, fl_owner_t id)
 {
        struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
        struct inode    *inode = file->f_dentry->d_inode;
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long) pos);
-        result = nfs_revalidate_file(inode, iocb->ki_filp);
+        result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
        if (!result)
                result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
                dentry->d_parent->d_name.name, dentry->d_name.name,
                (unsigned long) count, (unsigned long long) *ppos);
-        res = nfs_revalidate_file(inode, filp);
+        res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (!res)
                res = generic_file_sendfile(filp, ppos, count, actor, target);
        return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
        dfprintk(VFS, "nfs: mmap(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        status = nfs_revalidate_file(inode, file);
+        status = nfs_revalidate_mapping(inode, file->f_mapping);
        if (!status)
                status = generic_file_mmap(file, vma);
        return status;
@@ -320,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
-        /* FIXME: we really should cancel any unstarted writes on this page */
+        struct inode *inode = page->mapping->host;
+        /* Cancel any unstarted writes on this page */
+        if (offset == 0)
+                nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
 }
 static int nfs_release_page(struct page *page, gfp_t gfp)
@@ -328,7 +315,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
        return !nfs_wb_page(page->mapping->host, page);
 }
-struct address_space_operations nfs_file_aops = {
+const struct address_space_operations nfs_file_aops = {
        .readpage = nfs_readpage,
        .readpages = nfs_readpages,
        .set_page_dirty = __set_page_dirty_nobuffers,
@@ -373,7 +360,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
                if (result)
                        goto out;
        }
-        nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        result = count;
        if (!count)
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5a..b81e7ed3c902 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a92327..d349fb2245da 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -13,7 +13,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -36,6 +35,8 @@
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -44,89 +45,17 @@
 #include "callback.h"
 #include "delegation.h"
 #include "iostat.h"
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
 #define NFS_PARANOIA 1
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static struct inode *nfs_alloc_inode(struct super_block *sb);
-static void nfs_destroy_inode(struct inode *);
-static int nfs_write_inode(struct inode *,int);
-static void nfs_delete_inode(struct inode *);
-static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
-static int  nfs_show_options(struct seq_file *, struct vfsmount *);
-static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
-static struct rpc_program       nfs_program;
+static kmem_cache_t * nfs_inode_cachep;
-static struct super_operations nfs_sops = { 
-        .alloc_inode    = nfs_alloc_inode,
-        .destroy_inode  = nfs_destroy_inode,
-        .write_inode    = nfs_write_inode,
-        .delete_inode   = nfs_delete_inode,
-        .statfs         = nfs_statfs,
-        .clear_inode    = nfs_clear_inode,
-        .umount_begin   = nfs_umount_begin,
-        .show_options   = nfs_show_options,
-        .show_stats     = nfs_show_stats,
-};
-/*
- * RPC cruft for NFS
- */
-static struct rpc_stat          nfs_rpcstat = {
-        .program                = &nfs_program
-};
-static struct rpc_version *     nfs_version[] = {
-        NULL,
-        NULL,
-        &nfs_version2,
-#if defined(CONFIG_NFS_V3)
-        &nfs_version3,
-#elif defined(CONFIG_NFS_V4)
-        NULL,
-#endif
-#if defined(CONFIG_NFS_V4)
-        &nfs_version4,
-#endif
-};
-static struct rpc_program       nfs_program = {
-        .name                   = "nfs",
-        .number                 = NFS_PROGRAM,
-        .nrvers                 = ARRAY_SIZE(nfs_version),
-        .version                = nfs_version,
-        .stats                  = &nfs_rpcstat,
-        .pipe_dir_name          = "/nfs",
-};
-#ifdef CONFIG_NFS_V3_ACL
-static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *     nfsacl_version[] = {
-        [3]                     = &nfsacl_version3,
-};
-struct rpc_program              nfsacl_program = {
-        .name =                 "nfsacl",
-        .number =               NFS_ACL_PROGRAM,
-        .nrvers =               ARRAY_SIZE(nfsacl_version),
-        .version =              nfsacl_version,
-        .stats =                &nfsacl_rpcstat,
-};
-#endif  /* CONFIG_NFS_V3_ACL */
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@ -134,8 +63,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
        return nfs_fileid_to_ino_t(fattr->fileid);
 }
-static int
+int nfs_write_inode(struct inode *inode, int sync)
-nfs_write_inode(struct inode *inode, int sync)
 {
        int flags = sync ? FLUSH_SYNC : 0;
        int ret;
@@ -146,31 +74,15 @@ nfs_write_inode(struct inode *inode, int sync)
        return 0;
 }
-static void
+void nfs_clear_inode(struct inode *inode)
-nfs_delete_inode(struct inode * inode)
 {
-        dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct rpc_cred *cred;
-        truncate_inode_pages(&inode->i_data, 0);
-        nfs_wb_all(inode);
        /*
         * The following should never happen...
         */
-        if (nfs_have_writebacks(inode)) {
+        BUG_ON(nfs_have_writebacks(inode));
-                printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
-        }
-        clear_inode(inode);
-}
-static void
-nfs_clear_inode(struct inode *inode)
-{
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct rpc_cred *cred;
-        nfs_wb_all(inode);
        BUG_ON (!list_empty(&nfsi->open_files));
        nfs_zap_acl_cache(inode);
        cred = nfsi->cache_access.cred;
@@ -179,554 +91,6 @@ nfs_clear_inode(struct inode *inode)
        BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
-void
-nfs_umount_begin(struct super_block *sb)
-{
-        struct rpc_clnt *rpc = NFS_SB(sb)->client;
-        /* -EIO all pending I/O */
-        if (!IS_ERR(rpc))
-                rpc_killall_tasks(rpc);
-        rpc = NFS_SB(sb)->client_acl;
-        if (!IS_ERR(rpc))
-                rpc_killall_tasks(rpc);
-}
-static inline unsigned long
-nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
-{
-        /* make sure blocksize is a power of two */
-        if ((bsize & (bsize - 1)) || nrbitsp) {
-                unsigned char   nrbits;
-                for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
-                        ;
-                bsize = 1 << nrbits;
-                if (nrbitsp)
-                        *nrbitsp = nrbits;
-        }
-        return bsize;
-}
-/*
- * Calculate the number of 512byte blocks used.
- */
-static inline unsigned long
-nfs_calc_block_size(u64 tsize)
-{
-        loff_t used = (tsize + 511) >> 9;
-        return (used > ULONG_MAX) ? ULONG_MAX : used;
-}
-/*
- * Compute and set NFS server blocksize
- */
-static inline unsigned long
-nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
-{
-        if (bsize < NFS_MIN_FILE_IO_SIZE)
-                bsize = NFS_DEF_FILE_IO_SIZE;
-        else if (bsize >= NFS_MAX_FILE_IO_SIZE)
-                bsize = NFS_MAX_FILE_IO_SIZE;
-        return nfs_block_bits(bsize, nrbitsp);
-}
-/*
- * Obtain the root inode of the file system.
- */
-static struct inode *
-nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
-{
-        struct nfs_server       *server = NFS_SB(sb);
-        int                     error;
-        error = server->rpc_ops->getroot(server, rootfh, fsinfo);
-        if (error < 0) {
-                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
-        }
-        return nfs_fhget(sb, rootfh, fsinfo->fattr);
-}
-/*
- * Do NFS version-independent mount processing, and sanity checking
- */
-static int
-nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
-{
-        struct nfs_server       *server;
-        struct inode            *root_inode;
-        struct nfs_fattr        fattr;
-        struct nfs_fsinfo       fsinfo = {
-                                        .fattr = &fattr,
-                                };
-        struct nfs_pathconf pathinfo = {
-                        .fattr = &fattr,
-        };
-        int no_root_error = 0;
-        unsigned long max_rpc_payload;
-        /* We probably want something more informative here */
-        snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
-        server = NFS_SB(sb);
-        sb->s_magic      = NFS_SUPER_MAGIC;
-        server->io_stats = nfs_alloc_iostats();
-        if (server->io_stats == NULL)
-                return -ENOMEM;
-        root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
-        /* Did getting the root inode fail? */
-        if (IS_ERR(root_inode)) {
-                no_root_error = PTR_ERR(root_inode);
-                goto out_no_root;
-        }
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root) {
-                no_root_error = -ENOMEM;
-                goto out_no_root;
-        }
-        sb->s_root->d_op = server->rpc_ops->dentry_ops;
-        /* mount time stamp, in seconds */
-        server->mount_time = jiffies;
-        /* Get some general file system info */
-        if (server->namelen == 0 &&
-            server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
-                server->namelen = pathinfo.max_namelen;
-        /* Work out a lot of parameters */
-        if (server->rsize == 0)
-                server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
-        if (server->wsize == 0)
-                server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
-        if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
-                server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
-        if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
-                server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
-        max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
-        if (server->rsize > max_rpc_payload)
-                server->rsize = max_rpc_payload;
-        if (server->rsize > NFS_MAX_FILE_IO_SIZE)
-                server->rsize = NFS_MAX_FILE_IO_SIZE;
-        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (server->wsize > max_rpc_payload)
-                server->wsize = max_rpc_payload;
-        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
-                server->wsize = NFS_MAX_FILE_IO_SIZE;
-        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (sb->s_blocksize == 0)
-                sb->s_blocksize = nfs_block_bits(server->wsize,
-                                                         &sb->s_blocksize_bits);
-        server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
-        server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
-        if (server->dtsize > PAGE_CACHE_SIZE)
-                server->dtsize = PAGE_CACHE_SIZE;
-        if (server->dtsize > server->rsize)
-                server->dtsize = server->rsize;
-        if (server->flags & NFS_MOUNT_NOAC) {
-                server->acregmin = server->acregmax = 0;
-                server->acdirmin = server->acdirmax = 0;
-                sb->s_flags |= MS_SYNCHRONOUS;
-        }
-        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-        sb->s_maxbytes = fsinfo.maxfilesize;
-        if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
-                sb->s_maxbytes = MAX_LFS_FILESIZE; 
-        server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
-        server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
-        /* We're airborne Set socket buffersize */
-        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
-        return 0;
-        /* Yargs. It didn't work out. */
-out_no_root:
-        dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
-        if (!IS_ERR(root_inode))
-                iput(root_inode);
-        return no_root_error;
-}
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
-{
-        to->to_initval = timeo * HZ / 10;
-        to->to_retries = retrans;
-        if (!to->to_retries)
-                to->to_retries = 2;
-        switch (proto) {
-        case IPPROTO_TCP:
-                if (!to->to_initval)
-                        to->to_initval = 60 * HZ;
-                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
-                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
-                to->to_increment = to->to_initval;
-                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
-                to->to_exponential = 0;
-                break;
-        case IPPROTO_UDP:
-        default:
-                if (!to->to_initval)
-                        to->to_initval = 11 * HZ / 10;
-                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
-                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
-                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
-                to->to_exponential = 1;
-                break;
-        }
-}
-/*
- * Create an RPC client handle.
- */
-static struct rpc_clnt *
-nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
-{
-        struct rpc_timeout      timeparms;
-        struct rpc_xprt         *xprt = NULL;
-        struct rpc_clnt         *clnt = NULL;
-        int                     proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-        nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
-        server->retrans_timeo = timeparms.to_initval;
-        server->retrans_count = timeparms.to_retries;
-        /* create transport and client */
-        xprt = xprt_create_proto(proto, &server->addr, &timeparms);
-        if (IS_ERR(xprt)) {
-                dprintk("%s: cannot create RPC transport. Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(xprt));
-                return (struct rpc_clnt *)xprt;
-        }
-        clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-                                 server->rpc_ops->version, data->pseudoflavor);
-        if (IS_ERR(clnt)) {
-                dprintk("%s: cannot create RPC client. Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(xprt));
-                goto out_fail;
-        }
-        clnt->cl_intr     = 1;
-        clnt->cl_softrtry = 1;
-        return clnt;
-out_fail:
-        return clnt;
-}
-/*
- * The way this works is that the mount process passes a structure
- * in the data argument which contains the server's IP address
- * and the root file handle obtained from the server's mount
- * daemon. We stash these away in the private superblock fields.
- */
-static int
-nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
-{
-        struct nfs_server       *server;
-        rpc_authflavor_t        authflavor;
-        server           = NFS_SB(sb);
-        sb->s_blocksize_bits = 0;
-        sb->s_blocksize = 0;
-        if (data->bsize)
-                sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-        if (data->rsize)
-                server->rsize = nfs_block_size(data->rsize, NULL);
-        if (data->wsize)
-                server->wsize = nfs_block_size(data->wsize, NULL);
-        server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
-        server->acregmin = data->acregmin*HZ;
-        server->acregmax = data->acregmax*HZ;
-        server->acdirmin = data->acdirmin*HZ;
-        server->acdirmax = data->acdirmax*HZ;
-        /* Start lockd here, before we might error out */
-        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_up();
-        server->namelen  = data->namlen;
-        server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
-        if (!server->hostname)
-                return -ENOMEM;
-        strcpy(server->hostname, data->hostname);
-        /* Check NFS protocol revision and initialize RPC op vector
-         * and file handle pool. */
-#ifdef CONFIG_NFS_V3
-        if (server->flags & NFS_MOUNT_VER3) {
-                server->rpc_ops = &nfs_v3_clientops;
-                server->caps |= NFS_CAP_READDIRPLUS;
-        } else {
-                server->rpc_ops = &nfs_v2_clientops;
-        }
-#else
-        server->rpc_ops = &nfs_v2_clientops;
-#endif
-        /* Fill in pseudoflavor for mount version < 5 */
-        if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-                data->pseudoflavor = RPC_AUTH_UNIX;
-        authflavor = data->pseudoflavor;        /* save for sb_init() */
-        /* XXX maybe we want to add a server->pseudoflavor field */
-        /* Create RPC client handles */
-        server->client = nfs_create_client(server, data);
-        if (IS_ERR(server->client))
-                return PTR_ERR(server->client);
-        /* RFC 2623, sec 2.3.2 */
-        if (authflavor != RPC_AUTH_UNIX) {
-                struct rpc_auth *auth;
-                server->client_sys = rpc_clone_client(server->client);
-                if (IS_ERR(server->client_sys))
-                        return PTR_ERR(server->client_sys);
-                auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
-                if (IS_ERR(auth))
-                        return PTR_ERR(auth);
-        } else {
-                atomic_inc(&server->client->cl_count);
-                server->client_sys = server->client;
-        }
-        if (server->flags & NFS_MOUNT_VER3) {
-#ifdef CONFIG_NFS_V3_ACL
-                if (!(server->flags & NFS_MOUNT_NOACL)) {
-                        server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
-                        /* No errors! Assume that Sun nfsacls are supported */
-                        if (!IS_ERR(server->client_acl))
-                                server->caps |= NFS_CAP_ACLS;
-                }
-#else
-                server->flags &= ~NFS_MOUNT_NOACL;
-#endif /* CONFIG_NFS_V3_ACL */
-                /*
-                 * The VFS shouldn't apply the umask to mode bits. We will
-                 * do so ourselves when necessary.
-                 */
-                sb->s_flags |= MS_POSIXACL;
-                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-                        server->namelen = NFS3_MAXNAMLEN;
-                sb->s_time_gran = 1;
-        } else {
-                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-                        server->namelen = NFS2_MAXNAMLEN;
-        }
-        sb->s_op = &nfs_sops;
-        return nfs_sb_init(sb, authflavor);
-}
-static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
-{
-        struct nfs_server *server = NFS_SB(sb);
-        unsigned char blockbits;
-        unsigned long blockres;
-        struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
-        struct nfs_fattr fattr;
-        struct nfs_fsstat res = {
-                        .fattr = &fattr,
-        };
-        int error;
-        lock_kernel();
-        error = server->rpc_ops->statfs(server, rootfh, &res);
-        buf->f_type = NFS_SUPER_MAGIC;
-        if (error < 0)
-                goto out_err;
-        /*
-         * Current versions of glibc do not correctly handle the
-         * case where f_frsize != f_bsize.  Eventually we want to
-         * report the value of wtmult in this field.
-         */
-        buf->f_frsize = sb->s_blocksize;
-        /*
-         * On most *nix systems, f_blocks, f_bfree, and f_bavail
-         * are reported in units of f_frsize.  Linux hasn't had
-         * an f_frsize field in its statfs struct until recently,
-         * thus historically Linux's sys_statfs reports these
-         * fields in units of f_bsize.
-         */
-        buf->f_bsize = sb->s_blocksize;
-        blockbits = sb->s_blocksize_bits;
-        blockres = (1 << blockbits) - 1;
-        buf->f_blocks = (res.tbytes + blockres) >> blockbits;
-        buf->f_bfree = (res.fbytes + blockres) >> blockbits;
-        buf->f_bavail = (res.abytes + blockres) >> blockbits;
-        buf->f_files = res.tfiles;
-        buf->f_ffree = res.afiles;
-        buf->f_namelen = server->namelen;
- out:
-        unlock_kernel();
-        return 0;
- out_err:
-        dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-        buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
-        goto out;
-}
-static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
-{
-        static struct proc_nfs_info {
-                int flag;
-                char *str;
-                char *nostr;
-        } nfs_info[] = {
-                { NFS_MOUNT_SOFT, ",soft", ",hard" },
-                { NFS_MOUNT_INTR, ",intr", "" },
-                { NFS_MOUNT_NOCTO, ",nocto", "" },
-                { NFS_MOUNT_NOAC, ",noac", "" },
-                { NFS_MOUNT_NONLM, ",nolock", "" },
-                { NFS_MOUNT_NOACL, ",noacl", "" },
-                { 0, NULL, NULL }
-        };
-        struct proc_nfs_info *nfs_infop;
-        char buf[12];
-        char *proto;
-        seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
-        seq_printf(m, ",rsize=%d", nfss->rsize);
-        seq_printf(m, ",wsize=%d", nfss->wsize);
-        if (nfss->acregmin != 3*HZ || showdefaults)
-                seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
-        if (nfss->acregmax != 60*HZ || showdefaults)
-                seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
-        if (nfss->acdirmin != 30*HZ || showdefaults)
-                seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
-        if (nfss->acdirmax != 60*HZ || showdefaults)
-                seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
-        for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
-                if (nfss->flags & nfs_infop->flag)
-                        seq_puts(m, nfs_infop->str);
-                else
-                        seq_puts(m, nfs_infop->nostr);
-        }
-        switch (nfss->client->cl_xprt->prot) {
-                case IPPROTO_TCP:
-                        proto = "tcp";
-                        break;
-                case IPPROTO_UDP:
-                        proto = "udp";
-                        break;
-                default:
-                        snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-                        proto = buf;
-        }
-        seq_printf(m, ",proto=%s", proto);
-        seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
-        seq_printf(m, ",retrans=%u", nfss->retrans_count);
-}
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-        nfs_show_mount_options(m, nfss, 0);
-        seq_puts(m, ",addr=");
-        seq_escape(m, nfss->hostname, " \t\n\\");
-        return 0;
-}
-static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
-{
-        int i, cpu;
-        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-        struct rpc_auth *auth = nfss->client->cl_auth;
-        struct nfs_iostats totals = { };
-        seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
-        /*
-         * Display all mount option settings
-         */
-        seq_printf(m, "\n\topts:\t");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
-        nfs_show_mount_options(m, nfss, 1);
-        seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
-        seq_printf(m, "\n\tcaps:\t");
-        seq_printf(m, "caps=0x%x", nfss->caps);
-        seq_printf(m, ",wtmult=%d", nfss->wtmult);
-        seq_printf(m, ",dtsize=%d", nfss->dtsize);
-        seq_printf(m, ",bsize=%d", nfss->bsize);
-        seq_printf(m, ",namelen=%d", nfss->namelen);
-#ifdef CONFIG_NFS_V4
-        if (nfss->rpc_ops->version == 4) {
-                seq_printf(m, "\n\tnfsv4:\t");
-                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
-                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
-                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
-        }
-#endif
-        /*
-         * Display security flavor in effect for this mount
-         */
-        seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
-        if (auth->au_flavor)
-                seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
-        /*
-         * Display superblock I/O counters
-         */
-        for_each_possible_cpu(cpu) {
-                struct nfs_iostats *stats;
-                preempt_disable();
-                stats = per_cpu_ptr(nfss->io_stats, cpu);
-                for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-                        totals.events[i] += stats->events[i];
-                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-                        totals.bytes[i] += stats->bytes[i];
-                preempt_enable();
-        }
-        seq_printf(m, "\n\tevents:\t");
-        for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-                seq_printf(m, "%lu ", totals.events[i]);
-        seq_printf(m, "\n\tbytes:\t");
-        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-                seq_printf(m, "%Lu ", totals.bytes[i]);
-        seq_printf(m, "\n");
-        rpc_print_iostats(m, nfss->client);
-        return 0;
-}
 /**
 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
 */
@@ -889,6 +253,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                        /* Deal with crossing mountpoints */
+                        if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+                                if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+                                        inode->i_op = &nfs_referral_inode_operations;
+                                else
+                                        inode->i_op = &nfs_mountpoint_inode_operations;
+                                inode->i_fop = NULL;
+                        }
                } else if (S_ISLNK(inode->i_mode))
                        inode->i_op = &nfs_symlink_inode_operations;
                else
@@ -1207,6 +579,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
                inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        lock_kernel();
        if (!inode || is_bad_inode(inode))
                goto out_nowait;
@@ -1220,7 +593,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                status = -ESTALE;
                /* Do we trust the cached ESTALE? */
                if (NFS_ATTRTIMEO(inode) != 0) {
-                        if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+                        if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
                                /* no */
                        } else
                                goto out;
@@ -1251,8 +624,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        }
        spin_unlock(&inode->i_lock);
-        nfs_revalidate_mapping(inode, inode->i_mapping);
        if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
                nfs_zap_acl_cache(inode);
@@ -1286,8 +657,7 @@ int nfs_attribute_timeout(struct inode *inode)
 */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+        if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
-        if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
                        && !nfs_attribute_timeout(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
@@ -1298,9 +668,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 * @inode - pointer to host inode
 * @mapping - pointer to mapping
 */
-void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        int ret = 0;
+        if (NFS_STALE(inode))
+                ret = -ESTALE;
+        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+                        || nfs_attribute_timeout(inode))
+                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
                nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1321,6 +698,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
                                inode->i_sb->s_id,
                                (long long)NFS_FILEID(inode));
        }
+        return ret;
 }
 /**
@@ -1360,12 +738,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
-                        && nfsi->change_attr == fattr->pre_change_attr) {
-                nfsi->change_attr = fattr->change_attr;
-                nfsi->cache_change_attribute = jiffies;
-        }
        /* If we have atomic WCC data, we may update some attributes */
        if ((fattr->valid & NFS_ATTR_WCC) != 0) {
                if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
@@ -1399,9 +771,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        int data_unstable;
-        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-                return 0;
        /* Has the inode gone and changed behind our back? */
        if (nfsi->fileid != fattr->fileid
                        || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
@@ -1414,20 +783,13 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
-        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) {
+        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-                if (nfsi->change_attr == fattr->change_attr)
+                        nfsi->change_attr != fattr->change_attr)
-                        goto out;
+                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-                if (!data_unstable)
-                        nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-        }
        /* Verify a few of the more important attributes */
-        if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+        if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+                nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-                if (!data_unstable)
-                        nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-        }
        cur_size = i_size_read(inode);
        new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1444,7 +806,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        if (inode->i_nlink != fattr->nlink)
                nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-out:
        if (!timespec_equal(&inode->i_atime, &fattr->atime))
                nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
@@ -1470,7 +831,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
                return 0;
        spin_lock(&inode->i_lock);
-        nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
        if (time_after(fattr->time_start, nfsi->last_updated))
                status = nfs_update_inode(inode, fattr);
        else
@@ -1495,7 +855,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-                nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+                nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
                goto out;
        }
        status = nfs_update_inode(inode, fattr);
@@ -1518,6 +878,7 @@ out:
 */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+        struct nfs_server *server;
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t cur_isize, new_isize;
        unsigned int    invalid = 0;
@@ -1527,9 +888,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        __FUNCTION__, inode->i_sb->s_id, inode->i_ino,
                        atomic_read(&inode->i_count), fattr->valid);
-        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-                return 0;
        if (nfsi->fileid != fattr->fileid)
                goto out_fileid;
@@ -1539,6 +897,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                goto out_changed;
+        server = NFS_SERVER(inode);
+        /* Update the fsid if and only if this is the root directory */
+        if (inode == inode->i_sb->s_root->d_inode
+                        && !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+                server->fsid = fattr->fsid;
        /*
         * Update the read time so we don't revalidate too often.
         */
@@ -1548,7 +912,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* Are we racing with known updates of the metadata on the server? */
        data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
        if (data_stable)
-                nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+                nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
@@ -1612,15 +976,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                inode->i_blksize = fattr->du.nfs2.blocksize;
        }
-        if ((fattr->valid & NFS_ATTR_FATTR_V4)) {
+        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-                if (nfsi->change_attr != fattr->change_attr) {
+                        nfsi->change_attr != fattr->change_attr) {
-                        dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                dprintk("NFS: change_attr change on server for file %s/%ld\n",
-                                        inode->i_sb->s_id, inode->i_ino);
+                                inode->i_sb->s_id, inode->i_ino);
-                        nfsi->change_attr = fattr->change_attr;
+                nfsi->change_attr = fattr->change_attr;
-                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-                        nfsi->cache_change_attribute = jiffies;
+                nfsi->cache_change_attribute = jiffies;
-                } else
-                        invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA);
        }
        /* Update attrtimeo value if we're out of the unstable period */
@@ -1668,190 +1030,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        goto out_err;
 }
-/*
- * File system information
- */
-static int nfs_set_super(struct super_block *s, void *data)
-{
-        s->s_fs_info = data;
-        return set_anon_super(s, data);
-}
- 
-static int nfs_compare_super(struct super_block *sb, void *data)
-{
-        struct nfs_server *server = data;
-        struct nfs_server *old = NFS_SB(sb);
-        if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
-                return 0;
-        if (old->addr.sin_port != server->addr.sin_port)
-                return 0;
-        return !nfs_compare_fh(&old->fh, &server->fh);
-}
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data)
-{
-        int error;
-        struct nfs_server *server = NULL;
-        struct super_block *s;
-        struct nfs_fh *root;
-        struct nfs_mount_data *data = raw_data;
-        s = ERR_PTR(-EINVAL);
-        if (data == NULL) {
-                dprintk("%s: missing data argument\n", __FUNCTION__);
-                goto out_err;
-        }
-        if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-                dprintk("%s: bad mount version\n", __FUNCTION__);
-                goto out_err;
-        }
-        switch (data->version) {
-                case 1:
-                        data->namlen = 0;
-                case 2:
-                        data->bsize  = 0;
-                case 3:
-                        if (data->flags & NFS_MOUNT_VER3) {
-                                dprintk("%s: mount structure version %d does not support NFSv3\n",
-                                                __FUNCTION__,
-                                                data->version);
-                                goto out_err;
-                        }
-                        data->root.size = NFS2_FHSIZE;
-                        memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
-                case 4:
-                        if (data->flags & NFS_MOUNT_SECFLAVOUR) {
-                                dprintk("%s: mount structure version %d does not support strong security\n",
-                                                __FUNCTION__,
-                                                data->version);
-                                goto out_err;
-                        }
-                case 5:
-                        memset(data->context, 0, sizeof(data->context));
-        }
-#ifndef CONFIG_NFS_V3
-        /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-        s = ERR_PTR(-EPROTONOSUPPORT);
-        if (data->flags & NFS_MOUNT_VER3) {
-                dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-                goto out_err;
-        }
-#endif /* CONFIG_NFS_V3 */
-        s = ERR_PTR(-ENOMEM);
-        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-        if (!server)
-                goto out_err;
-        /* Zero out the NFS state stuff */
-        init_nfsv4_state(server);
-        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-        root = &server->fh;
-        if (data->flags & NFS_MOUNT_VER3)
-                root->size = data->root.size;
-        else
-                root->size = NFS2_FHSIZE;
-        s = ERR_PTR(-EINVAL);
-        if (root->size > sizeof(root->data)) {
-                dprintk("%s: invalid root filehandle\n", __FUNCTION__);
-                goto out_err;
-        }
-        memcpy(root->data, data->root.data, root->size);
-        /* We now require that the mount process passes the remote address */
-        memcpy(&server->addr, &data->addr, sizeof(server->addr));
-        if (server->addr.sin_addr.s_addr == INADDR_ANY) {
-                dprintk("%s: mount program didn't pass remote address!\n",
-                                __FUNCTION__);
-                goto out_err;
-        }
-        /* Fire up rpciod if not yet running */
-        s = ERR_PTR(rpciod_up());
-        if (IS_ERR(s)) {
-                dprintk("%s: couldn't start rpciod! Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(s));
-                goto out_err;
-        }
-        s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-        if (IS_ERR(s) || s->s_root)
-                goto out_rpciod_down;
-        s->s_flags = flags;
-        error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-        if (error) {
-                up_write(&s->s_umount);
-                deactivate_super(s);
-                return ERR_PTR(error);
-        }
-        s->s_flags |= MS_ACTIVE;
-        return s;
-out_rpciod_down:
-        rpciod_down();
-out_err:
-        kfree(server);
-        return s;
-}
-static void nfs_kill_super(struct super_block *s)
-{
-        struct nfs_server *server = NFS_SB(s);
-        kill_anon_super(s);
-        if (!IS_ERR(server->client))
-                rpc_shutdown_client(server->client);
-        if (!IS_ERR(server->client_sys))
-                rpc_shutdown_client(server->client_sys);
-        if (!IS_ERR(server->client_acl))
-                rpc_shutdown_client(server->client_acl);
-        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_down();   /* release rpc.lockd */
-        rpciod_down();          /* release rpciod */
-        nfs_free_iostats(server->io_stats);
-        kfree(server->hostname);
-        kfree(server);
-}
-static struct file_system_type nfs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "nfs",
-        .get_sb         = nfs_get_sb,
-        .kill_sb        = nfs_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
 #ifdef CONFIG_NFS_V4
-static void nfs4_clear_inode(struct inode *);
-static struct super_operations nfs4_sops = { 
-        .alloc_inode    = nfs_alloc_inode,
-        .destroy_inode  = nfs_destroy_inode,
-        .write_inode    = nfs_write_inode,
-        .delete_inode   = nfs_delete_inode,
-        .statfs         = nfs_statfs,
-        .clear_inode    = nfs4_clear_inode,
-        .umount_begin   = nfs_umount_begin,
-        .show_options   = nfs_show_options,
-        .show_stats     = nfs_show_stats,
-};
 /*
 * Clean out any remaining NFSv4 state that might be left over due
 * to open() calls that passed nfs_atomic_lookup, but failed to call
 * nfs_open().
 */
-static void nfs4_clear_inode(struct inode *inode)
+void nfs4_clear_inode(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -1875,357 +1062,9 @@ static void nfs4_clear_inode(struct inode *inode)
                nfs4_close_state(state, state->state);
        }
 }
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
-{
-        struct nfs_server *server;
-        struct nfs4_client *clp = NULL;
-        struct rpc_xprt *xprt = NULL;
-        struct rpc_clnt *clnt = NULL;
-        struct rpc_timeout timeparms;
-        rpc_authflavor_t authflavour;
-        int err = -EIO;
-        sb->s_blocksize_bits = 0;
-        sb->s_blocksize = 0;
-        server = NFS_SB(sb);
-        if (data->rsize != 0)
-                server->rsize = nfs_block_size(data->rsize, NULL);
-        if (data->wsize != 0)
-                server->wsize = nfs_block_size(data->wsize, NULL);
-        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-        server->caps = NFS_CAP_ATOMIC_OPEN;
-        server->acregmin = data->acregmin*HZ;
-        server->acregmax = data->acregmax*HZ;
-        server->acdirmin = data->acdirmin*HZ;
-        server->acdirmax = data->acdirmax*HZ;
-        server->rpc_ops = &nfs_v4_clientops;
-        nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
-        server->retrans_timeo = timeparms.to_initval;
-        server->retrans_count = timeparms.to_retries;
-        clp = nfs4_get_client(&server->addr.sin_addr);
-        if (!clp) {
-                dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-                return -EIO;
-        }
-        /* Now create transport and client */
-        authflavour = RPC_AUTH_UNIX;
-        if (data->auth_flavourlen != 0) {
-                if (data->auth_flavourlen != 1) {
-                        dprintk("%s: Invalid number of RPC auth flavours %d.\n",
-                                        __FUNCTION__, data->auth_flavourlen);
-                        err = -EINVAL;
-                        goto out_fail;
-                }
-                if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-                        err = -EFAULT;
-                        goto out_fail;
-                }
-        }
-        down_write(&clp->cl_sem);
-        if (IS_ERR(clp->cl_rpcclient)) {
-                xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
-                if (IS_ERR(xprt)) {
-                        up_write(&clp->cl_sem);
-                        err = PTR_ERR(xprt);
-                        dprintk("%s: cannot create RPC transport. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-                }
-                clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-                                server->rpc_ops->version, authflavour);
-                if (IS_ERR(clnt)) {
-                        up_write(&clp->cl_sem);
-                        err = PTR_ERR(clnt);
-                        dprintk("%s: cannot create RPC client. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-                }
-                clnt->cl_intr     = 1;
-                clnt->cl_softrtry = 1;
-                clp->cl_rpcclient = clnt;
-                memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-                nfs_idmap_new(clp);
-        }
-        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-        clnt = rpc_clone_client(clp->cl_rpcclient);
-        if (!IS_ERR(clnt))
-                        server->nfs4_state = clp;
-        up_write(&clp->cl_sem);
-        clp = NULL;
-        if (IS_ERR(clnt)) {
-                err = PTR_ERR(clnt);
-                dprintk("%s: cannot create RPC client. Error = %d\n",
-                                __FUNCTION__, err);
-                return err;
-        }
-        server->client    = clnt;
-        if (server->nfs4_state->cl_idmap == NULL) {
-                dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
-                return -ENOMEM;
-        }
-        if (clnt->cl_auth->au_flavor != authflavour) {
-                struct rpc_auth *auth;
-                auth = rpcauth_create(authflavour, clnt);
-                if (IS_ERR(auth)) {
-                        dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-                        return PTR_ERR(auth);
-                }
-        }
-        sb->s_time_gran = 1;
-        sb->s_op = &nfs4_sops;
-        err = nfs_sb_init(sb, authflavour);
-        if (err == 0)
-                return 0;
-out_fail:
-        if (clp)
-                nfs4_put_client(clp);
-        return err;
-}
-static int nfs4_compare_super(struct super_block *sb, void *data)
-{
-        struct nfs_server *server = data;
-        struct nfs_server *old = NFS_SB(sb);
-        if (strcmp(server->hostname, old->hostname) != 0)
-                return 0;
-        if (strcmp(server->mnt_path, old->mnt_path) != 0)
-                return 0;
-        return 1;
-}
-static void *
-nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
-{
-        void *p = NULL;
-        if (!src->len)
-                return ERR_PTR(-EINVAL);
-        if (src->len < maxlen)
-                maxlen = src->len;
-        if (dst == NULL) {
-                p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
-                if (p == NULL)
-                        return ERR_PTR(-ENOMEM);
-        }
-        if (copy_from_user(dst, src->data, maxlen)) {
-                kfree(p);
-                return ERR_PTR(-EFAULT);
-        }
-        dst[maxlen] = '\0';
-        return dst;
-}
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data)
-{
-        int error;
-        struct nfs_server *server;
-        struct super_block *s;
-        struct nfs4_mount_data *data = raw_data;
-        void *p;
-        if (data == NULL) {
-                dprintk("%s: missing data argument\n", __FUNCTION__);
-                return ERR_PTR(-EINVAL);
-        }
-        if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
-                dprintk("%s: bad mount version\n", __FUNCTION__);
-                return ERR_PTR(-EINVAL);
-        }
-        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-        if (!server)
-                return ERR_PTR(-ENOMEM);
-        /* Zero out the NFS state stuff */
-        init_nfsv4_state(server);
-        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-        p = nfs_copy_user_string(NULL, &data->hostname, 256);
-        if (IS_ERR(p))
-                goto out_err;
-        server->hostname = p;
-        p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
-        if (IS_ERR(p))
-                goto out_err;
-        server->mnt_path = p;
-        p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
-                        sizeof(server->ip_addr) - 1);
-        if (IS_ERR(p))
-                goto out_err;
-        /* We now require that the mount process passes the remote address */
-        if (data->host_addrlen != sizeof(server->addr)) {
-                s = ERR_PTR(-EINVAL);
-                goto out_free;
-        }
-        if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-                s = ERR_PTR(-EFAULT);
-                goto out_free;
-        }
-        if (server->addr.sin_family != AF_INET ||
-            server->addr.sin_addr.s_addr == INADDR_ANY) {
-                dprintk("%s: mount program didn't pass remote IP address!\n",
-                                __FUNCTION__);
-                s = ERR_PTR(-EINVAL);
-                goto out_free;
-        }
-        /* Fire up rpciod if not yet running */
-        s = ERR_PTR(rpciod_up());
-        if (IS_ERR(s)) {
-                dprintk("%s: couldn't start rpciod! Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(s));
-                goto out_free;
-        }
-        s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-        if (IS_ERR(s) || s->s_root)
-                goto out_free;
-        s->s_flags = flags;
-        error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-        if (error) {
-                up_write(&s->s_umount);
-                deactivate_super(s);
-                return ERR_PTR(error);
-        }
-        s->s_flags |= MS_ACTIVE;
-        return s;
-out_err:
-        s = (struct super_block *)p;
-out_free:
-        kfree(server->mnt_path);
-        kfree(server->hostname);
-        kfree(server);
-        return s;
-}
-static void nfs4_kill_super(struct super_block *sb)
-{
-        struct nfs_server *server = NFS_SB(sb);
-        nfs_return_all_delegations(sb);
-        kill_anon_super(sb);
-        nfs4_renewd_prepare_shutdown(server);
-        if (server->client != NULL && !IS_ERR(server->client))
-                rpc_shutdown_client(server->client);
-        destroy_nfsv4_state(server);
-        rpciod_down();
-        nfs_free_iostats(server->io_stats);
-        kfree(server->hostname);
-        kfree(server);
-}
-static struct file_system_type nfs4_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "nfs4",
-        .get_sb         = nfs4_get_sb,
-        .kill_sb        = nfs4_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
-static int param_set_port(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
-                return -EINVAL;
-        *((int *)kp->arg) = num;
-        return 0;
-}
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-                 &nfs_callback_set_tcpport, 0644);
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        int jif = num * HZ;
-        if (endp == val || *endp || num < 0 || jif < num)
-                return -EINVAL;
-        *((int *)kp->arg) = jif;
-        return 0;
-}
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
-                 &nfs_idmap_cache_timeout, 0644);
-#define nfs4_init_once(nfsi) \
-        do { \
-                INIT_LIST_HEAD(&(nfsi)->open_states); \
-                nfsi->delegation = NULL; \
-                nfsi->delegation_state = 0; \
-                init_rwsem(&nfsi->rwsem); \
-        } while(0)
-static inline int register_nfs4fs(void)
-{
-        int ret;
-        ret = nfs_register_sysctl();
-        if (ret != 0)
-                return ret;
-        ret = register_filesystem(&nfs4_fs_type);
-        if (ret != 0)
-                nfs_unregister_sysctl();
-        return ret;
-}
-static inline void unregister_nfs4fs(void)
-{
-        unregister_filesystem(&nfs4_fs_type);
-        nfs_unregister_sysctl();
-}
-#else
-#define nfs4_init_once(nfsi) \
-        do { } while (0)
-#define register_nfs4fs() (0)
-#define unregister_nfs4fs()
 #endif
-extern int nfs_init_nfspagecache(void);
+struct inode *nfs_alloc_inode(struct super_block *sb)
-extern void nfs_destroy_nfspagecache(void);
-extern int nfs_init_readpagecache(void);
-extern void nfs_destroy_readpagecache(void);
-extern int nfs_init_writepagecache(void);
-extern void nfs_destroy_writepagecache(void);
-#ifdef CONFIG_NFS_DIRECTIO
-extern int nfs_init_directcache(void);
-extern void nfs_destroy_directcache(void);
-#endif
-static kmem_cache_t * nfs_inode_cachep;
-static struct inode *nfs_alloc_inode(struct super_block *sb)
 {
        struct nfs_inode *nfsi;
        nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
@@ -2244,11 +1083,21 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
        return &nfsi->vfs_inode;
 }
-static void nfs_destroy_inode(struct inode *inode)
+void nfs_destroy_inode(struct inode *inode)
 {
        kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+        INIT_LIST_HEAD(&nfsi->open_states);
+        nfsi->delegation = NULL;
+        nfsi->delegation_state = 0;
+        init_rwsem(&nfsi->rwsem);
+#endif
+}
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
        struct nfs_inode *nfsi = (struct nfs_inode *) foo;
@@ -2269,7 +1118,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
        }
 }
 
-static int nfs_init_inodecache(void)
+static int __init nfs_init_inodecache(void)
 {
        nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
                                             sizeof(struct nfs_inode),
@@ -2311,29 +1160,22 @@ static int __init init_nfs_fs(void)
        if (err)
                goto out1;
-#ifdef CONFIG_NFS_DIRECTIO
        err = nfs_init_directcache();
        if (err)
                goto out0;
-#endif
 #ifdef CONFIG_PROC_FS
        rpc_proc_register(&nfs_rpcstat);
 #endif
-        err = register_filesystem(&nfs_fs_type);
+        if ((err = register_nfs_fs()) != 0)
-        if (err)
-                goto out;
-        if ((err = register_nfs4fs()) != 0)
                goto out;
        return 0;
 out:
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
-#ifdef CONFIG_NFS_DIRECTIO
        nfs_destroy_directcache();
 out0:
-#endif
        nfs_destroy_writepagecache();
 out1:
        nfs_destroy_readpagecache();
@@ -2347,9 +1189,7 @@ out4:
 static void __exit exit_nfs_fs(void)
 {
-#ifdef CONFIG_NFS_DIRECTIO
        nfs_destroy_directcache();
-#endif
        nfs_destroy_writepagecache();
        nfs_destroy_readpagecache();
        nfs_destroy_inodecache();
@@ -2357,8 +1197,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
-        unregister_filesystem(&nfs_fs_type);
+        unregister_nfs_fs();
-        unregister_nfs4fs();
 }
 /* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 000000000000..e4f4e5def0fc
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,186 @@
+/*
+ * NFS internal definitions
+ */
+#include <linux/mount.h>
+struct nfs_clone_mount {
+        const struct super_block *sb;
+        const struct dentry *dentry;
+        struct nfs_fh *fh;
+        struct nfs_fattr *fattr;
+        char *hostname;
+        char *mnt_path;
+        struct sockaddr_in *addr;
+        rpc_authflavor_t authflavor;
+};
+/* namespace-nfs4.c */
+#ifdef CONFIG_NFS_V4
+extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+#else
+static inline
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+        return ERR_PTR(-ENOENT);
+}
+#endif
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+#ifdef CONFIG_NFS_DIRECTIO
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+#else
+#define nfs_init_directcache() (0)
+#define nfs_destroy_directcache() do {} while(0)
+#endif
+/* nfs2xdr.c */
+extern struct rpc_procinfo nfs_procedures[];
+extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+/* nfs4xdr.c */
+extern int nfs_stat_to_errno(int);
+extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+/* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
+extern struct rpc_procinfo nfs4_procedures[];
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+                                  struct nfs4_fs_locations *fs_locations,
+                                  struct page *page);
+#endif
+/* inode.c */
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *,int);
+extern void nfs_clear_inode(struct inode *);
+#ifdef CONFIG_NFS_V4
+extern void nfs4_clear_inode(struct inode *);
+#endif
+/* super.c */
+extern struct file_system_type nfs_referral_nfs4_fs_type;
+extern struct file_system_type clone_nfs_fs_type;
+#ifdef CONFIG_NFS_V4
+extern struct file_system_type clone_nfs4_fs_type;
+#endif
+extern struct rpc_stat nfs_rpcstat;
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+/* namespace.c */
+extern char *nfs_path(const char *base, const struct dentry *dentry,
+                      char *buffer, ssize_t buflen);
+/*
+ * Determine the mount path as a string
+ */
+static inline char *
+nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+#ifdef CONFIG_NFS_V4
+        return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+#else
+        return NULL;
+#endif
+}
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+                         const struct dentry *dentry,
+                         char *buffer, ssize_t buflen)
+{
+        return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+        /* make sure blocksize is a power of two */
+        if ((bsize & (bsize - 1)) || nrbitsp) {
+                unsigned char   nrbits;
+                for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+                        ;
+                bsize = 1 << nrbits;
+                if (nrbitsp)
+                        *nrbitsp = nrbits;
+        }
+        return bsize;
+}
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline unsigned long nfs_calc_block_size(u64 tsize)
+{
+        loff_t used = (tsize + 511) >> 9;
+        return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+        if (bsize < NFS_MIN_FILE_IO_SIZE)
+                bsize = NFS_DEF_FILE_IO_SIZE;
+        else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+                bsize = NFS_MAX_FILE_IO_SIZE;
+        return nfs_block_bits(bsize, nrbitsp);
+}
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+        sb->s_maxbytes = (loff_t)maxfilesize;
+        if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+                sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+/*
+ * Check if the string represents a "valid" IPv4 address
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+        int rc, count, in[4];
+        rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+        if (rc != 4)
+                return -EINVAL;
+        for (count = 0; count < 4; count++) {
+                if (in[count] > 255)
+                        return -EINVAL;
+        }
+        return 0;
+}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000000..19b98ca468eb
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,229 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFS namespace
+ */
+#include <linux/config.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_VFS
+static void nfs_expire_automounts(void *list);
+LIST_HEAD(nfs_automount_list);
+static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+char *nfs_path(const char *base, const struct dentry *dentry,
+               char *buffer, ssize_t buflen)
+{
+        char *end = buffer+buflen;
+        int namelen;
+        *--end = '\0';
+        buflen--;
+        spin_lock(&dcache_lock);
+        while (!IS_ROOT(dentry)) {
+                namelen = dentry->d_name.len;
+                buflen -= namelen + 1;
+                if (buflen < 0)
+                        goto Elong;
+                end -= namelen;
+                memcpy(end, dentry->d_name.name, namelen);
+                *--end = '/';
+                dentry = dentry->d_parent;
+        }
+        spin_unlock(&dcache_lock);
+        namelen = strlen(base);
+        /* Strip off excess slashes in base string */
+        while (namelen > 0 && base[namelen - 1] == '/')
+                namelen--;
+        buflen -= namelen;
+        if (buflen < 0)
+                goto Elong;
+        end -= namelen;
+        memcpy(end, base, namelen);
+        return end;
+Elong:
+        return ERR_PTR(-ENAMETOOLONG);
+}
+/*
+ * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * @dentry - dentry of mountpoint
+ * @nd - nameidata info
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+        struct vfsmount *mnt;
+        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+        struct dentry *parent;
+        struct nfs_fh fh;
+        struct nfs_fattr fattr;
+        int err;
+        BUG_ON(IS_ROOT(dentry));
+        dprintk("%s: enter\n", __FUNCTION__);
+        dput(nd->dentry);
+        nd->dentry = dget(dentry);
+        if (d_mountpoint(nd->dentry))
+                goto out_follow;
+        /* Look it up again */
+        parent = dget_parent(nd->dentry);
+        err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+        dput(parent);
+        if (err != 0)
+                goto out_err;
+        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+                mnt = nfs_do_refmount(nd->mnt, nd->dentry);
+        else
+                mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+        err = PTR_ERR(mnt);
+        if (IS_ERR(mnt))
+                goto out_err;
+        mntget(mnt);
+        err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
+        if (err < 0) {
+                mntput(mnt);
+                if (err == -EBUSY)
+                        goto out_follow;
+                goto out_err;
+        }
+        mntput(nd->mnt);
+        dput(nd->dentry);
+        nd->mnt = mnt;
+        nd->dentry = dget(mnt->mnt_root);
+        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+out:
+        dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+        return ERR_PTR(err);
+out_err:
+        path_release(nd);
+        goto out;
+out_follow:
+        while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+                ;
+        err = 0;
+        goto out;
+}
+struct inode_operations nfs_mountpoint_inode_operations = {
+        .follow_link    = nfs_follow_mountpoint,
+        .getattr        = nfs_getattr,
+};
+struct inode_operations nfs_referral_inode_operations = {
+        .follow_link    = nfs_follow_mountpoint,
+};
+static void nfs_expire_automounts(void *data)
+{
+        struct list_head *list = (struct list_head *)data;
+        mark_mounts_for_expiry(list);
+        if (!list_empty(list))
+                schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+void nfs_release_automount_timer(void)
+{
+        if (list_empty(&nfs_automount_list)) {
+                cancel_delayed_work(&nfs_automount_task);
+                flush_scheduled_work();
+        }
+}
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
+                                           struct nfs_clone_mount *mountdata)
+{
+#ifdef CONFIG_NFS_V4
+        struct vfsmount *mnt = NULL;
+        switch (server->rpc_ops->version) {
+                case 2:
+                case 3:
+                        mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+                        break;
+                case 4:
+                        mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+        }
+        return mnt;
+#else
+        return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+#endif
+}
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+                const struct dentry *dentry, struct nfs_fh *fh,
+                struct nfs_fattr *fattr)
+{
+        struct nfs_clone_mount mountdata = {
+                .sb = mnt_parent->mnt_sb,
+                .dentry = dentry,
+                .fh = fh,
+                .fattr = fattr,
+        };
+        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+        char *page = (char *) __get_free_page(GFP_USER);
+        char *devname;
+        dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+                        dentry->d_parent->d_name.name,
+                        dentry->d_name.name);
+        if (page == NULL)
+                goto out;
+        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        mnt = (struct vfsmount *)devname;
+        if (IS_ERR(devname))
+                goto free_page;
+        mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+free_page:
+        free_page((unsigned long)page);
+out:
+        dprintk("%s: done\n", __FUNCTION__);
+        return mnt;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e1..67391eef6b93 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,12 +23,11 @@
 #include <linux/nfs.h>
 #include <linux/nfs2.h>
 #include <linux/nfs_fs.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
-extern int                      nfs_stat_to_errno(int stat);
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO         EIO
@@ -131,7 +130,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
        fattr->du.nfs2.blocksize = ntohl(*p++);
        rdev = ntohl(*p++);
        fattr->du.nfs2.blocks = ntohl(*p++);
-        fattr->fsid_u.nfs3 = ntohl(*p++);
+        fattr->fsid.major = ntohl(*p++);
+        fattr->fsid.minor = 0;
        fattr->fileid = ntohl(*p++);
        p = xdr_decode_time(p, &fattr->atime);
        p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 33287879bd23..7322da4d2055 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -172,8 +172,10 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
                inode->i_ino, acl, dfacl);
        spin_lock(&inode->i_lock);
        __nfs3_forget_cached_acls(NFS_I(inode));
-        nfsi->acl_access = posix_acl_dup(acl);
+        if (!IS_ERR(acl))
-        nfsi->acl_default = posix_acl_dup(dfacl);
+                nfsi->acl_access = posix_acl_dup(acl);
+        if (!IS_ERR(dfacl))
+                nfsi->acl_default = posix_acl_dup(dfacl);
        spin_unlock(&inode->i_lock);
 }
@@ -254,7 +256,9 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                        res.acl_access = NULL;
                }
        }
-        nfs3_cache_acls(inode, res.acl_access, res.acl_default);
+        nfs3_cache_acls(inode,
+                (res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
+                (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
        switch(type) {
                case ACL_TYPE_ACCESS:
@@ -329,6 +333,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        switch (status) {
                case 0:
                        status = nfs_refresh_inode(inode, &fattr);
+                        nfs3_cache_acls(inode, acl, dfacl);
                        break;
                case -EPFNOSUPPORT:
                case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cf186f0d2b3b..7143b1f82cea 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -20,11 +20,10 @@
 #include <linux/nfs_mount.h>
 #include "iostat.h"
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
-extern struct rpc_procinfo nfs3_procedures[];
 /* A wrapper to handle the EJUKEBOX error message */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
@@ -809,8 +808,6 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return status;
 }
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
 static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        if (nfs3_async_handle_jukebox(task, data->inode))
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687e..0250269e9753 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -22,14 +22,13 @@
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfsacl.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_XDR
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO         EIO
-extern int                      nfs_stat_to_errno(int);
 /*
 * Declare the space requirements for NFS arguments and replies as
 * number of 32bit-words
@@ -166,7 +165,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
        if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
                fattr->rdev = 0;
-        p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3);
+        p = xdr_decode_hyper(p, &fattr->fsid.major);
+        fattr->fsid.minor = 0;
        p = xdr_decode_hyper(p, &fattr->fileid);
        p = xdr_decode_time3(p, &fattr->atime);
        p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cddec..9a102860df37 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,9 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
@@ -225,6 +228,7 @@ extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
 extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[2];
 /* nfs4renewd.c */
 extern void nfs4_schedule_state_renewal(struct nfs4_client *);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 000000000000..ea38d27b74e6
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,201 @@
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFSv4 namespace
+ */
+#include <linux/config.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_VFS
+/*
+ * Check if fs_root is valid
+ */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
+                                         char *buffer, ssize_t buflen)
+{
+        char *end = buffer + buflen;
+        int n;
+        *--end = '\0';
+        buflen--;
+        n = pathname->ncomponents;
+        while (--n >= 0) {
+                struct nfs4_string *component = &pathname->components[n];
+                buflen -= component->len + 1;
+                if (buflen < 0)
+                        goto Elong;
+                end -= component->len;
+                memcpy(end, component->data, component->len);
+                *--end = '/';
+        }
+        return end;
+Elong:
+        return ERR_PTR(-ENAMETOOLONG);
+}
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+                                            const struct dentry *dentry,
+                                            struct nfs4_fs_locations *locations)
+{
+        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct nfs_clone_mount mountdata = {
+                .sb = mnt_parent->mnt_sb,
+                .dentry = dentry,
+                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+        };
+        char *page, *page2;
+        char *path, *fs_path;
+        char *devname;
+        int loc, s;
+        if (locations == NULL || locations->nlocations <= 0)
+                goto out;
+        dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+                dentry->d_parent->d_name.name, dentry->d_name.name);
+        /* Ensure fs path is a prefix of current dentry path */
+        page = (char *) __get_free_page(GFP_USER);
+        if (page == NULL)
+                goto out;
+        page2 = (char *) __get_free_page(GFP_USER);
+        if (page2 == NULL)
+                goto out;
+        path = nfs4_path(dentry, page, PAGE_SIZE);
+        if (IS_ERR(path))
+                goto out_free;
+        fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+        if (IS_ERR(fs_path))
+                goto out_free;
+        if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+                dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+                goto out_free;
+        }
+        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        if (IS_ERR(devname)) {
+                mnt = (struct vfsmount *)devname;
+                goto out_free;
+        }
+        loc = 0;
+        while (loc < locations->nlocations && IS_ERR(mnt)) {
+                struct nfs4_fs_location *location = &locations->locations[loc];
+                char *mnt_path;
+                if (location == NULL || location->nservers <= 0 ||
+                    location->rootpath.ncomponents == 0) {
+                        loc++;
+                        continue;
+                }
+                mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+                if (IS_ERR(mnt_path)) {
+                        loc++;
+                        continue;
+                }
+                mountdata.mnt_path = mnt_path;
+                s = 0;
+                while (s < location->nservers) {
+                        struct sockaddr_in addr = {};
+                        if (location->servers[s].len <= 0 ||
+                            valid_ipaddr4(location->servers[s].data) < 0) {
+                                s++;
+                                continue;
+                        }
+                        mountdata.hostname = location->servers[s].data;
+                        addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+                        addr.sin_family = AF_INET;
+                        addr.sin_port = htons(NFS_PORT);
+                        mountdata.addr = &addr;
+                        mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+                        if (!IS_ERR(mnt)) {
+                                break;
+                        }
+                        s++;
+                }
+                loc++;
+        }
+out_free:
+        free_page((unsigned long)page);
+        free_page((unsigned long)page2);
+out:
+        dprintk("%s: done\n", __FUNCTION__);
+        return mnt;
+}
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct dentry *parent;
+        struct nfs4_fs_locations *fs_locations = NULL;
+        struct page *page;
+        int err;
+        /* BUG_ON(IS_ROOT(dentry)); */
+        dprintk("%s: enter\n", __FUNCTION__);
+        page = alloc_page(GFP_KERNEL);
+        if (page == NULL)
+                goto out;
+        fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+        if (fs_locations == NULL)
+                goto out_free;
+        /* Get locations */
+        parent = dget_parent(dentry);
+        dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+        err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+        dput(parent);
+        if (err != 0 || fs_locations->nlocations <= 0 ||
+            fs_locations->fs_path.ncomponents <= 0)
+                goto out_free;
+        mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+        __free_page(page);
+        kfree(fs_locations);
+out:
+        dprintk("%s: done\n", __FUNCTION__);
+        return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d86c0db7b1e8..b4916b092194 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,8 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
-extern struct rpc_procinfo nfs4_procedures[];
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
@@ -121,6 +119,25 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
                        0
 };
+const u32 nfs4_fs_locations_bitmap[2] = {
+        FATTR4_WORD0_TYPE
+        | FATTR4_WORD0_CHANGE
+        | FATTR4_WORD0_SIZE
+        | FATTR4_WORD0_FSID
+        | FATTR4_WORD0_FILEID
+        | FATTR4_WORD0_FS_LOCATIONS,
+        FATTR4_WORD1_MODE
+        | FATTR4_WORD1_NUMLINKS
+        | FATTR4_WORD1_OWNER
+        | FATTR4_WORD1_OWNER_GROUP
+        | FATTR4_WORD1_RAWDEV
+        | FATTR4_WORD1_SPACE_USED
+        | FATTR4_WORD1_TIME_ACCESS
+        | FATTR4_WORD1_TIME_METADATA
+        | FATTR4_WORD1_TIME_MODIFY
+        | FATTR4_WORD1_MOUNTED_ON_FILEID
+};
 static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
                struct nfs4_readdir_arg *readdir)
 {
@@ -185,15 +202,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
        spin_unlock(&clp->cl_lock);
 }
-static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_inode *nfsi = NFS_I(dir);
-        spin_lock(&inode->i_lock);
+        spin_lock(&dir->i_lock);
-        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
        if (cinfo->before == nfsi->change_attr && cinfo->atomic)
                nfsi->change_attr = cinfo->after;
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&dir->i_lock);
 }
 struct nfs4_opendata {
@@ -1331,7 +1348,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
        return status;
 }
-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
        struct nfs4_exception exception = { };
        int err;
@@ -1443,6 +1460,50 @@ out:
        return nfs4_map_errors(status);
 }
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+        int status = -ENOMEM;
+        struct page *page = NULL;
+        struct nfs4_fs_locations *locations = NULL;
+        struct dentry dentry = {};
+        page = alloc_page(GFP_KERNEL);
+        if (page == NULL)
+                goto out;
+        locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+        if (locations == NULL)
+                goto out;
+        dentry.d_name.name = name->name;
+        dentry.d_name.len = name->len;
+        status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
+        if (status != 0)
+                goto out;
+        /* Make sure server returned a different fsid for the referral */
+        if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+                dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+                status = -EIO;
+                goto out;
+        }
+        memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+        fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
+        if (!fattr->mode)
+                fattr->mode = S_IFDIR;
+        memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+        if (page)
+                __free_page(page);
+        if (locations)
+                kfree(locations);
+        return status;
+}
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
        struct nfs4_getattr_arg args = {
@@ -1547,6 +1608,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
        
        dprintk("NFS call  lookup %s\n", name->name);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+        if (status == -NFS4ERR_MOVED)
+                status = nfs4_get_referral(dir, name, fattr, fhandle);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
@@ -2008,7 +2071,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
-                nfs_refresh_inode(inode, res.fattr);
+                nfs_post_op_update_inode(inode, res.fattr);
        }
        return status;
@@ -3570,6 +3633,36 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
        return len;
 }
+int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+                struct nfs4_fs_locations *fs_locations, struct page *page)
+{
+        struct nfs_server *server = NFS_SERVER(dir);
+        u32 bitmask[2] = {
+                [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+                [1] = FATTR4_WORD1_MOUNTED_ON_FILEID,
+        };
+        struct nfs4_fs_locations_arg args = {
+                .dir_fh = NFS_FH(dir),
+                .name = &dentry->d_name,
+                .page = page,
+                .bitmask = bitmask,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+                .rpc_argp = &args,
+                .rpc_resp = fs_locations,
+        };
+        int status;
+        dprintk("%s: start\n", __FUNCTION__);
+        fs_locations->fattr.valid = 0;
+        fs_locations->server = server;
+        fs_locations->nlocations = 0;
+        status = rpc_call_sync(server->client, &msg, 0);
+        dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+        return status;
+}
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 96e5b82c153b..090a36b07a22 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -38,7 +38,6 @@
 * subsequent patch.
 */
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c5d70efe720..1750d996f49f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
 #define NFS4_dec_setacl_sz      (compound_decode_hdr_maxsz + \
                                decode_putfh_maxsz + \
                                op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define NFS4_enc_fs_locations_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_putfh_maxsz + \
+                                 encode_getattr_maxsz)
+#define NFS4_dec_fs_locations_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_putfh_maxsz + \
+                                 op_decode_hdr_maxsz + \
+                                 nfs4_fattr_bitmap_maxsz)
 static struct {
        unsigned int    mode;
@@ -722,6 +731,13 @@ static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
                        bitmask[1] & nfs4_fsinfo_bitmap[1]);
 }
+static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+{
+        return encode_getattr_two(xdr,
+                                  bitmask[0] & nfs4_fs_locations_bitmap[0],
+                                  bitmask[1] & nfs4_fs_locations_bitmap[1]);
+}
 static int encode_getfh(struct xdr_stream *xdr)
 {
        uint32_t *p;
@@ -2003,6 +2019,38 @@ out:
 }
 /*
+ * Encode FS_LOCATIONS request
+ */
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .nops = 3,
+        };
+        struct rpc_auth *auth = req->rq_task->tk_auth;
+        int replen;
+        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
+        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+                goto out;
+        if ((status = encode_lookup(&xdr, args->name)) != 0)
+                goto out;
+        if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
+                goto out;
+        /* set up reply
+         *   toplevel_status + OP_PUTFH + status
+         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
+         */
+        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
+        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
+                        0, PAGE_SIZE);
+out:
+        return status;
+}
+/*
 * START OF "GENERIC" DECODE ROUTINES.
 *   These may look a little ugly since they are imported from a "generic"
 * set of XDR encode/decode routines which are intended to be shared by
@@ -2036,7 +2084,7 @@ out:
        } \
 } while (0)
-static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
        uint32_t *p;
@@ -2087,7 +2135,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
 {
        uint32_t *p;
-        uint32_t strlen;
+        unsigned int strlen;
        char *str;
        READ_BUF(12);
@@ -2217,7 +2265,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
        return 0;
 }
-static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid)
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
        uint32_t *p;
@@ -2285,6 +2333,22 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
        return 0;
 }
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+        uint32_t *p;
+        *fileid = 0;
+        if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+                return -EIO;
+        if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+                READ_BUF(8);
+                READ64(*fileid);
+                bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+        }
+        dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+        return 0;
+}
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
        uint32_t *p;
@@ -2336,6 +2400,116 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
        return status;
 }
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+        int n;
+        uint32_t *p;
+        int status = 0;
+        READ_BUF(4);
+        READ32(n);
+        if (n < 0)
+                goto out_eio;
+        if (n == 0)
+                goto root_path;
+        dprintk("path ");
+        path->ncomponents = 0;
+        while (path->ncomponents < n) {
+                struct nfs4_string *component = &path->components[path->ncomponents];
+                status = decode_opaque_inline(xdr, &component->len, &component->data);
+                if (unlikely(status != 0))
+                        goto out_eio;
+                if (path->ncomponents != n)
+                        dprintk("/");
+                dprintk("%s", component->data);
+                if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+                        path->ncomponents++;
+                else {
+                        dprintk("cannot parse %d components in path\n", n);
+                        goto out_eio;
+                }
+        }
+out:
+        dprintk("\n");
+        return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+        path->ncomponents = 1;
+        path->components[0].len=0;
+        path->components[0].data=NULL;
+        dprintk("path /\n");
+        goto out;
+out_eio:
+        dprintk(" status %d", status);
+        status = -EIO;
+        goto out;
+}
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+        int n;
+        uint32_t *p;
+        int status = -EIO;
+        if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+                goto out;
+        status = 0;
+        if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+                goto out;
+        dprintk("%s: fsroot ", __FUNCTION__);
+        status = decode_pathname(xdr, &res->fs_path);
+        if (unlikely(status != 0))
+                goto out;
+        READ_BUF(4);
+        READ32(n);
+        if (n <= 0)
+                goto out_eio;
+        res->nlocations = 0;
+        while (res->nlocations < n) {
+                int m;
+                struct nfs4_fs_location *loc = &res->locations[res->nlocations];
+                READ_BUF(4);
+                READ32(m);
+                if (m <= 0)
+                        goto out_eio;
+                loc->nservers = 0;
+                dprintk("%s: servers ", __FUNCTION__);
+                while (loc->nservers < m) {
+                        struct nfs4_string *server = &loc->servers[loc->nservers];
+                        status = decode_opaque_inline(xdr, &server->len, &server->data);
+                        if (unlikely(status != 0))
+                                goto out_eio;
+                        dprintk("%s ", server->data);
+                        if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+                                loc->nservers++;
+                        else {
+                                int i;
+                                dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+                                for (i = loc->nservers; i < m; i++) {
+                                        int len;
+                                        char *data;
+                                        status = decode_opaque_inline(xdr, &len, &data);
+                                        if (unlikely(status != 0))
+                                                goto out_eio;
+                                }
+                        }
+                }
+                status = decode_pathname(xdr, &loc->rootpath);
+                if (unlikely(status != 0))
+                        goto out_eio;
+                if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
+                        res->nlocations++;
+        }
+out:
+        dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+        return status;
+out_eio:
+        status = -EIO;
+        goto out;
+}
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
        uint32_t *p;
@@ -2841,6 +3015,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                 bitmap[2] = {0},
                 type;
        int status, fmode = 0;
+        uint64_t fileid;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
@@ -2863,10 +3038,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                goto xdr_error;
        if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
                goto xdr_error;
-        if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0)
+        if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
                goto xdr_error;
        if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
                goto xdr_error;
+        if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+                                                struct nfs4_fs_locations,
+                                                fattr))) != 0)
+                goto xdr_error;
        if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
                goto xdr_error;
        fattr->mode |= fmode;
@@ -2886,6 +3065,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                goto xdr_error;
        if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
                goto xdr_error;
+        if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+                goto xdr_error;
+        if (fattr->fileid == 0 && fileid != 0)
+                fattr->fileid = fileid;
        if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
                fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
@@ -3350,8 +3533,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                                        attrlen, recvd);
                        return -EINVAL;
                }
-                if (attrlen <= *acl_len)
+                xdr_read_pages(xdr, attrlen);
-                        xdr_read_pages(xdr, attrlen);
                *acl_len = attrlen;
        } else
                status = -EOPNOTSUPP;
@@ -4211,6 +4393,29 @@ out:
        return status;
 }
+/*
+ * FS_LOCATIONS request
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (status != 0)
+                goto out;
+        if ((status = decode_putfh(&xdr)) != 0)
+                goto out;
+        if ((status = decode_lookup(&xdr)) != 0)
+                goto out;
+        xdr_enter_page(&xdr, PAGE_SIZE);
+        status = decode_getfattr(&xdr, &res->fattr, res->server);
+out:
+        return status;
+}
 uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
 {
        uint32_t bitmap[2] = {0};
@@ -4382,6 +4587,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(DELEGRETURN,     enc_delegreturn, dec_delegreturn),
  PROC(GETACL,          enc_getacl,     dec_getacl),
  PROC(SETACL,          enc_setacl,     dec_setacl),
+  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
 };
 struct rpc_version              nfs_version4 = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388ebc..36e902a88ca1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -9,7 +9,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/sunrpc/clnt.h>
@@ -315,6 +314,7 @@ nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
                                                req->wb_index, NFS_PAGE_TAG_DIRTY);
                                nfs_list_remove_request(req);
                                nfs_list_add_request(req, dst);
+                                dec_zone_page_state(req->wb_page, NR_FILE_DIRTY);
                                res++;
                        }
                }
@@ -325,6 +325,7 @@ out:
 /**
 * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
 * @head: One of the NFS inode request lists
 * @dst: Destination list
 * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
 * The requests are *not* checked to ensure that they form a contiguous set.
 * You must be holding the inode's req_lock when calling this function
 */
-int
+int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
-nfs_scan_list(struct list_head *head, struct list_head *dst,
+                struct list_head *dst, unsigned long idx_start,
-              unsigned long idx_start, unsigned int npages)
+                unsigned int npages)
 {
-        struct list_head        *pos, *tmp;
+        struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
-        struct nfs_page         *req;
+        struct nfs_page *req;
-        unsigned long           idx_end;
+        unsigned long idx_end;
-        int                     res;
+        int found, i;
+        int res;
        res = 0;
        if (npages == 0)
@@ -351,25 +353,32 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
        else
                idx_end = idx_start + npages - 1;
-        list_for_each_safe(pos, tmp, head) {
+        for (;;) {
+                found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
-                req = nfs_list_entry(pos);
+                                (void **)&pgvec[0], idx_start,
+                                NFS_SCAN_MAXENTRIES);
-                if (req->wb_index < idx_start)
+                if (found <= 0)
-                        continue;
-                if (req->wb_index > idx_end)
                        break;
+                for (i = 0; i < found; i++) {
+                        req = pgvec[i];
+                        if (req->wb_index > idx_end)
+                                goto out;
+                        idx_start = req->wb_index + 1;
+                        if (req->wb_list_head != head)
+                                continue;
+                        if (nfs_set_page_writeback_locked(req)) {
+                                nfs_list_remove_request(req);
+                                nfs_list_add_request(req, dst);
+                                res++;
+                        }
+                }
-                if (!nfs_set_page_writeback_locked(req))
-                        continue;
-                nfs_list_remove_request(req);
-                nfs_list_add_request(req, dst);
-                res++;
        }
+out:
        return res;
 }
-int nfs_init_nfspagecache(void)
+int __init nfs_init_nfspagecache(void)
 {
        nfs_page_cachep = kmem_cache_create("nfs_page",
                                            sizeof(struct nfs_page),
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 9dd85cac2df0..b3899ea3229e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -44,11 +44,10 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/smp_lock.h>
+#include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
-extern struct rpc_procinfo nfs_procedures[];
 /*
 * Bare-bones access to getattr: this is for nfs_read_super.
 */
@@ -611,8 +610,6 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return 0;
 }
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        if (task->tk_status >= 0) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 624ca7146b6b..52bf634260a1 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -15,7 +15,6 @@
 * within the RPC code when root squashing is suspected.
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -51,14 +50,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount < NFS_PAGEVEC_SIZE)
+                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = &p->page_array[0];
+                        p->pagevec = p->page_array;
                else {
-                        size_t size = ++pagecount * sizeof(struct page *);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        p->pagevec = kmalloc(size, GFP_NOFS);
+                        if (!p->pagevec) {
-                        if (p->pagevec) {
-                                memset(p->pagevec, 0, size);
-                        } else {
                                mempool_free(p, nfs_rdata_mempool);
                                p = NULL;
                        }
@@ -104,6 +100,28 @@ int nfs_return_empty_page(struct page *page)
        return 0;
 }
+static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
+{
+        unsigned int remainder = data->args.count - data->res.count;
+        unsigned int base = data->args.pgbase + data->res.count;
+        unsigned int pglen;
+        struct page **pages;
+        if (data->res.eof == 0 || remainder == 0)
+                return;
+        /*
+         * Note: "remainder" can never be negative, since we check for
+         *      this in the XDR code.
+         */
+        pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+        base &= ~PAGE_CACHE_MASK;
+        pglen = PAGE_CACHE_SIZE - base;
+        if (pglen < remainder)
+                memclear_highpage_flush(*pages, base, pglen);
+        else
+                memclear_highpage_flush(*pages, base, remainder);
+}
 /*
 * Read a page synchronously.
 */
@@ -177,11 +195,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
        spin_unlock(&inode->i_lock);
-        if (count)
+        nfs_readpage_truncate_uninitialised_page(rdata);
-                memclear_highpage_flush(page, rdata->args.pgbase, count);
+        if (rdata->res.eof || rdata->res.count == rdata->args.count)
-        SetPageUptodate(page);
+                SetPageUptodate(page);
-        if (PageError(page))
-                ClearPageError(page);
        result = 0;
 io_error:
@@ -436,20 +452,12 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
        struct nfs_page *req = data->req;
        struct page *page = req->wb_page;
 
+        if (likely(task->tk_status >= 0))
+                nfs_readpage_truncate_uninitialised_page(data);
+        else
+                SetPageError(page);
        if (nfs_readpage_result(task, data) != 0)
                return;
-        if (task->tk_status >= 0) {
-                unsigned int request = data->args.count;
-                unsigned int result = data->res.count;
-                if (result < request) {
-                        memclear_highpage_flush(page,
-                                                data->args.pgbase + result,
-                                                request - result);
-                }
-        } else
-                SetPageError(page);
        if (atomic_dec_and_test(&req->wb_complete)) {
                if (!PageError(page))
                        SetPageUptodate(page);
@@ -462,6 +470,40 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
        .rpc_release = nfs_readdata_release,
 };
+static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
+{
+        unsigned int count = data->res.count;
+        unsigned int base = data->args.pgbase;
+        struct page **pages;
+        if (unlikely(count == 0))
+                return;
+        pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+        base &= ~PAGE_CACHE_MASK;
+        count += base;
+        for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+                SetPageUptodate(*pages);
+        /*
+         * Was this an eof or a short read? If the latter, don't mark the page
+         * as uptodate yet.
+         */
+        if (count > 0 && (data->res.eof || data->args.count == data->res.count))
+                SetPageUptodate(*pages);
+}
+static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
+{
+        unsigned int count = data->args.count;
+        unsigned int base = data->args.pgbase;
+        struct page **pages;
+        pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+        base &= ~PAGE_CACHE_MASK;
+        count += base;
+        for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+                SetPageError(*pages);
+}
 /*
 * This is the callback from RPC telling us whether a reply was
 * received or some error occurred (timeout or socket shutdown).
@@ -469,27 +511,24 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
 static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 {
        struct nfs_read_data *data = calldata;
-        unsigned int count = data->res.count;
+        /*
+         * Note: nfs_readpage_result may change the values of
+         * data->args. In the multi-page case, we therefore need
+         * to ensure that we call the next nfs_readpage_set_page_uptodate()
+         * first in the multi-page case.
+         */
+        if (likely(task->tk_status >= 0)) {
+                nfs_readpage_truncate_uninitialised_page(data);
+                nfs_readpage_set_pages_uptodate(data);
+        } else
+                nfs_readpage_set_pages_error(data);
        if (nfs_readpage_result(task, data) != 0)
                return;
        while (!list_empty(&data->pages)) {
                struct nfs_page *req = nfs_list_entry(data->pages.next);
-                struct page *page = req->wb_page;
-                nfs_list_remove_request(req);
-                if (task->tk_status >= 0) {
+                nfs_list_remove_request(req);
-                        if (count < PAGE_CACHE_SIZE) {
-                                if (count < req->wb_bytes)
-                                        memclear_highpage_flush(page,
-                                                        req->wb_pgbase + count,
-                                                        req->wb_bytes - count);
-                                count = 0;
-                        } else
-                                count -= PAGE_CACHE_SIZE;
-                        SetPageUptodate(page);
-                } else
-                        SetPageError(page);
                nfs_readpage_release(req);
        }
 }
@@ -654,7 +693,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        return ret;
 }
-int nfs_init_readpagecache(void)
+int __init nfs_init_readpagecache(void)
 {
        nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
                                             sizeof(struct nfs_read_data),
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 000000000000..e8a9bee74d9d
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1537 @@
+/*
+ *  linux/fs/nfs/super.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs superblock handling functions
+ *
+ *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ *  Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_VFS
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version * nfs_version[] = {
+        NULL,
+        NULL,
+        &nfs_version2,
+#if defined(CONFIG_NFS_V3)
+        &nfs_version3,
+#elif defined(CONFIG_NFS_V4)
+        NULL,
+#endif
+#if defined(CONFIG_NFS_V4)
+        &nfs_version4,
+#endif
+};
+static struct rpc_program nfs_program = {
+        .name                   = "nfs",
+        .number                 = NFS_PROGRAM,
+        .nrvers                 = ARRAY_SIZE(nfs_version),
+        .version                = nfs_version,
+        .stats                  = &nfs_rpcstat,
+        .pipe_dir_name          = "/nfs",
+};
+struct rpc_stat nfs_rpcstat = {
+        .program                = &nfs_program
+};
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *     nfsacl_version[] = {
+        [3]                     = &nfsacl_version3,
+};
+struct rpc_program              nfsacl_program = {
+        .name =                 "nfsacl",
+        .number =               NFS_ACL_PROGRAM,
+        .nrvers =               ARRAY_SIZE(nfsacl_version),
+        .version =              nfsacl_version,
+        .stats =                &nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+static void nfs_umount_begin(struct vfsmount *, int);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
+static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
+static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs_kill_super(struct super_block *);
+static struct file_system_type nfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs",
+        .get_sb         = nfs_get_sb,
+        .kill_sb        = nfs_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+struct file_system_type clone_nfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs",
+        .get_sb         = nfs_clone_nfs_sb,
+        .kill_sb        = nfs_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+static struct super_operations nfs_sops = {
+        .alloc_inode    = nfs_alloc_inode,
+        .destroy_inode  = nfs_destroy_inode,
+        .write_inode    = nfs_write_inode,
+        .statfs         = nfs_statfs,
+        .clear_inode    = nfs_clear_inode,
+        .umount_begin   = nfs_umount_begin,
+        .show_options   = nfs_show_options,
+        .show_stats     = nfs_show_stats,
+};
+#ifdef CONFIG_NFS_V4
+static int nfs4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs4_kill_super(struct super_block *sb);
+static struct file_system_type nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs4_get_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+struct file_system_type clone_nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs_clone_nfs4_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+struct file_system_type nfs_referral_nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .get_sb         = nfs_referral_nfs4_sb,
+        .kill_sb        = nfs4_kill_super,
+        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+static struct super_operations nfs4_sops = {
+        .alloc_inode    = nfs_alloc_inode,
+        .destroy_inode  = nfs_destroy_inode,
+        .write_inode    = nfs_write_inode,
+        .statfs         = nfs_statfs,
+        .clear_inode    = nfs4_clear_inode,
+        .umount_begin   = nfs_umount_begin,
+        .show_options   = nfs_show_options,
+        .show_stats     = nfs_show_stats,
+};
+#endif
+#ifdef CONFIG_NFS_V4
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+        char *endp;
+        int num = simple_strtol(val, &endp, 0);
+        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+                return -EINVAL;
+        *((int *)kp->arg) = num;
+        return 0;
+}
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+                 &nfs_callback_set_tcpport, 0644);
+#endif
+#ifdef CONFIG_NFS_V4
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+        char *endp;
+        int num = simple_strtol(val, &endp, 0);
+        int jif = num * HZ;
+        if (endp == val || *endp || num < 0 || jif < num)
+                return -EINVAL;
+        *((int *)kp->arg) = jif;
+        return 0;
+}
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+                 &nfs_idmap_cache_timeout, 0644);
+#endif
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+        int ret;
+        ret = register_filesystem(&nfs_fs_type);
+        if (ret < 0)
+                goto error_0;
+#ifdef CONFIG_NFS_V4
+        ret = nfs_register_sysctl();
+        if (ret < 0)
+                goto error_1;
+        ret = register_filesystem(&nfs4_fs_type);
+        if (ret < 0)
+                goto error_2;
+#endif
+        return 0;
+#ifdef CONFIG_NFS_V4
+error_2:
+        nfs_unregister_sysctl();
+error_1:
+        unregister_filesystem(&nfs_fs_type);
+#endif
+error_0:
+        return ret;
+}
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+#ifdef CONFIG_NFS_V4
+        unregister_filesystem(&nfs4_fs_type);
+        nfs_unregister_sysctl();
+#endif
+        unregister_filesystem(&nfs_fs_type);
+}
+/*
+ * Deliver file system statistics to userspace
+ */
+static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct nfs_server *server = NFS_SB(sb);
+        unsigned char blockbits;
+        unsigned long blockres;
+        struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
+        struct nfs_fattr fattr;
+        struct nfs_fsstat res = {
+                        .fattr = &fattr,
+        };
+        int error;
+        lock_kernel();
+        error = server->rpc_ops->statfs(server, rootfh, &res);
+        buf->f_type = NFS_SUPER_MAGIC;
+        if (error < 0)
+                goto out_err;
+        /*
+         * Current versions of glibc do not correctly handle the
+         * case where f_frsize != f_bsize.  Eventually we want to
+         * report the value of wtmult in this field.
+         */
+        buf->f_frsize = sb->s_blocksize;
+        /*
+         * On most *nix systems, f_blocks, f_bfree, and f_bavail
+         * are reported in units of f_frsize.  Linux hasn't had
+         * an f_frsize field in its statfs struct until recently,
+         * thus historically Linux's sys_statfs reports these
+         * fields in units of f_bsize.
+         */
+        buf->f_bsize = sb->s_blocksize;
+        blockbits = sb->s_blocksize_bits;
+        blockres = (1 << blockbits) - 1;
+        buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+        buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+        buf->f_bavail = (res.abytes + blockres) >> blockbits;
+        buf->f_files = res.tfiles;
+        buf->f_ffree = res.afiles;
+        buf->f_namelen = server->namelen;
+ out:
+        unlock_kernel();
+        return 0;
+ out_err:
+        dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
+        buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
+        goto out;
+}
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+        static struct {
+                rpc_authflavor_t flavour;
+                const char *str;
+        } sec_flavours[] = {
+                { RPC_AUTH_NULL, "null" },
+                { RPC_AUTH_UNIX, "sys" },
+                { RPC_AUTH_GSS_KRB5, "krb5" },
+                { RPC_AUTH_GSS_KRB5I, "krb5i" },
+                { RPC_AUTH_GSS_KRB5P, "krb5p" },
+                { RPC_AUTH_GSS_LKEY, "lkey" },
+                { RPC_AUTH_GSS_LKEYI, "lkeyi" },
+                { RPC_AUTH_GSS_LKEYP, "lkeyp" },
+                { RPC_AUTH_GSS_SPKM, "spkm" },
+                { RPC_AUTH_GSS_SPKMI, "spkmi" },
+                { RPC_AUTH_GSS_SPKMP, "spkmp" },
+                { -1, "unknown" }
+        };
+        int i;
+        for (i=0; sec_flavours[i].flavour != -1; i++) {
+                if (sec_flavours[i].flavour == flavour)
+                        break;
+        }
+        return sec_flavours[i].str;
+}
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
+{
+        static struct proc_nfs_info {
+                int flag;
+                char *str;
+                char *nostr;
+        } nfs_info[] = {
+                { NFS_MOUNT_SOFT, ",soft", ",hard" },
+                { NFS_MOUNT_INTR, ",intr", "" },
+                { NFS_MOUNT_NOCTO, ",nocto", "" },
+                { NFS_MOUNT_NOAC, ",noac", "" },
+                { NFS_MOUNT_NONLM, ",nolock", "" },
+                { NFS_MOUNT_NOACL, ",noacl", "" },
+                { 0, NULL, NULL }
+        };
+        struct proc_nfs_info *nfs_infop;
+        char buf[12];
+        char *proto;
+        seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
+        seq_printf(m, ",rsize=%d", nfss->rsize);
+        seq_printf(m, ",wsize=%d", nfss->wsize);
+        if (nfss->acregmin != 3*HZ || showdefaults)
+                seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
+        if (nfss->acregmax != 60*HZ || showdefaults)
+                seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
+        if (nfss->acdirmin != 30*HZ || showdefaults)
+                seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
+        if (nfss->acdirmax != 60*HZ || showdefaults)
+                seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
+        for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+                if (nfss->flags & nfs_infop->flag)
+                        seq_puts(m, nfs_infop->str);
+                else
+                        seq_puts(m, nfs_infop->nostr);
+        }
+        switch (nfss->client->cl_xprt->prot) {
+                case IPPROTO_TCP:
+                        proto = "tcp";
+                        break;
+                case IPPROTO_UDP:
+                        proto = "udp";
+                        break;
+                default:
+                        snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
+                        proto = buf;
+        }
+        seq_printf(m, ",proto=%s", proto);
+        seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
+        seq_printf(m, ",retrans=%u", nfss->retrans_count);
+        seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+}
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+        nfs_show_mount_options(m, nfss, 0);
+        seq_puts(m, ",addr=");
+        seq_escape(m, nfss->hostname, " \t\n\\");
+        return 0;
+}
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+{
+        int i, cpu;
+        struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+        struct rpc_auth *auth = nfss->client->cl_auth;
+        struct nfs_iostats totals = { };
+        seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+        /*
+         * Display all mount option settings
+         */
+        seq_printf(m, "\n\topts:\t");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+        seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+        nfs_show_mount_options(m, nfss, 1);
+        seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+        seq_printf(m, "\n\tcaps:\t");
+        seq_printf(m, "caps=0x%x", nfss->caps);
+        seq_printf(m, ",wtmult=%d", nfss->wtmult);
+        seq_printf(m, ",dtsize=%d", nfss->dtsize);
+        seq_printf(m, ",bsize=%d", nfss->bsize);
+        seq_printf(m, ",namelen=%d", nfss->namelen);
+#ifdef CONFIG_NFS_V4
+        if (nfss->rpc_ops->version == 4) {
+                seq_printf(m, "\n\tnfsv4:\t");
+                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+        }
+#endif
+        /*
+         * Display security flavor in effect for this mount
+         */
+        seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
+        if (auth->au_flavor)
+                seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
+        /*
+         * Display superblock I/O counters
+         */
+        for_each_possible_cpu(cpu) {
+                struct nfs_iostats *stats;
+                preempt_disable();
+                stats = per_cpu_ptr(nfss->io_stats, cpu);
+                for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+                        totals.events[i] += stats->events[i];
+                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+                        totals.bytes[i] += stats->bytes[i];
+                preempt_enable();
+        }
+        seq_printf(m, "\n\tevents:\t");
+        for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+                seq_printf(m, "%lu ", totals.events[i]);
+        seq_printf(m, "\n\tbytes:\t");
+        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+                seq_printf(m, "%Lu ", totals.bytes[i]);
+        seq_printf(m, "\n");
+        rpc_print_iostats(m, nfss->client);
+        return 0;
+}
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to traversals
+ */
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+        struct nfs_server *server;
+        struct rpc_clnt *rpc;
+        shrink_submounts(vfsmnt, &nfs_automount_list);
+        if (!(flags & MNT_FORCE))
+                return;
+        /* -EIO all pending I/O */
+        server = NFS_SB(vfsmnt->mnt_sb);
+        rpc = server->client;
+        if (!IS_ERR(rpc))
+                rpc_killall_tasks(rpc);
+        rpc = server->client_acl;
+        if (!IS_ERR(rpc))
+                rpc_killall_tasks(rpc);
+}
+/*
+ * Obtain the root inode of the file system.
+ */
+static struct inode *
+nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
+{
+        struct nfs_server       *server = NFS_SB(sb);
+        int                     error;
+        error = server->rpc_ops->getroot(server, rootfh, fsinfo);
+        if (error < 0) {
+                dprintk("nfs_get_root: getattr error = %d\n", -error);
+                return ERR_PTR(error);
+        }
+        server->fsid = fsinfo->fattr->fsid;
+        return nfs_fhget(sb, rootfh, fsinfo->fattr);
+}
+/*
+ * Do NFS version-independent mount processing, and sanity checking
+ */
+static int
+nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
+{
+        struct nfs_server       *server;
+        struct inode            *root_inode;
+        struct nfs_fattr        fattr;
+        struct nfs_fsinfo       fsinfo = {
+                                        .fattr = &fattr,
+                                };
+        struct nfs_pathconf pathinfo = {
+                        .fattr = &fattr,
+        };
+        int no_root_error = 0;
+        unsigned long max_rpc_payload;
+        /* We probably want something more informative here */
+        snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+        server = NFS_SB(sb);
+        sb->s_magic      = NFS_SUPER_MAGIC;
+        server->io_stats = nfs_alloc_iostats();
+        if (server->io_stats == NULL)
+                return -ENOMEM;
+        root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
+        /* Did getting the root inode fail? */
+        if (IS_ERR(root_inode)) {
+                no_root_error = PTR_ERR(root_inode);
+                goto out_no_root;
+        }
+        sb->s_root = d_alloc_root(root_inode);
+        if (!sb->s_root) {
+                no_root_error = -ENOMEM;
+                goto out_no_root;
+        }
+        sb->s_root->d_op = server->rpc_ops->dentry_ops;
+        /* mount time stamp, in seconds */
+        server->mount_time = jiffies;
+        /* Get some general file system info */
+        if (server->namelen == 0 &&
+            server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
+                server->namelen = pathinfo.max_namelen;
+        /* Work out a lot of parameters */
+        if (server->rsize == 0)
+                server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
+        if (server->wsize == 0)
+                server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
+        if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
+                server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
+        if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
+                server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
+        max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+        if (server->rsize > max_rpc_payload)
+                server->rsize = max_rpc_payload;
+        if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+                server->rsize = NFS_MAX_FILE_IO_SIZE;
+        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (server->wsize > max_rpc_payload)
+                server->wsize = max_rpc_payload;
+        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+                server->wsize = NFS_MAX_FILE_IO_SIZE;
+        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (sb->s_blocksize == 0)
+                sb->s_blocksize = nfs_block_bits(server->wsize,
+                                                         &sb->s_blocksize_bits);
+        server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
+        server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
+        if (server->dtsize > PAGE_CACHE_SIZE)
+                server->dtsize = PAGE_CACHE_SIZE;
+        if (server->dtsize > server->rsize)
+                server->dtsize = server->rsize;
+        if (server->flags & NFS_MOUNT_NOAC) {
+                server->acregmin = server->acregmax = 0;
+                server->acdirmin = server->acdirmax = 0;
+                sb->s_flags |= MS_SYNCHRONOUS;
+        }
+        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+        nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+        server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
+        server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
+        /* We're airborne Set socket buffersize */
+        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+        return 0;
+        /* Yargs. It didn't work out. */
+out_no_root:
+        dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
+        if (!IS_ERR(root_inode))
+                iput(root_inode);
+        return no_root_error;
+}
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
+{
+        to->to_initval = timeo * HZ / 10;
+        to->to_retries = retrans;
+        if (!to->to_retries)
+                to->to_retries = 2;
+        switch (proto) {
+        case IPPROTO_TCP:
+                if (!to->to_initval)
+                        to->to_initval = 60 * HZ;
+                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
+                to->to_increment = to->to_initval;
+                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+                to->to_exponential = 0;
+                break;
+        case IPPROTO_UDP:
+        default:
+                if (!to->to_initval)
+                        to->to_initval = 11 * HZ / 10;
+                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
+                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+                to->to_exponential = 1;
+                break;
+        }
+}
+/*
+ * Create an RPC client handle.
+ */
+static struct rpc_clnt *
+nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
+{
+        struct rpc_timeout      timeparms;
+        struct rpc_xprt         *xprt = NULL;
+        struct rpc_clnt         *clnt = NULL;
+        int                     proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+        nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
+        server->retrans_timeo = timeparms.to_initval;
+        server->retrans_count = timeparms.to_retries;
+        /* create transport and client */
+        xprt = xprt_create_proto(proto, &server->addr, &timeparms);
+        if (IS_ERR(xprt)) {
+                dprintk("%s: cannot create RPC transport. Error = %ld\n",
+                                __FUNCTION__, PTR_ERR(xprt));
+                return (struct rpc_clnt *)xprt;
+        }
+        clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+                                 server->rpc_ops->version, data->pseudoflavor);
+        if (IS_ERR(clnt)) {
+                dprintk("%s: cannot create RPC client. Error = %ld\n",
+                                __FUNCTION__, PTR_ERR(xprt));
+                goto out_fail;
+        }
+        clnt->cl_intr     = 1;
+        clnt->cl_softrtry = 1;
+        return clnt;
+out_fail:
+        return clnt;
+}
+/*
+ * Clone a server record
+ */
+static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_server *parent = NFS_SB(data->sb);
+        struct inode *root_inode;
+        struct nfs_fsinfo fsinfo;
+        void *err = ERR_PTR(-ENOMEM);
+        sb->s_op = data->sb->s_op;
+        sb->s_blocksize = data->sb->s_blocksize;
+        sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+        sb->s_maxbytes = data->sb->s_maxbytes;
+        server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+        server->io_stats = nfs_alloc_iostats();
+        if (server->io_stats == NULL)
+                goto out;
+        server->client = rpc_clone_client(parent->client);
+        if (IS_ERR((err = server->client)))
+                goto out;
+        if (!IS_ERR(parent->client_sys)) {
+                server->client_sys = rpc_clone_client(parent->client_sys);
+                if (IS_ERR((err = server->client_sys)))
+                        goto out;
+        }
+        if (!IS_ERR(parent->client_acl)) {
+                server->client_acl = rpc_clone_client(parent->client_acl);
+                if (IS_ERR((err = server->client_acl)))
+                        goto out;
+        }
+        root_inode = nfs_fhget(sb, data->fh, data->fattr);
+        if (!root_inode)
+                goto out;
+        sb->s_root = d_alloc_root(root_inode);
+        if (!sb->s_root)
+                goto out_put_root;
+        fsinfo.fattr = data->fattr;
+        if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+                nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+        sb->s_root->d_op = server->rpc_ops->dentry_ops;
+        sb->s_flags |= MS_ACTIVE;
+        return server;
+out_put_root:
+        iput(root_inode);
+out:
+        return err;
+}
+/*
+ * Copy an existing superblock and attach revised data
+ */
+static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
+                struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
+                struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *),
+                struct vfsmount *mnt)
+{
+        struct nfs_server *server;
+        struct nfs_server *parent = NFS_SB(data->sb);
+        struct super_block *sb = ERR_PTR(-EINVAL);
+        char *hostname;
+        int error = -ENOMEM;
+        int len;
+        server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (server == NULL)
+                goto out_err;
+        memcpy(server, parent, sizeof(*server));
+        hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+        len = strlen(hostname) + 1;
+        server->hostname = kmalloc(len, GFP_KERNEL);
+        if (server->hostname == NULL)
+                goto free_server;
+        memcpy(server->hostname, hostname, len);
+        error = rpciod_up();
+        if (error != 0)
+                goto free_hostname;
+        sb = fill_sb(server, data);
+        if (IS_ERR(sb)) {
+                error = PTR_ERR(sb);
+                goto kill_rpciod;
+        }
+                
+        if (sb->s_root)
+                goto out_rpciod_down;
+        server = fill_server(sb, data);
+        if (IS_ERR(server)) {
+                error = PTR_ERR(server);
+                goto out_deactivate;
+        }
+        return simple_set_mnt(mnt, sb);
+out_deactivate:
+        up_write(&sb->s_umount);
+        deactivate_super(sb);
+        return error;
+out_rpciod_down:
+        rpciod_down();
+        kfree(server->hostname);
+        kfree(server);
+        return simple_set_mnt(mnt, sb);
+kill_rpciod:
+        rpciod_down();
+free_hostname:
+        kfree(server->hostname);
+free_server:
+        kfree(server);
+out_err:
+        return error;
+}
+/*
+ * Set up an NFS2/3 superblock
+ *
+ * The way this works is that the mount process passes a structure
+ * in the data argument which contains the server's IP address
+ * and the root file handle obtained from the server's mount
+ * daemon. We stash these away in the private superblock fields.
+ */
+static int
+nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
+{
+        struct nfs_server       *server;
+        rpc_authflavor_t        authflavor;
+        server           = NFS_SB(sb);
+        sb->s_blocksize_bits = 0;
+        sb->s_blocksize = 0;
+        if (data->bsize)
+                sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
+        if (data->rsize)
+                server->rsize = nfs_block_size(data->rsize, NULL);
+        if (data->wsize)
+                server->wsize = nfs_block_size(data->wsize, NULL);
+        server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
+        server->acregmin = data->acregmin*HZ;
+        server->acregmax = data->acregmax*HZ;
+        server->acdirmin = data->acdirmin*HZ;
+        server->acdirmax = data->acdirmax*HZ;
+        /* Start lockd here, before we might error out */
+        if (!(server->flags & NFS_MOUNT_NONLM))
+                lockd_up();
+        server->namelen  = data->namlen;
+        server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
+        if (!server->hostname)
+                return -ENOMEM;
+        strcpy(server->hostname, data->hostname);
+        /* Check NFS protocol revision and initialize RPC op vector
+         * and file handle pool. */
+#ifdef CONFIG_NFS_V3
+        if (server->flags & NFS_MOUNT_VER3) {
+                server->rpc_ops = &nfs_v3_clientops;
+                server->caps |= NFS_CAP_READDIRPLUS;
+        } else {
+                server->rpc_ops = &nfs_v2_clientops;
+        }
+#else
+        server->rpc_ops = &nfs_v2_clientops;
+#endif
+        /* Fill in pseudoflavor for mount version < 5 */
+        if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
+                data->pseudoflavor = RPC_AUTH_UNIX;
+        authflavor = data->pseudoflavor;        /* save for sb_init() */
+        /* XXX maybe we want to add a server->pseudoflavor field */
+        /* Create RPC client handles */
+        server->client = nfs_create_client(server, data);
+        if (IS_ERR(server->client))
+                return PTR_ERR(server->client);
+        /* RFC 2623, sec 2.3.2 */
+        if (authflavor != RPC_AUTH_UNIX) {
+                struct rpc_auth *auth;
+                server->client_sys = rpc_clone_client(server->client);
+                if (IS_ERR(server->client_sys))
+                        return PTR_ERR(server->client_sys);
+                auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
+                if (IS_ERR(auth))
+                        return PTR_ERR(auth);
+        } else {
+                atomic_inc(&server->client->cl_count);
+                server->client_sys = server->client;
+        }
+        if (server->flags & NFS_MOUNT_VER3) {
+#ifdef CONFIG_NFS_V3_ACL
+                if (!(server->flags & NFS_MOUNT_NOACL)) {
+                        server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+                        /* No errors! Assume that Sun nfsacls are supported */
+                        if (!IS_ERR(server->client_acl))
+                                server->caps |= NFS_CAP_ACLS;
+                }
+#else
+                server->flags &= ~NFS_MOUNT_NOACL;
+#endif /* CONFIG_NFS_V3_ACL */
+                /*
+                 * The VFS shouldn't apply the umask to mode bits. We will
+                 * do so ourselves when necessary.
+                 */
+                sb->s_flags |= MS_POSIXACL;
+                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+                        server->namelen = NFS3_MAXNAMLEN;
+                sb->s_time_gran = 1;
+        } else {
+                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+                        server->namelen = NFS2_MAXNAMLEN;
+        }
+        sb->s_op = &nfs_sops;
+        return nfs_sb_init(sb, authflavor);
+}
+static int nfs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
+}
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+        struct nfs_server *server = data;
+        struct nfs_server *old = NFS_SB(sb);
+        if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
+                return 0;
+        if (old->addr.sin_port != server->addr.sin_port)
+                return 0;
+        return !nfs_compare_fh(&old->fh, &server->fh);
+}
+static int nfs_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        int error;
+        struct nfs_server *server = NULL;
+        struct super_block *s;
+        struct nfs_fh *root;
+        struct nfs_mount_data *data = raw_data;
+        error = -EINVAL;
+        if (data == NULL) {
+                dprintk("%s: missing data argument\n", __FUNCTION__);
+                goto out_err_noserver;
+        }
+        if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
+                dprintk("%s: bad mount version\n", __FUNCTION__);
+                goto out_err_noserver;
+        }
+        switch (data->version) {
+                case 1:
+                        data->namlen = 0;
+                case 2:
+                        data->bsize  = 0;
+                case 3:
+                        if (data->flags & NFS_MOUNT_VER3) {
+                                dprintk("%s: mount structure version %d does not support NFSv3\n",
+                                                __FUNCTION__,
+                                                data->version);
+                                goto out_err_noserver;
+                        }
+                        data->root.size = NFS2_FHSIZE;
+                        memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+                case 4:
+                        if (data->flags & NFS_MOUNT_SECFLAVOUR) {
+                                dprintk("%s: mount structure version %d does not support strong security\n",
+                                                __FUNCTION__,
+                                                data->version);
+                                goto out_err_noserver;
+                        }
+                case 5:
+                        memset(data->context, 0, sizeof(data->context));
+        }
+#ifndef CONFIG_NFS_V3
+        /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
+        error = -EPROTONOSUPPORT;
+        if (data->flags & NFS_MOUNT_VER3) {
+                dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
+                goto out_err_noserver;
+        }
+#endif /* CONFIG_NFS_V3 */
+        error = -ENOMEM;
+        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (!server)
+                goto out_err_noserver;
+        /* Zero out the NFS state stuff */
+        init_nfsv4_state(server);
+        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+        root = &server->fh;
+        if (data->flags & NFS_MOUNT_VER3)
+                root->size = data->root.size;
+        else
+                root->size = NFS2_FHSIZE;
+        error = -EINVAL;
+        if (root->size > sizeof(root->data)) {
+                dprintk("%s: invalid root filehandle\n", __FUNCTION__);
+                goto out_err;
+        }
+        memcpy(root->data, data->root.data, root->size);
+        /* We now require that the mount process passes the remote address */
+        memcpy(&server->addr, &data->addr, sizeof(server->addr));
+        if (server->addr.sin_addr.s_addr == INADDR_ANY) {
+                dprintk("%s: mount program didn't pass remote address!\n",
+                                __FUNCTION__);
+                goto out_err;
+        }
+        /* Fire up rpciod if not yet running */
+        error = rpciod_up();
+        if (error < 0) {
+                dprintk("%s: couldn't start rpciod! Error = %d\n",
+                                __FUNCTION__, error);
+                goto out_err;
+        }
+        s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto out_err_rpciod;
+        }
+        if (s->s_root)
+                goto out_rpciod_down;
+        s->s_flags = flags;
+        error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+        if (error) {
+                up_write(&s->s_umount);
+                deactivate_super(s);
+                return error;
+        }
+        s->s_flags |= MS_ACTIVE;
+        return simple_set_mnt(mnt, s);
+out_rpciod_down:
+        rpciod_down();
+        kfree(server);
+        return simple_set_mnt(mnt, s);
+out_err_rpciod:
+        rpciod_down();
+out_err:
+        kfree(server);
+out_err_noserver:
+        return error;
+}
+static void nfs_kill_super(struct super_block *s)
+{
+        struct nfs_server *server = NFS_SB(s);
+        kill_anon_super(s);
+        if (!IS_ERR(server->client))
+                rpc_shutdown_client(server->client);
+        if (!IS_ERR(server->client_sys))
+                rpc_shutdown_client(server->client_sys);
+        if (!IS_ERR(server->client_acl))
+                rpc_shutdown_client(server->client_acl);
+        if (!(server->flags & NFS_MOUNT_NONLM))
+                lockd_down();   /* release rpc.lockd */
+        rpciod_down();          /* release rpciod */
+        nfs_free_iostats(server->io_stats);
+        kfree(server->hostname);
+        kfree(server);
+        nfs_release_automount_timer();
+}
+static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+        struct super_block *sb;
+        server->fsid = data->fattr->fsid;
+        nfs_copy_fh(&server->fh, data->fh);
+        sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+        if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+                lockd_up();
+        return sb;
+}
+static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt);
+}
+#ifdef CONFIG_NFS_V4
+static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
+        struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
+{
+        struct nfs4_client *clp;
+        struct rpc_xprt *xprt = NULL;
+        struct rpc_clnt *clnt = NULL;
+        int err = -EIO;
+        clp = nfs4_get_client(&server->addr.sin_addr);
+        if (!clp) {
+                dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
+                return ERR_PTR(err);
+        }
+        /* Now create transport and client */
+        down_write(&clp->cl_sem);
+        if (IS_ERR(clp->cl_rpcclient)) {
+                xprt = xprt_create_proto(proto, &server->addr, timeparms);
+                if (IS_ERR(xprt)) {
+                        up_write(&clp->cl_sem);
+                        err = PTR_ERR(xprt);
+                        dprintk("%s: cannot create RPC transport. Error = %d\n",
+                                        __FUNCTION__, err);
+                        goto out_fail;
+                }
+                /* Bind to a reserved port! */
+                xprt->resvport = 1;
+                clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+                                server->rpc_ops->version, flavor);
+                if (IS_ERR(clnt)) {
+                        up_write(&clp->cl_sem);
+                        err = PTR_ERR(clnt);
+                        dprintk("%s: cannot create RPC client. Error = %d\n",
+                                        __FUNCTION__, err);
+                        goto out_fail;
+                }
+                clnt->cl_intr     = 1;
+                clnt->cl_softrtry = 1;
+                clp->cl_rpcclient = clnt;
+                memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
+                nfs_idmap_new(clp);
+        }
+        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+        clnt = rpc_clone_client(clp->cl_rpcclient);
+        if (!IS_ERR(clnt))
+                server->nfs4_state = clp;
+        up_write(&clp->cl_sem);
+        clp = NULL;
+        if (IS_ERR(clnt)) {
+                dprintk("%s: cannot create RPC client. Error = %d\n",
+                                __FUNCTION__, err);
+                return clnt;
+        }
+        if (server->nfs4_state->cl_idmap == NULL) {
+                dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
+                return ERR_PTR(-ENOMEM);
+        }
+        if (clnt->cl_auth->au_flavor != flavor) {
+                struct rpc_auth *auth;
+                auth = rpcauth_create(flavor, clnt);
+                if (IS_ERR(auth)) {
+                        dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+                        return (struct rpc_clnt *)auth;
+                }
+        }
+        return clnt;
+ out_fail:
+        if (clp)
+                nfs4_put_client(clp);
+        return ERR_PTR(err);
+}
+/*
+ * Set up an NFS4 superblock
+ */
+static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+{
+        struct nfs_server *server;
+        struct rpc_timeout timeparms;
+        rpc_authflavor_t authflavour;
+        int err = -EIO;
+        sb->s_blocksize_bits = 0;
+        sb->s_blocksize = 0;
+        server = NFS_SB(sb);
+        if (data->rsize != 0)
+                server->rsize = nfs_block_size(data->rsize, NULL);
+        if (data->wsize != 0)
+                server->wsize = nfs_block_size(data->wsize, NULL);
+        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+        server->caps = NFS_CAP_ATOMIC_OPEN;
+        server->acregmin = data->acregmin*HZ;
+        server->acregmax = data->acregmax*HZ;
+        server->acdirmin = data->acdirmin*HZ;
+        server->acdirmax = data->acdirmax*HZ;
+        server->rpc_ops = &nfs_v4_clientops;
+        nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
+        server->retrans_timeo = timeparms.to_initval;
+        server->retrans_count = timeparms.to_retries;
+        /* Now create transport and client */
+        authflavour = RPC_AUTH_UNIX;
+        if (data->auth_flavourlen != 0) {
+                if (data->auth_flavourlen != 1) {
+                        dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+                                        __FUNCTION__, data->auth_flavourlen);
+                        err = -EINVAL;
+                        goto out_fail;
+                }
+                if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
+                        err = -EFAULT;
+                        goto out_fail;
+                }
+        }
+        server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+        if (IS_ERR(server->client)) {
+                err = PTR_ERR(server->client);
+                        dprintk("%s: cannot create RPC client. Error = %d\n",
+                                        __FUNCTION__, err);
+                        goto out_fail;
+        }
+        sb->s_time_gran = 1;
+        sb->s_op = &nfs4_sops;
+        err = nfs_sb_init(sb, authflavour);
+ out_fail:
+        return err;
+}
+static int nfs4_compare_super(struct super_block *sb, void *data)
+{
+        struct nfs_server *server = data;
+        struct nfs_server *old = NFS_SB(sb);
+        if (strcmp(server->hostname, old->hostname) != 0)
+                return 0;
+        if (strcmp(server->mnt_path, old->mnt_path) != 0)
+                return 0;
+        return 1;
+}
+static void *
+nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
+{
+        void *p = NULL;
+        if (!src->len)
+                return ERR_PTR(-EINVAL);
+        if (src->len < maxlen)
+                maxlen = src->len;
+        if (dst == NULL) {
+                p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
+                if (p == NULL)
+                        return ERR_PTR(-ENOMEM);
+        }
+        if (copy_from_user(dst, src->data, maxlen)) {
+                kfree(p);
+                return ERR_PTR(-EFAULT);
+        }
+        dst[maxlen] = '\0';
+        return dst;
+}
+static int nfs4_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        int error;
+        struct nfs_server *server;
+        struct super_block *s;
+        struct nfs4_mount_data *data = raw_data;
+        void *p;
+        if (data == NULL) {
+                dprintk("%s: missing data argument\n", __FUNCTION__);
+                return -EINVAL;
+        }
+        if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
+                dprintk("%s: bad mount version\n", __FUNCTION__);
+                return -EINVAL;
+        }
+        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (!server)
+                return -ENOMEM;
+        /* Zero out the NFS state stuff */
+        init_nfsv4_state(server);
+        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+        p = nfs_copy_user_string(NULL, &data->hostname, 256);
+        if (IS_ERR(p))
+                goto out_err;
+        server->hostname = p;
+        p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
+        if (IS_ERR(p))
+                goto out_err;
+        server->mnt_path = p;
+        p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
+                        sizeof(server->ip_addr) - 1);
+        if (IS_ERR(p))
+                goto out_err;
+        /* We now require that the mount process passes the remote address */
+        if (data->host_addrlen != sizeof(server->addr)) {
+                error = -EINVAL;
+                goto out_free;
+        }
+        if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
+                error = -EFAULT;
+                goto out_free;
+        }
+        if (server->addr.sin_family != AF_INET ||
+            server->addr.sin_addr.s_addr == INADDR_ANY) {
+                dprintk("%s: mount program didn't pass remote IP address!\n",
+                                __FUNCTION__);
+                error = -EINVAL;
+                goto out_free;
+        }
+        /* Fire up rpciod if not yet running */
+        error = rpciod_up();
+        if (error < 0) {
+                dprintk("%s: couldn't start rpciod! Error = %d\n",
+                                __FUNCTION__, error);
+                goto out_free;
+        }
+        s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto out_free;
+        }
+        if (s->s_root) {
+                kfree(server->mnt_path);
+                kfree(server->hostname);
+                kfree(server);
+                return simple_set_mnt(mnt, s);
+        }
+        s->s_flags = flags;
+        error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+        if (error) {
+                up_write(&s->s_umount);
+                deactivate_super(s);
+                return error;
+        }
+        s->s_flags |= MS_ACTIVE;
+        return simple_set_mnt(mnt, s);
+out_err:
+        error = PTR_ERR(p);
+out_free:
+        kfree(server->mnt_path);
+        kfree(server->hostname);
+        kfree(server);
+        return error;
+}
+static void nfs4_kill_super(struct super_block *sb)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        nfs_return_all_delegations(sb);
+        kill_anon_super(sb);
+        nfs4_renewd_prepare_shutdown(server);
+        if (server->client != NULL && !IS_ERR(server->client))
+                rpc_shutdown_client(server->client);
+        destroy_nfsv4_state(server);
+        rpciod_down();
+        nfs_free_iostats(server->io_stats);
+        kfree(server->hostname);
+        kfree(server);
+        nfs_release_automount_timer();
+}
+/*
+ * Constructs the SERVER-side path
+ */
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+        char *page = (char *) __get_free_page(GFP_USER);
+        char *path;
+        path = nfs4_path(dentry, page, PAGE_SIZE);
+        if (!IS_ERR(path)) {
+                int len = PAGE_SIZE + page - path;
+                char *tmp = path;
+                path = kmalloc(len, GFP_KERNEL);
+                if (path)
+                        memcpy(path, tmp, len);
+                else
+                        path = ERR_PTR(-ENOMEM);
+        }
+        free_page((unsigned long)page);
+        return path;
+}
+static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+        const struct dentry *dentry = data->dentry;
+        struct nfs4_client *clp = server->nfs4_state;
+        struct super_block *sb;
+        server->fsid = data->fattr->fsid;
+        nfs_copy_fh(&server->fh, data->fh);
+        server->mnt_path = nfs4_dup_path(dentry);
+        if (IS_ERR(server->mnt_path)) {
+                sb = (struct super_block *)server->mnt_path;
+                goto err;
+        }
+        sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+        if (IS_ERR(sb) || sb->s_root)
+                goto free_path;
+        nfs4_server_capabilities(server, &server->fh);
+        down_write(&clp->cl_sem);
+        atomic_inc(&clp->cl_count);
+        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+        up_write(&clp->cl_sem);
+        return sb;
+free_path:
+        kfree(server->mnt_path);
+err:
+        server->mnt_path = NULL;
+        return sb;
+}
+static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt);
+}
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+        struct super_block *sb = ERR_PTR(-ENOMEM);
+        int len;
+        len = strlen(data->mnt_path) + 1;
+        server->mnt_path = kmalloc(len, GFP_KERNEL);
+        if (server->mnt_path == NULL)
+                goto err;
+        memcpy(server->mnt_path, data->mnt_path, len);
+        memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+        sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+        if (IS_ERR(sb) || sb->s_root)
+                goto free_path;
+        return sb;
+free_path:
+        kfree(server->mnt_path);
+err:
+        server->mnt_path = NULL;
+        return sb;
+}
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        struct rpc_timeout timeparms;
+        int proto, timeo, retrans;
+        void *err;
+        proto = IPPROTO_TCP;
+        /* Since we are following a referral and there may be alternatives,
+           set the timeouts and retries to low values */
+        timeo = 2;
+        retrans = 1;
+        nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+        server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+        if (IS_ERR((err = server->client)))
+                goto out_err;
+        sb->s_time_gran = 1;
+        sb->s_op = &nfs4_sops;
+        err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+        if (!IS_ERR(err))
+                return server;
+out_err:
+        return (struct nfs_server *)err;
+}
+static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt);
+}
+#endif
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b646..600bbe630abd 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
        struct page *page;
-        void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
+        void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
        if (err)
                goto read_failed;
        page = read_cache_page(&inode->i_data, 0,
@@ -75,22 +75,13 @@ read_failed:
        return NULL;
 }
-static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
-        if (cookie) {
-                struct page *page = cookie;
-                kunmap(page);
-                page_cache_release(page);
-        }
-}
 /*
 * symlinks can't do much...
 */
 struct inode_operations nfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = nfs_follow_link,
-        .put_link       = nfs_put_link,
+        .put_link       = page_put_link,
        .getattr        = nfs_getattr,
        .setattr        = nfs_setattr,
 };
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867ca..2fe3403c2409 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -3,7 +3,6 @@
 *
 * Sysctl interface to NFS parameters
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/linkage.h>
 #include <linux/ctype.h>
@@ -12,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
 #include "callback.h"
@@ -46,6 +46,15 @@ static ctl_table nfs_cb_sysctls[] = {
                .strategy = &sysctl_jiffies,
        },
 #endif
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "nfs_mountpoint_timeout",
+                .data           = &nfs_mountpoint_expiry_timeout,
+                .maxlen         = sizeof(nfs_mountpoint_expiry_timeout),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
        { .ctl_name = 0 }
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09f..bca5734ca9fb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -46,7 +46,6 @@
 * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -98,11 +97,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount < NFS_PAGEVEC_SIZE)
+                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = &p->page_array[0];
+                        p->pagevec = p->page_array;
                else {
-                        size_t size = ++pagecount * sizeof(struct page *);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        p->pagevec = kzalloc(size, GFP_NOFS);
                        if (!p->pagevec) {
                                mempool_free(p, nfs_commit_mempool);
                                p = NULL;
@@ -126,14 +124,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount < NFS_PAGEVEC_SIZE)
+                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = &p->page_array[0];
+                        p->pagevec = p->page_array;
                else {
-                        size_t size = ++pagecount * sizeof(struct page *);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        p->pagevec = kmalloc(size, GFP_NOFS);
+                        if (!p->pagevec) {
-                        if (p->pagevec) {
-                                memset(p->pagevec, 0, size);
-                        } else {
                                mempool_free(p, nfs_wdata_mempool);
                                p = NULL;
                        }
@@ -501,7 +496,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
        nfs_list_add_request(req, &nfsi->dirty);
        nfsi->ndirty++;
        spin_unlock(&nfsi->req_lock);
-        inc_page_state(nr_dirty);
+        inc_zone_page_state(req->wb_page, NR_FILE_DIRTY);
        mark_inode_dirty(inode);
 }
@@ -529,7 +524,7 @@ nfs_mark_request_commit(struct nfs_page *req)
        nfs_list_add_request(req, &nfsi->commit);
        nfsi->ncommit++;
        spin_unlock(&nfsi->req_lock);
-        inc_page_state(nr_unstable);
+        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
        mark_inode_dirty(inode);
 }
 #endif
@@ -583,6 +578,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
        return ret;
 }
+static void nfs_cancel_requests(struct list_head *head)
+{
+        struct nfs_page *req;
+        while(!list_empty(head)) {
+                req = nfs_list_entry(head->next);
+                nfs_list_remove_request(req);
+                nfs_inode_remove_request(req);
+                nfs_clear_page_writeback(req);
+        }
+}
 /*
 * nfs_scan_dirty - Scan an inode for dirty requests
 * @inode: NFS inode to scan
@@ -602,7 +608,6 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_sta
        if (nfsi->ndirty != 0) {
                res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
                nfsi->ndirty -= res;
-                sub_page_state(nr_dirty,res);
                if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
                        printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
        }
@@ -627,7 +632,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
        int res = 0;
        if (nfsi->ncommit != 0) {
-                res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+                res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
                nfsi->ncommit -= res;
                if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
                        printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1387,7 +1392,6 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data   *data = calldata;
        struct nfs_page         *req;
-        int res = 0;
        dprintk("NFS: %4d nfs_commit_done (status %d)\n",
                                task->tk_pid, task->tk_status);
@@ -1399,6 +1403,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
+                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                dprintk("NFS: commit (%s/%Ld %d@%Ld)",
                        req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -1425,9 +1430,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
                nfs_mark_request_dirty(req);
        next:
                nfs_clear_page_writeback(req);
-                res++;
        }
-        sub_page_state(nr_unstable,res);
 }
 static const struct rpc_call_ops nfs_commit_ops = {
@@ -1495,15 +1498,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
                pages = nfs_scan_dirty(inode, &head, idx_start, npages);
                if (pages != 0) {
                        spin_unlock(&nfsi->req_lock);
-                        ret = nfs_flush_list(inode, &head, pages, how);
+                        if (how & FLUSH_INVALIDATE)
+                                nfs_cancel_requests(&head);
+                        else
+                                ret = nfs_flush_list(inode, &head, pages, how);
                        spin_lock(&nfsi->req_lock);
                        continue;
                }
                if (nocommit)
                        break;
-                pages = nfs_scan_commit(inode, &head, 0, 0);
+                pages = nfs_scan_commit(inode, &head, idx_start, npages);
                if (pages == 0)
                        break;
+                if (how & FLUSH_INVALIDATE) {
+                        spin_unlock(&nfsi->req_lock);
+                        nfs_cancel_requests(&head);
+                        spin_lock(&nfsi->req_lock);
+                        continue;
+                }
+                pages += nfs_scan_commit(inode, &head, 0, 0);
                spin_unlock(&nfsi->req_lock);
                ret = nfs_commit_list(inode, &head, how);
                spin_lock(&nfsi->req_lock);
@@ -1512,7 +1525,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
        return ret;
 }
-int nfs_init_writepagecache(void)
+int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
                                             sizeof(struct nfs_write_data),
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index a5a18d4aca40..c043136a82ca 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -4,7 +4,6 @@
 *      This should eventually move to userland.
 *
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 3eec30000f3f..01bc68c628ad 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -126,7 +126,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
        if (*ep)
                goto out;
        dprintk("found fsidtype %d\n", fsidtype);
-        if (fsidtype > 2)
+        if (key_len(fsidtype)==0) /* invalid type */
                goto out;
        if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
                goto out;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index dbaf3f93f328..54b37b1d2e3a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -33,7 +33,6 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/inet.h>
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4b6aa60dfceb..bea6b9478114 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -34,7 +34,6 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 96c7578cbe1e..9daa0b9feb8d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
 */
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 static void
@@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp)
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
        list_del_init(&clp->cl_strhash);
-        list_del_init(&clp->cl_idhash);
+        list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
-        list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
        strhashval = clientstr_hashval(clp->cl_recdir);
        list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
        renew_client(clp);
@@ -1238,8 +1237,15 @@ find_file(struct inode *ino)
        return NULL;
 }
-#define TEST_ACCESS(x) ((x > 0 || x < 4)?1:0)
+static int access_valid(u32 x)
-#define TEST_DENY(x) ((x >= 0 || x < 5)?1:0)
+{
+        return (x > 0 && x < 4);
+}
+static int deny_valid(u32 x)
+{
+        return (x >= 0 && x < 5);
+}
 static void
 set_access(unsigned int *access, unsigned long bmap) {
@@ -1746,7 +1752,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        int status;
        status = nfserr_inval;
-        if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny))
+        if (!access_valid(open->op_share_access)
+                        || !deny_valid(open->op_share_deny))
                goto out;
        /*
         * Lookup file; if found, lookup stateid and check open request,
@@ -1783,10 +1790,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        } else {
                /* Stateid was not found, this is a new OPEN */
                int flags = 0;
+                if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
+                        flags |= MAY_READ;
                if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                        flags = MAY_WRITE;
+                        flags |= MAY_WRITE;
-                else
-                        flags = MAY_READ;
                status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
                if (status)
                        goto out;
@@ -2071,16 +2078,12 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl
        if (!stateid->si_fileid) { /* delegation stateid */
                if(!(dp = find_delegation_stateid(ino, stateid))) {
                        dprintk("NFSD: delegation stateid not found\n");
-                        if (nfs4_in_grace())
-                                status = nfserr_grace;
                        goto out;
                }
                stidp = &dp->dl_stateid;
        } else { /* open or lock stateid */
                if (!(stp = find_stateid(stateid, flags))) {
                        dprintk("NFSD: open or lock stateid not found\n");
-                        if (nfs4_in_grace())
-                                status = nfserr_grace;
                        goto out;
                }
                if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
@@ -2253,8 +2256,9 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
                        (int)current_fh->fh_dentry->d_name.len,
                        current_fh->fh_dentry->d_name.name);
-        if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
+        status = fh_verify(rqstp, current_fh, S_IFREG, 0);
-                goto out;
+        if (status)
+                return status;
        nfs4_lock_state();
@@ -2321,7 +2325,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct n
                        (int)current_fh->fh_dentry->d_name.len,
                        current_fh->fh_dentry->d_name.name);
-        if (!TEST_ACCESS(od->od_share_access) || !TEST_DENY(od->od_share_deny))
+        if (!access_valid(od->od_share_access)
+                        || !deny_valid(od->od_share_deny))
                return nfserr_inval;
        nfs4_lock_state();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de3998f15f10..5446a0861d1d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
            (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
                       FATTR4_WORD1_SPACE_TOTAL))) {
-                status = vfs_statfs(dentry->d_inode->i_sb, &statfs);
+                status = vfs_statfs(dentry, &statfs);
                if (status)
                        goto out_nfserr;
        }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d852ebb538e3..fdf7cf3dfadc 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -103,8 +103,7 @@ nfsd_cache_shutdown(void)
 static void
 lru_put_end(struct svc_cacherep *rp)
 {
-        list_del(&rp->c_lru);
+        list_move_tail(&rp->c_lru, &lru_head);
-        list_add_tail(&rp->c_lru, &lru_head);
 }
 /*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ef017b3b5bd..7046ac9cf97f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -6,7 +6,6 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/linkage.h>
@@ -494,10 +493,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
        return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
-static struct super_block *nfsd_get_sb(struct file_system_type *fs_type,
+static int nfsd_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, nfsd_fill_super);
+        return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
 }
 static struct file_system_type nfsd_fs_type = {
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3f2ec2e6d06c..ecc439d2565f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -187,13 +187,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                        goto out;
                }
-                /* Set user creds for this exportpoint */
-                error = nfsd_setuser(rqstp, exp);
-                if (error) {
-                        error = nfserrno(error);
-                        goto out;
-                }
                /*
                 * Look up the dentry using the NFS file handle.
                 */
@@ -251,6 +244,14 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
        }
        cache_get(&exp->h);
+        /* Set user creds for this exportpoint; necessary even in the "just
+         * checking" case because this may be a filehandle that was created by
+         * fh_compose, and that is about to be used in another nfsv4 compound
+         * operation */
+        error = nfserrno(nfsd_setuser(rqstp, exp));
+        if (error)
+                goto out;
        error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
        if (error)
                goto out;
@@ -312,8 +313,8 @@ int
 fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh)
 {
        /* ref_fh is a reference file handle.
-         * if it is non-null, then we should compose a filehandle which is
+         * if it is non-null and for the same filesystem, then we should compose
-         * of the same version, where possible.
+         * a filehandle which is of the same version, where possible.
         * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
         * Then create a 32byte filehandle using nfs_fhbase_old
         *
@@ -332,7 +333,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
                parent->d_name.name, dentry->d_name.name,
                (inode ? inode->i_ino : 0));
-        if (ref_fh) {
+        if (ref_fh && ref_fh->fh_export == exp) {
                ref_fh_version = ref_fh->fh_handle.fh_version;
                if (ref_fh_version == 0xca)
                        ref_fh_fsid_type = 0;
@@ -461,7 +462,7 @@ fh_update(struct svc_fh *fhp)
        } else {
                int size;
                if (fhp->fh_handle.fh_fileid_type != 0)
-                        goto out_uptodate;
+                        goto out;
                datap = fhp->fh_handle.fh_auth+
                        fhp->fh_handle.fh_size/4 -1;
                size = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
@@ -481,10 +482,6 @@ out_negative:
        printk(KERN_ERR "fh_update: %s/%s still negative!\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
        goto out;
-out_uptodate:
-        printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n",
-                dentry->d_parent->d_name.name, dentry->d_name.name);
-        goto out;
 }
 /*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 3790727e5dfd..ec1decf29bab 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,7 +8,6 @@
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/time.h>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1d65f13f458c..c9e3b5a8fe07 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -16,7 +16,6 @@
 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
 */
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/errno.h>
@@ -673,7 +672,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                goto out_nfserr;
        if (access & MAY_WRITE) {
-                flags = O_WRONLY|O_LARGEFILE;
+                if (access & MAY_READ)
+                        flags = O_RDWR|O_LARGEFILE;
+                else
+                        flags = O_WRONLY|O_LARGEFILE;
                DQUOT_INIT(inode);
        }
@@ -834,7 +836,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        if (ra && ra->p_set)
                file->f_ra = ra->p_ra;
-        if (file->f_op->sendfile) {
+        if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
                svc_pushback_unused_pages(rqstp);
                err = file->f_op->sendfile(file, &offset, *count,
                                                 nfsd_read_actor, rqstp);
@@ -1517,14 +1519,15 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                        err = nfserrno(err);
        }
-        fh_unlock(ffhp);
        dput(dnew);
+out_unlock:
+        fh_unlock(ffhp);
 out:
        return err;
 out_nfserr:
        err = nfserrno(err);
-        goto out;
+        goto out_unlock;
 }
 /*
@@ -1553,7 +1556,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        tdir = tdentry->d_inode;
        err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
-        if (fdir->i_sb != tdir->i_sb)
+        if (ffhp->fh_export != tfhp->fh_export)
                goto out;
        err = nfserr_perm;
@@ -1737,7 +1740,7 @@ int
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
        int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
-        if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat))
+        if (!err && vfs_statfs(fhp->fh_dentry,stat))
                err = nfserr_io;
        return err;
 }
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index a912debcd20b..9de6b495f112 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -10,7 +10,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
-#include <linux/config.h>
 #include <linux/nls.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 580412d330cb..bc579bfdfbd8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1544,7 +1544,7 @@ err_out:
 /**
 * ntfs_aops - general address space operations for inodes and attributes
 */
-struct address_space_operations ntfs_aops = {
+const struct address_space_operations ntfs_aops = {
        .readpage       = ntfs_readpage,        /* Fill page with data. */
        .sync_page      = block_sync_page,      /* Currently, just unplugs the
                                                   disk request queue. */
@@ -1560,7 +1560,7 @@ struct address_space_operations ntfs_aops = {
 * ntfs_mst_aops - general address space operations for mst protecteed inodes
 *                 and attributes
 */
-struct address_space_operations ntfs_mst_aops = {
+const struct address_space_operations ntfs_mst_aops = {
        .readpage       = ntfs_readpage,        /* Fill page with data. */
        .sync_page      = block_sync_page,      /* Currently, just unplugs the
                                                   disk request queue. */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2ff..325ce261a107 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -86,8 +86,7 @@ static inline void ntfs_unmap_page(struct page *page)
 static inline struct page *ntfs_map_page(struct address_space *mapping,
                unsigned long index)
 {
-        struct page *page = read_cache_page(mapping, index,
+        struct page *page = read_mapping_page(mapping, index, NULL);
-                        (filler_t*)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1663f5c3c6aa..6708e1d68a9e 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2529,8 +2529,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
        end >>= PAGE_CACHE_SHIFT;
        /* If there is a first partial page, need to do it the slow way. */
        if (start_ofs) {
-                page = read_cache_page(mapping, idx,
+                page = read_mapping_page(mapping, idx, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
                if (IS_ERR(page)) {
                        ntfs_error(vol->sb, "Failed to read first partial "
                                        "page (sync error, index 0x%lx).", idx);
@@ -2600,8 +2599,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
        }
        /* If there is a last partial page, need to do it the slow way. */
        if (end_ofs) {
-                page = read_cache_page(mapping, idx,
+                page = read_mapping_page(mapping, idx, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
                if (IS_ERR(page)) {
                        ntfs_error(vol->sb, "Failed to read last partial page "
                                        "(sync error, index 0x%lx).", idx);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c63a83e8da98..2e42c2dcae12 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -231,8 +231,7 @@ do_non_resident_extend:
                 * Read the page.  If the page is not present, this will zero
                 * the uninitialized regions for us.
                 */
-                page = read_cache_page(mapping, index,
+                page = read_mapping_page(mapping, index, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
                if (IS_ERR(page)) {
                        err = PTR_ERR(page);
                        goto init_err_out;
@@ -1359,7 +1358,7 @@ err_out:
        goto out;
 }
-static size_t __ntfs_copy_from_user_iovec(char *vaddr,
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
                const struct iovec *iov, size_t iov_ofs, size_t bytes)
 {
        size_t total = 0;
@@ -1377,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
                bytes -= len;
                vaddr += len;
                if (unlikely(left)) {
-                        /*
-                         * Zero the rest of the target like __copy_from_user().
-                         */
-                        memset(vaddr, 0, bytes);
                        total -= left;
                        break;
                }
@@ -1421,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
 * single-segment behaviour.
 *
- * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
- * when not atomic.  This is ok because __ntfs_copy_from_user_iovec() calls
+ * when atomic and when not atomic.  This is ok because
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
+ * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
- * fact, the only difference between __copy_from_user_inatomic() and
+ * and it is ok to call this when non-atomic.
- * __copy_from_user() is that the latter calls might_sleep().  And on many
+ * Infact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many
 * architectures __copy_from_user_inatomic() is just defined to
 * __copy_from_user() so it makes no difference at all on those architectures.
 */
@@ -1442,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
                if (len > bytes)
                        len = bytes;
                kaddr = kmap_atomic(*pages, KM_USER0);
-                copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+                copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
                                *iov, *iov_ofs, len);
                kunmap_atomic(kaddr, KM_USER0);
                if (unlikely(copied != len)) {
                        /* Do it the slow way. */
                        kaddr = kmap(*pages);
-                        copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+                        copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
                                        *iov, *iov_ofs, len);
+                        /*
+                         * Zero the rest of the target like __copy_from_user().
+                         */
+                        memset(kaddr + ofs + copied, 0, len - copied);
                        kunmap(*pages);
                        if (unlikely(copied != len))
                                goto err_out;
@@ -1484,14 +1485,15 @@ static inline void ntfs_flush_dcache_pages(struct page **pages,
                unsigned nr_pages)
 {
        BUG_ON(!nr_pages);
+        /*
+         * Warning: Do not do the decrement at the same time as the call to
+         * flush_dcache_page() because it is a NULL macro on i386 and hence the
+         * decrement never happens so the loop never terminates.
+         */
        do {
-                /*
+                --nr_pages;
-                 * Warning: Do not do the decrement at the same time as the
-                 * call because flush_dcache_page() is a NULL macro on i386
-                 * and hence the decrement never happens.
-                 */
                flush_dcache_page(pages[nr_pages]);
-        } while (--nr_pages > 0);
+        } while (nr_pages > 0);
 }
 /**
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index bf7b3d7c0930..ddd3d503097c 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -57,8 +57,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache;
 extern struct kmem_cache *ntfs_index_ctx_cache;
 /* The various operations structs defined throughout the driver files. */
-extern struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_aops;
-extern struct address_space_operations ntfs_mst_aops;
+extern const struct address_space_operations ntfs_mst_aops;
 extern const struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 27833f6df49f..0e14acea3f8b 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 /**
 * ntfs_statfs - return information about mounted NTFS volume
- * @sb:         super block of mounted volume
+ * @dentry:     dentry from mounted volume
 * @sfs:        statfs structure in which to return the information
 *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
 * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
 * called). We interpret the values to be correct of the moment in time at
 * which we are called. Most values are variable otherwise and this isn't just
@@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 *
 * Return 0 on success or -errno on error.
 */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
+        struct super_block *sb = dentry->d_sb;
        s64 size;
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
@@ -3093,10 +3094,11 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
+static int ntfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+                           mnt);
 }
 static struct file_system_type ntfs_fs_type = {
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
index c8064cae8f17..beda5bf96405 100644
--- a/fs/ntfs/sysctl.h
+++ b/fs/ntfs/sysctl.h
@@ -24,7 +24,6 @@
 #ifndef _LINUX_NTFS_SYSCTL_H
 #define _LINUX_NTFS_SYSCTL_H
-#include <linux/config.h>
 #if defined(DEBUG) && defined(CONFIG_SYSCTL)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 47152bf9a7f2..f1d1c342ce01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -558,16 +558,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        u64 vbo_max; /* file offset, max_blocks from iblock */
        u64 p_blkno;
        int contig_blocks;
-        unsigned char blocksize_bits;
+        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-        if (!inode || !bh_result) {
-                mlog(ML_ERROR, "inode or bh_result is null\n");
-                return -EIO;
-        }
-        blocksize_bits = inode->i_sb->s_blocksize_bits;
        /* This function won't even be called if the request isn't all
         * nicely aligned and of the right size, so there's no need
         * for us to check any of that. */
@@ -666,7 +659,7 @@ out:
        return ret;
 }
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
        .writepage      = ocfs2_writepage,
        .prepare_write  = ocfs2_prepare_write,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38accd039..504595d6cf65 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem);
 * multiple hb threads are watching multiple regions.  A node is live
 * whenever any of the threads sees activity from the node in its region.
 */
-static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(o2hb_live_lock);
 static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
@@ -517,6 +517,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
        hb_block->hb_seq = cpu_to_le64(cputime);
        hb_block->hb_node = node_num;
        hb_block->hb_generation = cpu_to_le64(generation);
+        hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
        /* This step must always happen last! */
        hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
@@ -645,6 +646,8 @@ static int o2hb_check_slot(struct o2hb_region *reg,
        struct o2nm_node *node;
        struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
        u64 cputime;
+        unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
+        unsigned int slot_dead_ms;
        memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
@@ -733,6 +736,23 @@ fire_callbacks:
                              &o2hb_live_slots[slot->ds_node_num]);
                slot->ds_equal_samples = 0;
+                /* We want to be sure that all nodes agree on the
+                 * number of milliseconds before a node will be
+                 * considered dead. The self-fencing timeout is
+                 * computed from this value, and a discrepancy might
+                 * result in heartbeat calling a node dead when it
+                 * hasn't self-fenced yet. */
+                slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
+                if (slot_dead_ms && slot_dead_ms != dead_ms) {
+                        /* TODO: Perhaps we can fail the region here. */
+                        mlog(ML_ERROR, "Node %d on device %s has a dead count "
+                             "of %u ms, but our count is %u ms.\n"
+                             "Please double check your configuration values "
+                             "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
+                             slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
+                             dead_ms);
+                }
                goto out;
        }
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 73edad782537..a42628ba9ddf 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -123,6 +123,17 @@
 #define MLOG_MASK_PREFIX 0
 #endif
+/*
+ * When logging is disabled, force the bit test to 0 for anything other
+ * than errors and notices, allowing gcc to remove the code completely.
+ * When enabled, allow all masks.
+ */
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
+#define ML_ALLOWED_BITS ~0
+#else
+#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
+#endif
 #define MLOG_MAX_BITS 64
 struct mlog_bits {
@@ -187,7 +198,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog(mask, fmt, args...) do {                                   \
        u64 __m = MLOG_MASK_PREFIX | (mask);                            \
-        if (__mlog_test_u64(__m, mlog_and_bits) &&                      \
+        if ((__m & ML_ALLOWED_BITS) &&                                  \
+            __mlog_test_u64(__m, mlog_and_bits) &&                      \
            !__mlog_test_u64(__m, mlog_not_bits)) {                     \
                if (__m & ML_ERROR)                                     \
                        __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
@@ -204,6 +216,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
                mlog(ML_ERROR, "status = %lld\n", (long long)_st);      \
 } while (0)
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
 #define mlog_entry(fmt, args...) do {                                   \
        mlog(ML_ENTRY, "ENTRY:" fmt , ##args);                          \
 } while (0)
@@ -247,6 +260,13 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_exit_void() do {                                           \
        mlog(ML_EXIT, "EXIT\n");                                        \
 } while (0)
+#else
+#define mlog_entry(...)  do { } while (0)
+#define mlog_entry_void(...)  do { } while (0)
+#define mlog_exit(...)  do { } while (0)
+#define mlog_exit_ptr(...)  do { } while (0)
+#define mlog_exit_void(...)  do { } while (0)
+#endif  /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
 #define mlog_bug_on_msg(cond, fmt, args...) do {                        \
        if (cond) {                                                     \
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 94096069cb43..3f4151da9709 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -32,6 +32,7 @@ struct o2hb_disk_heartbeat_block {
        __u8  hb_pad1[3];
        __le32 hb_cksum;
        __le64 hb_generation;
+        __le32 hb_dead_ms;
 };
 #endif /* _OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0d3985..b650efa8c8be 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@
            ##args);                                                    \
 } while (0)
-static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(o2net_handler_lock);
 static struct rb_root o2net_handler_tree = RB_ROOT;
 static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
@@ -396,8 +396,8 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        }
        if (was_valid && !valid) {
-                mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
+                printk(KERN_INFO "o2net: no longer connected to "
-                     SC_NODEF_ARGS(old_sc));
+                       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
                o2net_complete_nodes_nsw(nn);
        }
@@ -409,10 +409,10 @@ static void o2net_set_nn_state(struct o2net_node *nn,
                 * the only way to start connecting again is to down
                 * heartbeat and bring it back up. */
                cancel_delayed_work(&nn->nn_connect_expired);
-                mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
+                printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
-                     o2nm_this_node() > sc->sc_node->nd_num ?
+                       o2nm_this_node() > sc->sc_node->nd_num ?
-                        "connected to" : "accepted connection from",
+                                "connected to" : "accepted connection from",
-                     SC_NODEF_ARGS(sc));
+                       SC_NODEF_ARGS(sc));
        }
        /* trigger the connecting worker func as long as we're not valid,
@@ -1280,7 +1280,7 @@ static void o2net_idle_timer(unsigned long data)
        do_gettimeofday(&now);
-        mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
+        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
        mlog(ML_NOTICE, "here are some times that might help debug the "
             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ae47f450792f..3d494d1a5f36 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -213,11 +213,9 @@ int ocfs2_find_files_on_disk(const char *name,
                             struct ocfs2_dir_entry **dirent)
 {
        int status = -ENOENT;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry("(osb=%p, parent=%llu, name='%.*s', blkno=%p, inode=%p)\n",
+        mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
-                   osb, (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                   namelen, name, blkno, inode, dirent_bh, dirent);
-                   namelen, name, blkno, inode);
        *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
        if (!*dirent_bh || !*dirent) {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593dd8ef8..42775e2bbe2c 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                  lock->ml.node == dlm->node_num ? "master" :
                                  "remote");
                        memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
-                } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
-                        mlog(0, "setting lvb from lockres for %s node\n",
-                                  lock->ml.node == dlm->node_num ? "master" :
-                                  "remote");
-                        memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
                }
+                /* Do nothing for lvb put requests - they should be done in
+                 * place when the lock is downconverted - otherwise we risk
+                 * racing gets and puts which could result in old lvb data
+                 * being propagated. We leave the put flag set and clear it
+                 * here. In the future we might want to clear it at the time
+                 * the put is actually done.
+                 */
                spin_unlock(&res->spinlock);
        }
@@ -381,8 +383,7 @@ do_ast:
        ret = DLM_NORMAL;
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
-                list_del_init(&lock->list);
+                list_move_tail(&lock->list, &res->granted);
-                list_add_tail(&lock->list, &res->granted);
                mlog(0, "ast: adding to granted list... type=%d, "
                          "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43df18f1..14530ee7e11d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,17 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
-#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_SIZE_DEFAULT   (1 << 14)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES         1
+#else
+# define DLM_HASH_PAGES         (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE    (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS        (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 enum dlm_ast_type {
        DLM_AST = 0,
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
        return 0;
 }
-#define DLM_RECO_STATE_ACTIVE  0x0001
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
 struct dlm_recovery_ctxt
 {
@@ -85,7 +96,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
        struct list_head list;
-        struct hlist_head *lockres_hash;
+        struct hlist_head **lockres_hash;
        struct list_head dirty_list;
        struct list_head purge_list;
        struct list_head pending_asts;
@@ -120,6 +131,7 @@ struct dlm_ctxt
        struct o2hb_callback_func dlm_hb_down;
        struct task_struct *dlm_thread_task;
        struct task_struct *dlm_reco_thread_task;
+        struct workqueue_struct *dlm_worker;
        wait_queue_head_t dlm_thread_wq;
        wait_queue_head_t dlm_reco_thread_wq;
        wait_queue_head_t ast_wq;
@@ -132,6 +144,11 @@ struct dlm_ctxt
        struct list_head        dlm_eviction_callbacks;
 };
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+        return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
 /* these keventd work queue items are for less-frequently
 * called functions that cannot be directly called from the
 * net message handlers for some reason, usually because
@@ -216,20 +233,29 @@ struct dlm_lock_resource
        /* WARNING: Please see the comment in dlm_init_lockres before
         * adding fields here. */
        struct hlist_node hash_node;
+        struct qstr lockname;
        struct kref      refs;
-        /* please keep these next 3 in this order
+        /*
-         * some funcs want to iterate over all lists */
+         * Please keep granted, converting, and blocked in this order,
+         * as some funcs want to iterate over all lists.
+         *
+         * All four lists are protected by the hash's reference.
+         */
        struct list_head granted;
        struct list_head converting;
        struct list_head blocked;
+        struct list_head purge;
+        /*
+         * These two lists require you to hold an additional reference
+         * while they are on the list.
+         */
        struct list_head dirty;
        struct list_head recovering; // dlm_recovery_ctxt.resources list
        /* unused lock resources have their last_used stamped and are
         * put on a list for the dlm thread to run. */
-        struct list_head purge;
        unsigned long    last_used;
        unsigned migration_pending:1;
@@ -238,7 +264,6 @@ struct dlm_lock_resource
        wait_queue_head_t wq;
        u8  owner;              //node which owns the lock resource, or unknown
        u16 state;
-        struct qstr lockname;
        char lvb[DLM_LVB_LEN];
 };
@@ -300,6 +325,15 @@ enum dlm_lockres_list {
        DLM_BLOCKED_LIST
 };
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+        int i;
+        for (i=0; i<DLM_LVB_LEN; i++)
+                if (lvb[i])
+                        return 0;
+        return 1;
+}
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -609,7 +643,8 @@ struct dlm_finalize_reco
 {
        u8 node_idx;
        u8 dead_node;
-        __be16 pad1;
+        u8 flags;
+        u8 pad1;
        __be32 pad2;
 };
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                            struct dlm_lock_resource *res);
 void dlm_purge_lockres(struct dlm_ctxt *dlm,
                       struct dlm_lock_resource *lockres);
-void dlm_lockres_get(struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+        /* This is called on every lookup, so it might be worth
+         * inlining. */
+        kref_get(&res->refs);
+}
 void dlm_lockres_put(struct dlm_lock_resource *res);
 void __dlm_unhash_lockres(struct dlm_lock_resource *res);
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
                          struct dlm_lock_resource *res);
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                                const char *name,
-                                                unsigned int len);
+                                                unsigned int len,
+                                                unsigned int hash);
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                              const char *name,
                                              unsigned int len);
@@ -780,8 +822,6 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                          u8 nodenum, u8 *real_master);
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-                               struct dlm_lock_resource *res, u8 *real_master);
 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
@@ -819,6 +859,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm,
                           u8 dead_node);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
 static inline const char * dlm_lock_mode_name(int mode)
 {
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228d9e37..c764dc8e40a2 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
        if (lock->ml.node == dlm->node_num)
                mlog(0, "doing in-place convert for nonlocal lock\n");
        lock->ml.type = type;
+        if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+                memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
        status = DLM_NORMAL;
        *call_ast = 1;
        goto unlock_exit;
@@ -231,8 +234,7 @@ switch_queues:
        lock->ml.convert_type = type;
        /* do not alter lock refcount.  switching lists. */
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->converting);
-        list_add_tail(&lock->list, &res->converting);
 unlock_exit:
        spin_unlock(&lock->spinlock);
@@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res,
                                struct dlm_lock *lock)
 {
        /* do not alter lock refcount.  switching lists. */
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->granted);
-        list_add_tail(&lock->list, &res->granted);
        lock->ml.convert_type = LKM_IVMODE;
        lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
 }
@@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        res->state |= DLM_LOCK_RES_IN_PROGRESS;
        /* move lock to local convert queue */
        /* do not alter lock refcount.  switching lists. */
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->converting);
-        list_add_tail(&lock->list, &res->converting);
        lock->convert_pending = 1;
        lock->ml.convert_type = type;
@@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
        }
        spin_lock(&res->spinlock);
+        status = __dlm_lockres_state_to_status(res);
+        if (status != DLM_NORMAL) {
+                spin_unlock(&res->spinlock);
+                dlm_error(status);
+                goto leave;
+        }
        list_for_each(iter, &res->granted) {
                lock = list_entry(iter, struct dlm_lock, list);
                if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
                }
                lock = NULL;
        }
+        if (!lock) {
+                __dlm_print_one_lock_resource(res);
+                list_for_each(iter, &res->granted) {
+                        lock = list_entry(iter, struct dlm_lock, list);
+                        if (lock->ml.node == cnv->node_idx) {
+                                mlog(ML_ERROR, "There is something here "
+                                     "for node %u, lock->ml.cookie=%llu, "
+                                     "cnv->cookie=%llu\n", cnv->node_idx,
+                                     (unsigned long long)lock->ml.cookie,
+                                     (unsigned long long)cnv->cookie);
+                                break;
+                        }
+                }
+                lock = NULL;
+        }
        spin_unlock(&res->spinlock);
        if (!lock) {
                status = DLM_IVLOCKID;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d3324e..3f6c8d88f7af 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
-#include "dlmdebug.h"
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+#if 0
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
        struct dlm_lock_resource *res;
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
        spin_lock(&dlm->spinlock);
        for (i=0; i<DLM_HASH_BUCKETS; i++) {
-                bucket = &(dlm->lockres_hash[i]);
+                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, iter, bucket, hash_node)
                        dlm_print_one_lock_resource(res);
        }
        spin_unlock(&dlm->spinlock);
 }
+#endif  /*  0  */
 static const char *dlm_errnames[] = {
        [DLM_NORMAL] =                  "DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510c3ccd..000000000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmdebug.h
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-#ifndef DLMDEBUG_H
-#define DLMDEBUG_H
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
-#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3106fd..8d1065f8b3bd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 #include "dlmver.h"
@@ -49,6 +48,33 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
+static void dlm_free_pagevec(void **vec, int pages)
+{
+        while (pages--)
+                free_page((unsigned long)vec[pages]);
+        kfree(vec);
+}
+static void **dlm_alloc_pagevec(int pages)
+{
+        void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+        int i;
+        if (!vec)
+                return NULL;
+        for (i = 0; i < pages; i++)
+                if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+                        goto out_free;
+        mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+             pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
+        return vec;
+out_free:
+        dlm_free_pagevec(vec, i);
+        return NULL;
+}
 /*
 *
 * spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -62,7 +88,7 @@
 *
 */
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
 LIST_HEAD(dlm_domains);
 static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
        assert_spin_locked(&dlm->spinlock);
        q = &res->lockname;
-        q->hash = full_name_hash(q->name, q->len);
+        bucket = dlm_lockres_hash(dlm, q->hash);
-        bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
        /* get a reference for our hashtable */
        dlm_lockres_get(res);
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 }
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
-                                         const char *name,
+                                                const char *name,
-                                         unsigned int len)
+                                                unsigned int len,
+                                                unsigned int hash)
 {
-        unsigned int hash;
-        struct hlist_node *iter;
-        struct dlm_lock_resource *tmpres=NULL;
        struct hlist_head *bucket;
+        struct hlist_node *list;
        mlog_entry("%.*s\n", len, name);
        assert_spin_locked(&dlm->spinlock);
-        hash = full_name_hash(name, len);
+        bucket = dlm_lockres_hash(dlm, hash);
-        bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
-        /* check for pre-existing lock */
-        hlist_for_each(iter, bucket) {
-                tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
-                if (tmpres->lockname.len == len &&
-                    memcmp(tmpres->lockname.name, name, len) == 0) {
-                        dlm_lockres_get(tmpres);
-                        break;
-                }
-                tmpres = NULL;
+        hlist_for_each(list, bucket) {
+                struct dlm_lock_resource *res = hlist_entry(list,
+                        struct dlm_lock_resource, hash_node);
+                if (res->lockname.name[0] != name[0])
+                        continue;
+                if (unlikely(res->lockname.len != len))
+                        continue;
+                if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+                        continue;
+                dlm_lockres_get(res);
+                return res;
        }
-        return tmpres;
+        return NULL;
 }
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                    unsigned int len)
 {
        struct dlm_lock_resource *res;
+        unsigned int hash = dlm_lockid_hash(name, len);
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, name, len);
+        res = __dlm_lookup_lockres(dlm, name, len, hash);
        spin_unlock(&dlm->spinlock);
        return res;
 }
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain)
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
        if (dlm->lockres_hash)
-                free_page((unsigned long) dlm->lockres_hash);
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
        if (dlm->name)
                kfree(dlm->name);
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
        return ret;
 }
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+        if (dlm->dlm_worker) {
+                flush_workqueue(dlm->dlm_worker);
+                destroy_workqueue(dlm->dlm_worker);
+                dlm->dlm_worker = NULL;
+        }
+}
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
        dlm_unregister_domain_handlers(dlm);
        dlm_complete_thread(dlm);
        dlm_complete_recovery_thread(dlm);
+        dlm_destroy_dlm_worker(dlm);
        /* We've left the domain. Now we can take ourselves out of the
         * list and allow the kref stuff to help us free the
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 restart:
        spin_lock(&dlm->spinlock);
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                while (!hlist_empty(&dlm->lockres_hash[i])) {
+                while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
-                        res = hlist_entry(dlm->lockres_hash[i].first,
+                        res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
                                          struct dlm_lock_resource, hash_node);
                        /* need reference when manually grabbing lockres */
                        dlm_lockres_get(res);
@@ -374,12 +408,13 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
        assert_spin_locked(&dlm->spinlock);
-        mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+        printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
-                mlog(ML_NOTICE, " node %d\n", node);
+                printk("%d ", node);
        }
+        printk("\n");
 }
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
@@ -395,7 +430,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
        node = exit_msg->node_idx;
-        mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+        printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
@@ -644,6 +679,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
                set_bit(assert->node_idx, dlm->domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+                printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
                /* notify anything attached to the heartbeat events */
@@ -1126,6 +1163,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                goto bail;
        }
+        dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+        if (!dlm->dlm_worker) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
        do {
                unsigned int backoff;
                status = dlm_try_to_join_domain(dlm);
@@ -1166,6 +1210,7 @@ bail:
                dlm_unregister_domain_handlers(dlm);
                dlm_complete_thread(dlm);
                dlm_complete_recovery_thread(dlm);
+                dlm_destroy_dlm_worker(dlm);
        }
        return status;
@@ -1191,7 +1236,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+        dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
        if (!dlm->lockres_hash) {
                mlog_errno(-ENOMEM);
                kfree(dlm->name);
@@ -1200,8 +1245,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        for (i=0; i<DLM_HASH_BUCKETS; i++)
+        for (i = 0; i < DLM_HASH_BUCKETS; i++)
-                INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
+                INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
        strcpy(dlm->name, domain);
        dlm->key = key;
@@ -1231,6 +1276,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->dlm_thread_task = NULL;
        dlm->dlm_reco_thread_task = NULL;
+        dlm->dlm_worker = NULL;
        init_waitqueue_head(&dlm->dlm_thread_wq);
        init_waitqueue_head(&dlm->dlm_reco_thread_wq);
        init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24b3471..033ad1701232 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode,
         * doesn't make sense for LVB writes. */
        file->f_flags &= ~O_APPEND;
-        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+        fp = kmalloc(sizeof(*fp), GFP_NOFS);
        if (!fp) {
                status = -ENOMEM;
                goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
        else
                readlen = count - *ppos;
-        lvb_buf = kmalloc(readlen, GFP_KERNEL);
+        lvb_buf = kmalloc(readlen, GFP_NOFS);
        if (!lvb_buf)
                return -ENOMEM;
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
        else
                writelen = count - *ppos;
-        lvb_buf = kmalloc(writelen, GFP_KERNEL);
+        lvb_buf = kmalloc(writelen, GFP_NOFS);
        if (!lvb_buf)
                return -ENOMEM;
@@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = {
        .getattr        = simple_getattr,
 };
-static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
+static int dlmfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+        return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
 }
 static struct file_system_type dlmfs_fs_type = {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea28318d6d..5ca57ec650c7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,7 +53,7 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
-static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
                                      struct dlm_lock *lock, int flags)
 {
        enum dlm_status status = DLM_DENIED;
+        int lockres_changed = 1;
        mlog_entry("type=%d\n", lock->ml.type);
        mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        lock->lock_pending = 0;
        if (status != DLM_NORMAL) {
-                if (status != DLM_NOTQUEUED)
+                if (status == DLM_RECOVERING &&
+                    dlm_is_recovery_lock(res->lockname.name,
+                                         res->lockname.len)) {
+                        /* recovery lock was mastered by dead node.
+                         * we need to have calc_usage shoot down this
+                         * lockres and completely remaster it. */
+                        mlog(0, "%s: recovery lock was owned by "
+                             "dead node %u, remaster it now.\n",
+                             dlm->name, res->owner);
+                } else if (status != DLM_NOTQUEUED) {
+                        /*
+                         * DO NOT call calc_usage, as this would unhash
+                         * the remote lockres before we ever get to use
+                         * it.  treat as if we never made any change to
+                         * the lockres.
+                         */
+                        lockres_changed = 0;
                        dlm_error(status);
+                }
                dlm_revert_pending_lock(res, lock);
                dlm_lock_put(lock);
        } else if (dlm_is_recovery_lock(res->lockname.name, 
@@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
                mlog(0, "%s: $RECOVERY lock for this node (%u) is "
                     "mastered by %u; got lock, manually granting (no ast)\n",
                     dlm->name, dlm->node_num, res->owner);
-                list_del_init(&lock->list);
+                list_move_tail(&lock->list, &res->granted);
-                list_add_tail(&lock->list, &res->granted);
        }
        spin_unlock(&res->spinlock);
-        dlm_lockres_calc_usage(dlm, res);
+        if (lockres_changed)
+                dlm_lockres_calc_usage(dlm, res);
        wake_up(&res->wq);
        return status;
@@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
        if (tmpret >= 0) {
                // successfully sent and received
                ret = status;  // this is already a dlm_status
+                if (ret == DLM_REJECTED) {
+                        mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+                             "no longer owned by %u.  that node is coming back "
+                             "up currently.\n", dlm->name, create.namelen,
+                             create.name, res->owner);
+                        dlm_print_one_lock_resource(res);
+                        BUG();
+                }
        } else {
                mlog_errno(tmpret);
                if (dlm_is_host_down(tmpret)) {
@@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+        lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
        if (!lock)
                return NULL;
        if (!lksb) {
                /* zero memory only if kernel-allocated */
-                lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+                lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
                if (!lksb) {
                        kfree(lock);
                        return NULL;
@@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return DLM_REJECTED;
-        mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
-                        "Domain %s not fully joined!\n", dlm->name);
        name = create->name;
        namelen = create->namelen;
+        status = DLM_REJECTED;
+        if (!dlm_domain_fully_joined(dlm)) {
+                mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+                     "sending a create_lock message for lock %.*s!\n",
+                     dlm->name, create->node_idx, namelen, name);
+                dlm_error(status);
+                goto leave;
+        }
        status = DLM_IVBUFLEN;
        if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -669,18 +700,22 @@ retry_lock:
                        msleep(100);
                        /* no waiting for dlm_reco_thread */
                        if (recovery) {
-                                if (status == DLM_RECOVERING) {
+                                if (status != DLM_RECOVERING)
-                                        mlog(0, "%s: got RECOVERING "
+                                        goto retry_lock;
-                                             "for $REOCVERY lock, master "
-                                             "was %u\n", dlm->name, 
+                                mlog(0, "%s: got RECOVERING "
-                                             res->owner);
+                                     "for $RECOVERY lock, master "
-                                        dlm_wait_for_node_death(dlm, res->owner, 
+                                     "was %u\n", dlm->name,
-                                                        DLM_NODE_DEATH_WAIT_MAX);
+                                     res->owner);
-                                }
+                                /* wait to see the node go down, then
+                                 * drop down and allow the lockres to
+                                 * get cleaned up.  need to remaster. */
+                                dlm_wait_for_node_death(dlm, res->owner,
+                                                DLM_NODE_DEATH_WAIT_MAX);
                        } else {
                                dlm_wait_for_recovery(dlm);
+                                goto retry_lock;
                        }
-                        goto retry_lock;
                }
                if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c13b1f..1b8346dd0572 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -74,6 +73,7 @@ struct dlm_master_list_entry
        wait_queue_head_t wq;
        atomic_t woken;
        struct kref mle_refs;
+        int inuse;
        unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
        return 1;
 }
-#if 0
+#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
-/* Code here is included but defined out as it aids debugging */
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+{
+        int i;
+        printk("%s=[ ", mapname);
+        for (i=0; i<O2NM_MAX_NODES; i++)
+                if (test_bit(i, map))
+                        printk("%d ", i);
+        printk("]");
+}
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 {
-        int i = 0, refs;
+        int refs;
        char *type;
        char attached;
        u8 master;
        unsigned int namelen;
        const char *name;
        struct kref *k;
+        unsigned long *maybe = mle->maybe_map,
+                      *vote = mle->vote_map,
+                      *resp = mle->response_map,
+                      *node = mle->node_map;
        k = &mle->mle_refs;
        if (mle->type == DLM_MLE_BLOCK)
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
                name = mle->u.res->lockname.name;
        }
-        mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
+        mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
-                  i, type, refs, master, mle->new_master, attached,
+                  namelen, name, type, refs, master, mle->new_master, attached,
-                  namelen, namelen, name);
+                  mle->inuse);
+        dlm_print_nodemap(maybe);
+        printk(", ");
+        dlm_print_nodemap(vote);
+        printk(", ");
+        dlm_print_nodemap(resp);
+        printk(", ");
+        dlm_print_nodemap(node);
+        printk(", ");
+        printk("\n");
 }
+#if 0
+/* Code here is included but defined out as it aids debugging */
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
        struct dlm_master_list_entry *mle;
        struct list_head *iter;
        
        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-        mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
        spin_lock(&dlm->master_lock);
        list_for_each(iter, &dlm->master_list) {
                mle = list_entry(iter, struct dlm_master_list_entry, list);
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
        spin_unlock(&dlm->spinlock);
 }
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+        struct dlm_ctxt *dlm;
+        dlm = mle->dlm;
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&dlm->master_lock);
+        mle->inuse++;
+        kref_get(&mle->mle_refs);
+}
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+        struct dlm_ctxt *dlm;
+        dlm = mle->dlm;
+        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->master_lock);
+        mle->inuse--;
+        __dlm_put_mle(mle);
+        spin_unlock(&dlm->master_lock);
+        spin_unlock(&dlm->spinlock);
+}
 /* remove from list and free */
 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 {
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
-        BUG_ON(!atomic_read(&mle->mle_refs.refcount));
+        if (!atomic_read(&mle->mle_refs.refcount)) {
+                /* this may or may not crash, but who cares.
-        kref_put(&mle->mle_refs, dlm_mle_release);
+                 * it's a BUG. */
+                mlog(ML_ERROR, "bad mle: %p\n", mle);
+                dlm_print_one_mle(mle);
+                BUG();
+        } else
+                kref_put(&mle->mle_refs, dlm_mle_release);
 }
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        memset(mle->response_map, 0, sizeof(mle->response_map));
        mle->master = O2NM_MAX_NODES;
        mle->new_master = O2NM_MAX_NODES;
+        mle->inuse = 0;
        if (mle->type == DLM_MLE_MASTER) {
                BUG_ON(!res);
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        if (!hlist_unhashed(&res->hash_node) ||
+            !list_empty(&res->granted) ||
+            !list_empty(&res->converting) ||
+            !list_empty(&res->blocked) ||
+            !list_empty(&res->dirty) ||
+            !list_empty(&res->recovering) ||
+            !list_empty(&res->purge)) {
+                mlog(ML_ERROR,
+                     "Going to BUG for resource %.*s."
+                     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+                     res->lockname.len, res->lockname.name,
+                     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+                     !list_empty(&res->granted) ? 'G' : ' ',
+                     !list_empty(&res->converting) ? 'C' : ' ',
+                     !list_empty(&res->blocked) ? 'B' : ' ',
+                     !list_empty(&res->dirty) ? 'D' : ' ',
+                     !list_empty(&res->recovering) ? 'R' : ' ',
+                     !list_empty(&res->purge) ? 'P' : ' ');
+                dlm_print_one_lock_resource(res);
+        }
        /* By the time we're ready to blow this guy away, we shouldn't
         * be on any lists. */
        BUG_ON(!hlist_unhashed(&res->hash_node));
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref)
        kfree(res);
 }
-void dlm_lockres_get(struct dlm_lock_resource *res)
-{
-        kref_get(&res->refs);
-}
 void dlm_lockres_put(struct dlm_lock_resource *res)
 {
        kref_put(&res->refs, dlm_lockres_release);
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        memcpy(qname, name, namelen);
        res->lockname.len = namelen;
-        res->lockname.hash = full_name_hash(name, namelen);
+        res->lockname.hash = dlm_lockid_hash(name, namelen);
        init_waitqueue_head(&res->wq);
        spin_lock_init(&res->spinlock);
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res;
-        res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+        res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
        if (!res)
                return NULL;
-        res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+        res->lockname.name = kmalloc(namelen, GFP_NOFS);
        if (!res->lockname.name) {
                kfree(res);
                return NULL;
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        int blocked = 0;
        int ret, nodenum;
        struct dlm_node_iter iter;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        int tries = 0;
        int bit, wait_on_recovery = 0;
        BUG_ON(!lockid);
        namelen = strlen(lockid);
+        hash = dlm_lockid_hash(lockid, namelen);
        mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 lookup:
        spin_lock(&dlm->spinlock);
-        tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+        tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
        if (tmpres) {
                spin_unlock(&dlm->spinlock);
                mlog(0, "found in hash!\n");
@@ -704,7 +776,7 @@ lookup:
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
                alloc_mle = (struct dlm_master_list_entry *)
-                        kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+                        kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -790,10 +862,11 @@ lookup:
         * if so, the creator of the BLOCK may try to put the last
         * ref at this time in the assert master handler, so we
         * need an extra one to keep from a bad ptr deref. */
-        dlm_get_mle(mle);
+        dlm_get_mle_inuse(mle);
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
+redo_request:
        while (wait_on_recovery) {
                /* any cluster changes that occurred after dropping the
                 * dlm spinlock would be detectable be a change on the mle,
@@ -812,7 +885,7 @@ lookup:
                } 
                dlm_kick_recovery_thread(dlm);
-                msleep(100);
+                msleep(1000);
                dlm_wait_for_recovery(dlm);
                spin_lock(&dlm->spinlock);
@@ -825,13 +898,15 @@ lookup:
                } else
                        wait_on_recovery = 0;
                spin_unlock(&dlm->spinlock);
+                if (wait_on_recovery)
+                        dlm_wait_for_node_recovery(dlm, bit, 10000);
        }
        /* must wait for lock to be mastered elsewhere */
        if (blocked)
                goto wait;
-redo_request:
        ret = -EINVAL;
        dlm_node_iter_init(mle->vote_map, &iter);
        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -856,6 +931,7 @@ wait:
        /* keep going until the response map includes all nodes */
        ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
        if (ret < 0) {
+                wait_on_recovery = 1;
                mlog(0, "%s:%.*s: node map changed, redo the "
                     "master request now, blocked=%d\n",
                     dlm->name, res->lockname.len,
@@ -866,7 +942,7 @@ wait:
                             dlm->name, res->lockname.len, 
                             res->lockname.name, blocked);
                        dlm_print_one_lock_resource(res);
-                        /* dlm_print_one_mle(mle); */
+                        dlm_print_one_mle(mle);
                        tries = 0;
                }
                goto redo_request;
@@ -880,7 +956,7 @@ wait:
        dlm_mle_detach_hb_events(dlm, mle);
        dlm_put_mle(mle);
        /* put the extra ref */
-        dlm_put_mle(mle);
+        dlm_put_mle_inuse(mle);
 wake_waiters:
        spin_lock(&res->spinlock);
@@ -921,12 +997,14 @@ recheck:
                spin_unlock(&res->spinlock);
                /* this will cause the master to re-assert across
                 * the whole cluster, freeing up mles */
-                ret = dlm_do_master_request(mle, res->owner);
+                if (res->owner != dlm->node_num) {
-                if (ret < 0) {
+                        ret = dlm_do_master_request(mle, res->owner);
-                        /* give recovery a chance to run */
+                        if (ret < 0) {
-                        mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+                                /* give recovery a chance to run */
-                        msleep(500);
+                                mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
-                        goto recheck;
+                                msleep(500);
+                                goto recheck;
+                        }
                }
                ret = 0;
                goto leave;
@@ -962,6 +1040,12 @@ recheck:
                     "rechecking now\n", dlm->name, res->lockname.len,
                     res->lockname.name);
                goto recheck;
+        } else {
+                if (!voting_done) {
+                        mlog(0, "map not changed and voting not done "
+                             "for %s:%.*s\n", dlm->name, res->lockname.len,
+                             res->lockname.name);
+                }
        }
        if (m != O2NM_MAX_NODES) {
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                        set_bit(node, mle->vote_map);
                } else {
                        mlog(ML_ERROR, "node down! %d\n", node);
-                        /* if the node wasn't involved in mastery skip it,
-                         * but clear it out from the maps so that it will
-                         * not affect mastery of this lockres */
-                        clear_bit(node, mle->response_map);
-                        clear_bit(node, mle->vote_map);
-                        if (!test_bit(node, mle->maybe_map))
-                                goto next;
-                        /* if we're already blocked on lock mastery, and the
-                         * dead node wasn't the expected master, or there is
-                         * another node in the maybe_map, keep waiting */
                        if (blocked) {
                                int lowest = find_next_bit(mle->maybe_map,
                                                       O2NM_MAX_NODES, 0);
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                /* act like it was never there */
                                clear_bit(node, mle->maybe_map);
-                                if (node != lowest)
+                                if (node == lowest) {
-                                        goto next;
+                                        mlog(0, "expected master %u died"
+                                            " while this node was blocked "
-                                mlog(ML_ERROR, "expected master %u died while "
+                                            "waiting on it!\n", node);
-                                     "this node was blocked waiting on it!\n",
+                                        lowest = find_next_bit(mle->maybe_map,
-                                     node);
+                                                        O2NM_MAX_NODES,
-                                lowest = find_next_bit(mle->maybe_map,
+                                                        lowest+1);
-                                                       O2NM_MAX_NODES,
+                                        if (lowest < O2NM_MAX_NODES) {
-                                                       lowest+1);
+                                                mlog(0, "%s:%.*s:still "
-                                if (lowest < O2NM_MAX_NODES) {
+                                                     "blocked. waiting on %u "
-                                        mlog(0, "still blocked. waiting "
+                                                     "now\n", dlm->name,
-                                             "on %u now\n", lowest);
+                                                     res->lockname.len,
-                                        goto next;
+                                                     res->lockname.name,
+                                                     lowest);
+                                        } else {
+                                                /* mle is an MLE_BLOCK, but
+                                                 * there is now nothing left to
+                                                 * block on.  we need to return
+                                                 * all the way back out and try
+                                                 * again with an MLE_MASTER.
+                                                 * dlm_do_local_recovery_cleanup
+                                                 * has already run, so the mle
+                                                 * refcount is ok */
+                                                mlog(0, "%s:%.*s: no "
+                                                     "longer blocking. try to "
+                                                     "master this here\n",
+                                                     dlm->name,
+                                                     res->lockname.len,
+                                                     res->lockname.name);
+                                                mle->type = DLM_MLE_MASTER;
+                                                mle->u.res = res;
+                                        }
                                }
-                                /* mle is an MLE_BLOCK, but there is now
-                                 * nothing left to block on.  we need to return
-                                 * all the way back out and try again with
-                                 * an MLE_MASTER. dlm_do_local_recovery_cleanup
-                                 * has already run, so the mle refcount is ok */
-                                mlog(0, "no longer blocking. we can "
-                                     "try to master this here\n");
-                                mle->type = DLM_MLE_MASTER;
-                                memset(mle->maybe_map, 0,
-                                       sizeof(mle->maybe_map));
-                                memset(mle->response_map, 0,
-                                       sizeof(mle->maybe_map));
-                                memcpy(mle->vote_map, mle->node_map,
-                                       sizeof(mle->node_map));
-                                mle->u.res = res;
-                                set_bit(dlm->node_num, mle->maybe_map);
-                                ret = -EAGAIN;
-                                goto next;
                        }
-                        clear_bit(node, mle->maybe_map);
+                        /* now blank out everything, as if we had never
-                        if (node > dlm->node_num)
+                         * contacted anyone */
-                                goto next;
+                        memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+                        memset(mle->response_map, 0, sizeof(mle->response_map));
-                        mlog(0, "dead node in map!\n");
+                        /* reset the vote_map to the current node_map */
-                        /* yuck. go back and re-contact all nodes
+                        memcpy(mle->vote_map, mle->node_map,
-                         * in the vote_map, removing this node. */
+                               sizeof(mle->node_map));
-                        memset(mle->response_map, 0,
+                        /* put myself into the maybe map */
-                               sizeof(mle->response_map));
+                        if (mle->type != DLM_MLE_BLOCK)
+                                set_bit(dlm->node_num, mle->maybe_map);
                }
                ret = -EAGAIN;
-next:
                node = dlm_bitmap_diff_iter_next(&bdi, &sc);
        }
        return ret;
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
        struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
        char *name;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        int found, ret;
        int set_maybe;
        int dispatch_assert = 0;
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
        name = request->name;
        namelen = request->namelen;
+        hash = dlm_lockid_hash(name, namelen);
        if (namelen > DLM_LOCKID_NAME_MAX) {
                response = DLM_IVBUFLEN;
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 way_up_top:
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, name, namelen);
+        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        if (res) {
                spin_unlock(&dlm->spinlock);
@@ -1459,21 +1531,18 @@ way_up_top:
                        spin_unlock(&dlm->spinlock);
                        mle = (struct dlm_master_list_entry *)
-                                kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+                                kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
                                goto send_response;
                        }
-                        spin_lock(&dlm->spinlock);
-                        dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
-                                         name, namelen);
-                        spin_unlock(&dlm->spinlock);
                        goto way_up_top;
                }
                // mlog(0, "this is second time thru, already allocated, "
                // "add the block.\n");
+                dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
                set_bit(request->node_idx, mle->maybe_map);
                list_add(&mle->list, &dlm->master_list);
                response = DLM_MASTER_RESP_NO;
@@ -1556,6 +1625,8 @@ again:
        dlm_node_iter_init(nodemap, &iter);
        while ((to = dlm_node_iter_next(&iter)) >= 0) {
                int r = 0;
+                struct dlm_master_list_entry *mle = NULL;
                mlog(0, "sending assert master to %d (%.*s)\n", to,
                     namelen, lockname);
                memset(&assert, 0, sizeof(assert));
@@ -1567,20 +1638,28 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                        mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+                        mlog(0, "assert_master returned %d!\n", tmpret);
                        if (!dlm_is_host_down(tmpret)) {
-                                mlog(ML_ERROR, "unhandled error!\n");
+                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
                        }
                        /* a node died.  finish out the rest of the nodes. */
-                        mlog(ML_ERROR, "link to %d went down!\n", to);
+                        mlog(0, "link to %d went down!\n", to);
                        /* any nonzero status return will do */
                        ret = tmpret;
                } else if (r < 0) {
                        /* ok, something horribly messed.  kill thyself. */
                        mlog(ML_ERROR,"during assert master of %.*s to %u, "
                             "got %d.\n", namelen, lockname, to, r);
-                        dlm_dump_lock_resources(dlm);
+                        spin_lock(&dlm->spinlock);
+                        spin_lock(&dlm->master_lock);
+                        if (dlm_find_mle(dlm, &mle, (char *)lockname,
+                                         namelen)) {
+                                dlm_print_one_mle(mle);
+                                __dlm_put_mle(mle);
+                        }
+                        spin_unlock(&dlm->master_lock);
+                        spin_unlock(&dlm->spinlock);
                        BUG();
                } else if (r == EAGAIN) {
                        mlog(0, "%.*s: node %u create mles on other "
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
        struct dlm_lock_resource *res = NULL;
        char *name;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        u32 flags;
        int master_request = 0;
        int ret = 0;
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
        name = assert->name;
        namelen = assert->namelen;
+        hash = dlm_lockid_hash(name, namelen);
        flags = be32_to_cpu(assert->flags);
        if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
                if (bit >= O2NM_MAX_NODES) {
                        /* not necessarily an error, though less likely.
                         * could be master just re-asserting. */
-                        mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+                        mlog(0, "no bits set in the maybe_map, but %u "
                             "is asserting! (%.*s)\n", assert->node_idx,
                             namelen, name);
                } else if (bit != assert->node_idx) {
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
                                 * number winning the mastery will respond
                                 * YES to mastery requests, but this node
                                 * had no way of knowing.  let it pass. */
-                                mlog(ML_ERROR, "%u is the lowest node, "
+                                mlog(0, "%u is the lowest node, "
                                     "%u is asserting. (%.*s)  %u must "
                                     "have begun after %u won.\n", bit,
                                     assert->node_idx, namelen, name, bit,
                                     assert->node_idx);
                        }
                }
+                if (mle->type == DLM_MLE_MIGRATION) {
+                        if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+                                mlog(0, "%s:%.*s: got cleanup assert"
+                                     " from %u for migration\n",
+                                     dlm->name, namelen, name,
+                                     assert->node_idx);
+                        } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+                                mlog(0, "%s:%.*s: got unrelated assert"
+                                     " from %u for migration, ignoring\n",
+                                     dlm->name, namelen, name,
+                                     assert->node_idx);
+                                __dlm_put_mle(mle);
+                                spin_unlock(&dlm->master_lock);
+                                spin_unlock(&dlm->spinlock);
+                                goto done;
+                        }       
+                }
        }
        spin_unlock(&dlm->master_lock);
        /* ok everything checks out with the MLE
         * now check to see if there is a lockres */
-        res = __dlm_lookup_lockres(dlm, name, namelen);
+        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        if (res) {
                spin_lock(&res->spinlock);
                if (res->state & DLM_LOCK_RES_RECOVERING)  {
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
                        goto kill;
                }
                if (!mle) {
-                        if (res->owner != assert->node_idx) {
+                        if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+                            res->owner != assert->node_idx) {
                                mlog(ML_ERROR, "assert_master from "
                                          "%u, but current owner is "
                                          "%u! (%.*s)\n",
@@ -1732,6 +1830,7 @@ ok:
        if (mle) {
                int extra_ref = 0;
                int nn = -1;
+                int rr, err = 0;
                
                spin_lock(&mle->spinlock);
                if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1751,27 +1850,64 @@ ok:
                wake_up(&mle->wq);
                spin_unlock(&mle->spinlock);
-                if (mle->type == DLM_MLE_MIGRATION && res) {
+                if (res) {
-                        mlog(0, "finishing off migration of lockres %.*s, "
-                             "from %u to %u\n",
-                               res->lockname.len, res->lockname.name,
-                               dlm->node_num, mle->new_master);
                        spin_lock(&res->spinlock);
-                        res->state &= ~DLM_LOCK_RES_MIGRATING;
+                        if (mle->type == DLM_MLE_MIGRATION) {
-                        dlm_change_lockres_owner(dlm, res, mle->new_master);
+                                mlog(0, "finishing off migration of lockres %.*s, "
-                        BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+                                        "from %u to %u\n",
+                                        res->lockname.len, res->lockname.name,
+                                        dlm->node_num, mle->new_master);
+                                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                                dlm_change_lockres_owner(dlm, res, mle->new_master);
+                                BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+                        } else {
+                                dlm_change_lockres_owner(dlm, res, mle->master);
+                        }
                        spin_unlock(&res->spinlock);
                }
-                /* master is known, detach if not already detached */
-                dlm_mle_detach_hb_events(dlm, mle);
+                /* master is known, detach if not already detached.
-                dlm_put_mle(mle);
+                 * ensures that only one assert_master call will happen
-                
+                 * on this mle. */
+                spin_lock(&dlm->spinlock);
+                spin_lock(&dlm->master_lock);
+                rr = atomic_read(&mle->mle_refs.refcount);
+                if (mle->inuse > 0) {
+                        if (extra_ref && rr < 3)
+                                err = 1;
+                        else if (!extra_ref && rr < 2)
+                                err = 1;
+                } else {
+                        if (extra_ref && rr < 2)
+                                err = 1;
+                        else if (!extra_ref && rr < 1)
+                                err = 1;
+                }
+                if (err) {
+                        mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+                             "that will mess up this node, refs=%d, extra=%d, "
+                             "inuse=%d\n", dlm->name, namelen, name,
+                             assert->node_idx, rr, extra_ref, mle->inuse);
+                        dlm_print_one_mle(mle);
+                }
+                list_del_init(&mle->list);
+                __dlm_mle_detach_hb_events(dlm, mle);
+                __dlm_put_mle(mle);
                if (extra_ref) {
                        /* the assert master message now balances the extra
                         * ref given by the master / migration request message.
                         * if this is the last put, it will be removed
                         * from the list. */
-                        dlm_put_mle(mle);
+                        __dlm_put_mle(mle);
+                }
+                spin_unlock(&dlm->master_lock);
+                spin_unlock(&dlm->spinlock);
+        } else if (res) {
+                if (res->owner != assert->node_idx) {
+                        mlog(0, "assert_master from %u, but current "
+                             "owner is %u (%.*s), no mle\n", assert->node_idx,
+                             res->owner, namelen, name);
                }
        }
@@ -1788,12 +1924,12 @@ done:
 kill:
        /* kill the caller! */
+        mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+             "and killing the other node now!  This node is OK and can continue.\n");
+        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
        spin_unlock(&dlm->spinlock);
        dlm_lockres_put(res);
-        mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
-             "and killing the other node now!  This node is OK and can continue.\n");
-        dlm_dump_lock_resources(dlm);
        dlm_put(dlm);
        return -EINVAL;
 }
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
                               int ignore_higher, u8 request_from, u32 flags)
 {
        struct dlm_work_item *item;
-        item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+        item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!item)
                return -ENOMEM;
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-        schedule_work(&dlm->dispatched_work);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
        return 0;
 }
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
                }
        }
+        /*
+         * If we're migrating this lock to someone else, we are no
+         * longer allowed to assert out own mastery.  OTOH, we need to
+         * prevent migration from starting while we're still asserting
+         * our dominance.  The reserved ast delays migration.
+         */
+        spin_lock(&res->spinlock);
+        if (res->state & DLM_LOCK_RES_MIGRATING) {
+                mlog(0, "Someone asked us to assert mastery, but we're "
+                     "in the middle of migration.  Skipping assert, "
+                     "the new master will handle that.\n");
+                spin_unlock(&res->spinlock);
+                goto put;
+        } else
+                __dlm_lockres_reserve_ast(res);
+        spin_unlock(&res->spinlock);
        /* this call now finishes out the nodemap
         * even if one or more nodes die */
        mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
                                   nodemap, flags);
        if (ret < 0) {
                /* no need to restart, we are done */
-                mlog_errno(ret);
+                if (!dlm_is_host_down(ret))
+                        mlog_errno(ret);
        }
+        /* Ok, we've asserted ourselves.  Let's let migration start. */
+        dlm_lockres_release_ast(dlm, res);
+put:
        dlm_lockres_put(res);
        mlog(0, "finished with dlm_assert_master_worker\n");
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
                                BUG();
                        /* host is down, so answer for that node would be
                         * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+                        ret = 0;
                }
                if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
         */
        ret = -ENOMEM;
-        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
        if (!mres) {
                mlog_errno(ret);
                goto leave;
        }
        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                                GFP_KERNEL);
+                                                                GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2117,7 +2276,7 @@ fail:
         * take both dlm->spinlock and dlm->master_lock */
        spin_lock(&dlm->spinlock);
        spin_lock(&dlm->master_lock);
-        dlm_get_mle(mle);
+        dlm_get_mle_inuse(mle);
        spin_unlock(&dlm->master_lock);
        spin_unlock(&dlm->spinlock);
@@ -2134,7 +2293,10 @@ fail:
                /* migration failed, detach and clean up mle */
                dlm_mle_detach_hb_events(dlm, mle);
                dlm_put_mle(mle);
-                dlm_put_mle(mle);
+                dlm_put_mle_inuse(mle);
+                spin_lock(&res->spinlock);
+                res->state &= ~DLM_LOCK_RES_MIGRATING;
+                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2164,8 +2326,8 @@ fail:
                        /* avoid hang during shutdown when migrating lockres 
                         * to a node which also goes down */
                        if (dlm_is_node_dead(dlm, target)) {
-                                mlog(0, "%s:%.*s: expected migration target %u "
+                                mlog(0, "%s:%.*s: expected migration "
-                                     "is no longer up.  restarting.\n",
+                                     "target %u is no longer up, restarting\n",
                                     dlm->name, res->lockname.len,
                                     res->lockname.name, target);
                                ret = -ERESTARTSYS;
@@ -2175,7 +2337,10 @@ fail:
                        /* migration failed, detach and clean up mle */
                        dlm_mle_detach_hb_events(dlm, mle);
                        dlm_put_mle(mle);
-                        dlm_put_mle(mle);
+                        dlm_put_mle_inuse(mle);
+                        spin_lock(&res->spinlock);
+                        res->state &= ~DLM_LOCK_RES_MIGRATING;
+                        spin_unlock(&res->spinlock);
                        goto leave;
                }
                /* TODO: if node died: stop, clean up, return error */
@@ -2191,7 +2356,7 @@ fail:
        /* master is known, detach if not already detached */
        dlm_mle_detach_hb_events(dlm, mle);
-        dlm_put_mle(mle);
+        dlm_put_mle_inuse(mle);
        ret = 0;
        dlm_lockres_calc_usage(dlm, res);
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
        struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
        const char *name;
-        unsigned int namelen;
+        unsigned int namelen, hash;
        int ret = 0;
        if (!dlm_grab(dlm))
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        name = migrate->name;
        namelen = migrate->namelen;
+        hash = dlm_lockid_hash(name, namelen);
        /* preallocate.. if this fails, abort */
        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-                                                         GFP_KERNEL);
+                                                         GFP_NOFS);
        if (!mle) {
                ret = -ENOMEM;
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
        /* check for pre-existing lock */
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, name, namelen);
+        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
        spin_lock(&dlm->master_lock);
        if (res) {
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        /* remove it from the list so that only one
                         * mle will be found */
                        list_del_init(&tmp->list);
+                        __dlm_mle_detach_hb_events(dlm, mle);
                }
                spin_unlock(&tmp->spinlock);
        }
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
        struct list_head *iter, *iter2;
        struct dlm_master_list_entry *mle;
        struct dlm_lock_resource *res;
+        unsigned int hash;
        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
@@ -2640,7 +2808,7 @@ top:
                                 * may result in the mle being unlinked and
                                 * freed, but there may still be a process
                                 * waiting in the dlmlock path which is fine. */
-                                mlog(ML_ERROR, "node %u was expected master\n",
+                                mlog(0, "node %u was expected master\n",
                                     dead_node);
                                atomic_set(&mle->woken, 1);
                                spin_unlock(&mle->spinlock);
@@ -2673,19 +2841,21 @@ top:
                /* remove from the list early.  NOTE: unlinking
                 * list_head while in list_for_each_safe */
+                __dlm_mle_detach_hb_events(dlm, mle);
                spin_lock(&mle->spinlock);
                list_del_init(&mle->list);
                atomic_set(&mle->woken, 1);
                spin_unlock(&mle->spinlock);
                wake_up(&mle->wq);
-                mlog(0, "node %u died during migration from "
+                mlog(0, "%s: node %u died during migration from "
-                     "%u to %u!\n", dead_node,
+                     "%u to %u!\n", dlm->name, dead_node,
                     mle->master, mle->new_master);
                /* if there is a lockres associated with this
                 * mle, find it and set its owner to UNKNOWN */
+                hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
                res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                        mle->u.name.len);
+                                           mle->u.name.len, hash);
                if (res) {
                        /* unfortunately if we hit this rare case, our
                         * lock ordering is messed.  we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbabac051..594745fab0b5 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -95,11 +95,14 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
 static void dlm_request_all_locks_worker(struct dlm_work_item *item,
                                         void *data);
 static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+                                      struct dlm_lock_resource *res,
+                                      u8 *real_master);
 static u64 dlm_get_next_mig_cookie(void);
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 static u64 dlm_get_next_mig_cookie(void)
@@ -115,12 +118,37 @@ static u64 dlm_get_next_mig_cookie(void)
        return c;
 }
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+                                          u8 dead_node)
+{
+        assert_spin_locked(&dlm->spinlock);
+        if (dlm->reco.dead_node != dead_node)
+                mlog(0, "%s: changing dead_node from %u to %u\n",
+                     dlm->name, dlm->reco.dead_node, dead_node);
+        dlm->reco.dead_node = dead_node;
+}
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+                                       u8 master)
+{
+        assert_spin_locked(&dlm->spinlock);
+        mlog(0, "%s: changing new_master from %u to %u\n",
+             dlm->name, dlm->reco.new_master, master);
+        dlm->reco.new_master = master;
+}
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+        assert_spin_locked(&dlm->spinlock);
+        clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+        dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
 static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
        spin_lock(&dlm->spinlock);
-        clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+        __dlm_reset_recovery(dlm);
-        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-        dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
        spin_unlock(&dlm->spinlock);
 }
@@ -132,12 +160,21 @@ void dlm_dispatch_work(void *data)
        struct list_head *iter, *iter2;
        struct dlm_work_item *item;
        dlm_workfunc_t *workfunc;
+        int tot=0;
+        if (!dlm_joined(dlm))
+                return;
        spin_lock(&dlm->work_lock);
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
        list_for_each_safe(iter, iter2, &tmp_list) {
+                tot++;
+        }
+        mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+        list_for_each_safe(iter, iter2, &tmp_list) {
                item = list_entry(iter, struct dlm_work_item, list);
                workfunc = item->func;
                list_del_init(&item->list);
@@ -220,6 +257,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
 *
 */
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+        struct dlm_reco_node_data *ndata;
+        struct dlm_lock_resource *res;
+        mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+             dlm->name, dlm->dlm_reco_thread_task->pid,
+             dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+             dlm->reco.dead_node, dlm->reco.new_master);
+        list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+                char *st = "unknown";
+                switch (ndata->state) {
+                        case DLM_RECO_NODE_DATA_INIT:
+                                st = "init";
+                                break;
+                        case DLM_RECO_NODE_DATA_REQUESTING:
+                                st = "requesting";
+                                break;
+                        case DLM_RECO_NODE_DATA_DEAD:
+                                st = "dead";
+                                break;
+                        case DLM_RECO_NODE_DATA_RECEIVING:
+                                st = "receiving";
+                                break;
+                        case DLM_RECO_NODE_DATA_REQUESTED:
+                                st = "requested";
+                                break;
+                        case DLM_RECO_NODE_DATA_DONE:
+                                st = "done";
+                                break;
+                        case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                                st = "finalize-sent";
+                                break;
+                        default:
+                                st = "bad";
+                                break;
+                }
+                mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+                     dlm->name, ndata->node_num, st);
+        }
+        list_for_each_entry(res, &dlm->reco.resources, recovering) {
+                mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
+        }
+}
 #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
@@ -267,11 +350,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 {
        int dead;
        spin_lock(&dlm->spinlock);
-        dead = test_bit(node, dlm->domain_map);
+        dead = !test_bit(node, dlm->domain_map);
        spin_unlock(&dlm->spinlock);
        return dead;
 }
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+        int recovered;
+        spin_lock(&dlm->spinlock);
+        recovered = !test_bit(node, dlm->recovery_map);
+        spin_unlock(&dlm->spinlock);
+        return recovered;
+}
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
        if (timeout) {
@@ -290,6 +385,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
        return 0;
 }
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+        if (timeout) {
+                mlog(0, "%s: waiting %dms for notification of "
+                     "recovery of node %u\n", dlm->name, timeout, node);
+                wait_event_timeout(dlm->dlm_reco_thread_wq,
+                           dlm_is_node_recovered(dlm, node),
+                           msecs_to_jiffies(timeout));
+        } else {
+                mlog(0, "%s: waiting indefinitely for notification "
+                     "of recovery of node %u\n", dlm->name, node);
+                wait_event(dlm->dlm_reco_thread_wq,
+                           dlm_is_node_recovered(dlm, node));
+        }
+        /* for now, return 0 */
+        return 0;
+}
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
 * block on the dlm->reco.event when recovery is in progress.
 * the dlm recovery thread will set this state when it begins
@@ -308,6 +421,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+        if (dlm_in_recovery(dlm)) {
+                mlog(0, "%s: reco thread %d in recovery: "
+                     "state=%d, master=%u, dead=%u\n",
+                     dlm->name, dlm->dlm_reco_thread_task->pid,
+                     dlm->reco.state, dlm->reco.new_master,
+                     dlm->reco.dead_node);
+        }
        wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
@@ -341,7 +461,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                mlog(0, "new master %u died while recovering %u!\n",
                     dlm->reco.new_master, dlm->reco.dead_node);
                /* unset the new_master, leave dead_node */
-                dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+                dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
        }
        /* select a target to recover */
@@ -350,14 +470,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
                if (bit >= O2NM_MAX_NODES || bit < 0)
-                        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+                        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
                else
-                        dlm->reco.dead_node = bit;
+                        dlm_set_reco_dead_node(dlm, bit);
        } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
                /* BUG? */
                mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
                     dlm->reco.dead_node);
-                dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+                dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
        }
        if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -366,7 +486,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                /* return to main thread loop and sleep. */
                return 0;
        }
-        mlog(0, "recovery thread found node %u in the recovery map!\n",
+        mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+             dlm->name, dlm->dlm_reco_thread_task->pid,
             dlm->reco.dead_node);
        spin_unlock(&dlm->spinlock);
@@ -389,8 +510,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                }
                mlog(0, "another node will master this recovery session.\n");
        }
-        mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
+        mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
-             dlm->name, dlm->reco.new_master,
+             dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
             dlm->node_num, dlm->reco.dead_node);
        /* it is safe to start everything back up here
@@ -402,11 +523,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        return 0;
 master_here:
-        mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+        mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+             dlm->dlm_reco_thread_task->pid,
             dlm->name, dlm->reco.dead_node, dlm->node_num);
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
+                /* we should never hit this anymore */
                mlog(ML_ERROR, "error %d remastering locks for node %u, "
                     "retrying.\n", status, dlm->reco.dead_node);
                /* yield a bit to allow any final network messages
@@ -433,9 +556,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        int destroy = 0;
        int pass = 0;
-        status = dlm_init_recovery_area(dlm, dead_node);
+        do {
-        if (status < 0)
+                /* we have become recovery master.  there is no escaping
-                goto leave;
+                 * this, so just keep trying until we get it. */
+                status = dlm_init_recovery_area(dlm, dead_node);
+                if (status < 0) {
+                        mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+                             "retrying\n", dlm->name);
+                        msleep(1000);
+                }
+        } while (status != 0);
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
@@ -452,16 +582,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        continue;
                }
-                status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+                do {
-                if (status < 0) {
+                        status = dlm_request_all_locks(dlm, ndata->node_num,
-                        mlog_errno(status);
+                                                       dead_node);
-                        if (dlm_is_host_down(status))
+                        if (status < 0) {
-                                ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                mlog_errno(status);
-                        else {
+                                if (dlm_is_host_down(status)) {
-                                destroy = 1;
+                                        /* node died, ignore it for recovery */
-                                goto leave;
+                                        status = 0;
+                                        ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                        /* wait for the domain map to catch up
+                                         * with the network state. */
+                                        wait_event_timeout(dlm->dlm_reco_thread_wq,
+                                                           dlm_is_node_dead(dlm,
+                                                                ndata->node_num),
+                                                           msecs_to_jiffies(1000));
+                                        mlog(0, "waited 1 sec for %u, "
+                                             "dead? %s\n", ndata->node_num,
+                                             dlm_is_node_dead(dlm, ndata->node_num) ?
+                                             "yes" : "no");
+                                } else {
+                                        /* -ENOMEM on the other node */
+                                        mlog(0, "%s: node %u returned "
+                                             "%d during recovery, retrying "
+                                             "after a short wait\n",
+                                             dlm->name, ndata->node_num,
+                                             status);
+                                        msleep(100);
+                                }
                        }
-                }
+                } while (status != 0);
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
@@ -473,10 +623,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                mlog(0, "node %u died after requesting "
                                     "recovery info for node %u\n",
                                     ndata->node_num, dead_node);
-                                // start all over
+                                /* fine.  don't need this node's info.
-                                destroy = 1;
+                                 * continue without it. */
-                                status = -EAGAIN;
+                                break;
-                                goto leave;
                        case DLM_RECO_NODE_DATA_REQUESTING:
                                ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
                                mlog(0, "now receiving recovery data from "
@@ -520,35 +669,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                        BUG();
                                        break;
                                case DLM_RECO_NODE_DATA_DEAD:
-                                        mlog(ML_NOTICE, "node %u died after "
+                                        mlog(0, "node %u died after "
                                             "requesting recovery info for "
                                             "node %u\n", ndata->node_num,
                                             dead_node);
-                                        spin_unlock(&dlm_reco_state_lock);
+                                        break;
-                                        // start all over
-                                        destroy = 1;
-                                        status = -EAGAIN;
-                                        /* instead of spinning like crazy here,
-                                         * wait for the domain map to catch up
-                                         * with the network state.  otherwise this
-                                         * can be hit hundreds of times before
-                                         * the node is really seen as dead. */
-                                        wait_event_timeout(dlm->dlm_reco_thread_wq,
-                                                           dlm_is_node_dead(dlm,
-                                                                ndata->node_num),
-                                                           msecs_to_jiffies(1000));
-                                        mlog(0, "waited 1 sec for %u, "
-                                             "dead? %s\n", ndata->node_num,
-                                             dlm_is_node_dead(dlm, ndata->node_num) ?
-                                             "yes" : "no");
-                                        goto leave;
                                case DLM_RECO_NODE_DATA_RECEIVING:
                                case DLM_RECO_NODE_DATA_REQUESTED:
+                                        mlog(0, "%s: node %u still in state %s\n",
+                                             dlm->name, ndata->node_num,
+                                             ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+                                             "receiving" : "requested");
                                        all_nodes_done = 0;
                                        break;
                                case DLM_RECO_NODE_DATA_DONE:
+                                        mlog(0, "%s: node %u state is done\n",
+                                             dlm->name, ndata->node_num);
                                        break;
                                case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                                        mlog(0, "%s: node %u state is finalize\n",
+                                             dlm->name, ndata->node_num);
                                        break;
                        }
                }
@@ -578,7 +718,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                             jiffies, dlm->reco.dead_node,
                             dlm->node_num, dlm->reco.new_master);
                        destroy = 1;
-                        status = ret;
+                        status = 0;
                        /* rescan everything marked dirty along the way */
                        dlm_kick_thread(dlm, NULL);
                        break;
@@ -591,7 +731,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        }
-leave:
        if (destroy)
                dlm_destroy_recovery_area(dlm, dead_node);
@@ -617,7 +756,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
                }
                BUG_ON(num == dead_node);
-                ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+                ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
                if (!ndata) {
                        dlm_destroy_recovery_area(dlm, dead_node);
                        return -ENOMEM;
@@ -691,16 +830,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return -EINVAL;
+        if (lr->dead_node != dlm->reco.dead_node) {
+                mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+                     "dead_node is %u\n", dlm->name, lr->node_idx,
+                     lr->dead_node, dlm->reco.dead_node);
+                dlm_print_reco_node_status(dlm);
+                /* this is a hack */
+                dlm_put(dlm);
+                return -ENOMEM;
+        }
        BUG_ON(lr->dead_node != dlm->reco.dead_node);
-        item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+        item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!item) {
                dlm_put(dlm);
                return -ENOMEM;
        }
        /* this will get freed by dlm_request_all_locks_worker */
-        buf = (char *) __get_free_page(GFP_KERNEL);
+        buf = (char *) __get_free_page(GFP_NOFS);
        if (!buf) {
                kfree(item);
                dlm_put(dlm);
@@ -715,7 +863,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-        schedule_work(&dlm->dispatched_work);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
        dlm_put(dlm);
        return 0;
@@ -730,32 +878,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        struct list_head *iter;
        int ret;
        u8 dead_node, reco_master;
+        int skip_all_done = 0;
        dlm = item->dlm;
        dead_node = item->u.ral.dead_node;
        reco_master = item->u.ral.reco_master;
        mres = (struct dlm_migratable_lockres *)data;
+        mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+             dlm->name, dead_node, reco_master);
        if (dead_node != dlm->reco.dead_node ||
            reco_master != dlm->reco.new_master) {
-                /* show extra debug info if the recovery state is messed */
+                /* worker could have been created before the recovery master
-                mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
+                 * died.  if so, do not continue, but do not error. */
-                     "request(dead=%u, master=%u)\n",
+                if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
-                     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
+                        mlog(ML_NOTICE, "%s: will not send recovery state, "
-                     dead_node, reco_master);
+                             "recovery master %u died, thread=(dead=%u,mas=%u)"
-                mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
+                             " current=(dead=%u,mas=%u)\n", dlm->name,
-                     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
+                             reco_master, dead_node, reco_master,
-                     dlm->name, mres->lockname_len, mres->lockname, mres->master,
+                             dlm->reco.dead_node, dlm->reco.new_master);
-                     mres->num_locks, mres->total_locks, mres->flags,
+                } else {
-                     dlm_get_lock_cookie_node(mres->ml[0].cookie),
+                        mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
-                     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
+                             "master=%u), request(dead=%u, master=%u)\n",
-                     mres->ml[0].list, mres->ml[0].flags,
+                             dlm->name, dlm->reco.dead_node,
-                     mres->ml[0].type, mres->ml[0].convert_type,
+                             dlm->reco.new_master, dead_node, reco_master);
-                     mres->ml[0].highest_blocked, mres->ml[0].node);
+                }
-                BUG();
+                goto leave;
        }
-        BUG_ON(dead_node != dlm->reco.dead_node);
-        BUG_ON(reco_master != dlm->reco.new_master);
        /* lock resources should have already been moved to the
         * dlm->reco.resources list.  now move items from that list
@@ -766,12 +916,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
        /* now we can begin blasting lockreses without the dlm lock */
+        /* any errors returned will be due to the new_master dying,
+         * the dlm_reco_thread should detect this */
        list_for_each(iter, &resources) {
                res = list_entry (iter, struct dlm_lock_resource, recovering);
                ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
                                        DLM_MRES_RECOVERY);
-                if (ret < 0)
+                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: node %u went down while sending "
+                             "recovery state for dead node %u, ret=%d\n", dlm->name,
+                             reco_master, dead_node, ret);
+                        skip_all_done = 1;
+                        break;
+                }
        }
        /* move the resources back to the list */
@@ -779,10 +937,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        list_splice_init(&resources, &dlm->reco.resources);
        spin_unlock(&dlm->spinlock);
-        ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+        if (!skip_all_done) {
-        if (ret < 0)
+                ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
-                mlog_errno(ret);
+                if (ret < 0) {
+                        mlog(ML_ERROR, "%s: node %u went down while sending "
+                             "recovery all-done for dead node %u, ret=%d\n",
+                             dlm->name, reco_master, dead_node, ret);
+                }
+        }
+leave:
        free_page((unsigned long)data);
 }
@@ -801,8 +964,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
-        /* negative status is ignored by the caller */
+        if (ret < 0) {
-        if (ret >= 0)
+                if (!dlm_is_host_down(ret)) {
+                        mlog_errno(ret);
+                        mlog(ML_ERROR, "%s: unknown error sending data-done "
+                             "to %u\n", dlm->name, send_to);
+                        BUG();
+                }
+        } else
                ret = tmpret;
        return ret;
 }
@@ -822,7 +991,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
        mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
             "node_idx=%u, this node=%u\n", done->dead_node,
             dlm->reco.dead_node, done->node_idx, dlm->node_num);
-        BUG_ON(done->dead_node != dlm->reco.dead_node);
+        mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+                        "Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+                        "node_idx=%u, this node=%u\n", done->dead_node,
+                        dlm->reco.dead_node, done->node_idx, dlm->node_num);
        spin_lock(&dlm_reco_state_lock);
        list_for_each(iter, &dlm->reco.node_data) {
@@ -905,13 +1078,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                        mlog(0, "found lockres owned by dead node while "
                                  "doing recovery for node %u. sending it.\n",
                                  dead_node);
-                        list_del_init(&res->recovering);
+                        list_move_tail(&res->recovering, list);
-                        list_add_tail(&res->recovering, list);
                } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                        mlog(0, "found UNKNOWN owner while doing recovery "
                                  "for node %u. sending it.\n", dead_node);
-                        list_del_init(&res->recovering);
+                        list_move_tail(&res->recovering, list);
-                        list_add_tail(&res->recovering, list);
                }
        }
        spin_unlock(&dlm->spinlock);
@@ -1023,8 +1194,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
                    ml->type == LKM_PRMODE) {
                        /* if it is already set, this had better be a PR
                         * and it has to match */
-                        if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
+                        if (!dlm_lvb_is_empty(mres->lvb) &&
-                            memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+                            (ml->type == LKM_EXMODE ||
+                             memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
                                mlog(ML_ERROR, "mismatched lvbs!\n");
                                __dlm_print_one_lock_resource(lock->lockres);
                                BUG();
@@ -1083,22 +1255,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         * we must send it immediately. */
                        ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
                                                       res, total_locks);
-                        if (ret < 0) {
+                        if (ret < 0)
-                                // TODO
+                                goto error;
-                                mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
-                                     "returned %d, TODO\n", ret);
-                                BUG();
-                        }
                }
        }
        /* flush any remaining locks */
        ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
-        if (ret < 0) {
+        if (ret < 0)
-                // TODO
+                goto error;
-                mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
+        return ret;
-                     "TODO\n", ret);
+error:
+        mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+             dlm->name, ret);
+        if (!dlm_is_host_down(ret))
                BUG();
-        }
+        mlog(0, "%s: node %u went down while sending %s "
+             "lockres %.*s\n", dlm->name, send_to,
+             flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+             res->lockname.len, res->lockname.name);
        return ret;
 }
@@ -1146,8 +1321,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
                mlog(0, "all done flag.  all lockres data received!\n");
        ret = -ENOMEM;
-        buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
+        buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
-        item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+        item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!buf || !item)
                goto leave;
@@ -1238,7 +1413,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-        schedule_work(&dlm->dispatched_work);
+        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 leave:
        dlm_put(dlm);
@@ -1312,8 +1487,9 @@ leave:
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-                               struct dlm_lock_resource *res, u8 *real_master)
+                                      struct dlm_lock_resource *res,
+                                      u8 *real_master)
 {
        struct dlm_node_iter iter;
        int nodenum;
@@ -1406,6 +1582,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
        struct dlm_ctxt *dlm = data;
        struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
        struct dlm_lock_resource *res = NULL;
+        unsigned int hash;
        int master = DLM_LOCK_RES_OWNER_UNKNOWN;
        u32 flags = DLM_ASSERT_MASTER_REQUERY;
@@ -1415,8 +1592,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
                return master;
        }
+        hash = dlm_lockid_hash(req->name, req->namelen);
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+        res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
        if (res) {
                spin_lock(&res->spinlock);
                master = res->owner;
@@ -1483,7 +1662,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
-        int i;
+        int i, bad;
        struct list_head *iter;
        struct dlm_lock *lock = NULL;
@@ -1529,8 +1708,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        /* move the lock to its proper place */
                        /* do not alter lock refcount.  switching lists. */
-                        list_del_init(&lock->list);
+                        list_move_tail(&lock->list, queue);
-                        list_add_tail(&lock->list, queue);
                        spin_unlock(&res->spinlock);
                        mlog(0, "just reordered a local lock!\n");
@@ -1553,28 +1731,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                }
                lksb->flags |= (ml->flags &
                                (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-                        
-                if (mres->lvb[0]) {
+                if (ml->type == LKM_NLMODE)
+                        goto skip_lvb;
+                if (!dlm_lvb_is_empty(mres->lvb)) {
                        if (lksb->flags & DLM_LKSB_PUT_LVB) {
                                /* other node was trying to update
                                 * lvb when node died.  recreate the
                                 * lksb with the updated lvb. */
                                memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+                                /* the lock resource lvb update must happen
+                                 * NOW, before the spinlock is dropped.
+                                 * we no longer wait for the AST to update
+                                 * the lvb. */
+                                memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        } else {
                                /* otherwise, the node is sending its 
                                 * most recent valid lvb info */
                                BUG_ON(ml->type != LKM_EXMODE &&
                                       ml->type != LKM_PRMODE);
-                                if (res->lvb[0] && (ml->type == LKM_EXMODE ||
+                                if (!dlm_lvb_is_empty(res->lvb) &&
-                                    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+                                    (ml->type == LKM_EXMODE ||
-                                        mlog(ML_ERROR, "received bad lvb!\n");
+                                     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-                                        __dlm_print_one_lock_resource(res);
+                                        int i;
-                                        BUG();
+                                        mlog(ML_ERROR, "%s:%.*s: received bad "
+                                             "lvb! type=%d\n", dlm->name,
+                                             res->lockname.len,
+                                             res->lockname.name, ml->type);
+                                        printk("lockres lvb=[");
+                                        for (i=0; i<DLM_LVB_LEN; i++)
+                                                printk("%02x", res->lvb[i]);
+                                        printk("]\nmigrated lvb=[");
+                                        for (i=0; i<DLM_LVB_LEN; i++)
+                                                printk("%02x", mres->lvb[i]);
+                                        printk("]\n");
+                                        dlm_print_one_lock_resource(res);
+                                        BUG();
                                }
                                memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        }
                }
+skip_lvb:
                /* NOTE:
                 * wrt lock queue ordering and recovery:
@@ -1592,9 +1790,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                 * relative to each other, but clearly *not*
                 * preserved relative to locks from other nodes.
                 */
+                bad = 0;
                spin_lock(&res->spinlock);
-                dlm_lock_get(newlock);
+                list_for_each_entry(lock, queue, list) {
-                list_add_tail(&newlock->list, queue);
+                        if (lock->ml.cookie == ml->cookie) {
+                                u64 c = lock->ml.cookie;
+                                mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+                                     "exists on this lockres!\n", dlm->name,
+                                     res->lockname.len, res->lockname.name,
+                                     dlm_get_lock_cookie_node(c),
+                                     dlm_get_lock_cookie_seq(c));
+                                mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+                                     "node=%u, cookie=%u:%llu, queue=%d\n",
+                                     ml->type, ml->convert_type, ml->node,
+                                     dlm_get_lock_cookie_node(ml->cookie),
+                                     dlm_get_lock_cookie_seq(ml->cookie),
+                                     ml->list);
+                                __dlm_print_one_lock_resource(res);
+                                bad = 1;
+                                break;
+                        }
+                }
+                if (!bad) {
+                        dlm_lock_get(newlock);
+                        list_add_tail(&newlock->list, queue);
+                }
                spin_unlock(&res->spinlock);
        }
        mlog(0, "done running all the locks\n");
@@ -1618,8 +1840,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        struct dlm_lock *lock;
        res->state |= DLM_LOCK_RES_RECOVERING;
-        if (!list_empty(&res->recovering))
+        if (!list_empty(&res->recovering)) {
+                mlog(0,
+                     "Recovering res %s:%.*s, is already on recovery list!\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->recovering);
+        }
+        /* We need to hold a reference while on the recovery list */
+        dlm_lockres_get(res);
        list_add_tail(&res->recovering, &dlm->reco.resources);
        /* find any pending locks and put them back on proper list */
@@ -1708,9 +1936,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        dlm_change_lockres_owner(dlm, res, new_master);
                        res->state &= ~DLM_LOCK_RES_RECOVERING;
-                        __dlm_dirty_lockres(dlm, res);
+                        if (!__dlm_lockres_unused(res))
+                                __dlm_dirty_lockres(dlm, res);
                        spin_unlock(&res->spinlock);
                        wake_up(&res->wq);
+                        dlm_lockres_put(res);
                }
        }
@@ -1719,7 +1949,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
         * the RECOVERING state and set the owner
         * if necessary */
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                bucket = &(dlm->lockres_hash[i]);
+                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
                        if (res->state & DLM_LOCK_RES_RECOVERING) {
                                if (res->owner == dead_node) {
@@ -1743,11 +1973,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                                             dlm->name, res->lockname.len,
                                             res->lockname.name, res->owner);
                                        list_del_init(&res->recovering);
+                                        dlm_lockres_put(res);
                                }
                                spin_lock(&res->spinlock);
                                dlm_change_lockres_owner(dlm, res, new_master);
                                res->state &= ~DLM_LOCK_RES_RECOVERING;
-                                __dlm_dirty_lockres(dlm, res);
+                                if (!__dlm_lockres_unused(res))
+                                        __dlm_dirty_lockres(dlm, res);
                                spin_unlock(&res->spinlock);
                                wake_up(&res->wq);
                        }
@@ -1884,7 +2116,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
         *    need to be fired as a result.
         */
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                bucket = &(dlm->lockres_hash[i]);
+                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, iter, bucket, hash_node) {
                        /* always prune any $RECOVERY entries for dead nodes,
                         * otherwise hangs can occur during later recovery */
@@ -1924,6 +2156,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 {
        assert_spin_locked(&dlm->spinlock);
+        if (dlm->reco.new_master == idx) {
+                mlog(0, "%s: recovery master %d just died\n",
+                     dlm->name, idx);
+                if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                        /* finalize1 was reached, so it is safe to clear
+                         * the new_master and dead_node.  that recovery
+                         * is complete. */
+                        mlog(0, "%s: dead master %d had reached "
+                             "finalize1 state, clearing\n", dlm->name, idx);
+                        dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                        __dlm_reset_recovery(dlm);
+                }
+        }
        /* check to see if the node is already considered dead */
        if (!test_bit(idx, dlm->live_nodes_map)) {
                mlog(0, "for domain %s, node %d is already dead. "
@@ -2087,7 +2333,7 @@ again:
                        /* set the new_master to this node */
                        spin_lock(&dlm->spinlock);
-                        dlm->reco.new_master = dlm->node_num;
+                        dlm_set_reco_master(dlm, dlm->node_num);
                        spin_unlock(&dlm->spinlock);
                }
@@ -2125,6 +2371,10 @@ again:
                mlog(0, "%s: reco master %u is ready to recover %u\n",
                     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
                status = -EEXIST;
+        } else if (ret == DLM_RECOVERING) {
+                mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+                     dlm->name, dlm->node_num);
+                goto again;
        } else {
                struct dlm_lock_resource *res;
@@ -2156,7 +2406,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
        mlog_entry("%u\n", dead_node);
-        mlog(0, "dead node is %u\n", dead_node);
+        mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
        spin_lock(&dlm->spinlock);
        dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2214,6 +2464,14 @@ retry:
                         * another ENOMEM */
                        msleep(100);
                        goto retry;
+                } else if (ret == EAGAIN) {
+                        mlog(0, "%s: trying to start recovery of node "
+                             "%u, but node %u is waiting for last recovery "
+                             "to complete, backoff for a bit\n", dlm->name,
+                             dead_node, nodenum);
+                        /* TODO Look into replacing msleep with cond_resched() */
+                        msleep(100);
+                        goto retry;
                }
        }
@@ -2229,8 +2487,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return 0;
-        mlog(0, "node %u wants to recover node %u\n",
+        spin_lock(&dlm->spinlock);
-                  br->node_idx, br->dead_node);
+        if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+                     "but this node is in finalize state, waiting on finalize2\n",
+                     dlm->name, br->node_idx, br->dead_node,
+                     dlm->reco.dead_node, dlm->reco.new_master);
+                spin_unlock(&dlm->spinlock);
+                return EAGAIN;
+        }
+        spin_unlock(&dlm->spinlock);
+        mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+             dlm->name, br->node_idx, br->dead_node,
+             dlm->reco.dead_node, dlm->reco.new_master);
        dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
@@ -2252,8 +2522,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
                     "node %u changing it to %u\n", dlm->name, 
                     dlm->reco.dead_node, br->node_idx, br->dead_node);
        }
-        dlm->reco.new_master = br->node_idx;
+        dlm_set_reco_master(dlm, br->node_idx);
-        dlm->reco.dead_node = br->dead_node;
+        dlm_set_reco_dead_node(dlm, br->dead_node);
        if (!test_bit(br->dead_node, dlm->recovery_map)) {
                mlog(0, "recovery master %u sees %u as dead, but this "
                     "node has not yet.  marking %u as dead\n",
@@ -2272,10 +2542,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_unlock(&dlm->spinlock);
        dlm_kick_recovery_thread(dlm);
+        mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+             dlm->name, br->node_idx, br->dead_node,
+             dlm->reco.dead_node, dlm->reco.new_master);
        dlm_put(dlm);
        return 0;
 }
+#define DLM_FINALIZE_STAGE2  0x01
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 {
        int ret = 0;
@@ -2283,25 +2559,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
        struct dlm_node_iter iter;
        int nodenum;
        int status;
+        int stage = 1;
-        mlog(0, "finishing recovery for node %s:%u\n",
+        mlog(0, "finishing recovery for node %s:%u, "
-             dlm->name, dlm->reco.dead_node);
+             "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
        spin_lock(&dlm->spinlock);
        dlm_node_iter_init(dlm->domain_map, &iter);
        spin_unlock(&dlm->spinlock);
+stage2:
        memset(&fr, 0, sizeof(fr));
        fr.node_idx = dlm->node_num;
        fr.dead_node = dlm->reco.dead_node;
+        if (stage == 2)
+                fr.flags |= DLM_FINALIZE_STAGE2;
        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
                if (nodenum == dlm->node_num)
                        continue;
                ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
                                         &fr, sizeof(fr), nodenum, &status);
-                if (ret >= 0) {
+                if (ret >= 0)
                        ret = status;
+                if (ret < 0) {
+                        mlog_errno(ret);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery 
                                 * session, so set the status to zero to 
@@ -2309,13 +2591,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
                                mlog(ML_ERROR, "node %u went down after this "
                                     "node finished recovery.\n", nodenum);
                                ret = 0;
+                                continue;
                        }
-                }
-                if (ret < 0) {
-                        mlog_errno(ret);
                        break;
                }
        }
+        if (stage == 1) {
+                /* reset the node_iter back to the top and send finalize2 */
+                iter.curnode = -1;
+                stage = 2;
+                goto stage2;
+        }
        return ret;
 }
@@ -2324,14 +2610,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+        int stage = 1;
        /* ok to return 0, domain has gone away */
        if (!dlm_grab(dlm))
                return 0;
-        mlog(0, "node %u finalizing recovery of node %u\n",
+        if (fr->flags & DLM_FINALIZE_STAGE2)
-             fr->node_idx, fr->dead_node);
+                stage = 2;
+        mlog(0, "%s: node %u finalizing recovery stage%d of "
+             "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+             fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+ 
        spin_lock(&dlm->spinlock);
        if (dlm->reco.new_master != fr->node_idx) {
@@ -2347,13 +2638,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
                BUG();
        }
-        dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+        switch (stage) {
+                case 1:
-        spin_unlock(&dlm->spinlock);
+                        dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+                        if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                                mlog(ML_ERROR, "%s: received finalize1 from "
+                                     "new master %u for dead node %u, but "
+                                     "this node has already received it!\n",
+                                     dlm->name, fr->node_idx, fr->dead_node);
+                                dlm_print_reco_node_status(dlm);
+                                BUG();
+                        }
+                        dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+                        spin_unlock(&dlm->spinlock);
+                        break;
+                case 2:
+                        if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+                                mlog(ML_ERROR, "%s: received finalize2 from "
+                                     "new master %u for dead node %u, but "
+                                     "this node did not have finalize1!\n",
+                                     dlm->name, fr->node_idx, fr->dead_node);
+                                dlm_print_reco_node_status(dlm);
+                                BUG();
+                        }
+                        dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                        spin_unlock(&dlm->spinlock);
+                        dlm_reset_recovery(dlm);
+                        dlm_kick_recovery_thread(dlm);
+                        break;
+                default:
+                        BUG();
+        }
-        dlm_reset_recovery(dlm);
+        mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+             dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
-        dlm_kick_recovery_thread(dlm);
        dlm_put(dlm);
        return 0;
 }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14f12cb..0c822f3ffb05 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@
 #include <linux/inet.h>
 #include <linux/timer.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 #include "cluster/heartbeat.h"
@@ -53,6 +54,8 @@
 #include "cluster/masklog.h"
 static int dlm_thread(void *data);
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *lockres);
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
@@ -80,7 +83,7 @@ repeat:
 }
-static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
        if (list_empty(&res->granted) &&
            list_empty(&res->converting) &&
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
        if (__dlm_lockres_unused(res)){
+                /* For now, just keep any resource we master */
+                if (res->owner == dlm->node_num)
+                {
+                        if (!list_empty(&res->purge)) {
+                                mlog(0, "we master %s:%.*s, but it is on "
+                                     "the purge list.  Removing\n",
+                                     dlm->name, res->lockname.len,
+                                     res->lockname.name);
+                                list_del_init(&res->purge);
+                                dlm->purge_count--;
+                        }
+                        return;
+                }
                if (list_empty(&res->purge)) {
                        mlog(0, "putting lockres %.*s from purge list\n",
                             res->lockname.len, res->lockname.name);
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
                        res->last_used = jiffies;
                        list_add_tail(&res->purge, &dlm->purge_list);
                        dlm->purge_count++;
+                        /* if this node is not the owner, there is
+                         * no way to keep track of who the owner could be.
+                         * unhash it to avoid serious problems. */
+                        if (res->owner != dlm->node_num) {
+                                mlog(0, "%s:%.*s: doing immediate "
+                                     "purge of lockres owned by %u\n",
+                                     dlm->name, res->lockname.len,
+                                     res->lockname.name, res->owner);
+                                dlm_purge_lockres_now(dlm, res);
+                        }
                }
        } else if (!list_empty(&res->purge)) {
-                mlog(0, "removing lockres %.*s from purge list\n",
+                mlog(0, "removing lockres %.*s from purge list, "
-                     res->lockname.len, res->lockname.name);
+                     "owner=%u\n", res->lockname.len, res->lockname.name,
+                     res->owner);
                list_del_init(&res->purge);
                dlm->purge_count--;
@@ -165,6 +195,7 @@ again:
        } else if (ret < 0) {
                mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
                     lockres->lockname.len, lockres->lockname.name);
+                msleep(100);
                goto again;
        }
@@ -178,6 +209,24 @@ finish:
        __dlm_unhash_lockres(lockres);
 }
+/* make an unused lockres go away immediately.
+ * as soon as the dlm spinlock is dropped, this lockres
+ * will not be found. kfree still happens on last put. */
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *lockres)
+{
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&lockres->spinlock);
+        BUG_ON(!__dlm_lockres_unused(lockres));
+        if (!list_empty(&lockres->purge)) {
+                list_del_init(&lockres->purge);
+                dlm->purge_count--;
+        }
+        __dlm_unhash_lockres(lockres);
+}
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                               int purge_now)
 {
@@ -318,8 +367,7 @@ converting:
                target->ml.type = target->ml.convert_type;
                target->ml.convert_type = LKM_IVMODE;
-                list_del_init(&target->list);
+                list_move_tail(&target->list, &res->granted);
-                list_add_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
                target->lksb->status = DLM_NORMAL;
@@ -380,8 +428,7 @@ blocked:
                     target->ml.type, target->ml.node);
                // target->ml.type is already correct
-                list_del_init(&target->list);
+                list_move_tail(&target->list, &res->granted);
-                list_add_tail(&target->list, &res->granted);
                BUG_ON(!target->lksb);
                target->lksb->status = DLM_NORMAL;
@@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        /* don't shuffle secondary queues */
        if ((res->owner == dlm->node_num) &&
            !(res->state & DLM_LOCK_RES_DIRTY)) {
+                /* ref for dirty_list */
+                dlm_lockres_get(res);
                list_add_tail(&res->dirty, &dlm->dirty_list);
                res->state |= DLM_LOCK_RES_DIRTY;
        }
@@ -606,6 +655,8 @@ static int dlm_thread(void *data)
                        list_del_init(&res->dirty);
                        spin_unlock(&res->spinlock);
                        spin_unlock(&dlm->spinlock);
+                        /* Drop dirty_list ref */
+                        dlm_lockres_put(res);
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
@@ -642,8 +693,9 @@ static int dlm_thread(void *data)
                         * spinlock and do NOT have the dlm lock.
                         * safe to reserve/queue asts and run the lists. */
-                        mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
+                        mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
-                             "res=%p\n", dlm, res);
+                             "res=%.*s\n", dlm->name,
+                             res->lockname.len, res->lockname.name);
                        /* called while holding lockres lock */
                        dlm_shuffle_lists(dlm, res);
@@ -657,6 +709,8 @@ in_progress:
                        /* if the lock was in-progress, stick
                         * it on the back of the list */
                        if (delay) {
+                                /* ref for dirty_list */
+                                dlm_lockres_get(res);
                                spin_lock(&res->spinlock);
                                list_add_tail(&res->dirty, &dlm->dirty_list);
                                res->state |= DLM_LOCK_RES_DIRTY;
@@ -677,7 +731,7 @@ in_progress:
                /* yield and continue right away if there is more work to do */
                if (!n) {
-                        yield();
+                        cond_resched();
                        continue;
                }
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a27542674..b0c3134f4f70 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
 void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
                               struct dlm_lock *lock)
 {
-        list_del_init(&lock->list);
+        list_move_tail(&lock->list, &res->granted);
-        list_add_tail(&lock->list, &res->granted);
        lock->ml.convert_type = LKM_IVMODE;
 }
@@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+        if (owner == dlm->node_num) {
+                /* ended up trying to contact ourself.  this means
+                 * that the lockres had been remote but became local
+                 * via a migration.  just retry it, now as local */
+                mlog(0, "%s:%.*s: this node became the master due to a "
+                     "migration, re-evaluate now\n", dlm->name,
+                     res->lockname.len, res->lockname.name);
+                return DLM_FORWARD;
+        }
        memset(&unlock, 0, sizeof(unlock));
        unlock.node_idx = dlm->node_num;
        unlock.flags = cpu_to_be32(flags);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5f9765..e641b084b343 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
        u32 dlm_key;
        char *domain;
-        domain = kmalloc(name->len + 1, GFP_KERNEL);
+        domain = kmalloc(name->len + 1, GFP_NOFS);
        if (!domain) {
                mlog_errno(-ENOMEM);
                return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd52860c87..762eb1fbb34d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
        mlog_exit_void();
 }
-static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
                                       struct ocfs2_dlm_debug *dlm_debug)
@@ -2071,8 +2071,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
        }
        /* launch vote thread */
-        osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
+        osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
-                                     osb->osb_id);
        if (IS_ERR(osb->vote_task)) {
                status = PTR_ERR(osb->vote_task);
                osb->vote_task = NULL;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 1a5c69071df6..fcd4475d1f89 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -298,7 +298,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
                ret = ocfs2_extent_map_insert(inode, rec,
                                              le16_to_cpu(el->l_tree_depth));
-                if (ret) {
+                if (ret && (ret != -EEXIST)) {
                        mlog_errno(ret);
                        goto out_free;
                }
@@ -427,6 +427,11 @@ static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
 /*
 * Simple rule: on any return code other than -EAGAIN, anything left
 * in the insert_context will be freed.
+ *
+ * Simple rule #2: A return code of -EEXIST from this function or
+ * its calls to ocfs2_extent_map_insert_entry() signifies that another
+ * thread beat us to the insert.  It is not an actual error, but it
+ * tells the caller we have no more work to do.
 */
 static int ocfs2_extent_map_try_insert(struct inode *inode,
                                       struct ocfs2_extent_rec *rec,
@@ -448,22 +453,32 @@ static int ocfs2_extent_map_try_insert(struct inode *inode,
                goto out_unlock;
        }
+        /* Since insert_entry failed, the map MUST have old_ent */
        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-                                          le32_to_cpu(rec->e_clusters), NULL,
+                                          le32_to_cpu(rec->e_clusters),
-                                          NULL);
+                                          NULL, NULL);
        BUG_ON(!old_ent);
-        ret = -EEXIST;
+        if (old_ent->e_tree_depth < tree_depth) {
-        if (old_ent->e_tree_depth < tree_depth)
+                /* Another thread beat us to the lower tree_depth */
+                ret = -EEXIST;
                goto out_unlock;
+        }
        if (old_ent->e_tree_depth == tree_depth) {
+                /*
+                 * Another thread beat us to this tree_depth.
+                 * Let's make sure we agree with that thread (the
+                 * extent_rec should be identical).
+                 */
                if (!memcmp(rec, &old_ent->e_rec,
                            sizeof(struct ocfs2_extent_rec)))
                        ret = 0;
+                else
+                        /* FIXME: Should this be ESRCH/EBADR??? */
+                        ret = -EEXIST;
-                /* FIXME: Should this be ESRCH/EBADR??? */
                goto out_unlock;
        }
@@ -599,7 +614,7 @@ static int ocfs2_extent_map_insert(struct inode *inode,
                                                  tree_depth, &ctxt);
        } while (ret == -EAGAIN);
-        if (ret < 0)
+        if ((ret < 0) && (ret != -EEXIST))
                mlog_errno(ret);
        if (ctxt.left_ent)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 84c507961287..35140f6cf840 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -114,7 +114,7 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 extern kmem_cache_t *ocfs2_inode_cache;
-extern struct address_space_operations ocfs2_aops;
+extern const struct address_space_operations ocfs2_aops;
 struct buffer_head *ocfs2_bread(struct inode *inode, int block,
                                int *err, int reada);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cfa6be8..f92bf1dd379a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -49,7 +49,7 @@
 #include "buffer_head_io.h"
-spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(trans_inc_lock);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
@@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
        BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
        OCFS2_I(inode)->ip_handle = handle;
-        list_del(&(OCFS2_I(inode)->ip_handle_list));
+        list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
-        list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
 }
 static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
@@ -785,8 +784,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal)
        }
        /* Launch the commit thread */
-        osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
+        osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt");
-                                       osb->osb_id);
        if (IS_ERR(osb->commit_task)) {
                status = PTR_ERR(osb->commit_task);
                osb->commit_task = NULL;
@@ -1119,7 +1117,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
                goto out;
        osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
-                                                 "ocfs2rec-%d", osb->osb_id);
+                                                 "ocfs2rec");
        if (IS_ERR(osb->recovery_thread_task)) {
                mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
                osb->recovery_thread_task = NULL;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 843cf9ddefe8..83934e33e5b0 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -46,12 +46,12 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
                                 unsigned long address,
                                 int *type)
 {
-        struct inode *inode = area->vm_file->f_dentry->d_inode;
        struct page *page = NOPAGE_SIGBUS;
        sigset_t blocked, oldset;
        int ret;
-        mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+        mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
+                   type);
        /* The best way to deal with signals in this path is
         * to block them upfront, rather than allowing the
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index da1093039c01..cd4a6f253d13 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -184,7 +184,6 @@ struct ocfs2_journal;
 struct ocfs2_journal_handle;
 struct ocfs2_super
 {
-        u32 osb_id;             /* id used by the proc interface */
        struct task_struct *commit_task;
        struct super_block *sb;
        struct inode *root_inode;
@@ -222,13 +221,11 @@ struct ocfs2_super
        unsigned long s_mount_opt;
        u16 max_slots;
-        u16 num_nodes;
        s16 node_num;
        s16 slot_num;
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
-        struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
        atomic_t vol_state;
        struct mutex recovery_lock;
@@ -294,7 +291,6 @@ struct ocfs2_super
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
-#define OCFS2_MAX_OSB_ID             65536
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 871627961d6d..aa6f5aadedc4 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -264,7 +264,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
        osb->slot_num = slot;
        spin_unlock(&si->si_lock);
-        mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+        mlog(0, "taking node slot %d\n", osb->slot_num);
        status = ocfs2_update_disk_slots(osb, si);
        if (status < 0)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3dac30f1..382706a67ffd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,13 +68,6 @@
 #include "buffer_head_io.h"
-/*
- * Globals
- */
-static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
-static u32 osb_id;             /* Keeps track of next available OSB Id */
 static kmem_cache_t *ocfs2_inode_cachep = NULL;
 kmem_cache_t *ocfs2_lock_cache = NULL;
@@ -100,7 +93,7 @@ static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
@@ -642,10 +635,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        ocfs2_complete_mount_recovery(osb);
-        printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
+        printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) "
-               "data mode.\n",
+               "with %s data mode.\n",
-               MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
+               osb->dev_str, osb->node_num, osb->slot_num,
-               osb->slot_num,
               osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
               "ordered");
@@ -672,12 +664,14 @@ read_super_error:
        return status;
 }
-static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
+static int ocfs2_get_sb(struct file_system_type *fs_type,
-                                        int flags,
+                        int flags,
-                                        const char *dev_name,
+                        const char *dev_name,
-                                        void *data)
+                        void *data,
+                        struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+                           mnt);
 }
 static struct file_system_type ocfs2_fs_type = {
@@ -798,10 +792,6 @@ static int __init ocfs2_init(void)
                goto leave;
        }
-        spin_lock(&ocfs2_globals_lock);
-        osb_id = 0;
-        spin_unlock(&ocfs2_globals_lock);
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
        if (!ocfs2_debugfs_root) {
                status = -EFAULT;
@@ -855,7 +845,7 @@ static void ocfs2_put_super(struct super_block *sb)
        mlog_exit_void();
 }
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct ocfs2_super *osb;
        u32 numbits, freebits;
@@ -864,9 +854,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
        struct buffer_head *bh = NULL;
        struct inode *inode = NULL;
-        mlog_entry("(%p, %p)\n", sb, buf);
+        mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
-        osb = OCFS2_SB(sb);
+        osb = OCFS2_SB(dentry->d_sb);
        inode = ocfs2_get_system_file_inode(osb,
                                            GLOBAL_BITMAP_SYSTEM_INODE,
@@ -889,7 +879,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
        freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
        buf->f_type = OCFS2_SUPER_MAGIC;
-        buf->f_bsize = sb->s_blocksize;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
        buf->f_blocks = ((sector_t) numbits) *
                        (osb->s_clustersize >> osb->sb->s_blocksize_bits);
@@ -1018,7 +1008,7 @@ static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
                goto bail;
        }
-        mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
+        mlog(0, "I am node %d\n", osb->node_num);
        status = 0;
 bail:
@@ -1189,8 +1179,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
-        printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
+        printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n",
-               MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
+               osb->dev_str, osb->node_num);
        ocfs2_delete_osb(osb);
        kfree(osb);
@@ -1210,8 +1200,6 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
        if (osb->uuid_str == NULL)
                return -ENOMEM;
-        memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
        for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
                /* print with null */
                ret = snprintf(ptr, 3, "%02X", uuid[i]);
@@ -1309,13 +1297,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
-        if (!osb->uuid) {
-                mlog(ML_ERROR, "unable to alloc uuid\n");
-                status = -ENOMEM;
-                goto bail;
-        }
        di = (struct ocfs2_dinode *)bh->b_data;
        osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
@@ -1325,7 +1306,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                status = -EINVAL;
                goto bail;
        }
-        mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
+        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
        init_waitqueue_head(&osb->osb_wipe_event);
        osb->osb_orphan_wipes = kcalloc(osb->max_slots,
@@ -1416,7 +1397,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
+        memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
        osb->net_key = le32_to_cpu(uuid_net_key);
        strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
@@ -1482,18 +1463,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        /*  Link this osb onto the global linked list of all osb structures. */
-        /*  The Global Link List is mainted for the whole driver . */
-        spin_lock(&ocfs2_globals_lock);
-        osb->osb_id = osb_id;
-        if (osb_id < OCFS2_MAX_OSB_ID)
-                osb_id++;
-        else {
-                mlog(ML_ERROR, "Too many volumes mounted\n");
-                status = -ENOMEM;
-        }
-        spin_unlock(&ocfs2_globals_lock);
 bail:
        mlog_exit(status);
        return status;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd79e75..c0f68aa6c175 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -64,8 +64,7 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 {
        struct page * page;
        struct address_space *mapping = dentry->d_inode->i_mapping;
-        page = read_cache_page(mapping, 0,
+        page = read_mapping_page(mapping, 0, NULL);
-                               (filler_t *)mapping->a_ops->readpage, NULL);
        if (IS_ERR(page))
                goto sync_fail;
        wait_on_page_locked(page);
@@ -155,7 +154,7 @@ static void *ocfs2_follow_link(struct dentry *dentry,
        }
        status = vfs_follow_link(nd, link);
-        if (status)
+        if (status && status != -ENOENT)
                mlog_errno(status);
 bail:
        if (page) {
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index ee42765a8553..cf70fe2075b8 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb)
        }
 bail:
-        if (request)
+        kfree(request);
-                kfree(request);
        return status;
 }
@@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
        }
 bail:
-        if (request)
+        kfree(request);
-                kfree(request);
        return status;
 }
diff --git a/fs/open.c b/fs/open.c
index 317b7c7f38a7..303f06d2a7b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -31,18 +31,18 @@
 #include <asm/unistd.h>
-int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        int retval = -ENODEV;
-        if (sb) {
+        if (dentry) {
                retval = -ENOSYS;
-                if (sb->s_op->statfs) {
+                if (dentry->d_sb->s_op->statfs) {
                        memset(buf, 0, sizeof(*buf));
-                        retval = security_sb_statfs(sb);
+                        retval = security_sb_statfs(dentry);
                        if (retval)
                                return retval;
-                        retval = sb->s_op->statfs(sb, buf);
+                        retval = dentry->d_sb->s_op->statfs(dentry, buf);
                        if (retval == 0 && buf->f_frsize == 0)
                                buf->f_frsize = buf->f_bsize;
                }
@@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
 EXPORT_SYMBOL(vfs_statfs);
-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 {
        struct kstatfs st;
        int retval;
-        retval = vfs_statfs(sb, &st);
+        retval = vfs_statfs(dentry, &st);
        if (retval)
                return retval;
@@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
        return 0;
 }
-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 {
        struct kstatfs st;
        int retval;
-        retval = vfs_statfs(sb, &st);
+        retval = vfs_statfs(dentry, &st);
        if (retval)
                return retval;
@@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
        error = user_path_walk(path, &nd);
        if (!error) {
                struct statfs tmp;
-                error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
+                error = vfs_statfs_native(nd.dentry, &tmp);
                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
                        error = -EFAULT;
                path_release(&nd);
@@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
        error = user_path_walk(path, &nd);
        if (!error) {
                struct statfs64 tmp;
-                error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
+                error = vfs_statfs64(nd.dentry, &tmp);
                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
                        error = -EFAULT;
                path_release(&nd);
@@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
        file = fget(fd);
        if (!file)
                goto out;
-        error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
+        error = vfs_statfs_native(file->f_dentry, &tmp);
        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
                error = -EFAULT;
        fput(file);
@@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
        file = fget(fd);
        if (!file)
                goto out;
-        error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
+        error = vfs_statfs64(file->f_dentry, &tmp);
        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
                error = -EFAULT;
        fput(file);
@@ -322,7 +322,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
-                error = do_truncate(dentry, length, 0, file);
+                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
        fput(file);
 out:
@@ -633,7 +633,7 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
        dentry = file->f_dentry;
        inode = dentry->d_inode;
-        audit_inode(NULL, inode, 0);
+        audit_inode(NULL, inode);
        err = -EROFS;
        if (IS_RDONLY(inode))
@@ -786,7 +786,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
        if (file) {
                struct dentry * dentry;
                dentry = file->f_dentry;
-                audit_inode(NULL, dentry->d_inode, 0);
+                audit_inode(NULL, dentry->d_inode);
                error = chown_common(dentry, user, group);
                fput(file);
        }
@@ -1152,7 +1152,7 @@ int filp_close(struct file *filp, fl_owner_t id)
        }
        if (filp->f_op && filp->f_op->flush)
-                retval = filp->f_op->flush(filp);
+                retval = filp->f_op->flush(filp, id);
        dnotify_flush(filp, id);
        locks_remove_posix(filp, id);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 0f14276a2e51..93a56bd4a2b7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1,5 +1,4 @@
-/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $
+/* inode.c: /proc/openprom handling routines
- * openpromfs.c: /proc/openprom handling routines
 *
 * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
 * Copyright (C) 1998      Eddie C. Dost  (ecd@skynet.be)
@@ -12,756 +11,245 @@
 #include <linux/openprom_fs.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
+#include <asm/prom.h>
 #include <asm/uaccess.h>
-#define ALIASES_NNODES 64
+static DEFINE_MUTEX(op_mutex);
-typedef struct {
+#define OPENPROM_ROOT_INO       0
-        u16     parent;
-        u16     next;
+enum op_inode_type {
-        u16     child;
+        op_inode_node,
-        u16     first_prop;
+        op_inode_prop,
-        u32     node;
+};
-} openpromfs_node;
+union op_inode_data {
-typedef struct {
+        struct device_node      *node;
-#define OPP_STRING      0x10
+        struct property         *prop;
-#define OPP_STRINGLIST  0x20
+};
-#define OPP_BINARY      0x40
-#define OPP_HEXSTRING   0x80
-#define OPP_DIRTY       0x01
-#define OPP_QUOTED      0x02
-#define OPP_NOTQUOTED   0x04
-#define OPP_ASCIIZ      0x08
-        u32     flag;
-        u32     alloclen;
-        u32     len;
-        char    *value;
-        char    name[8];
-} openprom_property;
-static openpromfs_node *nodes;
-static int alloced;
-static u16 last_node;
-static u16 first_prop;
-static u16 options = 0xffff;
-static u16 aliases = 0xffff;
-static int aliases_nodes;
-static char *alias_names [ALIASES_NNODES];
-#define OPENPROM_ROOT_INO       16
-#define OPENPROM_FIRST_INO      OPENPROM_ROOT_INO
-#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO]
-#define NODE2INO(node) (node + OPENPROM_FIRST_INO)
-#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
-static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
-static int openpromfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
-static int openpromfs_unlink (struct inode *, struct dentry *dentry);
-static ssize_t nodenum_read(struct file *file, char __user *buf,
+struct op_inode_info {
-                            size_t count, loff_t *ppos)
+        struct inode            vfs_inode;
+        enum op_inode_type      type;
+        union op_inode_data     u;
+};
+static inline struct op_inode_info *OP_I(struct inode *inode)
 {
-        struct inode *inode = file->f_dentry->d_inode;
+        return container_of(inode, struct op_inode_info, vfs_inode);
-        char buffer[10];
-        
-        if (count < 0 || !inode->u.generic_ip)
-                return -EINVAL;
-        sprintf (buffer, "%8.8x\n", (u32)(long)(inode->u.generic_ip));
-        if (file->f_pos >= 9)
-                return 0;
-        if (count > 9 - file->f_pos)
-                count = 9 - file->f_pos;
-        if (copy_to_user(buf, buffer + file->f_pos, count))
-                return -EFAULT;
-        *ppos += count;
-        return count;
 }
-static ssize_t property_read(struct file *filp, char __user *buf,
+static int is_string(unsigned char *p, int len)
-                             size_t count, loff_t *ppos)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        int i;
-        int i, j, k;
-        u32 node;
-        char *p, *s;
-        u32 *q;
-        openprom_property *op;
-        char buffer[64];
-        
-        if (!filp->private_data) {
-                node = nodes[(u16)((long)inode->u.generic_ip)].node;
-                i = ((u32)(long)inode->u.generic_ip) >> 16;
-                if ((u16)((long)inode->u.generic_ip) == aliases) {
-                        if (i >= aliases_nodes)
-                                p = NULL;
-                        else
-                                p = alias_names [i];
-                } else
-                        for (p = prom_firstprop (node, buffer);
-                             i && p && *p;
-                             p = prom_nextprop (node, p, buffer), i--)
-                                /* nothing */ ;
-                if (!p || !*p)
-                        return -EIO;
-                i = prom_getproplen (node, p);
-                if (i < 0) {
-                        if ((u16)((long)inode->u.generic_ip) == aliases)
-                                i = 0;
-                        else
-                                return -EIO;
-                }
-                k = i;
-                if (i < 64) i = 64;
-                filp->private_data = kmalloc (sizeof (openprom_property)
-                                              + (j = strlen (p)) + 2 * i,
-                                              GFP_KERNEL);
-                if (!filp->private_data)
-                        return -ENOMEM;
-                op = (openprom_property *)filp->private_data;
-                op->flag = 0;
-                op->alloclen = 2 * i;
-                strcpy (op->name, p);
-                op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3);
-                op->len = k;
-                if (k && prom_getproperty (node, p, op->value, i) < 0)
-                        return -EIO;
-                op->value [k] = 0;
-                if (k) {
-                        for (s = NULL, p = op->value; p < op->value + k; p++) {
-                                if ((*p >= ' ' && *p <= '~') || *p == '\n') {
-                                        op->flag |= OPP_STRING;
-                                        s = p;
-                                        continue;
-                                }
-                                if (p > op->value && !*p && s == p - 1) {
-                                        if (p < op->value + k - 1)
-                                                op->flag |= OPP_STRINGLIST;
-                                        else
-                                                op->flag |= OPP_ASCIIZ;
-                                        continue;
-                                }
-                                if (k == 1 && !*p) {
-                                        op->flag |= (OPP_STRING|OPP_ASCIIZ);
-                                        break;
-                                }
-                                op->flag &= ~(OPP_STRING|OPP_STRINGLIST);
-                                if (k & 3)
-                                        op->flag |= OPP_HEXSTRING;
-                                else
-                                        op->flag |= OPP_BINARY;
-                                break;
-                        }
-                        if (op->flag & OPP_STRINGLIST)
-                                op->flag &= ~(OPP_STRING);
-                        if (op->flag & OPP_ASCIIZ)
-                                op->len--;
-                }
-        } else
-                op = (openprom_property *)filp->private_data;
-        if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
-                return 0;
-        if (*ppos >= 0xffffff || count >= 0xffffff)
-                return -EINVAL;
-        if (op->flag & OPP_STRINGLIST) {
-                for (k = 0, p = op->value; p < op->value + op->len; p++)
-                        if (!*p)
-                                k++;
-                i = op->len + 4 * k + 3;
-        } else if (op->flag & OPP_STRING) {
-                i = op->len + 3;
-        } else if (op->flag & OPP_BINARY) {
-                i = (op->len * 9) >> 2;
-        } else {
-                i = (op->len << 1) + 1;
-        }
-        k = *ppos;
-        if (k >= i) return 0;
-        if (count > i - k) count = i - k;
-        if (op->flag & OPP_STRING) {
-                if (!k) {
-                        if (put_user('\'', buf))
-                                return -EFAULT;
-                        k++;
-                        count--;
-                }
-                if (k + count >= i - 2)
+        for (i = 0; i < len; i++) {
-                        j = i - 2 - k;
+                unsigned char val = p[i];
-                else
-                        j = count;
-                if (j >= 0) {
-                        if (copy_to_user(buf + k - *ppos,
-                                         op->value + k - 1, j))
-                                return -EFAULT;
-                        count -= j;
-                        k += j;
-                }
-                if (count) {
+                if ((i && !val) ||
-                        if (put_user('\'', &buf [k++ - *ppos]))
+                    (val >= ' ' && val <= '~'))
-                                return -EFAULT;
+                        continue;
-                }
-                if (count > 1) {
-                        if (put_user('\n', &buf [k++ - *ppos]))
-                                return -EFAULT;
-                }
-        } else if (op->flag & OPP_STRINGLIST) {
-                char *tmp;
-                tmp = kmalloc (i, GFP_KERNEL);
-                if (!tmp)
-                        return -ENOMEM;
-                s = tmp;
-                *s++ = '\'';
-                for (p = op->value; p < op->value + op->len; p++) {
-                        if (!*p) {
-                                strcpy(s, "' + '");
-                                s += 5;
-                                continue;
-                        }
-                        *s++ = *p;
-                }
-                strcpy(s, "'\n");
-                if (copy_to_user(buf, tmp + k, count))
-                        return -EFAULT;
-                kfree(tmp);
-                k += count;
-        } else if (op->flag & OPP_BINARY) {
-                char buffer[10];
-                u32 *first, *last;
-                int first_off, last_cnt;
-                first = ((u32 *)op->value) + k / 9;
-                first_off = k % 9;
-                last = ((u32 *)op->value) + (k + count - 1) / 9;
-                last_cnt = (k + count) % 9;
-                if (!last_cnt) last_cnt = 9;
-                if (first == last) {
-                        sprintf (buffer, "%08x.", *first);
-                        if (copy_to_user(buf, buffer + first_off,
-                                         last_cnt - first_off))
-                                return -EFAULT;
-                        buf += last_cnt - first_off;
-                } else {                
-                        for (q = first; q <= last; q++) {
-                                sprintf (buffer, "%08x.", *q);
-                                if (q == first) {
-                                        if (copy_to_user(buf, buffer + first_off,
-                                                         9 - first_off))
-                                                return -EFAULT;
-                                        buf += 9 - first_off;
-                                } else if (q == last) {
-                                        if (copy_to_user(buf, buffer, last_cnt))
-                                                return -EFAULT;
-                                        buf += last_cnt;
-                                } else {
-                                        if (copy_to_user(buf, buffer, 9))
-                                                return -EFAULT;
-                                        buf += 9;
-                                }
-                        }
-                }
-                if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) {
+                return 0;
-                        if (put_user('\n', (buf - 1)))
+        }
-                                return -EFAULT;
-                }
-                k += count;
+        return 1;
+}
-        } else if (op->flag & OPP_HEXSTRING) {
+static int property_show(struct seq_file *f, void *v)
-                char buffer[3];
+{
+        struct property *prop = f->private;
+        void *pval;
+        int len;
-                if ((k < i - 1) && (k & 1)) {
+        len = prop->length;
-                        sprintf (buffer, "%02x",
+        pval = prop->value;
-                                 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-                        if (put_user(buffer[1], &buf[k++ - *ppos]))
-                                return -EFAULT;
-                        count--;
-                }
-                for (; (count > 1) && (k < i - 1); k += 2) {
+        if (is_string(pval, len)) {
-                        sprintf (buffer, "%02x",
+                while (len > 0) {
-                                 (unsigned char) *(op->value + (k >> 1)) & 0xff);
+                        int n = strlen(pval);
-                        if (copy_to_user(buf + k - *ppos, buffer, 2))
-                                return -EFAULT;
-                        count -= 2;
-                }
-                if (count && (k < i - 1)) {
+                        seq_printf(f, "%s", (char *) pval);
-                        sprintf (buffer, "%02x",
-                                 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-                        if (put_user(buffer[0], &buf[k++ - *ppos]))
-                                return -EFAULT;
-                        count--;
-                }
-                if (count) {
+                        /* Skip over the NULL byte too.  */
-                        if (put_user('\n', &buf [k++ - *ppos]))
+                        pval += n + 1;
-                                return -EFAULT;
+                        len -= n + 1;
-                }
-        }
-        count = k - *ppos;
-        *ppos = k;
-        return count;
-}
-static ssize_t property_write(struct file *filp, const char __user *buf,
+                        if (len > 0)
-                              size_t count, loff_t *ppos)
+                                seq_printf(f, " + ");
-{
-        int i, j, k;
-        char *p;
-        u32 *q;
-        void *b;
-        openprom_property *op;
-        
-        if (*ppos >= 0xffffff || count >= 0xffffff)
-                return -EINVAL;
-        if (!filp->private_data) {
-                i = property_read (filp, NULL, 0, NULL);
-                if (i)
-                        return i;
-        }
-        k = *ppos;
-        op = (openprom_property *)filp->private_data;
-        if (!(op->flag & OPP_STRING)) {
-                u32 *first, *last;
-                int first_off, last_cnt;
-                u32 mask, mask2;
-                char tmp [9];
-                int forcelen = 0;
-                
-                j = k % 9;
-                for (i = 0; i < count; i++, j++) {
-                        if (j == 9) j = 0;
-                        if (!j) {
-                                char ctmp;
-                                if (get_user(ctmp, &buf[i]))
-                                        return -EFAULT;
-                                if (ctmp != '.') {
-                                        if (ctmp != '\n') {
-                                                if (op->flag & OPP_BINARY)
-                                                        return -EINVAL;
-                                                else
-                                                        goto write_try_string;
-                                        } else {
-                                                count = i + 1;
-                                                forcelen = 1;
-                                                break;
-                                        }
-                                }
-                        } else {
-                                char ctmp;
-                                if (get_user(ctmp, &buf[i]))
-                                        return -EFAULT;
-                                if (ctmp < '0' || 
-                                    (ctmp > '9' && ctmp < 'A') ||
-                                    (ctmp > 'F' && ctmp < 'a') ||
-                                    ctmp > 'f') {
-                                        if (op->flag & OPP_BINARY)
-                                                return -EINVAL;
-                                        else
-                                                goto write_try_string;
-                                }
-                        }
-                }
-                op->flag |= OPP_BINARY;
-                tmp [8] = 0;
-                i = ((count + k + 8) / 9) << 2;
-                if (op->alloclen <= i) {
-                        b = kmalloc (sizeof (openprom_property) + 2 * i,
-                                     GFP_KERNEL);
-                        if (!b)
-                                return -ENOMEM;
-                        memcpy (b, filp->private_data,
-                                sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen);
-                        memset (((char *)b) + sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen, 
-                                0, 2 * i - op->alloclen);
-                        op = (openprom_property *)b;
-                        op->alloclen = 2*i;
-                        b = filp->private_data;
-                        filp->private_data = (void *)op;
-                        kfree (b);
                }
-                first = ((u32 *)op->value) + (k / 9);
+        } else {
-                first_off = k % 9;
+                if (len & 3) {
-                last = (u32 *)(op->value + i);
+                        while (len) {
-                last_cnt = (k + count) % 9;
+                                len--;
-                if (first + 1 == last) {
+                                if (len)
-                        memset (tmp, '0', 8);
+                                        seq_printf(f, "%02x.",
-                        if (copy_from_user(tmp + first_off, buf,
+                                                   *(unsigned char *) pval);
-                                           (count + first_off > 8) ?
+                                else
-                                           8 - first_off : count))
+                                        seq_printf(f, "%02x",
-                                return -EFAULT;
+                                                   *(unsigned char *) pval);
-                        mask = 0xffffffff;
+                                pval++;
-                        mask2 = 0xffffffff;
-                        for (j = 0; j < first_off; j++)
-                                mask >>= 1;
-                        for (j = 8 - count - first_off; j > 0; j--)
-                                mask2 <<= 1;
-                        mask &= mask2;
-                        if (mask) {
-                                *first &= ~mask;
-                                *first |= simple_strtoul (tmp, NULL, 16);
-                                op->flag |= OPP_DIRTY;
                        }
                } else {
-                        op->flag |= OPP_DIRTY;
+                        while (len >= 4) {
-                        for (q = first; q < last; q++) {
+                                len -= 4;
-                                if (q == first) {
-                                        if (first_off < 8) {
+                                if (len)
-                                                memset (tmp, '0', 8);
+                                        seq_printf(f, "%08x.",
-                                                if (copy_from_user(tmp + first_off,
+                                                   *(unsigned int *) pval);
-                                                                   buf,
+                                else
-                                                                   8 - first_off))
+                                        seq_printf(f, "%08x",
-                                                        return -EFAULT;
+                                                   *(unsigned int *) pval);
-                                                mask = 0xffffffff;
+                                pval += 4;
-                                                for (j = 0; j < first_off; j++)
-                                                        mask >>= 1;
-                                                *q &= ~mask;
-                                                *q |= simple_strtoul (tmp,NULL,16);
-                                        }
-                                        buf += 9;
-                                } else if ((q == last - 1) && last_cnt
-                                           && (last_cnt < 8)) {
-                                        memset (tmp, '0', 8);
-                                        if (copy_from_user(tmp, buf, last_cnt))
-                                                return -EFAULT;
-                                        mask = 0xffffffff;
-                                        for (j = 0; j < 8 - last_cnt; j++)
-                                                mask <<= 1;
-                                        *q &= ~mask;
-                                        *q |= simple_strtoul (tmp, NULL, 16);
-                                        buf += last_cnt;
-                                } else {
-                                        char tchars[17]; /* XXX yuck... */
-                                        if (copy_from_user(tchars, buf, 16))
-                                                return -EFAULT;
-                                        *q = simple_strtoul (tchars, NULL, 16);
-                                        buf += 9;
-                                }
-                        }
-                }
-                if (!forcelen) {
-                        if (op->len < i)
-                                op->len = i;
-                } else
-                        op->len = i;
-                *ppos += count;
-        }
-write_try_string:
-        if (!(op->flag & OPP_BINARY)) {
-                if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) {
-                        char ctmp;
-                        /* No way, if somebody starts writing from the middle, 
-                         * we don't know whether he uses quotes around or not 
-                         */
-                        if (k > 0)
-                                return -EINVAL;
-                        if (get_user(ctmp, buf))
-                                return -EFAULT;
-                        if (ctmp == '\'') {
-                                op->flag |= OPP_QUOTED;
-                                buf++;
-                                count--;
-                                (*ppos)++;
-                                if (!count) {
-                                        op->flag |= OPP_STRING;
-                                        return 1;
-                                }
-                        } else
-                                op->flag |= OPP_NOTQUOTED;
-                }
-                op->flag |= OPP_STRING;
-                if (op->alloclen <= count + *ppos) {
-                        b = kmalloc (sizeof (openprom_property)
-                                     + 2 * (count + *ppos), GFP_KERNEL);
-                        if (!b)
-                                return -ENOMEM;
-                        memcpy (b, filp->private_data,
-                                sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen);
-                        memset (((char *)b) + sizeof (openprom_property)
-                                + strlen (op->name) + op->alloclen, 
-                                0, 2*(count - *ppos) - op->alloclen);
-                        op = (openprom_property *)b;
-                        op->alloclen = 2*(count + *ppos);
-                        b = filp->private_data;
-                        filp->private_data = (void *)op;
-                        kfree (b);
-                }
-                p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
-                if (copy_from_user(p, buf, count))
-                        return -EFAULT;
-                op->flag |= OPP_DIRTY;
-                for (i = 0; i < count; i++, p++)
-                        if (*p == '\n') {
-                                *p = 0;
-                                break;
                        }
-                if (i < count) {
-                        op->len = p - op->value;
-                        *ppos += i + 1;
-                        if ((p > op->value) && (op->flag & OPP_QUOTED)
-                            && (*(p - 1) == '\''))
-                                op->len--;
-                } else {
-                        if (p - op->value > op->len)
-                                op->len = p - op->value;
-                        *ppos += count;
                }
        }
-        return *ppos - k;
+        seq_printf(f, "\n");
+        return 0;
 }
-int property_release (struct inode *inode, struct file *filp)
+static void *property_start(struct seq_file *f, loff_t *pos)
 {
-        openprom_property *op = (openprom_property *)filp->private_data;
+        if (*pos == 0)
-        int error;
+                return pos;
-        u32 node;
+        return NULL;
-        
+}
-        if (!op)
-                return 0;
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
-        lock_kernel();
+{
-        node = nodes[(u16)((long)inode->u.generic_ip)].node;
+        (*pos)++;
-        if ((u16)((long)inode->u.generic_ip) == aliases) {
+        return NULL;
-                if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
+}
-                        char *p = op->name;
-                        int i = (op->value - op->name) - strlen (op->name) - 1;
+static void property_stop(struct seq_file *f, void *v)
-                        op->value [op->len] = 0;
+{
-                        *(op->value - 1) = ' ';
+        /* Nothing to do */
-                        if (i) {
+}
-                                for (p = op->value - i - 2; p >= op->name; p--)
-                                        p[i] = *p;
+static struct seq_operations property_op = {
-                                p = op->name + i;
+        .start          = property_start,
-                        }
+        .next           = property_next,
-                        memcpy (p - 8, "nvalias ", 8);
+        .stop           = property_stop,
-                        prom_feval (p - 8);
+        .show           = property_show
-                }
+};
-        } else if (op->flag & OPP_DIRTY) {
-                if (op->flag & OPP_STRING) {
+static int property_open(struct inode *inode, struct file *file)
-                        op->value [op->len] = 0;
+{
-                        error = prom_setprop (node, op->name,
+        struct op_inode_info *oi = OP_I(inode);
-                                              op->value, op->len + 1);
+        int ret;
-                        if (error <= 0)
-                                printk (KERN_WARNING "openpromfs: "
+        BUG_ON(oi->type != op_inode_prop);
-                                        "Couldn't write property %s\n",
-                                        op->name);
+        ret = seq_open(file, &property_op);
-                } else if ((op->flag & OPP_BINARY) || !op->len) {
+        if (!ret) {
-                        error = prom_setprop (node, op->name,
+                struct seq_file *m = file->private_data;
-                                              op->value, op->len);
+                m->private = oi->u.prop;
-                        if (error <= 0)
-                                printk (KERN_WARNING "openpromfs: "
-                                        "Couldn't write property %s\n",
-                                        op->name);
-                } else {
-                        printk (KERN_WARNING "openpromfs: "
-                                "Unknown property type of %s\n",
-                                op->name);
-                }
        }
-        unlock_kernel();
+        return ret;
-        kfree (filp->private_data);
-        return 0;
 }
 static const struct file_operations openpromfs_prop_ops = {
-        .read           = property_read,
+        .open           = property_open,
-        .write          = property_write,
+        .read           = seq_read,
-        .release        = property_release,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
 };
-static const struct file_operations openpromfs_nodenum_ops = {
+static int openpromfs_readdir(struct file *, void *, filldir_t);
-        .read           = nodenum_read,
-};
 static const struct file_operations openprom_operations = {
        .read           = generic_read_dir,
        .readdir        = openpromfs_readdir,
 };
-static struct inode_operations openprom_alias_inode_operations = {
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
-        .create         = openpromfs_create,
-        .lookup         = openpromfs_lookup,
-        .unlink         = openpromfs_unlink,
-};
 static struct inode_operations openprom_inode_operations = {
        .lookup         = openpromfs_lookup,
 };
-static int lookup_children(u16 n, const char * name, int len)
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        int ret;
-        u16 node;
-        for (; n != 0xffff; n = nodes[n].next) {
-                node = nodes[n].child;
-                if (node != 0xffff) {
-                        char buffer[128];
-                        int i;
-                        char *p;
-                        
-                        while (node != 0xffff) {
-                                if (prom_getname (nodes[node].node,
-                                                  buffer, 128) >= 0) {
-                                        i = strlen (buffer);
-                                        if ((len == i)
-                                            && !strncmp (buffer, name, len))
-                                                return NODE2INO(node);
-                                        p = strchr (buffer, '@');
-                                        if (p && (len == p - buffer)
-                                            && !strncmp (buffer, name, len))
-                                                return NODE2INO(node);
-                                }
-                                node = nodes[node].next;
-                        }
-                } else
-                        continue;
-                ret = lookup_children (nodes[n].child, name, len);
-                if (ret) return ret;
-        }
-        return 0;
-}
-static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
-        int ino = 0;
+        struct op_inode_info *ent_oi, *oi = OP_I(dir);
-#define OPFSL_DIR       0
+        struct device_node *dp, *child;
-#define OPFSL_PROPERTY  1
+        struct property *prop;
-#define OPFSL_NODENUM   2
+        enum op_inode_type ent_type;
-        int type = 0;
+        union op_inode_data ent_data;
-        char buffer[128];
-        char *p;
        const char *name;
-        u32 n;
-        u16 dirnode;
-        unsigned int len;
-        int i;
        struct inode *inode;
-        char buffer2[64];
+        unsigned int ino;
+        int len;
        
-        inode = NULL;
+        BUG_ON(oi->type != op_inode_node);
+        dp = oi->u.node;
        name = dentry->d_name.name;
        len = dentry->d_name.len;
-        lock_kernel();
-        if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) {
+        mutex_lock(&op_mutex);
-                ino = NODEP2INO(NODE(dir->i_ino).first_prop);
-                type = OPFSL_NODENUM;
+        child = dp->child;
-        }
+        while (child) {
-        if (!ino) {
+                int n = strlen(child->path_component_name);
-                u16 node = NODE(dir->i_ino).child;
-                while (node != 0xffff) {
+                if (len == n &&
-                        if (prom_getname (nodes[node].node, buffer, 128) >= 0) {
+                    !strncmp(child->path_component_name, name, len)) {
-                                i = strlen (buffer);
+                        ent_type = op_inode_node;
-                                if (len == i && !strncmp (buffer, name, len)) {
+                        ent_data.node = child;
-                                        ino = NODE2INO(node);
+                        ino = child->unique_id;
-                                        type = OPFSL_DIR;
+                        goto found;
-                                        break;
-                                }
-                                p = strchr (buffer, '@');
-                                if (p && (len == p - buffer)
-                                    && !strncmp (buffer, name, len)) {
-                                        ino = NODE2INO(node);
-                                        type = OPFSL_DIR;
-                                        break;
-                                }
-                        }
-                        node = nodes[node].next;
-                }
-        }
-        n = NODE(dir->i_ino).node;
-        dirnode = dir->i_ino - OPENPROM_FIRST_INO;
-        if (!ino) {
-                int j = NODEP2INO(NODE(dir->i_ino).first_prop);
-                if (dirnode != aliases) {
-                        for (p = prom_firstprop (n, buffer2);
-                             p && *p;
-                             p = prom_nextprop (n, p, buffer2)) {
-                                j++;
-                                if ((len == strlen (p))
-                                    && !strncmp (p, name, len)) {
-                                        ino = j;
-                                        type = OPFSL_PROPERTY;
-                                        break;
-                                }
-                        }
-                } else {
-                        int k;
-                        for (k = 0; k < aliases_nodes; k++) {
-                                j++;
-                                if (alias_names [k]
-                                    && (len == strlen (alias_names [k]))
-                                    && !strncmp (alias_names [k], name, len)) {
-                                        ino = j;
-                                        type = OPFSL_PROPERTY;
-                                        break;
-                                }
-                        }
                }
+                child = child->sibling;
        }
-        if (!ino) {
-                ino = lookup_children (NODE(dir->i_ino).child, name, len);
+        prop = dp->properties;
-                if (ino)
+        while (prop) {
-                        type = OPFSL_DIR;
+                int n = strlen(prop->name);
-                else {
-                        unlock_kernel();
+                if (len == n && !strncmp(prop->name, name, len)) {
-                        return ERR_PTR(-ENOENT);
+                        ent_type = op_inode_prop;
+                        ent_data.prop = prop;
+                        ino = prop->unique_id;
+                        goto found;
                }
+                prop = prop->next;
        }
-        inode = iget (dir->i_sb, ino);
-        unlock_kernel();
+        mutex_unlock(&op_mutex);
+        return ERR_PTR(-ENOENT);
+found:
+        inode = iget(dir->i_sb, ino);
+        mutex_unlock(&op_mutex);
        if (!inode)
                return ERR_PTR(-EINVAL);
-        switch (type) {
+        ent_oi = OP_I(inode);
-        case OPFSL_DIR:
+        ent_oi->type = ent_type;
+        ent_oi->u = ent_data;
+        switch (ent_type) {
+        case op_inode_node:
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-                if (ino == OPENPROM_FIRST_INO + aliases) {
+                inode->i_op = &openprom_inode_operations;
-                        inode->i_mode |= S_IWUSR;
-                        inode->i_op = &openprom_alias_inode_operations;
-                } else
-                        inode->i_op = &openprom_inode_operations;
                inode->i_fop = &openprom_operations;
                inode->i_nlink = 2;
                break;
-        case OPFSL_NODENUM:
+        case op_inode_prop:
-                inode->i_mode = S_IFREG | S_IRUGO;
+                if (!strcmp(dp->name, "options") && (len == 17) &&
-                inode->i_fop = &openpromfs_nodenum_ops;
+                    !strncmp (name, "security-password", 17))
-                inode->i_nlink = 1;
-                inode->u.generic_ip = (void *)(long)(n);
-                break;
-        case OPFSL_PROPERTY:
-                if ((dirnode == options) && (len == 17)
-                    && !strncmp (name, "security-password", 17))
                        inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
-                else {
+                else
                        inode->i_mode = S_IFREG | S_IRUGO;
-                        if (dirnode == options || dirnode == aliases) {
-                                if (len != 4 || strncmp (name, "name", 4))
-                                        inode->i_mode |= S_IWUSR;
-                        }
-                }
                inode->i_fop = &openpromfs_prop_ops;
                inode->i_nlink = 1;
-                if (inode->i_size < 0)
+                inode->i_size = ent_oi->u.prop->length;
-                        inode->i_size = 0;
-                inode->u.generic_ip = (void *)(long)(((u16)dirnode) | 
-                        (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16));
                break;
        }
@@ -775,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr
 static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
        struct inode *inode = filp->f_dentry->d_inode;
+        struct op_inode_info *oi = OP_I(inode);
+        struct device_node *dp = oi->u.node;
+        struct device_node *child;
+        struct property *prop;
        unsigned int ino;
-        u32 n;
+        int i;
-        int i, j;
-        char buffer[128];
+        mutex_lock(&op_mutex);
-        u16 node;
-        char *p;
-        char buffer2[64];
-        lock_kernel();
        
        ino = inode->i_ino;
        i = filp->f_pos;
        switch (i) {
        case 0:
-                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out;
+                if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+                        goto out;
                i++;
                filp->f_pos++;
                /* fall thru */
        case 1:
-                if (filldir(dirent, "..", 2, i, 
+                if (filldir(dirent, "..", 2, i,
-                        (NODE(ino).parent == 0xffff) ? 
+                            (dp->parent == NULL ?
-                        OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) 
+                             OPENPROM_ROOT_INO :
+                             dp->parent->unique_id), DT_DIR) < 0) 
                        goto out;
                i++;
                filp->f_pos++;
                /* fall thru */
        default:
                i -= 2;
-                node = NODE(ino).child;
-                while (i && node != 0xffff) {
+                /* First, the children nodes as directories.  */
-                        node = nodes[node].next;
+                child = dp->child;
+                while (i && child) {
+                        child = child->sibling;
                        i--;
                }
-                while (node != 0xffff) {
+                while (child) {
-                        if (prom_getname (nodes[node].node, buffer, 128) < 0)
+                        if (filldir(dirent,
-                                goto out;
+                                    child->path_component_name,
-                        if (filldir(dirent, buffer, strlen(buffer),
+                                    strlen(child->path_component_name),
-                                    filp->f_pos, NODE2INO(node), DT_DIR) < 0)
+                                    filp->f_pos, child->unique_id, DT_DIR) < 0)
                                goto out;
                        filp->f_pos++;
-                        node = nodes[node].next;
+                        child = child->sibling;
                }
-                j = NODEP2INO(NODE(ino).first_prop);
-                if (!i) {
+                /* Next, the properties as files.  */
-                        if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0)
+                prop = dp->properties;
+                while (i && prop) {
+                        prop = prop->next;
+                        i--;
+                }
+                while (prop) {
+                        if (filldir(dirent, prop->name, strlen(prop->name),
+                                    filp->f_pos, prop->unique_id, DT_REG) < 0)
                                goto out;
                        filp->f_pos++;
-                } else
+                        prop = prop->next;
-                        i--;
-                n = NODE(ino).node;
-                if (ino == OPENPROM_FIRST_INO + aliases) {
-                        for (j++; i < aliases_nodes; i++, j++) {
-                                if (alias_names [i]) {
-                                        if (filldir (dirent, alias_names [i], 
-                                                strlen (alias_names [i]), 
-                                                filp->f_pos, j, DT_REG) < 0) goto out; 
-                                        filp->f_pos++;
-                                }
-                        }
-                } else {
-                        for (p = prom_firstprop (n, buffer2);
-                             p && *p;
-                             p = prom_nextprop (n, p, buffer2)) {
-                                j++;
-                                if (i) i--;
-                                else {
-                                        if (filldir(dirent, p, strlen(p),
-                                                    filp->f_pos, j, DT_REG) < 0)
-                                                goto out;
-                                        filp->f_pos++;
-                                }
-                        }
                }
        }
 out:
-        unlock_kernel();
+        mutex_unlock(&op_mutex);
-        return 0;
-}
-static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        char *p;
-        struct inode *inode;
-        
-        if (!dir)
-                return -ENOENT;
-        if (dentry->d_name.len > 256)
-                return -EINVAL;
-        p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL);
-        if (!p)
-                return -ENOMEM;
-        strncpy (p, dentry->d_name.name, dentry->d_name.len);
-        p [dentry->d_name.len] = 0;
-        lock_kernel();
-        if (aliases_nodes == ALIASES_NNODES) {
-                kfree(p);
-                unlock_kernel();
-                return -EIO;
-        }
-        alias_names [aliases_nodes++] = p;
-        inode = iget (dir->i_sb,
-                        NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes);
-        if (!inode) {
-                unlock_kernel();
-                return -EINVAL;
-        }
-        inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR;
-        inode->i_fop = &openpromfs_prop_ops;
-        inode->i_nlink = 1;
-        if (inode->i_size < 0) inode->i_size = 0;
-        inode->u.generic_ip = (void *)(long)(((u16)aliases) | 
-                        (((u16)(aliases_nodes - 1)) << 16));
-        unlock_kernel();
-        d_instantiate(dentry, inode);
        return 0;
 }
-static int openpromfs_unlink (struct inode *dir, struct dentry *dentry)
+static kmem_cache_t *op_inode_cachep;
-{
-        unsigned int len;
-        char *p;
-        const char *name;
-        int i;
-        
-        name = dentry->d_name.name;
-        len = dentry->d_name.len;
-        lock_kernel();
-        for (i = 0; i < aliases_nodes; i++)
-                if ((strlen (alias_names [i]) == len)
-                    && !strncmp (name, alias_names[i], len)) {
-                        char buffer[512];
-                        
-                        p = alias_names [i];
-                        alias_names [i] = NULL;
-                        kfree (p);
-                        strcpy (buffer, "nvunalias ");
-                        memcpy (buffer + 10, name, len);
-                        buffer [10 + len] = 0;
-                        prom_feval (buffer);
-                }
-        unlock_kernel();
-        return 0;
-}
-/* {{{ init section */
+static struct inode *openprom_alloc_inode(struct super_block *sb)
-static int __init check_space (u16 n)
 {
-        unsigned long pages;
+        struct op_inode_info *oi;
-        if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) {
+        oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
-                pages = __get_free_pages (GFP_KERNEL, alloced + 1);
+        if (!oi)
-                if (!pages)
+                return NULL;
-                        return -1;
-                if (nodes) {
+        return &oi->vfs_inode;
-                        memcpy ((char *)pages, (char *)nodes,
-                                (1 << alloced) * PAGE_SIZE);
-                        free_pages ((unsigned long)nodes, alloced);
-                }
-                alloced++;
-                nodes = (openpromfs_node *)pages;
-        }
-        return 0;
 }
-static u16 __init get_nodes (u16 parent, u32 node)
+static void openprom_destroy_inode(struct inode *inode)
 {
-        char *p;
+        kmem_cache_free(op_inode_cachep, OP_I(inode));
-        u16 n = last_node++, i;
-        char buffer[64];
-        if (check_space (n) < 0)
-                return 0xffff;
-        nodes[n].parent = parent;
-        nodes[n].node = node;
-        nodes[n].next = 0xffff;
-        nodes[n].child = 0xffff;
-        nodes[n].first_prop = first_prop++;
-        if (!parent) {
-                char buffer[8];
-                int j;
-                
-                if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) {
-                    buffer[j] = 0;
-                    if (!strcmp (buffer, "options"))
-                        options = n;
-                    else if (!strcmp (buffer, "aliases"))
-                        aliases = n;
-                }
-        }
-        if (n != aliases)
-                for (p = prom_firstprop (node, buffer);
-                     p && p != (char *)-1 && *p;
-                     p = prom_nextprop (node, p, buffer))
-                        first_prop++;
-        else {
-                char *q;
-                for (p = prom_firstprop (node, buffer);
-                     p && p != (char *)-1 && *p;
-                     p = prom_nextprop (node, p, buffer)) {
-                        if (aliases_nodes == ALIASES_NNODES)
-                                break;
-                        for (i = 0; i < aliases_nodes; i++)
-                                if (!strcmp (p, alias_names [i]))
-                                        break;
-                        if (i < aliases_nodes)
-                                continue;
-                        q = kmalloc (strlen (p) + 1, GFP_KERNEL);
-                        if (!q)
-                                return 0xffff;
-                        strcpy (q, p);
-                        alias_names [aliases_nodes++] = q;
-                }
-                first_prop += ALIASES_NNODES;
-        }
-        node = prom_getchild (node);
-        if (node) {
-                parent = get_nodes (n, node);
-                if (parent == 0xffff)
-                        return 0xffff;
-                nodes[n].child = parent;
-                while ((node = prom_getsibling (node)) != 0) {
-                        i = get_nodes (n, node);
-                        if (i == 0xffff)
-                                return 0xffff;
-                        nodes[parent].next = i;
-                        parent = i;
-                }
-        }
-        return n;
 }
 static void openprom_read_inode(struct inode * inode)
@@ -1025,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data)
 }
 static struct super_operations openprom_sops = { 
+        .alloc_inode    = openprom_alloc_inode,
+        .destroy_inode  = openprom_destroy_inode,
        .read_inode     = openprom_read_inode,
        .statfs         = simple_statfs,
        .remount_fs     = openprom_remount,
@@ -1032,7 +374,8 @@ static struct super_operations openprom_sops = {
 static int openprom_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct inode * root_inode;
+        struct inode *root_inode;
+        struct op_inode_info *oi;
        s->s_flags |= MS_NOATIME;
        s->s_blocksize = 1024;
@@ -1043,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
        root_inode = iget(s, OPENPROM_ROOT_INO);
        if (!root_inode)
                goto out_no_root;
+        oi = OP_I(root_inode);
+        oi->type = op_inode_node;
+        oi->u.node = of_find_node_by_path("/");
        s->s_root = d_alloc_root(root_inode);
        if (!s->s_root)
                goto out_no_root;
@@ -1054,10 +402,10 @@ out_no_root:
        return -ENOMEM;
 }
-static struct super_block *openprom_get_sb(struct file_system_type *fs_type,
+static int openprom_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, openprom_fill_super);
+        return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
 }
 static struct file_system_type openprom_fs_type = {
@@ -1067,29 +415,39 @@ static struct file_system_type openprom_fs_type = {
        .kill_sb        = kill_anon_super,
 };
+static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
+{
+        struct op_inode_info *oi = (struct op_inode_info *) data;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(&oi->vfs_inode);
+}
 static int __init init_openprom_fs(void)
 {
-        nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0);
+        int err;
-        if (!nodes) {
-                printk (KERN_WARNING "openpromfs: can't get free page\n");
+        op_inode_cachep = kmem_cache_create("op_inode_cache",
-                return -EIO;
+                                            sizeof(struct op_inode_info),
-        }
+                                            0,
-        if (get_nodes (0xffff, prom_root_node) == 0xffff) {
+                                            (SLAB_RECLAIM_ACCOUNT |
-                printk (KERN_WARNING "openpromfs: couldn't setup tree\n");
+                                             SLAB_MEM_SPREAD),
-                return -EIO;
+                                            op_inode_init_once, NULL);
-        }
+        if (!op_inode_cachep)
-        nodes[last_node].first_prop = first_prop;
+                return -ENOMEM;
-        return register_filesystem(&openprom_fs_type);
+        err = register_filesystem(&openprom_fs_type);
+        if (err)
+                kmem_cache_destroy(op_inode_cachep);
+        return err;
 }
 static void __exit exit_openprom_fs(void)
 {
-        int i;
        unregister_filesystem(&openprom_fs_type);
-        free_pages ((unsigned long)nodes, alloced);
+        kmem_cache_destroy(op_inode_cachep);
-        for (i = 0; i < aliases_nodes; i++)
-                kfree (alias_names [i]);
-        nodes = NULL;
 }
 module_init(init_openprom_fs)
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index 42c7d3878ed0..d713ce6b3e12 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -4,7 +4,6 @@
 obj-y := check.o
-obj-$(CONFIG_DEVFS_FS) += devfs.o
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
 obj-$(CONFIG_ATARI_PARTITION) += atari.o
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index c05085710fce..1bc9f372c7d4 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -12,7 +12,6 @@
 *  every single manufacturer of SCSI and IDE cards created their own
 *  method.
 */
-#include <linux/config.h>
 #include <linux/buffer_head.h>
 #include <linux/adfs_fs.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7ef1f094de91..839634026eb5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -18,10 +18,8 @@
 #include <linux/fs.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
-#include <linux/devfs_fs_kernel.h>
 #include "check.h"
-#include "devfs.h"
 #include "acorn.h"
 #include "amiga.h"
@@ -161,18 +159,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        if (!state)
                return NULL;
-#ifdef CONFIG_DEVFS_FS
+        disk_name(hd, 0, state->name);
-        if (hd->devfs_name[0] != '\0') {
+        printk(KERN_INFO " %s:", state->name);
-                printk(KERN_INFO " /dev/%s:", hd->devfs_name);
+        if (isdigit(state->name[strlen(state->name)-1]))
                sprintf(state->name, "p");
-        }
-#endif
-        else {
-                disk_name(hd, 0, state->name);
-                printk(KERN_INFO " %s:", state->name);
-                if (isdigit(state->name[strlen(state->name)-1]))
-                        sprintf(state->name, "p");
-        }
        state->limit = hd->minors;
        i = res = 0;
        while (!res && check_part[i]) {
@@ -328,7 +319,7 @@ void delete_partition(struct gendisk *disk, int part)
        p->nr_sects = 0;
        p->ios[0] = p->ios[1] = 0;
        p->sectors[0] = p->sectors[1] = 0;
-        devfs_remove("%s/part%d", disk->devfs_name, part);
+        sysfs_remove_link(&p->kobj, "subsystem");
        if (p->holder_dir)
                kobject_unregister(p->holder_dir);
        kobject_uevent(&p->kobj, KOBJ_REMOVE);
@@ -349,10 +340,6 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
        p->nr_sects = len;
        p->partno = part;
-        devfs_mk_bdev(MKDEV(disk->major, disk->first_minor + part),
-                        S_IFBLK|S_IRUSR|S_IWUSR,
-                        "%s/part%d", disk->devfs_name, part);
        if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1]))
                snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part);
        else
@@ -363,6 +350,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
        kobject_add(&p->kobj);
        if (!disk->part_uevent_suppress)
                kobject_uevent(&p->kobj, KOBJ_ADD);
+        sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem");
        partition_sysfs_add_subdir(p);
        disk->part[part-1] = p;
 }
@@ -398,6 +386,7 @@ static void disk_sysfs_symlinks(struct gendisk *disk)
                        kfree(disk_name);
                }
        }
+        sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem");
 }
 /* Not exported, helper to add_disk(). */
@@ -420,14 +409,8 @@ void register_disk(struct gendisk *disk)
        disk_sysfs_add_subdirs(disk);
        /* No minors to use for partitions */
-        if (disk->minors == 1) {
+        if (disk->minors == 1)
-                if (disk->devfs_name[0] != '\0')
-                        devfs_add_disk(disk);
                goto exit;
-        }
-        /* always add handle for the whole disk */
-        devfs_add_partitioned(disk);
        /* No such device (e.g., media were just removed) */
        if (!get_capacity(disk))
@@ -481,6 +464,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
                sector_t from = state->parts[p].from;
                if (!size)
                        continue;
+                if (from + size > get_capacity(disk)) {
+                        printk(" %s: p%d exceeds device capacity\n",
+                                disk->disk_name, p);
+                }
                add_partition(disk, p, from, size);
 #ifdef CONFIG_BLK_DEV_MD
                if (state->parts[p].flags)
@@ -496,8 +483,8 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
        struct address_space *mapping = bdev->bd_inode->i_mapping;
        struct page *page;
-        page = read_cache_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+        page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-                        (filler_t *)mapping->a_ops->readpage, NULL);
+                                 NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                if (!PageUptodate(page))
@@ -531,8 +518,6 @@ void del_gendisk(struct gendisk *disk)
        disk_stat_set_all(disk, 0);
        disk->stamp = 0;
-        devfs_remove_disk(disk);
        kobject_uevent(&disk->kobj, KOBJ_REMOVE);
        if (disk->holder_dir)
                kobject_unregister(disk->holder_dir);
@@ -548,5 +533,6 @@ void del_gendisk(struct gendisk *disk)
                put_device(disk->driverfs_dev);
                disk->driverfs_dev = NULL;
        }
+        sysfs_remove_link(&disk->kobj, "subsystem");
        kobject_del(&disk->kobj);
 }
diff --git a/fs/partitions/devfs.c b/fs/partitions/devfs.c
deleted file mode 100644
index 3f0a780c9cec..000000000000
--- a/fs/partitions/devfs.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * This tries to keep block devices away from devfs as much as possible.
- */
-#include <linux/fs.h>
-#include <linux/devfs_fs_kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/genhd.h>
-#include <linux/bitops.h>
-#include <linux/mutex.h>
-struct unique_numspace {
-        u32               num_free;          /*  Num free in bits       */
-        u32               length;            /*  Array length in bytes  */
-        unsigned long     *bits;
-        struct semaphore  mutex;
-};
-static DEFINE_MUTEX(numspace_mutex);
-static int expand_numspace(struct unique_numspace *s)
-{
-        u32 length;
-        void *bits;
-        if (s->length < 16)
-                length = 16;
-        else
-                length = s->length << 1;
-        bits = vmalloc(length);
-        if (!bits)
-                return -ENOMEM;
-        if (s->bits) {
-                memcpy(bits, s->bits, s->length);
-                vfree(s->bits);
-        }
-                
-        s->num_free = (length - s->length) << 3;
-        s->bits = bits;
-        memset(bits + s->length, 0, length - s->length);
-        s->length = length;
-        return 0;
-}
-static int alloc_unique_number(struct unique_numspace *s)
-{
-        int rval = 0;
-        mutex_lock(&numspace_mutex);
-        if (s->num_free < 1)
-                rval = expand_numspace(s);
-        if (!rval) {
-                rval = find_first_zero_bit(s->bits, s->length << 3);
-                --s->num_free;
-                __set_bit(rval, s->bits);
-        }
-        mutex_unlock(&numspace_mutex);
-        return rval;
-}
-static void dealloc_unique_number(struct unique_numspace *s, int number)
-{
-        int old_val;
-        if (number >= 0) {
-                mutex_lock(&numspace_mutex);
-                old_val = __test_and_clear_bit(number, s->bits);
-                if (old_val)
-                        ++s->num_free;
-                mutex_unlock(&numspace_mutex);
-        }
-}
-static struct unique_numspace disc_numspace;
-static struct unique_numspace cdrom_numspace;
-void devfs_add_partitioned(struct gendisk *disk)
-{
-        char dirname[64], symlink[16];
-        devfs_mk_dir(disk->devfs_name);
-        devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
-                        S_IFBLK|S_IRUSR|S_IWUSR,
-                        "%s/disc", disk->devfs_name);
-        disk->number = alloc_unique_number(&disc_numspace);
-        sprintf(symlink, "discs/disc%d", disk->number);
-        sprintf(dirname, "../%s", disk->devfs_name);
-        devfs_mk_symlink(symlink, dirname);
-}
-void devfs_add_disk(struct gendisk *disk)
-{
-        devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
-                        (disk->flags & GENHD_FL_CD) ?
-                                S_IFBLK|S_IRUGO|S_IWUGO :
-                                S_IFBLK|S_IRUSR|S_IWUSR,
-                        "%s", disk->devfs_name);
-        if (disk->flags & GENHD_FL_CD) {
-                char dirname[64], symlink[16];
-                disk->number = alloc_unique_number(&cdrom_numspace);
-                sprintf(symlink, "cdroms/cdrom%d", disk->number);
-                sprintf(dirname, "../%s", disk->devfs_name);
-                devfs_mk_symlink(symlink, dirname);
-        }
-}
-void devfs_remove_disk(struct gendisk *disk)
-{
-        if (disk->minors != 1) {
-                devfs_remove("discs/disc%d", disk->number);
-                dealloc_unique_number(&disc_numspace, disk->number);
-                devfs_remove("%s/disc", disk->devfs_name);
-        }
-        if (disk->flags & GENHD_FL_CD) {
-                devfs_remove("cdroms/cdrom%d", disk->number);
-                dealloc_unique_number(&cdrom_numspace, disk->number);
-        }
-        devfs_remove(disk->devfs_name);
-}
diff --git a/fs/partitions/devfs.h b/fs/partitions/devfs.h
deleted file mode 100644
index 176118b4e492..000000000000
--- a/fs/partitions/devfs.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifdef CONFIG_DEVFS_FS
-void devfs_add_disk(struct gendisk *dev);
-void devfs_add_partitioned(struct gendisk *dev);
-void devfs_remove_disk(struct gendisk *dev);
-#else
-# define devfs_add_disk(disk)                   do { } while (0)
-# define devfs_add_partitioned(disk)            do { } while (0)
-# define devfs_remove_disk(disk)                do { } while (0)
-#endif
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 0f5b017aebad..63730282ad81 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -91,7 +91,6 @@
 * - Code works, detects all the partitions.
 *
 ************************************************************/
-#include <linux/config.h>
 #include <linux/crc32.h>
 #include "check.h"
 #include "efi.h"
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index c44fb0561448..2cc89d0475bf 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -26,7 +26,6 @@
 #define FS_PART_EFI_H_INCLUDED
 #include <linux/types.h>
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 830c55d86ab1..d352a7381fed 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -6,7 +6,6 @@
 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
 */
-#include <linux/config.h>
 #include <linux/buffer_head.h>
 #include <linux/hdreg.h>
 #include <linux/slab.h>
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 813292f21210..c0871002d00d 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -6,7 +6,6 @@
 *  Re-organised Feb 1998 Russell King
 */
-#include <linux/config.h>
 #include <linux/ctype.h>
 #include "check.h"
 #include "mac.h"
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 9935d254186e..8f12587c3129 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -19,7 +19,6 @@
 *  Re-organised Feb 1998 Russell King
 */
-#include <linux/config.h>
 #include "check.h"
 #include "msdos.h"
diff --git a/fs/pipe.c b/fs/pipe.c
index 5acd8954aaa0..20352573e025 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -979,12 +979,11 @@ no_files:
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */
+static int pipefs_get_sb(struct file_system_type *fs_type,
-static struct super_block *
+                         int flags, const char *dev_name, void *data,
-pipefs_get_sb(struct file_system_type *fs_type, int flags,
+                         struct vfsmount *mnt)
-              const char *dev_name, void *data)
 {
-        return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+        return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e05..da42ee61c1df 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
        if (master) {
                list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
                        slave_mnt->mnt_master = master;
-                list_del(&mnt->mnt_slave);
+                list_move(&mnt->mnt_slave, &master->mnt_slave_list);
-                list_add(&mnt->mnt_slave, &master->mnt_slave_list);
                list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
        } else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
                 * umount the child only if the child has no
                 * other children
                 */
-                if (child && list_empty(&child->mnt_mounts)) {
+                if (child && list_empty(&child->mnt_mounts))
-                        list_del(&child->mnt_hash);
+                        list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
-                        list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
-                }
        }
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7a76ad570230..7495d3e20775 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -52,7 +52,6 @@
 *                       :  base.c too.
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/time.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6cc77dc3f3ff..243a94af0427 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -49,7 +49,6 @@
 #include <asm/uaccess.h>
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
@@ -74,6 +73,16 @@
 #include <linux/poll.h>
 #include "internal.h"
+/* NOTE:
+ *      Implementing inode permission operations in /proc is almost
+ *      certainly an error.  Permission checks need to happen during
+ *      each system call not at open time.  The reason is that most of
+ *      what we wish to check for permissions in /proc varies at runtime.
+ *
+ *      The classic example of a problem is opening file descriptors
+ *      in /proc for a task before it execs a suid executable.
+ */
 /*
 * For hysterical raisins we keep the same inumbers as in the old procfs.
 * Feel free to change the macro below - just keep the range distinct from
@@ -121,6 +130,8 @@ enum pid_directory_inos {
        PROC_TGID_ATTR_PREV,
        PROC_TGID_ATTR_EXEC,
        PROC_TGID_ATTR_FSCREATE,
+        PROC_TGID_ATTR_KEYCREATE,
+        PROC_TGID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
        PROC_TGID_LOGINUID,
@@ -162,6 +173,8 @@ enum pid_directory_inos {
        PROC_TID_ATTR_PREV,
        PROC_TID_ATTR_EXEC,
        PROC_TID_ATTR_FSCREATE,
+        PROC_TID_ATTR_KEYCREATE,
+        PROC_TID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
        PROC_TID_LOGINUID,
@@ -173,6 +186,9 @@ enum pid_directory_inos {
        PROC_TID_FD_DIR = 0x8000,       /* 0x8000-0xffff */
 };
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 10
 struct pid_entry {
        int type;
        int len;
@@ -275,6 +291,8 @@ static struct pid_entry tgid_attr_stuff[] = {
        E(PROC_TGID_ATTR_PREV,     "prev",     S_IFREG|S_IRUGO),
        E(PROC_TGID_ATTR_EXEC,     "exec",     S_IFREG|S_IRUGO|S_IWUGO),
        E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
        {0,0,NULL,0}
 };
 static struct pid_entry tid_attr_stuff[] = {
@@ -282,6 +300,8 @@ static struct pid_entry tid_attr_stuff[] = {
        E(PROC_TID_ATTR_PREV,      "prev",     S_IFREG|S_IRUGO),
        E(PROC_TID_ATTR_EXEC,      "exec",     S_IFREG|S_IRUGO|S_IWUGO),
        E(PROC_TID_ATTR_FSCREATE,  "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+        E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
        {0,0,NULL,0}
 };
 #endif
@@ -290,12 +310,15 @@ static struct pid_entry tid_attr_stuff[] = {
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        struct files_struct *files;
+        struct files_struct *files = NULL;
        struct file *file;
-        int fd = proc_type(inode) - PROC_TID_FD_DIR;
+        int fd = proc_fd(inode);
-        files = get_files_struct(task);
+        if (task) {
+                files = get_files_struct(task);
+                put_task_struct(task);
+        }
        if (files) {
                /*
                 * We are not taking a ref to the file structure, so we must
@@ -327,29 +350,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
        return fs;
 }
-static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int get_nr_threads(struct task_struct *tsk)
 {
-        struct fs_struct *fs = get_fs_struct(proc_task(inode));
+        /* Must be called with the rcu_read_lock held */
-        int result = -ENOENT;
+        unsigned long flags;
-        if (fs) {
+        int count = 0;
-                read_lock(&fs->lock);
-                *mnt = mntget(fs->pwdmnt);
+        if (lock_task_sighand(tsk, &flags)) {
-                *dentry = dget(fs->pwd);
+                count = atomic_read(&tsk->signal->count);
-                read_unlock(&fs->lock);
+                unlock_task_sighand(tsk, &flags);
-                result = 0;
-                put_fs_struct(fs);
        }
-        return result;
+        return count;
 }
-static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-        struct fs_struct *fs = get_fs_struct(proc_task(inode));
+        struct task_struct *task = get_proc_task(inode);
+        struct fs_struct *fs = NULL;
        int result = -ENOENT;
+        if (task) {
+                fs = get_fs_struct(task);
+                put_task_struct(task);
+        }
        if (fs) {
                read_lock(&fs->lock);
-                *mnt = mntget(fs->rootmnt);
+                *mnt = mntget(fs->pwdmnt);
-                *dentry = dget(fs->root);
+                *dentry = dget(fs->pwd);
                read_unlock(&fs->lock);
                result = 0;
                put_fs_struct(fs);
@@ -357,42 +384,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
        return result;
 }
+static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
-/* Same as proc_root_link, but this addionally tries to get fs from other
- * threads in the group */
-static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
-                                struct vfsmount **mnt)
 {
-        struct fs_struct *fs;
+        struct task_struct *task = get_proc_task(inode);
+        struct fs_struct *fs = NULL;
        int result = -ENOENT;
-        struct task_struct *leader = proc_task(inode);
-        task_lock(leader);
+        if (task) {
-        fs = leader->fs;
+                fs = get_fs_struct(task);
-        if (fs) {
+                put_task_struct(task);
-                atomic_inc(&fs->count);
-                task_unlock(leader);
-        } else {
-                /* Try to get fs from other threads */
-                task_unlock(leader);
-                read_lock(&tasklist_lock);
-                if (pid_alive(leader)) {
-                        struct task_struct *task = leader;
-                        while ((task = next_thread(task)) != leader) {
-                                task_lock(task);
-                                fs = task->fs;
-                                if (fs) {
-                                        atomic_inc(&fs->count);
-                                        task_unlock(task);
-                                        break;
-                                }
-                                task_unlock(task);
-                        }
-                }
-                read_unlock(&tasklist_lock);
        }
        if (fs) {
                read_lock(&fs->lock);
                *mnt = mntget(fs->rootmnt);
@@ -404,7 +405,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
        return result;
 }
 #define MAY_PTRACE(task) \
        (task == current || \
        (task->parent == current && \
@@ -535,142 +535,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 /************************************************************************/
 /* permission checks */
+static int proc_fd_access_allowed(struct inode *inode)
-/* If the process being read is separated by chroot from the reading process,
- * don't let the reader access the threads.
- *
- * note: this does dput(root) and mntput(vfsmnt) on exit.
- */
-static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
 {
-        struct dentry *de, *base;
+        struct task_struct *task;
-        struct vfsmount *our_vfsmnt, *mnt;
+        int allowed = 0;
-        int res = 0;
+        /* Allow access to a task's file descriptors if it is us or we
+         * may use ptrace attach to the process and find out that
-        read_lock(&current->fs->lock);
+         * information.
-        our_vfsmnt = mntget(current->fs->rootmnt);
+         */
-        base = dget(current->fs->root);
+        task = get_proc_task(inode);
-        read_unlock(&current->fs->lock);
+        if (task) {
+                allowed = ptrace_may_attach(task);
-        spin_lock(&vfsmount_lock);
+                put_task_struct(task);
-        de = root;
-        mnt = vfsmnt;
-        while (mnt != our_vfsmnt) {
-                if (mnt == mnt->mnt_parent)
-                        goto out;
-                de = mnt->mnt_mountpoint;
-                mnt = mnt->mnt_parent;
-        }
-        if (!is_subdir(de, base))
-                goto out;
-        spin_unlock(&vfsmount_lock);
-exit:
-        dput(base);
-        mntput(our_vfsmnt);
-        dput(root);
-        mntput(vfsmnt);
-        return res;
-out:
-        spin_unlock(&vfsmount_lock);
-        res = -EACCES;
-        goto exit;
-}
-static int proc_check_root(struct inode *inode)
-{
-        struct dentry *root;
-        struct vfsmount *vfsmnt;
-        if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
-                return -ENOENT;
-        return proc_check_chroot(root, vfsmnt);
-}
-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-        if (generic_permission(inode, mask, NULL) != 0)
-                return -EACCES;
-        return proc_check_root(inode);
-}
-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-        struct dentry *root;
-        struct vfsmount *vfsmnt;
-        if (generic_permission(inode, mask, NULL) != 0)
-                return -EACCES;
-        if (proc_task_root_link(inode, &root, &vfsmnt))
-                return -ENOENT;
-        return proc_check_chroot(root, vfsmnt);
-}
-extern struct seq_operations proc_pid_maps_op;
-static int maps_open(struct inode *inode, struct file *file)
-{
-        struct task_struct *task = proc_task(inode);
-        int ret = seq_open(file, &proc_pid_maps_op);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = task;
-        }
-        return ret;
-}
-static struct file_operations proc_maps_operations = {
-        .open           = maps_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-#ifdef CONFIG_NUMA
-extern struct seq_operations proc_pid_numa_maps_op;
-static int numa_maps_open(struct inode *inode, struct file *file)
-{
-        struct task_struct *task = proc_task(inode);
-        int ret = seq_open(file, &proc_pid_numa_maps_op);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = task;
-        }
-        return ret;
-}
-static struct file_operations proc_numa_maps_operations = {
-        .open           = numa_maps_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-#endif
-#ifdef CONFIG_MMU
-extern struct seq_operations proc_pid_smaps_op;
-static int smaps_open(struct inode *inode, struct file *file)
-{
-        struct task_struct *task = proc_task(inode);
-        int ret = seq_open(file, &proc_pid_smaps_op);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = task;
        }
-        return ret;
+        return allowed;
 }
-static struct file_operations proc_smaps_operations = {
-        .open           = smaps_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-#endif
 extern struct seq_operations mounts_op;
 struct proc_mounts {
        struct seq_file m;
@@ -679,16 +559,19 @@ struct proc_mounts {
 static int mounts_open(struct inode *inode, struct file *file)
 {
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        struct namespace *namespace;
+        struct namespace *namespace = NULL;
        struct proc_mounts *p;
        int ret = -EINVAL;
-        task_lock(task);
+        if (task) {
-        namespace = task->namespace;
+                task_lock(task);
-        if (namespace)
+                namespace = task->namespace;
-                get_namespace(namespace);
+                if (namespace)
-        task_unlock(task);
+                        get_namespace(namespace);
+                task_unlock(task);
+                put_task_struct(task);
+        }
        if (namespace) {
                ret = -ENOMEM;
@@ -745,17 +628,21 @@ static struct file_operations proc_mounts_operations = {
 extern struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-        struct task_struct *task = proc_task(inode);
        int ret = seq_open(file, &mountstats_op);
        if (!ret) {
                struct seq_file *m = file->private_data;
-                struct namespace *namespace;
+                struct namespace *namespace = NULL;
-                task_lock(task);
+                struct task_struct *task = get_proc_task(inode);
-                namespace = task->namespace;
-                if (namespace)
+                if (task) {
-                        get_namespace(namespace);
+                        task_lock(task);
-                task_unlock(task);
+                        namespace = task->namespace;
+                        if (namespace)
+                                get_namespace(namespace);
+                        task_unlock(task);
+                        put_task_struct(task);
+                }
                if (namespace)
                        m->private = namespace;
@@ -782,18 +669,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        unsigned long page;
        ssize_t length;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
+        length = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (count > PROC_BLOCK_SIZE)
                count = PROC_BLOCK_SIZE;
+        length = -ENOMEM;
        if (!(page = __get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
+                goto out;
        length = PROC_I(inode)->op.proc_read(task, (char*)page);
        if (length >= 0)
                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
        free_page(page);
+out:
+        put_task_struct(task);
+out_no_task:
        return length;
 }
@@ -810,12 +706,15 @@ static int mem_open(struct inode* inode, struct file* file)
 static ssize_t mem_read(struct file * file, char __user * buf,
                        size_t count, loff_t *ppos)
 {
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
        char *page;
        unsigned long src = *ppos;
        int ret = -ESRCH;
        struct mm_struct *mm;
+        if (!task)
+                goto out_no_task;
        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
                goto out;
@@ -865,6 +764,8 @@ out_put:
 out_free:
        free_page((unsigned long) page);
 out:
+        put_task_struct(task);
+out_no_task:
        return ret;
 }
@@ -877,15 +778,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
 {
        int copied = 0;
        char *page;
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
        unsigned long dst = *ppos;
+        copied = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
-                return -ESRCH;
+                goto out;
+        copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_USER);
        if (!page)
-                return -ENOMEM;
+                goto out;
        while (count > 0) {
                int this_len, retval;
@@ -908,6 +814,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
        }
        *ppos = dst;
        free_page((unsigned long) page);
+out:
+        put_task_struct(task);
+out_no_task:
        return copied;
 }
 #endif
@@ -938,13 +847,18 @@ static struct file_operations proc_mem_operations = {
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
                                size_t count, loff_t *ppos)
 {
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
-        char buffer[8];
+        char buffer[PROC_NUMBUF];
        size_t len;
-        int oom_adjust = task->oomkilladj;
+        int oom_adjust;
        loff_t __ppos = *ppos;
-        len = sprintf(buffer, "%i\n", oom_adjust);
+        if (!task)
+                return -ESRCH;
+        oom_adjust = task->oomkilladj;
+        put_task_struct(task);
+        len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
        if (__ppos >= len)
                return 0;
        if (count > len-__ppos)
@@ -958,15 +872,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
-        struct task_struct *task = proc_task(file->f_dentry->d_inode);
+        struct task_struct *task;
-        char buffer[8], *end;
+        char buffer[PROC_NUMBUF], *end;
        int oom_adjust;
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        memset(buffer, 0, 8);
+        memset(buffer, 0, sizeof(buffer));
-        if (count > 6)
+        if (count > sizeof(buffer) - 1)
-                count = 6;
+                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        oom_adjust = simple_strtol(buffer, &end, 0);
@@ -974,7 +888,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                return -EINVAL;
        if (*end == '\n')
                end++;
+        task = get_proc_task(file->f_dentry->d_inode);
+        if (!task)
+                return -ESRCH;
        task->oomkilladj = oom_adjust;
+        put_task_struct(task);
        if (end - buffer == 0)
                return -EIO;
        return end - buffer;
@@ -985,22 +903,21 @@ static struct file_operations proc_oom_adjust_operations = {
        .write          = oom_adjust_write,
 };
-static struct inode_operations proc_mem_inode_operations = {
-        .permission     = proc_permission,
-};
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
 {
        struct inode * inode = file->f_dentry->d_inode;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];
+        if (!task)
+                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                                audit_get_loginuid(task->audit_context));
+        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
@@ -1010,17 +927,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        char *page, *tmp;
        ssize_t length;
-        struct task_struct *task = proc_task(inode);
        uid_t loginuid;
        if (!capable(CAP_AUDIT_CONTROL))
                return -EPERM;
-        if (current != task)
+        if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
                return -EPERM;
-        if (count > PAGE_SIZE)
+        if (count >= PAGE_SIZE)
-                count = PAGE_SIZE;
+                count = PAGE_SIZE - 1;
        if (*ppos != 0) {
                /* No partial writes. */
@@ -1033,13 +949,14 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        if (copy_from_user(page, buf, count))
                goto out_free_page;
+        page[count] = '\0';
        loginuid = simple_strtoul(page, &tmp, 10);
        if (tmp == page) {
                length = -EINVAL;
                goto out_free_page;
        }
-        length = audit_set_loginuid(task, loginuid);
+        length = audit_set_loginuid(current, loginuid);
        if (likely(length == 0))
                length = count;
@@ -1058,13 +975,16 @@ static struct file_operations proc_loginuid_operations = {
 static ssize_t seccomp_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
-        struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+        struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
        char __buf[20];
        loff_t __ppos = *ppos;
        size_t len;
+        if (!tsk)
+                return -ESRCH;
        /* no need to print the trailing zero, so use only len */
        len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+        put_task_struct(tsk);
        if (__ppos >= len)
                return 0;
        if (count > len - __ppos)
@@ -1078,29 +998,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
 {
-        struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+        struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
        char __buf[20], *end;
        unsigned int seccomp_mode;
+        ssize_t result;
+        result = -ESRCH;
+        if (!tsk)
+                goto out_no_task;
        /* can set it only once to be even more secure */
+        result = -EPERM;
        if (unlikely(tsk->seccomp.mode))
-                return -EPERM;
+                goto out;
+        result = -EFAULT;
        memset(__buf, 0, sizeof(__buf));
        count = min(count, sizeof(__buf) - 1);
        if (copy_from_user(__buf, buf, count))
-                return -EFAULT;
+                goto out;
        seccomp_mode = simple_strtoul(__buf, &end, 0);
        if (*end == '\n')
                end++;
+        result = -EINVAL;
        if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
                tsk->seccomp.mode = seccomp_mode;
                set_tsk_thread_flag(tsk, TIF_SECCOMP);
        } else
-                return -EINVAL;
+                goto out;
+        result = -EIO;
        if (unlikely(!(end - __buf)))
-                return -EIO;
+                goto out;
-        return end - __buf;
+        result = end - __buf;
+out:
+        put_task_struct(tsk);
+out_no_task:
+        return result;
 }
 static struct file_operations proc_seccomp_operations = {
@@ -1117,10 +1051,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        /* We don't need a base pointer in the /proc filesystem */
        path_release(nd);
-        if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
+        /* Are we allowed to snoop on the tasks file descriptors? */
-                goto out;
+        if (!proc_fd_access_allowed(inode))
-        error = proc_check_root(inode);
-        if (error)
                goto out;
        error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
@@ -1162,12 +1094,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        struct dentry *de;
        struct vfsmount *mnt = NULL;
-        lock_kernel();
+        /* Are we allowed to snoop on the tasks file descriptors? */
+        if (!proc_fd_access_allowed(inode))
-        if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
-                goto out;
-        error = proc_check_root(inode);
-        if (error)
                goto out;
        error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
@@ -1178,7 +1106,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        dput(de);
        mntput(mnt);
 out:
-        unlock_kernel();
        return error;
 }
@@ -1187,21 +1114,20 @@ static struct inode_operations proc_pid_link_inode_operations = {
        .follow_link    = proc_pid_follow_link
 };
-#define NUMBUF 10
 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        struct dentry *dentry = filp->f_dentry;
-        struct task_struct *p = proc_task(inode);
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *p = get_proc_task(inode);
        unsigned int fd, tid, ino;
        int retval;
-        char buf[NUMBUF];
+        char buf[PROC_NUMBUF];
        struct files_struct * files;
        struct fdtable *fdt;
        retval = -ENOENT;
-        if (!pid_alive(p))
+        if (!p)
-                goto out;
+                goto out_no_task;
        retval = 0;
        tid = p->pid;
@@ -1212,7 +1138,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                goto out;
                        filp->f_pos++;
                case 1:
-                        ino = fake_ino(tid, PROC_TID_INO);
+                        ino = parent_ino(dentry);
                        if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
                                goto out;
                        filp->f_pos++;
@@ -1231,7 +1157,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                        continue;
                                rcu_read_unlock();
-                                j = NUMBUF;
+                                j = PROC_NUMBUF;
                                i = fd;
                                do {
                                        j--;
@@ -1240,7 +1166,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                                } while (i);
                                ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
-                                if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
+                                if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
                                        rcu_read_lock();
                                        break;
                                }
@@ -1250,6 +1176,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
                        put_files_struct(files);
        }
 out:
+        put_task_struct(p);
+out_no_task:
        return retval;
 }
@@ -1261,16 +1189,18 @@ static int proc_pident_readdir(struct file *filp,
        int pid;
        struct dentry *dentry = filp->f_dentry;
        struct inode *inode = dentry->d_inode;
+        struct task_struct *task = get_proc_task(inode);
        struct pid_entry *p;
        ino_t ino;
        int ret;
        ret = -ENOENT;
-        if (!pid_alive(proc_task(inode)))
+        if (!task)
                goto out;
        ret = 0;
-        pid = proc_task(inode)->pid;
+        pid = task->pid;
+        put_task_struct(task);
        i = filp->f_pos;
        switch (i) {
        case 0:
@@ -1353,22 +1283,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        /* Common stuff */
        ei = PROC_I(inode);
-        ei->task = NULL;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_ino = fake_ino(task->pid, ino);
-        if (!pid_alive(task))
-                goto out_unlock;
        /*
         * grab the reference to task.
         */
-        get_task_struct(task);
+        ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
-        ei->task = task;
+        if (!ei->pid)
-        ei->type = ino;
+                goto out_unlock;
        inode->i_uid = 0;
        inode->i_gid = 0;
-        if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) {
+        if (task_dumpable(task)) {
                inode->i_uid = task->euid;
                inode->i_gid = task->egid;
        }
@@ -1378,7 +1305,6 @@ out:
        return inode;
 out_unlock:
-        ei->pde = NULL;
        iput(inode);
        return NULL;
 }
@@ -1392,13 +1318,21 @@ out_unlock:
 *
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
 */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        if (pid_alive(task)) {
+        if (task) {
-                if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) {
+                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                    task_dumpable(task)) {
                        inode->i_uid = task->euid;
                        inode->i_gid = task->egid;
                } else {
@@ -1406,59 +1340,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
                        inode->i_gid = 0;
                }
                security_task_to_inode(task, inode);
+                put_task_struct(task);
                return 1;
        }
        d_drop(dentry);
        return 0;
 }
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *task;
+        generic_fillattr(inode, stat);
+        rcu_read_lock();
+        stat->uid = 0;
+        stat->gid = 0;
+        task = pid_task(proc_pid(inode), PIDTYPE_PID);
+        if (task) {
+                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+                    task_dumpable(task)) {
+                        stat->uid = task->euid;
+                        stat->gid = task->egid;
+                }
+        }
+        rcu_read_unlock();
+        return 0;
+}
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        int fd = proc_type(inode) - PROC_TID_FD_DIR;
+        int fd = proc_fd(inode);
        struct files_struct *files;
-        files = get_files_struct(task);
+        if (task) {
-        if (files) {
+                files = get_files_struct(task);
-                rcu_read_lock();
+                if (files) {
-                if (fcheck_files(files, fd)) {
+                        rcu_read_lock();
+                        if (fcheck_files(files, fd)) {
+                                rcu_read_unlock();
+                                put_files_struct(files);
+                                if (task_dumpable(task)) {
+                                        inode->i_uid = task->euid;
+                                        inode->i_gid = task->egid;
+                                } else {
+                                        inode->i_uid = 0;
+                                        inode->i_gid = 0;
+                                }
+                                security_task_to_inode(task, inode);
+                                put_task_struct(task);
+                                return 1;
+                        }
                        rcu_read_unlock();
                        put_files_struct(files);
-                        if (task_dumpable(task)) {
-                                inode->i_uid = task->euid;
-                                inode->i_gid = task->egid;
-                        } else {
-                                inode->i_uid = 0;
-                                inode->i_gid = 0;
-                        }
-                        security_task_to_inode(task, inode);
-                        return 1;
                }
-                rcu_read_unlock();
+                put_task_struct(task);
-                put_files_struct(files);
        }
        d_drop(dentry);
        return 0;
 }
-static void pid_base_iput(struct dentry *dentry, struct inode *inode)
-{
-        struct task_struct *task = proc_task(inode);
-        spin_lock(&task->proc_lock);
-        if (task->proc_dentry == dentry)
-                task->proc_dentry = NULL;
-        spin_unlock(&task->proc_lock);
-        iput(inode);
-}
 static int pid_delete_dentry(struct dentry * dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
-        return !pid_alive(proc_task(dentry->d_inode));
+        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 static struct dentry_operations tid_fd_dentry_operations =
@@ -1473,13 +1423,6 @@ static struct dentry_operations pid_dentry_operations =
        .d_delete       = pid_delete_dentry,
 };
-static struct dentry_operations pid_base_dentry_operations =
-{
-        .d_revalidate   = pid_revalidate,
-        .d_iput         = pid_base_iput,
-        .d_delete       = pid_delete_dentry,
-};
 /* Lookups */
 static unsigned name_to_int(struct dentry *dentry)
@@ -1507,22 +1450,24 @@ out:
 /* SMP-safe */
 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
-        struct task_struct *task = proc_task(dir);
+        struct task_struct *task = get_proc_task(dir);
        unsigned fd = name_to_int(dentry);
+        struct dentry *result = ERR_PTR(-ENOENT);
        struct file * file;
        struct files_struct * files;
        struct inode *inode;
        struct proc_inode *ei;
+        if (!task)
+                goto out_no_task;
        if (fd == ~0U)
                goto out;
-        if (!pid_alive(task))
-                goto out;
        inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
        if (!inode)
                goto out;
        ei = PROC_I(inode);
+        ei->fd = fd;
        files = get_files_struct(task);
        if (!files)
                goto out_unlock;
@@ -1547,19 +1492,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
        ei->op.proc_get_link = proc_fd_link;
        dentry->d_op = &tid_fd_dentry_operations;
        d_add(dentry, inode);
-        return NULL;
+        /* Close the race of the process dying before we return the dentry */
+        if (tid_fd_revalidate(dentry, NULL))
+                result = NULL;
+out:
+        put_task_struct(task);
+out_no_task:
+        return result;
 out_unlock2:
        spin_unlock(&files->file_lock);
        put_files_struct(files);
 out_unlock:
        iput(inode);
-out:
+        goto out;
-        return ERR_PTR(-ENOENT);
 }
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 static struct file_operations proc_fd_operations = {
        .read           = generic_read_dir,
@@ -1576,12 +1527,11 @@ static struct file_operations proc_task_operations = {
 */
 static struct inode_operations proc_fd_inode_operations = {
        .lookup         = proc_lookupfd,
-        .permission     = proc_permission,
 };
 static struct inode_operations proc_task_inode_operations = {
        .lookup         = proc_task_lookup,
-        .permission     = proc_task_permission,
+        .getattr        = proc_task_getattr,
 };
 #ifdef CONFIG_SECURITY
@@ -1591,12 +1541,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        unsigned long page;
        ssize_t length;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
+        length = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (count > PAGE_SIZE)
                count = PAGE_SIZE;
+        length = -ENOMEM;
        if (!(page = __get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
+                goto out;
        length = security_getprocattr(task, 
                                      (char*)file->f_dentry->d_name.name, 
@@ -1604,6 +1559,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
        if (length >= 0)
                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
        free_page(page);
+out:
+        put_task_struct(task);
+out_no_task:
        return length;
 }
@@ -1613,26 +1571,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        struct inode * inode = file->f_dentry->d_inode;
        char *page; 
        ssize_t length; 
-        struct task_struct *task = proc_task(inode); 
+        struct task_struct *task = get_proc_task(inode);
+        length = -ESRCH;
+        if (!task)
+                goto out_no_task;
        if (count > PAGE_SIZE) 
                count = PAGE_SIZE; 
-        if (*ppos != 0) {
-                /* No partial writes. */
+        /* No partial writes. */
-                return -EINVAL;
+        length = -EINVAL;
-        }
+        if (*ppos != 0)
+                goto out;
+        length = -ENOMEM;
        page = (char*)__get_free_page(GFP_USER); 
        if (!page) 
-                return -ENOMEM;
+                goto out;
        length = -EFAULT; 
        if (copy_from_user(page, buf, count)) 
-                goto out;
+                goto out_free;
        length = security_setprocattr(task, 
                                      (char*)file->f_dentry->d_name.name, 
                                      (void*)page, count);
-out:
+out_free:
        free_page((unsigned long) page);
+out:
+        put_task_struct(task);
+out_no_task:
        return length;
 } 
@@ -1647,24 +1615,22 @@ static struct file_operations proc_tgid_attr_operations;
 static struct inode_operations proc_tgid_attr_inode_operations;
 #endif
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir);
 /* SMP-safe */
 static struct dentry *proc_pident_lookup(struct inode *dir, 
                                         struct dentry *dentry,
                                         struct pid_entry *ents)
 {
        struct inode *inode;
-        int error;
+        struct dentry *error;
-        struct task_struct *task = proc_task(dir);
+        struct task_struct *task = get_proc_task(dir);
        struct pid_entry *p;
        struct proc_inode *ei;
-        error = -ENOENT;
+        error = ERR_PTR(-ENOENT);
        inode = NULL;
-        if (!pid_alive(task))
+        if (!task)
-                goto out;
+                goto out_no_task;
        for (p = ents; p->name; p++) {
                if (p->len != dentry->d_name.len)
@@ -1675,7 +1641,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
        if (!p->name)
                goto out;
-        error = -EINVAL;
+        error = ERR_PTR(-EINVAL);
        inode = proc_pid_make_inode(dir->i_sb, task, p->type);
        if (!inode)
                goto out;
@@ -1688,7 +1654,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
         */
        switch(p->type) {
                case PROC_TGID_TASK:
-                        inode->i_nlink = 2 + get_tid_list(2, NULL, dir);
+                        inode->i_nlink = 2;
                        inode->i_op = &proc_task_inode_operations;
                        inode->i_fop = &proc_task_operations;
                        break;
@@ -1758,7 +1724,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 #endif
                case PROC_TID_MEM:
                case PROC_TGID_MEM:
-                        inode->i_op = &proc_mem_inode_operations;
                        inode->i_fop = &proc_mem_operations;
                        break;
 #ifdef CONFIG_SECCOMP
@@ -1800,6 +1765,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                case PROC_TGID_ATTR_EXEC:
                case PROC_TID_ATTR_FSCREATE:
                case PROC_TGID_ATTR_FSCREATE:
+                case PROC_TID_ATTR_KEYCREATE:
+                case PROC_TGID_ATTR_KEYCREATE:
+                case PROC_TID_ATTR_SOCKCREATE:
+                case PROC_TGID_ATTR_SOCKCREATE:
                        inode->i_fop = &proc_pid_attr_operations;
                        break;
 #endif
@@ -1841,14 +1810,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                default:
                        printk("procfs: impossible type (%d)",p->type);
                        iput(inode);
-                        return ERR_PTR(-EINVAL);
+                        error = ERR_PTR(-EINVAL);
+                        goto out;
        }
        dentry->d_op = &pid_dentry_operations;
        d_add(dentry, inode);
-        return NULL;
+        /* Close the race of the process dying before we return the dentry */
+        if (pid_revalidate(dentry, NULL))
+                error = NULL;
 out:
-        return ERR_PTR(error);
+        put_task_struct(task);
+out_no_task:
+        return error;
 }
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -1871,10 +1844,12 @@ static struct file_operations proc_tid_base_operations = {
 static struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
+        .getattr        = pid_getattr,
 };
 static struct inode_operations proc_tid_base_inode_operations = {
        .lookup         = proc_tid_base_lookup,
+        .getattr        = pid_getattr,
 };
 #ifdef CONFIG_SECURITY
@@ -1916,10 +1891,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
 static struct inode_operations proc_tgid_attr_inode_operations = {
        .lookup         = proc_tgid_attr_lookup,
+        .getattr        = pid_getattr,
 };
 static struct inode_operations proc_tid_attr_inode_operations = {
        .lookup         = proc_tid_attr_lookup,
+        .getattr        = pid_getattr,
 };
 #endif
@@ -1929,14 +1906,14 @@ static struct inode_operations proc_tid_attr_inode_operations = {
 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
                              int buflen)
 {
-        char tmp[30];
+        char tmp[PROC_NUMBUF];
        sprintf(tmp, "%d", current->tgid);
        return vfs_readlink(dentry,buffer,buflen,tmp);
 }
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        char tmp[30];
+        char tmp[PROC_NUMBUF];
        sprintf(tmp, "%d", current->tgid);
        return ERR_PTR(vfs_follow_link(nd,tmp));
 }       
@@ -1947,67 +1924,80 @@ static struct inode_operations proc_self_inode_operations = {
 };
 /**
- * proc_pid_unhash -  Unhash /proc/@pid entry from the dcache.
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
- * @p: task that should be flushed.
+ *
+ * @task: task that should be flushed.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
 *
- * Drops the /proc/@pid dcache entry from the hash chains.
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
 *
- * Dropping /proc/@pid entries and detach_pid must be synchroneous,
+ * NOTE: This routine is just an optimization so it does not guarantee
- * otherwise e.g. /proc/@pid/exe might point to the wrong executable,
+ *       that no dcache entries will exist at process exit time it
- * if the pid value is immediately reused. This is enforced by
+ *       just makes it very unlikely that any will persist.
- * - caller must acquire spin_lock(p->proc_lock)
- * - must be called before detach_pid()
- * - proc_pid_lookup acquires proc_lock, and checks that
- *   the target is not dead by looking at the attach count
- *   of PIDTYPE_PID.
 */
+void proc_flush_task(struct task_struct *task)
-struct dentry *proc_pid_unhash(struct task_struct *p)
 {
-        struct dentry *proc_dentry;
+        struct dentry *dentry, *leader, *dir;
+        char buf[PROC_NUMBUF];
+        struct qstr name;
+        name.name = buf;
+        name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+        dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+        if (dentry) {
+                shrink_dcache_parent(dentry);
+                d_drop(dentry);
+                dput(dentry);
+        }
-        proc_dentry = p->proc_dentry;
+        if (thread_group_leader(task))
-        if (proc_dentry != NULL) {
+                goto out;
-                spin_lock(&dcache_lock);
+        name.name = buf;
-                spin_lock(&proc_dentry->d_lock);
+        name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
-                if (!d_unhashed(proc_dentry)) {
+        leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
-                        dget_locked(proc_dentry);
+        if (!leader)
-                        __d_drop(proc_dentry);
+                goto out;
-                        spin_unlock(&proc_dentry->d_lock);
-                } else {
-                        spin_unlock(&proc_dentry->d_lock);
-                        proc_dentry = NULL;
-                }
-                spin_unlock(&dcache_lock);
-        }
-        return proc_dentry;
-}
-/**
+        name.name = "task";
- * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
+        name.len = strlen(name.name);
- * @proc_dentry: directoy to prune.
+        dir = d_hash_and_lookup(leader, &name);
- *
+        if (!dir)
- * Shrink the /proc directory that was used by the just killed thread.
+                goto out_put_leader;
- */
-        
+        name.name = buf;
-void proc_pid_flush(struct dentry *proc_dentry)
+        name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
-{
+        dentry = d_hash_and_lookup(dir, &name);
-        might_sleep();
+        if (dentry) {
-        if(proc_dentry != NULL) {
+                shrink_dcache_parent(dentry);
-                shrink_dcache_parent(proc_dentry);
+                d_drop(dentry);
-                dput(proc_dentry);
+                dput(dentry);
        }
+        dput(dir);
+out_put_leader:
+        dput(leader);
+out:
+        return;
 }
 /* SMP-safe */
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+        struct dentry *result = ERR_PTR(-ENOENT);
        struct task_struct *task;
        struct inode *inode;
        struct proc_inode *ei;
        unsigned tgid;
-        int died;
        if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
                inode = new_inode(dir->i_sb);
@@ -2028,21 +2018,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
        if (tgid == ~0U)
                goto out;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = find_task_by_pid(tgid);
        if (task)
                get_task_struct(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!task)
                goto out;
        inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
+        if (!inode)
+                goto out_put_task;
-        if (!inode) {
-                put_task_struct(task);
-                goto out;
-        }
        inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
        inode->i_op = &proc_tgid_base_inode_operations;
        inode->i_fop = &proc_tgid_base_operations;
@@ -2053,45 +2040,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
        inode->i_nlink = 4;
 #endif
-        dentry->d_op = &pid_base_dentry_operations;
+        dentry->d_op = &pid_dentry_operations;
-        died = 0;
        d_add(dentry, inode);
-        spin_lock(&task->proc_lock);
+        /* Close the race of the process dying before we return the dentry */
-        task->proc_dentry = dentry;
+        if (pid_revalidate(dentry, NULL))
-        if (!pid_alive(task)) {
+                result = NULL;
-                dentry = proc_pid_unhash(task);
-                died = 1;
-        }
-        spin_unlock(&task->proc_lock);
+out_put_task:
        put_task_struct(task);
-        if (died) {
-                proc_pid_flush(dentry);
-                goto out;
-        }
-        return NULL;
 out:
-        return ERR_PTR(-ENOENT);
+        return result;
 }
 /* SMP-safe */
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+        struct dentry *result = ERR_PTR(-ENOENT);
        struct task_struct *task;
-        struct task_struct *leader = proc_task(dir);
+        struct task_struct *leader = get_proc_task(dir);
        struct inode *inode;
        unsigned tid;
+        if (!leader)
+                goto out_no_task;
        tid = name_to_int(dentry);
        if (tid == ~0U)
                goto out;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = find_task_by_pid(tid);
        if (task)
                get_task_struct(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!task)
                goto out;
        if (leader->tgid != task->tgid)
@@ -2112,101 +2094,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
        inode->i_nlink = 3;
 #endif
-        dentry->d_op = &pid_base_dentry_operations;
+        dentry->d_op = &pid_dentry_operations;
        d_add(dentry, inode);
+        /* Close the race of the process dying before we return the dentry */
+        if (pid_revalidate(dentry, NULL))
+                result = NULL;
-        put_task_struct(task);
-        return NULL;
 out_drop_task:
        put_task_struct(task);
 out:
-        return ERR_PTR(-ENOENT);
+        put_task_struct(leader);
+out_no_task:
+        return result;
 }
-#define PROC_NUMBUF 10
-#define PROC_MAXPIDS 20
 /*
- * Get a few tgid's to return for filldir - we need to hold the
+ * Find the first tgid to return to user space.
- * tasklist lock while doing this, and we must release it before
+ *
- * we actually do the filldir itself, so we use a temp buffer..
+ * Usually this is just whatever follows &init_task, but if the users
+ * buffer was too small to hold the full list or there was a seek into
+ * the middle of the directory we have more work to do.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with &init_task and walk nr
+ * threads past it.
 */
-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids)
+static struct task_struct *first_tgid(int tgid, unsigned int nr)
-{
+{
-        struct task_struct *p;
+        struct task_struct *pos;
-        int nr_tgids = 0;
+        rcu_read_lock();
+        if (tgid && nr) {
-        index--;
+                pos = find_task_by_pid(tgid);
-        read_lock(&tasklist_lock);
+                if (pos && thread_group_leader(pos))
-        p = NULL;
+                        goto found;
-        if (version) {
-                p = find_task_by_pid(version);
-                if (p && !thread_group_leader(p))
-                        p = NULL;
        }
+        /* If nr exceeds the number of processes get out quickly */
+        pos = NULL;
+        if (nr && nr >= nr_processes())
+                goto done;
-        if (p)
+        /* If we haven't found our starting place yet start with
-                index = 0;
+         * the init_task and walk nr tasks forward.
-        else
+         */
-                p = next_task(&init_task);
+        for (pos = next_task(&init_task); nr > 0; --nr) {
+                pos = next_task(pos);
-        for ( ; p != &init_task; p = next_task(p)) {
+                if (pos == &init_task) {
-                int tgid = p->pid;
+                        pos = NULL;
-                if (!pid_alive(p))
+                        goto done;
-                        continue;
+                }
-                if (--index >= 0)
-                        continue;
-                tgids[nr_tgids] = tgid;
-                nr_tgids++;
-                if (nr_tgids >= PROC_MAXPIDS)
-                        break;
        }
-        read_unlock(&tasklist_lock);
+found:
-        return nr_tgids;
+        get_task_struct(pos);
+done:
+        rcu_read_unlock();
+        return pos;
 }
 /*
- * Get a few tid's to return for filldir - we need to hold the
+ * Find the next task in the task list.
- * tasklist lock while doing this, and we must release it before
+ * Return NULL if we loop or there is any error.
- * we actually do the filldir itself, so we use a temp buffer..
+ *
+ * The reference to the input task_struct is released.
 */
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir)
+static struct task_struct *next_tgid(struct task_struct *start)
-{
+{
-        struct task_struct *leader_task = proc_task(dir);
+        struct task_struct *pos;
-        struct task_struct *task = leader_task;
+        rcu_read_lock();
-        int nr_tids = 0;
+        pos = start;
+        if (pid_alive(start))
-        index -= 2;
+                pos = next_task(start);
-        read_lock(&tasklist_lock);
+        if (pid_alive(pos) && (pos != &init_task)) {
-        /*
+                get_task_struct(pos);
-         * The starting point task (leader_task) might be an already
+                goto done;
-         * unlinked task, which cannot be used to access the task-list
+        }
-         * via next_thread().
+        pos = NULL;
-         */
+done:
-        if (pid_alive(task)) do {
+        rcu_read_unlock();
-                int tid = task->pid;
+        put_task_struct(start);
+        return pos;
-                if (--index >= 0)
-                        continue;
-                if (tids != NULL)
-                        tids[nr_tids] = tid;
-                nr_tids++;
-                if (nr_tids >= PROC_MAXPIDS)
-                        break;
-        } while ((task = next_thread(task)) != leader_task);
-        read_unlock(&tasklist_lock);
-        return nr_tids;
 }
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int tgid_array[PROC_MAXPIDS];
        char buf[PROC_NUMBUF];
        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-        unsigned int nr_tgids, i;
+        struct task_struct *task;
-        int next_tgid;
+        int tgid;
        if (!nr) {
                ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2215,63 +2191,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
                filp->f_pos++;
                nr++;
        }
+        nr -= 1;
        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
-        next_tgid = filp->f_version;
+        tgid = filp->f_version;
        filp->f_version = 0;
-        for (;;) {
+        for (task = first_tgid(tgid, nr);
-                nr_tgids = get_tgid_list(nr, next_tgid, tgid_array);
+             task;
-                if (!nr_tgids) {
+             task = next_tgid(task), filp->f_pos++) {
-                        /* no more entries ! */
+                int len;
+                ino_t ino;
+                tgid = task->pid;
+                len = snprintf(buf, sizeof(buf), "%d", tgid);
+                ino = fake_ino(tgid, PROC_TGID_INO);
+                if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
+                        /* returning this tgid failed, save it as the first
+                         * pid for the next readir call */
+                        filp->f_version = tgid;
+                        put_task_struct(task);
                        break;
                }
-                next_tgid = 0;
+        }
+        return 0;
+}
-                /* do not use the last found pid, reserve it for next_tgid */
+/*
-                if (nr_tgids == PROC_MAXPIDS) {
+ * Find the first tid of a thread group to return to user space.
-                        nr_tgids--;
+ *
-                        next_tgid = tgid_array[nr_tgids];
+ * Usually this is just the thread group leader, but if the users
-                }
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct task_struct *leader,
+                                        int tid, int nr)
+{
+        struct task_struct *pos;
-                for (i=0;i<nr_tgids;i++) {
+        rcu_read_lock();
-                        int tgid = tgid_array[i];
+        /* Attempt to start with the pid of a thread */
-                        ino_t ino = fake_ino(tgid,PROC_TGID_INO);
+        if (tid && (nr > 0)) {
-                        unsigned long j = PROC_NUMBUF;
+                pos = find_task_by_pid(tid);
+                if (pos && (pos->group_leader == leader))
+                        goto found;
+        }
-                        do
+        /* If nr exceeds the number of threads there is nothing todo */
-                                buf[--j] = '0' + (tgid % 10);
+        pos = NULL;
-                        while ((tgid /= 10) != 0);
+        if (nr && nr >= get_nr_threads(leader))
+                goto out;
-                        if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) {
+        /* If we haven't found our starting place yet start
-                                /* returning this tgid failed, save it as the first
+         * with the leader and walk nr threads forward.
-                                 * pid for the next readir call */
+         */
-                                filp->f_version = tgid_array[i];
+        for (pos = leader; nr > 0; --nr) {
-                                goto out;
+                pos = next_thread(pos);
-                        }
+                if (pos == leader) {
-                        filp->f_pos++;
+                        pos = NULL;
-                        nr++;
+                        goto out;
                }
        }
+found:
+        get_task_struct(pos);
 out:
-        return 0;
+        rcu_read_unlock();
+        return pos;
+}
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+        struct task_struct *pos = NULL;
+        rcu_read_lock();
+        if (pid_alive(start)) {
+                pos = next_thread(start);
+                if (thread_group_leader(pos))
+                        pos = NULL;
+                else
+                        get_task_struct(pos);
+        }
+        rcu_read_unlock();
+        put_task_struct(start);
+        return pos;
 }
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int tid_array[PROC_MAXPIDS];
        char buf[PROC_NUMBUF];
-        unsigned int nr_tids, i;
        struct dentry *dentry = filp->f_dentry;
        struct inode *inode = dentry->d_inode;
+        struct task_struct *leader = get_proc_task(inode);
+        struct task_struct *task;
        int retval = -ENOENT;
        ino_t ino;
+        int tid;
        unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
-        if (!pid_alive(proc_task(inode)))
+        if (!leader)
-                goto out;
+                goto out_no_task;
        retval = 0;
        switch (pos) {
@@ -2289,24 +2318,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
                /* fall through */
        }
-        nr_tids = get_tid_list(pos, tid_array, inode);
+        /* f_version caches the tgid value that the last readdir call couldn't
-        inode->i_nlink = pos + nr_tids;
+         * return. lseek aka telldir automagically resets f_version to 0.
+         */
-        for (i = 0; i < nr_tids; i++) {
+        tid = filp->f_version;
-                unsigned long j = PROC_NUMBUF;
+        filp->f_version = 0;
-                int tid = tid_array[i];
+        for (task = first_tid(leader, tid, pos - 2);
+             task;
-                ino = fake_ino(tid,PROC_TID_INO);
+             task = next_tid(task), pos++) {
+                int len;
-                do
+                tid = task->pid;
-                        buf[--j] = '0' + (tid % 10);
+                len = snprintf(buf, sizeof(buf), "%d", tid);
-                while ((tid /= 10) != 0);
+                ino = fake_ino(tid, PROC_TID_INO);
+                if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
-                if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0)
+                        /* returning this tgid failed, save it as the first
+                         * pid for the next readir call */
+                        filp->f_version = tid;
+                        put_task_struct(task);
                        break;
-                pos++;
+                }
        }
 out:
        filp->f_pos = pos;
+        put_task_struct(leader);
+out_no_task:
        return retval;
 }
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct task_struct *p = get_proc_task(inode);
+        generic_fillattr(inode, stat);
+        if (p) {
+                rcu_read_lock();
+                stat->nlink += get_nr_threads(p);
+                rcu_read_unlock();
+                put_task_struct(p);
+        }
+        return 0;
+}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c463111..6dcef089e18e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
 static void proc_delete_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
-        struct task_struct *tsk;
        truncate_inode_pages(&inode->i_data, 0);
-        /* Let go of any associated process */
+        /* Stop tracking associated processes */
-        tsk = PROC_I(inode)->task;
+        put_pid(PROC_I(inode)->pid);
-        if (tsk)
-                put_task_struct(tsk);
        /* Let go of any associated proc directory entry */
        de = PROC_I(inode)->pde;
@@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
        if (!ei)
                return NULL;
-        ei->task = NULL;
+        ei->pid = NULL;
-        ei->type = 0;
+        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
        inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860d..146a434ba944 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
 void free_proc_entry(struct proc_dir_entry *de);
 int proc_init_inodecache(void);
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct pid *proc_pid(struct inode *inode)
+{
+        return PROC_I(inode)->pid;
+}
+static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-        return PROC_I(inode)->task;
+        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
-static inline int proc_type(struct inode *inode)
+static inline int proc_fd(struct inode *inode)
 {
-        return PROC_I(inode)->type;
+        return PROC_I(inode)->fd;
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 17f6e8fa1397..036d14d83627 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -9,7 +9,6 @@
 *      Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
 */
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/user.h>
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5c10ea157425..9f2cfc30f9cf 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -26,7 +26,6 @@
 #include <linux/mman.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
@@ -120,7 +119,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 {
        struct sysinfo i;
        int len;
-        struct page_state ps;
        unsigned long inactive;
        unsigned long active;
        unsigned long free;
@@ -129,7 +127,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
        struct vmalloc_info vmi;
        long cached;
-        get_page_state(&ps);
        get_zone_counts(&active, &inactive, &free);
 /*
@@ -142,7 +139,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
        allowed = ((totalram_pages - hugetlb_total_pages())
                * sysctl_overcommit_ratio / 100) + total_swap_pages;
-        cached = get_page_cache_size() - total_swapcache_pages - i.bufferram;
+        cached = global_page_state(NR_FILE_PAGES) -
+                        total_swapcache_pages - i.bufferram;
        if (cached < 0)
                cached = 0;
@@ -167,11 +165,14 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                "SwapFree:     %8lu kB\n"
                "Dirty:        %8lu kB\n"
                "Writeback:    %8lu kB\n"
+                "AnonPages:    %8lu kB\n"
                "Mapped:       %8lu kB\n"
                "Slab:         %8lu kB\n"
+                "PageTables:   %8lu kB\n"
+                "NFS Unstable: %8lu kB\n"
+                "Bounce:       %8lu kB\n"
                "CommitLimit:  %8lu kB\n"
                "Committed_AS: %8lu kB\n"
-                "PageTables:   %8lu kB\n"
                "VmallocTotal: %8lu kB\n"
                "VmallocUsed:  %8lu kB\n"
                "VmallocChunk: %8lu kB\n",
@@ -188,13 +189,16 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                K(i.freeram-i.freehigh),
                K(i.totalswap),
                K(i.freeswap),
-                K(ps.nr_dirty),
+                K(global_page_state(NR_FILE_DIRTY)),
-                K(ps.nr_writeback),
+                K(global_page_state(NR_WRITEBACK)),
-                K(ps.nr_mapped),
+                K(global_page_state(NR_ANON_PAGES)),
-                K(ps.nr_slab),
+                K(global_page_state(NR_FILE_MAPPED)),
+                K(global_page_state(NR_SLAB)),
+                K(global_page_state(NR_PAGETABLE)),
+                K(global_page_state(NR_UNSTABLE_NFS)),
+                K(global_page_state(NR_BOUNCE)),
                K(allowed),
                K(committed),
-                K(ps.nr_page_table_pages),
                (unsigned long)VMALLOC_TOTAL >> 10,
                vmi.used >> 10,
                vmi.largest_chunk >> 10
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c3fd3611112f..8901c65caca8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -12,7 +12,6 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
@@ -26,10 +25,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc
 struct proc_dir_entry *proc_sys_root;
 #endif
-static struct super_block *proc_get_sb(struct file_system_type *fs_type,
+static int proc_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, proc_fill_super);
+        return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 static struct file_system_type proc_fs_type = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 91b7c15ab373..0a163a4f7764 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
 {
        struct vm_area_struct * vma;
        int result = -ENOENT;
-        struct task_struct *task = proc_task(inode);
+        struct task_struct *task = get_proc_task(inode);
-        struct mm_struct * mm = get_task_mm(task);
+        struct mm_struct * mm = NULL;
+        if (task) {
+                mm = get_task_mm(task);
+                put_task_struct(task);
+        }
        if (!mm)
                goto out;
        down_read(&mm->mmap_sem);
@@ -118,9 +122,15 @@ struct mem_size_stats
        unsigned long private_dirty;
 };
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+        return NULL;
+}
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
+        struct task_struct *task = priv->task;
        struct vm_area_struct *vma = v;
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
@@ -153,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
                pad_len_spaces(m, len);
                seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
        } else {
-                if (mm) {
+                const char *name = arch_vma_name(vma);
-                        if (vma->vm_start <= mm->start_brk &&
+                if (!name) {
+                        if (mm) {
+                                if (vma->vm_start <= mm->start_brk &&
                                                vma->vm_end >= mm->brk) {
-                                pad_len_spaces(m, len);
+                                        name = "[heap]";
-                                seq_puts(m, "[heap]");
+                                } else if (vma->vm_start <= mm->start_stack &&
-                        } else {
+                                           vma->vm_end >= mm->start_stack) {
-                                if (vma->vm_start <= mm->start_stack &&
+                                        name = "[stack]";
-                                        vma->vm_end >= mm->start_stack) {
-                                        pad_len_spaces(m, len);
-                                        seq_puts(m, "[stack]");
                                }
+                        } else {
+                                name = "[vdso]";
                        }
-                } else {
+                }
+                if (name) {
                        pad_len_spaces(m, len);
-                        seq_puts(m, "[vdso]");
+                        seq_puts(m, name);
                }
        }
        seq_putc(m, '\n');
@@ -295,12 +306,16 @@ static int show_smap(struct seq_file *m, void *v)
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
        unsigned long last_addr = m->version;
        struct mm_struct *mm;
-        struct vm_area_struct *vma, *tail_vma;
+        struct vm_area_struct *vma, *tail_vma = NULL;
        loff_t l = *pos;
+        /* Clear the per syscall fields in priv */
+        priv->task = NULL;
+        priv->tail_vma = NULL;
        /*
         * We remember last_addr rather than next_addr to hit with
         * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +326,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        if (last_addr == -1UL)
                return NULL;
-        mm = get_task_mm(task);
+        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+        if (!priv->task)
+                return NULL;
+        mm = get_task_mm(priv->task);
        if (!mm)
                return NULL;
-        tail_vma = get_gate_vma(task);
+        priv->tail_vma = tail_vma = get_gate_vma(priv->task);
        down_read(&mm->mmap_sem);
        /* Start with last addr hint */
@@ -350,11 +369,9 @@ out:
        return tail_vma;
 }
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-        struct task_struct *task = m->private;
+        if (vma && vma != priv->tail_vma) {
-        struct vm_area_struct *vma = v;
-        if (vma && vma != get_gate_vma(task)) {
                struct mm_struct *mm = vma->vm_mm;
                up_read(&mm->mmap_sem);
                mmput(mm);
@@ -363,38 +380,103 @@ static void m_stop(struct seq_file *m, void *v)
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
        struct vm_area_struct *vma = v;
-        struct vm_area_struct *tail_vma = get_gate_vma(task);
+        struct vm_area_struct *tail_vma = priv->tail_vma;
        (*pos)++;
        if (vma && (vma != tail_vma) && vma->vm_next)
                return vma->vm_next;
-        m_stop(m, v);
+        vma_stop(priv, vma);
        return (vma != tail_vma)? tail_vma: NULL;
 }
-struct seq_operations proc_pid_maps_op = {
+static void m_stop(struct seq_file *m, void *v)
+{
+        struct proc_maps_private *priv = m->private;
+        struct vm_area_struct *vma = v;
+        vma_stop(priv, vma);
+        if (priv->task)
+                put_task_struct(priv->task);
+}
+static struct seq_operations proc_pid_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_map
 };
-struct seq_operations proc_pid_smaps_op = {
+static struct seq_operations proc_pid_smaps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_smap
 };
+static int do_maps_open(struct inode *inode, struct file *file,
+                        struct seq_operations *ops)
+{
+        struct proc_maps_private *priv;
+        int ret = -ENOMEM;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (priv) {
+                priv->pid = proc_pid(inode);
+                ret = seq_open(file, ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = priv;
+                } else {
+                        kfree(priv);
+                }
+        }
+        return ret;
+}
+static int maps_open(struct inode *inode, struct file *file)
+{
+        return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+struct file_operations proc_maps_operations = {
+        .open           = maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
-struct seq_operations proc_pid_numa_maps_op = {
+static struct seq_operations proc_pid_numa_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_numa_map
 };
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+        return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+struct file_operations proc_numa_maps_operations = {
+        .open           = numa_maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
 #endif
+static int smaps_open(struct inode *inode, struct file *file)
+{
+        return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+struct file_operations proc_smaps_operations = {
+        .open           = smaps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f68827ed10e..af69f28277b6 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        return NULL;
 }
-struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
        .show   = show_map
 };
+static int maps_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        ret = seq_open(file, &proc_pid_maps_op);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = NULL;
+        }
+        return ret;
+}
+struct file_operations proc_maps_operations = {
+        .open           = maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 20d4b2237fce..d96050728c43 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -7,7 +7,6 @@
 *
 */
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/user.h>
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 46efbf52cbec..8425cf6e9624 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,7 +13,6 @@
 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/qnx4_fs.h>
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 9031948fefd0..0d7103fa0df5 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
 */
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
index df5bc75d5414..aa3b19544bee 100644
--- a/fs/qnx4/fsync.c
+++ b/fs/qnx4/fsync.c
@@ -10,7 +10,6 @@
 * 24-03-1998 by Richard Frowijn : first release.
 */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/stat.h>
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2ecd46f85e9f..5a903491e697 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -12,7 +12,6 @@
 * 30-06-1998 by Frank Denis : first step to write inodes.
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/string.h>
@@ -128,7 +127,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static void qnx4_read_inode(struct inode *);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
-static int qnx4_statfs(struct super_block *, struct kstatfs *);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
 static struct super_operations qnx4_sops =
 {
@@ -282,8 +281,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
        return block;
 }
-static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf)
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        lock_kernel();
        buf->f_type    = sb->s_magic;
@@ -448,7 +449,7 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,qnx4_get_block);
 }
-static struct address_space_operations qnx4_aops = {
+static const struct address_space_operations qnx4_aops = {
        .readpage       = qnx4_readpage,
        .writepage      = qnx4_writepage,
        .sync_page      = block_sync_page,
@@ -561,10 +562,11 @@ static void destroy_inodecache(void)
                       "qnx4_inode_cache: not all structures were freed\n");
 }
-static struct super_block *qnx4_get_sb(struct file_system_type *fs_type,
+static int qnx4_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+                           mnt);
 }
 static struct file_system_type qnx4_fs_type = {
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 4af4951d7f54..c3d83f67154a 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/qnx4_fs.h>
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 86563ec01b39..6437c1c3d1dd 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,7 +10,6 @@
 * 30-06-1998 by Frank DENIS : ugly filler.
 */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 00a933eb820c..86f14cacf641 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -26,7 +26,7 @@
 #include <linux/fs.h>
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
        .readpage       = simple_readpage,
        .prepare_write  = simple_prepare_write,
        .commit_write   = simple_commit_write
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f443a84b98a5..99fffc9e1bfd 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,7 +27,7 @@
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
        .readpage               = simple_readpage,
        .prepare_write          = simple_prepare_write,
        .commit_write           = simple_commit_write
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 14bd2246fb6d..b9677335cc8d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        return 0;
 }
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
+int ramfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, ramfs_fill_super);
+        return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
 }
-static struct super_block *rootfs_get_sb(struct file_system_type *fs_type,
+static int rootfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+        return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+                            mnt);
 }
 static struct file_system_type ramfs_fs_type = {
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 313237631b49..c2bb58e74653 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,6 +10,6 @@
 */
-extern struct address_space_operations ramfs_aops;
+extern const struct address_space_operations ramfs_aops;
 extern const struct file_operations ramfs_file_operations;
 extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 909f71e9a30f..4a7dbdee1b6d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -3,7 +3,6 @@
 */
 /* Reiserfs block (de)allocator, bitmap-based. */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/errno.h>
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 973c819f8033..9aabcc0ccd2d 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index b2264ba3cc56..fba304e64de8 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -15,7 +15,6 @@
 **
 **/
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index cf6e1cf40351..752cea12e30f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
        return res;
 }
-static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
-                                  size_t count, loff_t pos)
-{
-        return generic_file_aio_write(iocb, buf, count, pos);
-}
 const struct file_operations reiserfs_file_operations = {
        .read = generic_file_read,
        .write = reiserfs_file_write,
@@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = {
        .fsync = reiserfs_sync_file,
        .sendfile = generic_file_sendfile,
        .aio_read = generic_file_aio_read,
-        .aio_write = reiserfs_aio_write,
+        .aio_write = generic_file_aio_write,
        .splice_read = generic_file_splice_read,
        .splice_write = generic_file_splice_write,
 };
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5600d3d60cf7..6d0e554daa9d 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -34,7 +34,6 @@
 ** 
 **/
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 6c5a726fd34b..de391a82b999 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/string.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9857e50f85e7..12dfdcfbee3d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
@@ -2933,6 +2932,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
                        }
                        if (error)
                                goto out;
+                        /*
+                         * file size is changed, ctime and mtime are
+                         * to be updated
+                         */
+                        attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
                }
        }
@@ -2996,7 +3000,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        return error;
 }
-struct address_space_operations reiserfs_address_space_operations = {
+const struct address_space_operations reiserfs_address_space_operations = {
        .writepage = reiserfs_writepage,
        .readpage = reiserfs_readpage,
        .readpages = reiserfs_readpages,
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b73529b8099..9b3672d69367 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -34,7 +34,6 @@
 **                      from within kupdate, it will ignore the immediate flag
 */
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -834,8 +833,7 @@ static int write_ordered_buffers(spinlock_t * lock,
                get_bh(bh);
                if (test_set_buffer_locked(bh)) {
                        if (!buffer_dirty(bh)) {
-                                list_del_init(&jh->list);
+                                list_move(&jh->list, &tmp);
-                                list_add(&jh->list, &tmp);
                                goto loop_next;
                        }
                        spin_unlock(lock);
@@ -855,8 +853,7 @@ static int write_ordered_buffers(spinlock_t * lock,
                        ret = -EIO;
                }
                if (buffer_dirty(bh)) {
-                        list_del_init(&jh->list);
+                        list_move(&jh->list, &tmp);
-                        list_add(&jh->list, &tmp);
                        add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
                } else {
                        reiserfs_free_jh(bh);
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 2533c1f64aba..281f8061ac58 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/string.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 284f7852de8b..c61710e49c62 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -11,7 +11,6 @@
 * NO WARRANTY
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/bitops.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index f62590aa9c95..65feba4deb69 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/random.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 27bd3a1df2ad..bc808a91eeaa 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -2,7 +2,6 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 731688e1cfe3..5d8a8cfebc70 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -10,7 +10,6 @@
 /* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/seq_file.h>
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d2b25e1ba6e9..8b9b13127136 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -49,7 +49,6 @@
 * reiserfs_insert_item
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cae2abbc0c71..28eb3c886034 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -11,7 +11,6 @@
 * NO WARRANTY
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
@@ -60,7 +59,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 }
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -1938,15 +1937,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return errval;
 }
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+        struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
        buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
        buf->f_bfree = sb_free_blocks(rs);
        buf->f_bavail = buf->f_bfree;
        buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-        buf->f_bsize = s->s_blocksize;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
        /* changed to accommodate gcc folks. */
        buf->f_type = REISERFS_SUPER_MAGIC;
        return 0;
@@ -2249,11 +2248,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 #endif
-static struct super_block *get_super_block(struct file_system_type *fs_type,
+static int get_super_block(struct file_system_type *fs_type,
-                                           int flags, const char *dev_name,
+                           int flags, const char *dev_name,
-                                           void *data)
+                           void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+                           mnt);
 }
 static int __init init_reiserfs_fs(void)
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 196e971c03c9..36f108fc1cf5 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -2,7 +2,6 @@
 * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details
 */
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ffb79c48c5bf..39fedaa88a0c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -452,8 +452,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
        /* We can deadlock if we try to free dentries,
           and an unlink/rmdir has just occured - GFP_NOFS avoids this */
        mapping_set_gfp_mask(mapping, GFP_NOFS);
-        page = read_cache_page(mapping, n,
+        page = read_mapping_page(mapping, n, NULL);
-                               (filler_t *) mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                kmap(page);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 9b9eda7b335c..22eed61ebf69 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -179,12 +179,12 @@ outnobh:
 /* That's simple too. */
 static int
-romfs_statfs(struct super_block *sb, struct kstatfs *buf)
+romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        buf->f_type = ROMFS_MAGIC;
        buf->f_bsize = ROMBSIZE;
        buf->f_bfree = buf->f_bavail = buf->f_ffree;
-        buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS;
+        buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
        buf->f_namelen = ROMFS_MAXFN;
        return 0;
 }
@@ -459,7 +459,7 @@ err_out:
 /* Mapping from our types to the kernel */
-static struct address_space_operations romfs_aops = {
+static const struct address_space_operations romfs_aops = {
        .readpage = romfs_readpage
 };
@@ -607,10 +607,11 @@ static struct super_operations romfs_ops = {
        .remount_fs     = romfs_remount,
 };
-static struct super_block *romfs_get_sb(struct file_system_type *fs_type,
+static int romfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
+                           mnt);
 }
 static struct file_system_type romfs_fs_type = {
diff --git a/fs/select.c b/fs/select.c
index a8109baa5e46..33b72ba0f86f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -546,37 +546,38 @@ struct poll_list {
 #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
-static void do_pollfd(unsigned int num, struct pollfd * fdpage,
+/*
-        poll_table ** pwait, int *count)
+ * Fish for pollable events on the pollfd->fd file descriptor. We're only
+ * interested in events matching the pollfd->events mask, and the result
+ * matching that mask is both recorded in pollfd->revents and returned. The
+ * pwait poll_table will be used by the fd-provided poll handler for waiting,
+ * if non-NULL.
+ */
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 {
-        int i;
+        unsigned int mask;
+        int fd;
-        for (i = 0; i < num; i++) {
-                int fd;
+        mask = 0;
-                unsigned int mask;
+        fd = pollfd->fd;
-                struct pollfd *fdp;
+        if (fd >= 0) {
+                int fput_needed;
-                mask = 0;
+                struct file * file;
-                fdp = fdpage+i;
-                fd = fdp->fd;
+                file = fget_light(fd, &fput_needed);
-                if (fd >= 0) {
+                mask = POLLNVAL;
-                        int fput_needed;
+                if (file != NULL) {
-                        struct file * file = fget_light(fd, &fput_needed);
+                        mask = DEFAULT_POLLMASK;
-                        mask = POLLNVAL;
+                        if (file->f_op && file->f_op->poll)
-                        if (file != NULL) {
+                                mask = file->f_op->poll(file, pwait);
-                                mask = DEFAULT_POLLMASK;
+                        /* Mask out unneeded events. */
-                                if (file->f_op && file->f_op->poll)
+                        mask &= pollfd->events | POLLERR | POLLHUP;
-                                        mask = file->f_op->poll(file, *pwait);
+                        fput_light(file, fput_needed);
-                                mask &= fdp->events | POLLERR | POLLHUP;
-                                fput_light(file, fput_needed);
-                        }
-                        if (mask) {
-                                *pwait = NULL;
-                                (*count)++;
-                        }
                }
-                fdp->revents = mask;
        }
+        pollfd->revents = mask;
+        return mask;
 }
 static int do_poll(unsigned int nfds,  struct poll_list *list,
@@ -594,11 +595,29 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                long __timeout;
                set_current_state(TASK_INTERRUPTIBLE);
-                walk = list;
+                for (walk = list; walk != NULL; walk = walk->next) {
-                while(walk != NULL) {
+                        struct pollfd * pfd, * pfd_end;
-                        do_pollfd( walk->len, walk->entries, &pt, &count);
-                        walk = walk->next;
+                        pfd = walk->entries;
+                        pfd_end = pfd + walk->len;
+                        for (; pfd != pfd_end; pfd++) {
+                                /*
+                                 * Fish for events. If we found one, record it
+                                 * and kill the poll_table, so we don't
+                                 * needlessly register any other waiters after
+                                 * this. They'll get immediately deregistered
+                                 * when we break out and return.
+                                 */
+                                if (do_pollfd(pfd, pt)) {
+                                        count++;
+                                        pt = NULL;
+                                }
+                        }
                }
+                /*
+                 * All waiters have already been registered, so don't provide
+                 * a poll_table to them on the next loop iteration.
+                 */
                pt = NULL;
                if (count || !*timeout || signal_pending(current))
                        break;
@@ -727,9 +746,9 @@ out_fds:
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                        long timeout_msecs)
 {
-        s64 timeout_jiffies = 0;
+        s64 timeout_jiffies;
-        if (timeout_msecs) {
+        if (timeout_msecs > 0) {
 #if HZ > 1000
                /* We can only overflow if HZ > 1000 */
                if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
@@ -737,6 +756,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                else
 #endif
                        timeout_jiffies = msecs_to_jiffies(timeout_msecs);
+        } else {
+                /* Infinite (< 0) or no (0) timeout */
+                timeout_jiffies = timeout_msecs;
        }
        return do_sys_poll(ufds, nfds, &timeout_jiffies);
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index ed9a24d19d7d..dae67048baba 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -306,7 +306,7 @@ static int smb_commit_write(struct file *file, struct page *page,
        return status;
 }
-struct address_space_operations smb_file_aops = {
+const struct address_space_operations smb_file_aops = {
        .readpage = smb_readpage,
        .writepage = smb_writepage,
        .prepare_write = smb_prepare_write,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fdeabc0a34f7..a1ed657c3c84 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -7,7 +7,6 @@
 *  Please add a note about your changes to smbfs in the ChangeLog file.
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
@@ -48,7 +47,7 @@
 static void smb_delete_inode(struct inode *);
 static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct super_block *, struct kstatfs *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 static kmem_cache_t *smb_inode_cachep;
@@ -641,13 +640,13 @@ out_no_server:
 }
 static int
-smb_statfs(struct super_block *sb, struct kstatfs *buf)
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        int result;
        
        lock_kernel();
-        result = smb_proc_dskattr(sb, buf);
+        result = smb_proc_dskattr(dentry, buf);
        unlock_kernel();
@@ -782,10 +781,10 @@ out:
        return error;
 }
-static struct super_block *smb_get_sb(struct file_system_type *fs_type,
+static int smb_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, smb_fill_super);
+        return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
 }
 static struct file_system_type smb_fs_type = {
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index b1b878b81730..c3495059889d 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
 }
 int
-smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr)
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
 {
-        struct smb_sb_info *server = SMB_SB(sb);
+        struct smb_sb_info *server = SMB_SB(dentry->d_sb);
        int result;
        char *p;
        long unit;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 47664597e6b1..34fb462b2379 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
 extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
@@ -63,7 +63,7 @@ extern int smb_revalidate_inode(struct dentry *dentry);
 extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
 /* file.c */
-extern struct address_space_operations smb_file_aops;
+extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71dd2760d32..c8e96195b96e 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req)
        if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
                goto out;
-        list_del_init(&req->rq_queue);
+        list_move_tail(&req->rq_queue, &server->recvq);
-        list_add_tail(&req->rq_queue, &server->recvq);
        result = 1;
 out:
        return result;
@@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server)
        result = smb_request_send_req(req);
        if (result < 0) {
                server->conn_error = result;
-                list_del_init(&req->rq_queue);
+                list_move(&req->rq_queue, &server->xmitq);
-                list_add(&req->rq_queue, &server->xmitq);
                result = -EIO;
                goto out;
        }
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 481a97a423fa..e67540441288 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -5,7 +5,6 @@
 *  Copyright (C) 2001, Urban Widmark
 */
-#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -20,6 +19,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/net.h>
+#include <linux/kthread.h>
 #include <net/ip.h>
 #include <linux/smb_fs.h>
@@ -40,7 +40,7 @@ enum smbiod_state {
 };
 static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static pid_t smbiod_pid;
+static struct task_struct *smbiod_thread;
 static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
 static LIST_HEAD(smb_servers);
 static DEFINE_SPINLOCK(servers_lock);
@@ -67,20 +67,29 @@ void smbiod_wake_up(void)
 */
 static int smbiod_start(void)
 {
-        pid_t pid;
+        struct task_struct *tsk;
+        int err = 0;
        if (smbiod_state != SMBIOD_DEAD)
                return 0;
        smbiod_state = SMBIOD_STARTING;
        __module_get(THIS_MODULE);
        spin_unlock(&servers_lock);
-        pid = kernel_thread(smbiod, NULL, 0);
+        tsk = kthread_run(smbiod, NULL, "smbiod");
-        if (pid < 0)
+        if (IS_ERR(tsk)) {
+                err = PTR_ERR(tsk);
                module_put(THIS_MODULE);
+        }
        spin_lock(&servers_lock);
-        smbiod_state = pid < 0 ? SMBIOD_DEAD : SMBIOD_RUNNING;
+        if (err < 0) {
-        smbiod_pid = pid;
+                smbiod_state = SMBIOD_DEAD;
-        return pid;
+                smbiod_thread = NULL;
+        } else {
+                smbiod_state = SMBIOD_RUNNING;
+                smbiod_thread = tsk;
+        }
+        return err;
 }
 /*
@@ -183,8 +192,7 @@ int smbiod_retry(struct smb_sb_info *server)
                if (req->rq_flags & SMB_REQ_RETRY) {
                        /* must move the request to the xmitq */
                        VERBOSE("retrying request %p on recvq\n", req);
-                        list_del(&req->rq_queue);
+                        list_move(&req->rq_queue, &server->xmitq);
-                        list_add(&req->rq_queue, &server->xmitq);
                        continue;
                }
 #endif
@@ -290,8 +298,6 @@ out:
 */
 static int smbiod(void *unused)
 {
-        daemonize("smbiod");
        allow_signal(SIGKILL);
        VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
diff --git a/fs/splice.c b/fs/splice.c
index a285fd746dc0..05fd2787be98 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -55,31 +55,43 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
                                     struct pipe_buffer *buf)
 {
        struct page *page = buf->page;
-        struct address_space *mapping = page_mapping(page);
+        struct address_space *mapping;
        lock_page(page);
-        WARN_ON(!PageUptodate(page));
+        mapping = page_mapping(page);
+        if (mapping) {
+                WARN_ON(!PageUptodate(page));
-        /*
+                /*
-         * At least for ext2 with nobh option, we need to wait on writeback
+                 * At least for ext2 with nobh option, we need to wait on
-         * completing on this page, since we'll remove it from the pagecache.
+                 * writeback completing on this page, since we'll remove it
-         * Otherwise truncate wont wait on the page, allowing the disk
+                 * from the pagecache.  Otherwise truncate wont wait on the
-         * blocks to be reused by someone else before we actually wrote our
+                 * page, allowing the disk blocks to be reused by someone else
-         * data to them. fs corruption ensues.
+                 * before we actually wrote our data to them. fs corruption
-         */
+                 * ensues.
-        wait_on_page_writeback(page);
+                 */
+                wait_on_page_writeback(page);
-        if (PagePrivate(page))
+                if (PagePrivate(page))
-                try_to_release_page(page, mapping_gfp_mask(mapping));
+                        try_to_release_page(page, mapping_gfp_mask(mapping));
-        if (!remove_mapping(mapping, page)) {
+                /*
-                unlock_page(page);
+                 * If we succeeded in removing the mapping, set LRU flag
-                return 1;
+                 * and return good.
+                 */
+                if (remove_mapping(mapping, page)) {
+                        buf->flags |= PIPE_BUF_FLAG_LRU;
+                        return 0;
+                }
        }
-        buf->flags |= PIPE_BUF_FLAG_LRU;
+        /*
-        return 0;
+         * Raced with truncate or failed to remove page from current
+         * address space, unlock and return failure.
+         */
+        unlock_page(page);
+        return 1;
 }
 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
diff --git a/fs/stat.c b/fs/stat.c
index 0f282face322..3a44dcf97da2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -4,7 +4,6 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..9b780c42d845 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -20,7 +20,6 @@
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -231,7 +230,7 @@ void generic_shutdown_super(struct super_block *sb)
        if (root) {
                sb->s_root = NULL;
                shrink_dcache_parent(root);
-                shrink_dcache_anon(&sb->s_anon);
+                shrink_dcache_sb(sb);
                dput(root);
                fsync_super(sb);
                lock_super(sb);
@@ -486,7 +485,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
        s = user_get_super(new_decode_dev(dev));
        if (s == NULL)
                goto out;
-        err = vfs_statfs(s, &sbuf);
+        err = vfs_statfs(s->s_root, &sbuf);
        drop_super(s);
        if (err)
                goto out;
@@ -676,9 +675,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
        }
 }
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+int get_sb_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int))
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
 {
        struct block_device *bdev;
        struct super_block *s;
@@ -686,7 +686,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
        bdev = open_bdev_excl(dev_name, flags, fs_type);
        if (IS_ERR(bdev))
-                return (struct super_block *)bdev;
+                return PTR_ERR(bdev);
        /*
         * once the super is inserted into the list by sget, s_umount
@@ -697,15 +697,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
        s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
        mutex_unlock(&bdev->bd_mount_mutex);
        if (IS_ERR(s))
-                goto out;
+                goto error_s;
        if (s->s_root) {
                if ((flags ^ s->s_flags) & MS_RDONLY) {
                        up_write(&s->s_umount);
                        deactivate_super(s);
-                        s = ERR_PTR(-EBUSY);
+                        error = -EBUSY;
+                        goto error_bdev;
                }
-                goto out;
+                close_bdev_excl(bdev);
        } else {
                char b[BDEVNAME_SIZE];
@@ -716,18 +718,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
                if (error) {
                        up_write(&s->s_umount);
                        deactivate_super(s);
-                        s = ERR_PTR(error);
+                        goto error;
-                } else {
-                        s->s_flags |= MS_ACTIVE;
-                        bdev_uevent(bdev, KOBJ_MOUNT);
                }
+                s->s_flags |= MS_ACTIVE;
+                bdev_uevent(bdev, KOBJ_MOUNT);
        }
-        return s;
+        return simple_set_mnt(mnt, s);
-out:
+error_s:
+        error = PTR_ERR(s);
+error_bdev:
        close_bdev_excl(bdev);
-        return s;
+error:
+        return error;
 }
 EXPORT_SYMBOL(get_sb_bdev);
@@ -744,15 +749,16 @@ void kill_block_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_block_super);
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+int get_sb_nodev(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int))
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
 {
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
        if (IS_ERR(s))
-                return s;
+                return PTR_ERR(s);
        s->s_flags = flags;
@@ -760,10 +766,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
        if (error) {
                up_write(&s->s_umount);
                deactivate_super(s);
-                return ERR_PTR(error);
+                return error;
        }
        s->s_flags |= MS_ACTIVE;
-        return s;
+        return simple_set_mnt(mnt, s);
 }
 EXPORT_SYMBOL(get_sb_nodev);
@@ -773,94 +779,100 @@ static int compare_single(struct super_block *s, void *p)
        return 1;
 }
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+int get_sb_single(struct file_system_type *fs_type,
        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int))
+        int (*fill_super)(struct super_block *, void *, int),
+        struct vfsmount *mnt)
 {
        struct super_block *s;
        int error;
        s = sget(fs_type, compare_single, set_anon_super, NULL);
        if (IS_ERR(s))
-                return s;
+                return PTR_ERR(s);
        if (!s->s_root) {
                s->s_flags = flags;
                error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
                if (error) {
                        up_write(&s->s_umount);
                        deactivate_super(s);
-                        return ERR_PTR(error);
+                        return error;
                }
                s->s_flags |= MS_ACTIVE;
        }
        do_remount_sb(s, flags, data, 0);
-        return s;
+        return simple_set_mnt(mnt, s);
 }
 EXPORT_SYMBOL(get_sb_single);
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-        struct file_system_type *type = get_fs_type(fstype);
-        struct super_block *sb = ERR_PTR(-ENOMEM);
        struct vfsmount *mnt;
-        int error;
        char *secdata = NULL;
+        int error;
        if (!type)
                return ERR_PTR(-ENODEV);
+        error = -ENOMEM;
        mnt = alloc_vfsmnt(name);
        if (!mnt)
                goto out;
        if (data) {
                secdata = alloc_secdata();
-                if (!secdata) {
+                if (!secdata)
-                        sb = ERR_PTR(-ENOMEM);
                        goto out_mnt;
-                }
                error = security_sb_copy_data(type, data, secdata);
-                if (error) {
+                if (error)
-                        sb = ERR_PTR(error);
                        goto out_free_secdata;
-                }
        }
-        sb = type->get_sb(type, flags, name, data);
+        error = type->get_sb(type, flags, name, data, mnt);
-        if (IS_ERR(sb))
+        if (error < 0)
                goto out_free_secdata;
-        error = security_sb_kern_mount(sb, secdata);
+        error = security_sb_kern_mount(mnt->mnt_sb, secdata);
        if (error)
                goto out_sb;
-        mnt->mnt_sb = sb;
-        mnt->mnt_root = dget(sb->s_root);
+        mnt->mnt_mountpoint = mnt->mnt_root;
-        mnt->mnt_mountpoint = sb->s_root;
        mnt->mnt_parent = mnt;
-        up_write(&sb->s_umount);
+        up_write(&mnt->mnt_sb->s_umount);
        free_secdata(secdata);
-        put_filesystem(type);
        return mnt;
 out_sb:
-        up_write(&sb->s_umount);
+        dput(mnt->mnt_root);
-        deactivate_super(sb);
+        up_write(&mnt->mnt_sb->s_umount);
-        sb = ERR_PTR(error);
+        deactivate_super(mnt->mnt_sb);
 out_free_secdata:
        free_secdata(secdata);
 out_mnt:
        free_vfsmnt(mnt);
 out:
-        put_filesystem(type);
+        return ERR_PTR(error);
-        return (struct vfsmount *)sb;
 }
-EXPORT_SYMBOL_GPL(do_kern_mount);
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+        struct file_system_type *type = get_fs_type(fstype);
+        struct vfsmount *mnt;
+        if (!type)
+                return ERR_PTR(-ENODEV);
+        mnt = vfs_kern_mount(type, flags, name, data);
+        put_filesystem(type);
+        return mnt;
+}
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-        return do_kern_mount(type->name, 0, type->name, NULL);
+        return vfs_kern_mount(type, 0, type->name, NULL);
 }
 EXPORT_SYMBOL(kern_mount);
diff --git a/fs/sync.c b/fs/sync.c
index aab5ffe77e9f..955aef04da28 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -100,7 +100,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
        }
        if (nbytes == 0)
-                endbyte = -1;
+                endbyte = LLONG_MAX;
        else
                endbyte--;              /* inclusive */
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75b..61c42430cba3 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        i++;
                        /* fallthrough */
                default:
-                        if (filp->f_pos == 2) {
+                        if (filp->f_pos == 2)
-                                list_del(q);
+                                list_move(q, &parent_sd->s_children);
-                                list_add(q, &parent_sd->s_children);
-                        }
                        for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
                                struct sysfs_dirent *next;
                                const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                                 dt_type(next)) < 0)
                                        return 0;
-                                list_del(q);
+                                list_move(q, p);
-                                list_add(q, p);
                                p = q;
                                filp->f_pos++;
                        }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index f0b347bd12ca..5e0e31cc46f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -16,7 +16,7 @@
 extern struct super_block * sysfs_sb;
-static struct address_space_operations sysfs_aops = {
+static const struct address_space_operations sysfs_aops = {
        .readpage       = simple_readpage,
        .prepare_write  = simple_prepare_write,
        .commit_write   = simple_commit_write
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f1117e885bd6..40190c489271 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static struct super_block *sysfs_get_sb(struct file_system_type *fs_type,
+static int sysfs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+        return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 static struct file_system_type sysfs_fs_type = {
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d7074341ee87..f2bef962d309 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -53,8 +53,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
-        struct page *page = read_cache_page(mapping, n,
+        struct page *page = read_mapping_page(mapping, n, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                kmap(page);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3ff89cc5833a..58b2d22142ba 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb)
        kfree(sbi);
 }
-static int sysv_statfs(struct super_block *sb, struct kstatfs *buf)
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        struct sysv_sb_info *sbi = SYSV_SB(sb);
        buf->f_type = sb->s_magic;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 86f5f8d43d0f..f2bcccd1d6fc 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -465,7 +465,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,get_block);
 }
-struct address_space_operations sysv_aops = {
+const struct address_space_operations sysv_aops = {
        .readpage = sysv_readpage,
        .writepage = sysv_writepage,
        .sync_page = block_sync_page,
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index e92b991e6dda..876639b93321 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -506,16 +506,17 @@ failed:
 /* Every kernel module contains stuff like this. */
-static struct super_block *sysv_get_sb(struct file_system_type *fs_type,
+static int sysv_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+                           mnt);
 }
-static struct super_block *v7_get_sb(struct file_system_type *fs_type,
+static int v7_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
 }
 static struct file_system_type sysv_fs_type = {
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 393a480e4deb..9dcc82120935 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -161,7 +161,7 @@ extern struct inode_operations sysv_dir_inode_operations;
 extern struct inode_operations sysv_fast_symlink_inode_operations;
 extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
-extern struct address_space_operations sysv_aops;
+extern const struct address_space_operations sysv_aops;
 extern struct super_operations sysv_sops;
 extern struct dentry_operations sysv_dentry_operations;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e34b00e303f1..a59e5f33daf6 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -95,7 +95,7 @@ static int udf_adinicb_commit_write(struct file *file, struct page *page, unsign
        return 0;
 }
-struct address_space_operations udf_adinicb_aops = {
+const struct address_space_operations udf_adinicb_aops = {
        .readpage               = udf_adinicb_readpage,
        .writepage              = udf_adinicb_writepage,
        .sync_page              = block_sync_page,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2983afd5e7fd..605f5111b6d8 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -132,7 +132,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping,block,udf_get_block);
 }
-struct address_space_operations udf_aops = {
+const struct address_space_operations udf_aops = {
        .readpage               = udf_readpage,
        .writepage              = udf_writepage,
        .sync_page              = block_sync_page,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e45789fe38e8..4df822c881b6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -40,7 +40,6 @@
 #include "udfdecl.h"    
-#include <linux/config.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -91,13 +90,13 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
-static int udf_statfs(struct super_block *, struct kstatfs *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
 /* UDF filesystem type */
-static struct super_block *udf_get_sb(struct file_system_type *fs_type,
+static int udf_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
 }
 static struct file_system_type udf_fstype = {
@@ -1779,8 +1778,10 @@ udf_put_super(struct super_block *sb)
 *      Written, tested, and released.
 */
 static int
-udf_statfs(struct super_block *sb, struct kstatfs *buf)
+udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+        struct super_block *sb = dentry->d_sb;
        buf->f_type = UDF_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb));
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 674bb40edc83..ba068a786563 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -113,6 +113,6 @@ out:
 /*
 * symlinks can't do much...
 */
-struct address_space_operations udf_symlink_aops = {
+const struct address_space_operations udf_symlink_aops = {
        .readpage               = udf_symlink_filler,
 };
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 023e19ba5a2e..1033b7cf2939 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -6,7 +6,6 @@
 #include "osta_udf.h"
 #include <linux/fs.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/udf_fs_i.h>
 #include <linux/udf_fs_sb.h>
@@ -47,9 +46,9 @@ extern struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
-extern struct address_space_operations udf_aops;
+extern const struct address_space_operations udf_aops;
-extern struct address_space_operations udf_adinicb_aops;
+extern const struct address_space_operations udf_adinicb_aops;
-extern struct address_space_operations udf_symlink_aops;
+extern const struct address_space_operations udf_symlink_aops;
 struct udf_fileident_bh
 {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 3ada9dcf55b8..b01804baa120 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -21,14 +21,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_BALLOC_DEBUG
-#ifdef UFS_BALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *);
@@ -39,7 +31,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *,
 /*
 * Free 'count' fragments from fragment number 'fragment'
 */
-void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
+{
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
@@ -51,7 +44,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
-        UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+        UFSD("ENTER, fragment %u, count %u\n", fragment, count);
        
        if (ufs_fragnum(fragment) + count > uspi->s_fpg)
                ufs_error (sb, "ufs_free_fragments", "internal error");
@@ -68,7 +61,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi) 
                goto failed;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) {
                ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
                goto failed;
@@ -76,11 +69,11 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        end_bit = bit + count;
        bbase = ufs_blknum (bit);
-        blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+        blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
        ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
        for (i = bit; i < end_bit; i++) {
-                if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i))
+                if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
-                        ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i);
+                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
                else 
                        ufs_error (sb, "ufs_free_fragments",
                                   "bit already cleared for fragment %u", i);
@@ -90,51 +83,52 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
-        fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
+        uspi->cs_total.cs_nffree += count;
        fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-        blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+        blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
        ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
        /*
         * Trying to reassemble free fragments into block
         */
        blkno = ufs_fragstoblks (bbase);
-        if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+        if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
                fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
-                fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
+                uspi->cs_total.cs_nffree -= uspi->s_fpb;
                fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+                uspi->cs_total.cs_nbfree++;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
                cylno = ufs_cbtocylno (bbase);
                fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1);
                fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
        }
        
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
        
        unlock_super (sb);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return;
 failed:
        unlock_super (sb);
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return;
 }
 /*
 * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
 */
-void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
+{
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
@@ -146,7 +140,7 @@ void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
-        UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+        UFSD("ENTER, fragment %u, count %u\n", fragment, count);
        
        if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
                ufs_error (sb, "ufs_free_blocks", "internal error, "
@@ -162,7 +156,7 @@ do_more:
        bit = ufs_dtogd (fragment);
        if (cgno >= uspi->s_ncg) {
                ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
-                goto failed;
+                goto failed_unlock;
        }
        end_bit = bit + count;
        if (end_bit > uspi->s_fpg) {
@@ -173,36 +167,36 @@ do_more:
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi) 
-                goto failed;
+                goto failed_unlock;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) {
                ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
-                goto failed;
+                goto failed_unlock;
        }
        for (i = bit; i < end_bit; i += uspi->s_fpb) {
                blkno = ufs_fragstoblks(i);
-                if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+                if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
                        ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
                }
-                ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno);
+                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
                DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+                uspi->cs_total.cs_nbfree++;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
                cylno = ufs_cbtocylno(i);
                fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1);
                fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
        }
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        if (overflow) {
@@ -213,38 +207,83 @@ do_more:
        sb->s_dirt = 1;
        unlock_super (sb);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return;
-failed:
+failed_unlock:
        unlock_super (sb);
-        UFSD(("EXIT (FAILED)\n"))
+failed:
+        UFSD("EXIT (FAILED)\n");
        return;
 }
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
+                               unsigned int count, unsigned int oldb,
+                               unsigned int newb, struct page *locked_page)
+{
+        unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        struct address_space *mapping = inode->i_mapping;
+        pgoff_t index, cur_index = locked_page->index;
+        unsigned int i, j;
+        struct page *page;
+        struct buffer_head *head, *bh;
+        UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+              inode->i_ino, count, oldb, newb);
+        BUG_ON(!PageLocked(locked_page));
+        for (i = 0; i < count; i += blk_per_page) {
+                index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                if (likely(cur_index != index)) {
+                        page = ufs_get_locked_page(mapping, index);
+                        if (IS_ERR(page))
+                                continue;
+                } else
+                        page = locked_page;
+                j = i;
+                head = page_buffers(page);
+                bh = head;
+                do {
+                        if (likely(bh->b_blocknr == j + oldb && j < count)) {
+                                unmap_underlying_metadata(bh->b_bdev,
+                                                          bh->b_blocknr);
+                                bh->b_blocknr = newb + j++;
+                                mark_buffer_dirty(bh);
+                        }
+                        bh = bh->b_this_page;
+                } while (bh != head);
-#define NULLIFY_FRAGMENTS \
+                set_page_dirty(page);
-        for (i = oldcount; i < newcount; i++) { \
-                bh = sb_getblk(sb, result + i); \
+                if (likely(cur_index != index))
-                memset (bh->b_data, 0, sb->s_blocksize); \
+                        ufs_put_locked_page(page);
-                set_buffer_uptodate(bh); \
+        }
-                mark_buffer_dirty (bh); \
+        UFSD("EXIT\n");
-                if (IS_SYNC(inode)) \
+}
-                        sync_dirty_buffer(bh); \
-                brelse (bh); \
-        }
-unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
+unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
-        unsigned goal, unsigned count, int * err )
+                           unsigned goal, unsigned count, int * err, struct page *locked_page)
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
-        struct buffer_head * bh;
+        unsigned cgno, oldcount, newcount, tmp, request, result;
-        unsigned cgno, oldcount, newcount, tmp, request, i, result;
        
-        UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
+        UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count);
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -273,14 +312,14 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
                        return (unsigned)-1;
                }
                if (fragment < UFS_I(inode)->i_lastfrag) {
-                        UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+                        UFSD("EXIT (ALREADY ALLOCATED)\n");
                        unlock_super (sb);
                        return 0;
                }
        }
        else {
                if (tmp) {
-                        UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+                        UFSD("EXIT (ALREADY ALLOCATED)\n");
                        unlock_super(sb);
                        return 0;
                }
@@ -289,9 +328,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        /*
         * There is not enough space for user on the device
         */
-        if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
+        if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
                unlock_super (sb);
-                UFSD(("EXIT (FAILED)\n"))
+                UFSD("EXIT (FAILED)\n");
                return 0;
        }
@@ -310,12 +349,10 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
                if (result) {
                        *p = cpu_to_fs32(sb, result);
                        *err = 0;
-                        inode->i_blocks += count << uspi->s_nspfshift;
                        UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-                        NULLIFY_FRAGMENTS
                }
                unlock_super(sb);
-                UFSD(("EXIT, result %u\n", result))
+                UFSD("EXIT, result %u\n", result);
                return result;
        }
@@ -325,11 +362,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
        if (result) {
                *err = 0;
-                inode->i_blocks += count << uspi->s_nspfshift;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-                NULLIFY_FRAGMENTS
                unlock_super(sb);
-                UFSD(("EXIT, result %u\n", result))
+                UFSD("EXIT, result %u\n", result);
                return result;
        }
@@ -339,8 +374,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        switch (fs32_to_cpu(sb, usb1->fs_optim)) {
            case UFS_OPTSPACE:
                request = newcount;
-                if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) 
+                if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
-                    > uspi->s_dsize * uspi->s_minfree / (2 * 100) )
+                    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
                        break;
                usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
                break;
@@ -349,7 +384,7 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        
            case UFS_OPTTIME:
                request = uspi->s_fpb;
-                if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize *
+                if (uspi->cs_total.cs_nffree < uspi->s_dsize *
                    (uspi->s_minfree - 2) / 100)
                        break;
                usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
@@ -357,39 +392,22 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
        }
        result = ufs_alloc_fragments (inode, cgno, goal, request, err);
        if (result) {
-                for (i = 0; i < oldcount; i++) {
+                ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp,
-                        bh = sb_bread(sb, tmp + i);
+                                   result, locked_page);
-                        if(bh)
-                        {
-                                clear_buffer_dirty(bh);
-                                bh->b_blocknr = result + i;
-                                mark_buffer_dirty (bh);
-                                if (IS_SYNC(inode))
-                                        sync_dirty_buffer(bh);
-                                brelse (bh);
-                        }
-                        else
-                        {
-                                printk(KERN_ERR "ufs_new_fragments: bread fail\n");
-                                unlock_super(sb);
-                                return 0;
-                        }
-                }
                *p = cpu_to_fs32(sb, result);
                *err = 0;
-                inode->i_blocks += count << uspi->s_nspfshift;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-                NULLIFY_FRAGMENTS
                unlock_super(sb);
                if (newcount < request)
                        ufs_free_fragments (inode, result + newcount, request - newcount);
                ufs_free_fragments (inode, tmp, oldcount);
-                UFSD(("EXIT, result %u\n", result))
+                UFSD("EXIT, result %u\n", result);
                return result;
        }
        unlock_super(sb);
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return 0;
 }               
@@ -404,7 +422,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
        
-        UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount))
+        UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount);
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -419,7 +437,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi)
                return 0;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) {
                ufs_panic (sb, "ufs_add_fragments",
                        "internal error, bad magic number on cg %u", cgno);
@@ -429,14 +447,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        fragno = ufs_dtogd (fragment);
        fragoff = ufs_fragnum (fragno);
        for (i = oldcount; i < newcount; i++)
-                if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+                if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
                        return 0;
        /*
         * Block can be extended
         */
        ucg->cg_time = cpu_to_fs32(sb, get_seconds());
        for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
-                if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+                if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
                        break;
        fragsize = i - oldcount;
        if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
@@ -446,7 +464,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        if (fragsize != count)
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
-                ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i);
+                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
        if(DQUOT_ALLOC_BLOCK(inode, count)) {
                *err = -EDQUOT;
                return 0;
@@ -454,17 +472,17 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+        uspi->cs_total.cs_nffree -= count;
        
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
-        UFSD(("EXIT, fragment %u\n", fragment))
+        UFSD("EXIT, fragment %u\n", fragment);
        
        return fragment;
 }
@@ -487,7 +505,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, result, allocsize;
        
-        UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count))
+        UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -521,14 +539,14 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
                UFS_TEST_FREE_SPACE_CG
        }
        
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return 0;
 cg_found:
        ucpi = ufs_load_cylinder (sb, cgno);
        if (!ucpi)
                return 0;
-        ucg = ubh_get_ucg (UCPI_UBH);
+        ucg = ubh_get_ucg (UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) 
                ufs_panic (sb, "ufs_alloc_fragments",
                        "internal error, bad magic number on cg %u", cgno);
@@ -551,12 +569,12 @@ cg_found:
                        return 0;
                goal = ufs_dtogd (result);
                for (i = count; i < uspi->s_fpb; i++)
-                        ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i);
+                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
                DQUOT_FREE_BLOCK(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i);
+                uspi->cs_total.cs_nffree += i;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
                fs32_add(sb, &ucg->cg_frsum[i], 1);
                goto succed;
@@ -570,10 +588,10 @@ cg_found:
                return 0;
        }
        for (i = 0; i < count; i++)
-                ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i);
+                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
        
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+        uspi->cs_total.cs_nffree -= count;
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
        fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
@@ -581,16 +599,16 @@ cg_found:
                fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
 succed:
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
        result += cgno * uspi->s_fpg;
-        UFSD(("EXIT3, result %u\n", result))
+        UFSD("EXIT3, result %u\n", result);
        return result;
 }
@@ -603,12 +621,12 @@ static unsigned ufs_alloccg_block (struct inode * inode,
        struct ufs_cylinder_group * ucg;
        unsigned result, cylno, blkno;
-        UFSD(("ENTER, goal %u\n", goal))
+        UFSD("ENTER, goal %u\n", goal);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (goal == 0) {
                goal = ucpi->c_rotor;
@@ -620,7 +638,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
        /*
         * If the requested block is available, use it.
         */
-        if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+        if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
                result = goal;
                goto gotit;
        }
@@ -632,7 +650,7 @@ norot:
        ucpi->c_rotor = result;
 gotit:
        blkno = ufs_fragstoblks(result);
-        ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno);
+        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
        if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
@@ -641,31 +659,76 @@ gotit:
        }
        fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+        uspi->cs_total.cs_nbfree--;
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
        cylno = ufs_cbtocylno(result);
        fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
        fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
        
-        UFSD(("EXIT, result %u\n", result))
+        UFSD("EXIT, result %u\n", result);
        return result;
 }
-static unsigned ufs_bitmap_search (struct super_block * sb,
+static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
-        struct ufs_cg_private_info * ucpi, unsigned goal, unsigned count)
+                          struct ufs_buffer_head *ubh,
+                          unsigned begin, unsigned size,
+                          unsigned char *table, unsigned char mask)
 {
-        struct ufs_sb_private_info * uspi;
+        unsigned rest, offset;
-        struct ufs_super_block_first * usb1;
+        unsigned char *cp;
-        struct ufs_cylinder_group * ucg;
+        
-        unsigned start, length, location, result;
-        unsigned possition, fragsize, blockmap, mask;
+        offset = begin & ~uspi->s_fmask;
-        
+        begin >>= uspi->s_fshift;
-        UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count))
+        for (;;) {
+                if ((offset + size) < uspi->s_fsize)
+                        rest = size;
+                else
+                        rest = uspi->s_fsize - offset;
+                size -= rest;
+                cp = ubh->bh[begin]->b_data + offset;
+                while ((table[*cp++] & mask) == 0 && --rest)
+                        ;
+                if (rest || !size)
+                        break;
+                begin++;
+                offset = 0;
+        }
+        return (size + rest);
+}
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ * @sp: pointer to super block
+ * @ucpi: pointer to cylinder group info
+ * @goal: near which block we want find new one
+ * @count: specified size
+ */
+static unsigned ufs_bitmap_search(struct super_block *sb,
+                                  struct ufs_cg_private_info *ucpi,
+                                  unsigned goal, unsigned count)
+{
+        /*
+         * Bit patterns for identifying fragments in the block map
+         * used as ((map & mask_arr) == want_arr)
+         */
+        static const int mask_arr[9] = {
+                0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+        };
+        static const int want_arr[9] = {
+                0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+        };
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_super_block_first *usb1;
+        struct ufs_cylinder_group *ucg;
+        unsigned start, length, loc, result;
+        unsigned pos, want, blockmap, mask, end;
+        UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count);
-        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first (uspi);
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (goal)
                start = ufs_dtogd(goal) >> 3;
@@ -673,53 +736,50 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
                start = ucpi->c_frotor >> 3;
                
        length = ((uspi->s_fpg + 7) >> 3) - start;
-        location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length,
+        loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
                (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
                1 << (count - 1 + (uspi->s_fpb & 7))); 
-        if (location == 0) {
+        if (loc == 0) {
                length = start + 1;
-                location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, 
+                loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
-                        (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
+                                (uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
-                        1 << (count - 1 + (uspi->s_fpb & 7)));
+                                ufs_fragtable_other,
-                if (location == 0) {
+                                1 << (count - 1 + (uspi->s_fpb & 7)));
-                        ufs_error (sb, "ufs_bitmap_search",
+                if (loc == 0) {
-                        "bitmap corrupted on cg %u, start %u, length %u, count %u, freeoff %u\n",
+                        ufs_error(sb, "ufs_bitmap_search",
-                        ucpi->c_cgx, start, length, count, ucpi->c_freeoff);
+                                  "bitmap corrupted on cg %u, start %u,"
+                                  " length %u, count %u, freeoff %u\n",
+                                  ucpi->c_cgx, start, length, count,
+                                  ucpi->c_freeoff);
                        return (unsigned)-1;
                }
                start = 0;
        }
-        result = (start + length - location) << 3;
+        result = (start + length - loc) << 3;
        ucpi->c_frotor = result;
        /*
         * found the byte in the map
         */
-        blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result);
-        fragsize = 0;
+        for (end = result + 8; result < end; result += uspi->s_fpb) {
-        for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) {
+                blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
-                if (blockmap & mask) {
+                blockmap <<= 1;
-                        if (!(possition & uspi->s_fpbmask))
+                mask = mask_arr[count];
-                                fragsize = 1;
+                want = want_arr[count];
-                        else 
+                for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
-                                fragsize++;
+                        if ((blockmap & mask) == want) {
-                }
+                                UFSD("EXIT, result %u\n", result);
-                else {
+                                return result + pos;
-                        if (fragsize == count) {
+                        }
-                                result += possition - count;
+                        mask <<= 1;
-                                UFSD(("EXIT, result %u\n", result))
+                        want <<= 1;
-                                return result;
+                }
-                        }
+        }
-                        fragsize = 0;
-                }
+        ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
-        }
+                  ucpi->c_cgx);
-        if (fragsize == count) {
+        UFSD("EXIT (FAILED)\n");
-                result += possition - count;
-                UFSD(("EXIT, result %u\n", result))
-                return result;
-        }
-        ufs_error (sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx);
-        UFSD(("EXIT (FAILED)\n"))
        return (unsigned)-1;
 }
@@ -734,9 +794,9 @@ static void ufs_clusteracct(struct super_block * sb,
                return;
        if (cnt > 0)
-                ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+                ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
        else
-                ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+                ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
        /*
         * Find the size of the cluster going forward.
@@ -745,7 +805,7 @@ static void ufs_clusteracct(struct super_block * sb,
        end = start + uspi->s_contigsumsize;
        if ( end >= ucpi->c_nclusterblks)
                end = ucpi->c_nclusterblks;
-        i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start);
+        i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
        if (i > end)
                i = end;
        forw = i - start;
@@ -757,7 +817,7 @@ static void ufs_clusteracct(struct super_block * sb,
        end = start - uspi->s_contigsumsize;
        if (end < 0 ) 
                end = -1;
-        i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end);
+        i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
        if ( i < end) 
                i = end;
        back = start - i;
@@ -769,11 +829,11 @@ static void ufs_clusteracct(struct super_block * sb,
        i = back + forw + 1;
        if (i > uspi->s_contigsumsize)
                i = uspi->s_contigsumsize;
-        fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt);
+        fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
        if (back > 0)
-                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt);
+                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
        if (forw > 0)
-                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt);
+                fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
 }
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 14abb8b835f7..09c39e5e6386 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -20,15 +20,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_CYLINDER_DEBUG
-#ifdef UFS_CYLINDER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 /*
 * Read cylinder group into cache. The memory space for ufs_cg_private_info
 * structure is already allocated during ufs_read_super.
@@ -42,19 +33,19 @@ static void ufs_read_cylinder (struct super_block * sb,
        struct ufs_cylinder_group * ucg;
        unsigned i, j;
-        UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr))
+        UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
        uspi = sbi->s_uspi;
        ucpi = sbi->s_ucpi[bitmap_nr];
        ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
-        UCPI_UBH->fragment = ufs_cgcmin(cgno);
+        UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
-        UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+        UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
        /*
         * We have already the first fragment of cylinder group block in buffer
         */
-        UCPI_UBH->bh[0] = sbi->s_ucg[cgno];
+        UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
-        for (i = 1; i < UCPI_UBH->count; i++)
+        for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
-                if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i)))
+                if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
                        goto failed;
        sbi->s_cgno[bitmap_nr] = cgno;
                        
@@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb,
        ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
        ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
        ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return; 
        
 failed:
@@ -95,15 +86,15 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
        struct ufs_cylinder_group * ucg;
        unsigned i;
-        UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr))
+        UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
        uspi = sbi->s_uspi;
        if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
-                UFSD(("EXIT\n"))
+                UFSD("EXIT\n");
                return;
        }
        ucpi = sbi->s_ucpi[bitmap_nr];
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
                ufs_panic (sb, "ufs_put_cylinder", "internal error");
@@ -116,13 +107,13 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
        ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
        ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
        ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        for (i = 1; i < UCPI_UBH->count; i++) {
+        for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
-                brelse (UCPI_UBH->bh[i]);
+                brelse (UCPI_UBH(ucpi)->bh[i]);
        }
        sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
 /*
@@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
        struct ufs_cg_private_info * ucpi;
        unsigned cg, i, j;
-        UFSD(("ENTER, cgno %u\n", cgno))
+        UFSD("ENTER, cgno %u\n", cgno);
        uspi = sbi->s_uspi;
        if (cgno >= uspi->s_ncg) {
@@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
         * Cylinder group number cg it in cache and it was last used
         */
        if (sbi->s_cgno[0] == cgno) {
-                UFSD(("EXIT\n"))
+                UFSD("EXIT\n");
                return sbi->s_ucpi[0];
        }
        /*
@@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder (
                if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
                        if (sbi->s_cgno[cgno] != cgno) {
                                ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
-                                UFSD(("EXIT (FAILED)\n"))
+                                UFSD("EXIT (FAILED)\n");
                                return NULL;
                        }
                        else {
-                                UFSD(("EXIT\n"))
+                                UFSD("EXIT\n");
                                return sbi->s_ucpi[cgno];
                        }
                } else {
                        ufs_read_cylinder (sb, cgno, cgno);
-                        UFSD(("EXIT\n"))
+                        UFSD("EXIT\n");
                        return sbi->s_ucpi[cgno];
                }
        }
@@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder (
                sbi->s_ucpi[0] = ucpi;
                ufs_read_cylinder (sb, cgno, 0);
        }
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return sbi->s_ucpi[0];
 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1a561202d3f4..7f0a0aa63584 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -11,31 +11,20 @@
 * 4.4BSD (FreeBSD) support added on February 1st 1998 by
 * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
 * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
 */
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include "swab.h"
 #include "util.h"
-#undef UFS_DIR_DEBUG
-#ifdef UFS_DIR_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-static int
-ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *,
-                     struct buffer_head *, unsigned long);
 /*
 * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
 *
@@ -51,495 +40,541 @@ static inline int ufs_match(struct super_block *sb, int len,
        return !memcmp(name, de->d_name, len);
 }
-/*
+static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to)
- * This is blatantly stolen from ext2fs
- */
-static int
-ufs_readdir (struct file * filp, void * dirent, filldir_t filldir)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        struct inode *dir = page->mapping->host;
-        int error = 0;
+        int err = 0;
-        unsigned long offset, lblk;
+        dir->i_version++;
-        int i, stored;
+        page->mapping->a_ops->commit_write(NULL, page, from, to);
-        struct buffer_head * bh;
+        if (IS_DIRSYNC(dir))
-        struct ufs_dir_entry * de;
+                err = write_one_page(page, 1);
-        struct super_block * sb;
+        else
-        int de_reclen;
+                unlock_page(page);
-        unsigned flags;
+        return err;
-        u64     blk= 0L;
+}
-        lock_kernel();
-        sb = inode->i_sb;
-        flags = UFS_SB(sb)->s_flags;
-        UFSD(("ENTER, ino %lu  f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos))
-        stored = 0;
-        bh = NULL;
-        offset = filp->f_pos & (sb->s_blocksize - 1);
-        while (!error && !stored && filp->f_pos < inode->i_size) {
-                lblk = (filp->f_pos) >> sb->s_blocksize_bits;
-                blk = ufs_frag_map(inode, lblk);
-                if (!blk || !(bh = sb_bread(sb, blk))) {
-                        /* XXX - error - skip to the next block */
-                        printk("ufs_readdir: "
-                               "dir inode %lu has a hole at offset %lu\n",
-                               inode->i_ino, (unsigned long int)filp->f_pos);
-                        filp->f_pos += sb->s_blocksize - offset;
-                        continue;
-                }
-revalidate:
-                /* If the dir block has changed since the last call to
-                 * readdir(2), then we might be pointing to an invalid
-                 * dirent right now.  Scan from the start of the block
-                 * to make sure. */
-                if (filp->f_version != inode->i_version) {
-                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
-                                de = (struct ufs_dir_entry *)(bh->b_data + i);
-                                /* It's too expensive to do a full
-                                 * dirent test each time round this
-                                 * loop, but we do have to test at
-                                 * least that it is non-zero.  A
-                                 * failure will be detected in the
-                                 * dirent test below. */
-                                de_reclen = fs16_to_cpu(sb, de->d_reclen);
-                                if (de_reclen < 1)
-                                        break;
-                                i += de_reclen;
-                        }
-                        offset = i;
-                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
-                                | offset;
-                        filp->f_version = inode->i_version;
-                }
-                while (!error && filp->f_pos < inode->i_size
+static inline void ufs_put_page(struct page *page)
-                       && offset < sb->s_blocksize) {
+{
-                        de = (struct ufs_dir_entry *) (bh->b_data + offset);
+        kunmap(page);
-                        /* XXX - put in a real ufs_check_dir_entry() */
+        page_cache_release(page);
-                        if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) {
+}
-                                filp->f_pos = (filp->f_pos &
-                                              (sb->s_blocksize - 1)) +
-                                               sb->s_blocksize;
-                                brelse(bh);
-                                unlock_kernel();
-                                return stored;
-                        }
-                        if (!ufs_check_dir_entry ("ufs_readdir", inode, de,
-                                                   bh, offset)) {
-                                /* On error, skip the f_pos to the
-                                   next block. */
-                                filp->f_pos = (filp->f_pos |
-                                              (sb->s_blocksize - 1)) +
-                                               1;
-                                brelse (bh);
-                                unlock_kernel();
-                                return stored;
-                        }
-                        offset += fs16_to_cpu(sb, de->d_reclen);
-                        if (de->d_ino) {
-                                /* We might block in the next section
-                                 * if the data destination is
-                                 * currently swapped out.  So, use a
-                                 * version stamp to detect whether or
-                                 * not the directory has been modified
-                                 * during the copy operation. */
-                                unsigned long version = filp->f_version;
-                                unsigned char d_type = DT_UNKNOWN;
-                                UFSD(("filldir(%s,%u)\n", de->d_name,
+static inline unsigned long ufs_dir_pages(struct inode *inode)
-                                                        fs32_to_cpu(sb, de->d_ino)))
+{
-                                UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)))
+        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
-                                if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
-                                        d_type = de->d_u.d_44.d_type;
+{
-                                error = filldir(dirent, de->d_name,
+        ino_t res = 0;
-                                                ufs_get_de_namlen(sb, de), filp->f_pos,
+        struct ufs_dir_entry *de;
-                                                fs32_to_cpu(sb, de->d_ino), d_type);
+        struct page *page;
-                                if (error)
+        
-                                        break;
+        de = ufs_find_entry(dir, dentry, &page);
-                                if (version != filp->f_version)
+        if (de) {
-                                        goto revalidate;
+                res = fs32_to_cpu(dir->i_sb, de->d_ino);
-                                stored ++;
+                ufs_put_page(page);
-                        }
-                        filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
-                }
-                offset = 0;
-                brelse (bh);
        }
-        unlock_kernel();
+        return res;
-        return 0;
 }
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
-/*
+/* Releases the page */
- *      ufs_find_entry()
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- *
+                  struct page *page, struct inode *inode)
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_bh). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- */
-struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry,
-        struct buffer_head ** res_bh)
 {
-        struct super_block * sb;
+        unsigned from = (char *) de - (char *) page_address(page);
-        struct buffer_head * bh_use[NAMEI_RA_SIZE];
+        unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen);
-        struct buffer_head * bh_read[NAMEI_RA_SIZE];
+        int err;
-        unsigned long offset;
-        int block, toread, i, err;
-        struct inode *dir = dentry->d_parent->d_inode;
-        const char *name = dentry->d_name.name;
-        int namelen = dentry->d_name.len;
-        UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen))
+        lock_page(page);
-        
+        err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
-        *res_bh = NULL;
+        BUG_ON(err);
-        
+        de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
-        sb = dir->i_sb;
+        ufs_set_de_type(dir->i_sb, de, inode->i_mode);
-        
+        err = ufs_commit_chunk(page, from, to);
-        if (namelen > UFS_MAXNAMLEN)
+        ufs_put_page(page);
-                return NULL;
+        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(dir);
+}
-        memset (bh_use, 0, sizeof (bh_use));
-        toread = 0;
-        for (block = 0; block < NAMEI_RA_SIZE; ++block) {
-                struct buffer_head * bh;
-                if ((block << sb->s_blocksize_bits) >= dir->i_size)
+static void ufs_check_page(struct page *page)
-                        break;
+{
-                bh = ufs_getfrag (dir, block, 0, &err);
+        struct inode *dir = page->mapping->host;
-                bh_use[block] = bh;
+        struct super_block *sb = dir->i_sb;
-                if (bh && !buffer_uptodate(bh))
+        char *kaddr = page_address(page);
-                        bh_read[toread++] = bh;
+        unsigned offs, rec_len;
+        unsigned limit = PAGE_CACHE_SIZE;
+        struct ufs_dir_entry *p;
+        char *error;
+        if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+                limit = dir->i_size & ~PAGE_CACHE_MASK;
+                if (limit & (UFS_SECTOR_SIZE - 1))
+                        goto Ebadsize;
+                if (!limit)
+                        goto out;
        }
+        for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+                p = (struct ufs_dir_entry *)(kaddr + offs);
+                rec_len = fs16_to_cpu(sb, p->d_reclen);
+                if (rec_len < UFS_DIR_REC_LEN(1))
+                        goto Eshort;
+                if (rec_len & 3)
+                        goto Ealign;
+                if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+                        goto Enamelen;
+                if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1))
+                        goto Espan;
+                if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+                                                  UFS_SB(sb)->s_uspi->s_ncg))
+                        goto Einumber;
+        }
+        if (offs != limit)
+                goto Eend;
+out:
+        SetPageChecked(page);
+        return;
+        /* Too bad, we had an error */
+Ebadsize:
+        ufs_error(sb, "ufs_check_page",
+                  "size of directory #%lu is not a multiple of chunk size",
+                  dir->i_ino
+        );
+        goto fail;
+Eshort:
+        error = "rec_len is smaller than minimal";
+        goto bad_entry;
+Ealign:
+        error = "unaligned directory entry";
+        goto bad_entry;
+Enamelen:
+        error = "rec_len is too small for name_len";
+        goto bad_entry;
+Espan:
+        error = "directory entry across blocks";
+        goto bad_entry;
+Einumber:
+        error = "inode out of bounds";
+bad_entry:
+        ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+                   "offset=%lu, rec_len=%d, name_len=%d",
+                   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+                   rec_len, ufs_get_de_namlen(sb, p));
+        goto fail;
+Eend:
+        p = (struct ufs_dir_entry *)(kaddr + offs);
+        ufs_error (sb, "ext2_check_page",
+                   "entry in directory #%lu spans the page boundary"
+                   "offset=%lu",
+                   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+        SetPageChecked(page);
+        SetPageError(page);
+}
-        for (block = 0, offset = 0; offset < dir->i_size; block++) {
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
-                struct buffer_head * bh;
+{
-                struct ufs_dir_entry * de;
+        struct address_space *mapping = dir->i_mapping;
-                char * dlimit;
+        struct page *page = read_cache_page(mapping, n,
+                                (filler_t*)mapping->a_ops->readpage, NULL);
-                if ((block % NAMEI_RA_BLOCKS) == 0 && toread) {
+        if (!IS_ERR(page)) {
-                        ll_rw_block (READ, toread, bh_read);
+                wait_on_page_locked(page);
-                        toread = 0;
+                kmap(page);
-                }
+                if (!PageUptodate(page))
-                bh = bh_use[block % NAMEI_RA_SIZE];
+                        goto fail;
-                if (!bh) {
+                if (!PageChecked(page))
-                        ufs_error (sb, "ufs_find_entry", 
+                        ufs_check_page(page);
-                                "directory #%lu contains a hole at offset %lu",
+                if (PageError(page))
-                                dir->i_ino, offset);
+                        goto fail;
-                        offset += sb->s_blocksize;
-                        continue;
-                }
-                wait_on_buffer (bh);
-                if (!buffer_uptodate(bh)) {
-                        /*
-                         * read error: all bets are off
-                         */
-                        break;
-                }
-                de = (struct ufs_dir_entry *) bh->b_data;
-                dlimit = bh->b_data + sb->s_blocksize;
-                while ((char *) de < dlimit && offset < dir->i_size) {
-                        /* this code is executed quadratically often */
-                        /* do minimal checking by hand */
-                        int de_len;
-                        if ((char *) de + namelen <= dlimit &&
-                            ufs_match(sb, namelen, name, de)) {
-                                /* found a match -
-                                just to be sure, do a full check */
-                                if (!ufs_check_dir_entry("ufs_find_entry",
-                                    dir, de, bh, offset))
-                                        goto failed;
-                                for (i = 0; i < NAMEI_RA_SIZE; ++i) {
-                                        if (bh_use[i] != bh)
-                                                brelse (bh_use[i]);
-                                }
-                                *res_bh = bh;
-                                return de;
-                        }
-                        /* prevent looping on a bad block */
-                        de_len = fs16_to_cpu(sb, de->d_reclen);
-                        if (de_len <= 0)
-                                goto failed;
-                        offset += de_len;
-                        de = (struct ufs_dir_entry *) ((char *) de + de_len);
-                }
-                brelse (bh);
-                if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >=
-                    dir->i_size)
-                        bh = NULL;
-                else
-                        bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err);
-                bh_use[block % NAMEI_RA_SIZE] = bh;
-                if (bh && !buffer_uptodate(bh))
-                        bh_read[toread++] = bh;
        }
+        return page;
-failed:
+fail:
-        for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]);
+        ufs_put_page(page);
-        UFSD(("EXIT\n"))
+        return ERR_PTR(-EIO);
-        return NULL;
 }
-static int
+/*
-ufs_check_dir_entry (const char *function, struct inode *dir,
+ * Return the offset into page `page_nr' of the last valid
-                     struct ufs_dir_entry *de, struct buffer_head *bh,
+ * byte in that page, plus one.
-                     unsigned long offset)
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-        struct super_block *sb = dir->i_sb;
+        unsigned last_byte = inode->i_size;
-        const char *error_msg = NULL;
-        int rlen = fs16_to_cpu(sb, de->d_reclen);
+        last_byte -= page_nr << PAGE_CACHE_SHIFT;
+        if (last_byte > PAGE_CACHE_SIZE)
-        if (rlen < UFS_DIR_REC_LEN(1))
+                last_byte = PAGE_CACHE_SIZE;
-                error_msg = "reclen is smaller than minimal";
+        return last_byte;
-        else if (rlen % 4 != 0)
-                error_msg = "reclen % 4 != 0";
-        else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)))
-                error_msg = "reclen is too small for namlen";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-                error_msg = "directory entry across blocks";
-        else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
-                                      UFS_SB(sb)->s_uspi->s_ncg))
-                error_msg = "inode out of bounds";
-        if (error_msg != NULL)
-                ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
-                            "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
-                            dir->i_ino, dir->i_size, error_msg, offset,
-                            (unsigned long)fs32_to_cpu(sb, de->d_ino),
-                            rlen, ufs_get_de_namlen(sb, de));
-        
-        return (error_msg == NULL ? 1 : 0);
 }
-struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p)
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
 {
-        int err;
+        return (struct ufs_dir_entry *)((char *)p +
-        struct buffer_head *bh = ufs_bread (dir, 0, 0, &err);
+                                        fs16_to_cpu(sb, p->d_reclen));
-        struct ufs_dir_entry *res = NULL;
-        if (bh) {
-                res = (struct ufs_dir_entry *) bh->b_data;
-                res = (struct ufs_dir_entry *)((char *)res +
-                        fs16_to_cpu(dir->i_sb, res->d_reclen));
-        }
-        *p = bh;
-        return res;
 }
-ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry)
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 {
-        ino_t res = 0;
+        struct page *page = ufs_get_page(dir, 0);
-        struct ufs_dir_entry * de;
+        struct ufs_dir_entry *de = NULL;
-        struct buffer_head *bh;
-        de = ufs_find_entry (dentry, &bh);
+        if (!IS_ERR(page)) {
-        if (de) {
+                de = ufs_next_entry(dir->i_sb,
-                res = fs32_to_cpu(dir->i_sb, de->d_ino);
+                                    (struct ufs_dir_entry *)page_address(page));
-                brelse(bh);
+                *p = page;
        }
-        return res;
+        return de;
 }
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+/*
-                struct buffer_head *bh, struct inode *inode)
+ *      ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+                                     struct page **res_page)
 {
-        dir->i_version++;
+        struct super_block *sb = dir->i_sb;
-        de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+        const char *name = dentry->d_name.name;
-        mark_buffer_dirty(bh);
+        int namelen = dentry->d_name.len;
-        if (IS_DIRSYNC(dir))
+        unsigned reclen = UFS_DIR_REC_LEN(namelen);
-                sync_dirty_buffer(bh);
+        unsigned long start, n;
-        brelse (bh);
+        unsigned long npages = ufs_dir_pages(dir);
+        struct page *page = NULL;
+        struct ufs_inode_info *ui = UFS_I(dir);
+        struct ufs_dir_entry *de;
+        UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
+        if (npages == 0 || namelen > UFS_MAXNAMLEN)
+                goto out;
+        /* OFFSET_CACHE */
+        *res_page = NULL;
+        start = ui->i_dir_start_lookup;
+        if (start >= npages)
+                start = 0;
+        n = start;
+        do {
+                char *kaddr;
+                page = ufs_get_page(dir, n);
+                if (!IS_ERR(page)) {
+                        kaddr = page_address(page);
+                        de = (struct ufs_dir_entry *) kaddr;
+                        kaddr += ufs_last_byte(dir, n) - reclen;
+                        while ((char *) de <= kaddr) {
+                                if (de->d_reclen == 0) {
+                                        ufs_error(dir->i_sb, __FUNCTION__,
+                                                  "zero-length directory entry");
+                                        ufs_put_page(page);
+                                        goto out;
+                                }
+                                if (ufs_match(sb, namelen, name, de))
+                                        goto found;
+                                de = ufs_next_entry(sb, de);
+                        }
+                        ufs_put_page(page);
+                }
+                if (++n >= npages)
+                        n = 0;
+        } while (n != start);
+out:
+        return NULL;
+found:
+        *res_page = page;
+        ui->i_dir_start_lookup = n;
+        return de;
 }
 /*
- *      ufs_add_entry()
+ *      Parent is locked.
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ufs_find_entry(). It returns NULL if it failed.
 */
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
-        struct super_block * sb;
-        struct ufs_sb_private_info * uspi;
-        unsigned long offset;
-        unsigned fragoff;
-        unsigned short rec_len;
-        struct buffer_head * bh;
-        struct ufs_dir_entry * de, * de1;
        struct inode *dir = dentry->d_parent->d_inode;
        const char *name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
+        struct super_block *sb = dir->i_sb;
+        unsigned reclen = UFS_DIR_REC_LEN(namelen);
+        unsigned short rec_len, name_len;
+        struct page *page = NULL;
+        struct ufs_dir_entry *de;
+        unsigned long npages = ufs_dir_pages(dir);
+        unsigned long n;
+        char *kaddr;
+        unsigned from, to;
        int err;
-        UFSD(("ENTER, name %s, namelen %u\n", name, namelen))
+        UFSD("ENTER, name %s, namelen %u\n", name, namelen);
-        
-        sb = dir->i_sb;
+        /*
-        uspi = UFS_SB(sb)->s_uspi;
+         * We take care of directory expansion in the same loop.
+         * This code plays outside i_size, so it locks the page
-        if (!namelen)
+         * to protect that region.
-                return -EINVAL;
+         */
-        bh = ufs_bread (dir, 0, 0, &err);
+        for (n = 0; n <= npages; n++) {
-        if (!bh)
+                char *dir_end;
-                return err;
-        rec_len = UFS_DIR_REC_LEN(namelen);
+                page = ufs_get_page(dir, n);
-        offset = 0;
+                err = PTR_ERR(page);
-        de = (struct ufs_dir_entry *) bh->b_data;
+                if (IS_ERR(page))
-        while (1) {
+                        goto out;
-                if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) {
+                lock_page(page);
-                        fragoff = offset & ~uspi->s_fmask;
+                kaddr = page_address(page);
-                        if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE)
+                dir_end = kaddr + ufs_last_byte(dir, n);
-                                ufs_error (sb, "ufs_add_entry", "internal error"
+                de = (struct ufs_dir_entry *)kaddr;
-                                        " fragoff %u", fragoff);
+                kaddr += PAGE_CACHE_SIZE - reclen;
-                        if (!fragoff) {
+                while ((char *)de <= kaddr) {
-                                brelse (bh);
+                        if ((char *)de == dir_end) {
-                                bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err);
+                                /* We hit i_size */
-                                if (!bh)
+                                name_len = 0;
-                                        return err;
+                                rec_len = UFS_SECTOR_SIZE;
-                        }
-                        if (dir->i_size <= offset) {
-                                if (dir->i_size == 0) {
-                                        brelse(bh);
-                                        return -ENOENT;
-                                }
-                                de = (struct ufs_dir_entry *) (bh->b_data + fragoff);
-                                de->d_ino = 0;
                                de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE);
-                                ufs_set_de_namlen(sb, de, 0);
+                                de->d_ino = 0;
-                                dir->i_size = offset + UFS_SECTOR_SIZE;
+                                goto got_it;
-                                mark_inode_dirty(dir);
-                        } else {
-                                de = (struct ufs_dir_entry *) bh->b_data;
                        }
+                        if (de->d_reclen == 0) {
+                                ufs_error(dir->i_sb, __FUNCTION__,
+                                          "zero-length directory entry");
+                                err = -EIO;
+                                goto out_unlock;
+                        }
+                        err = -EEXIST;
+                        if (ufs_match(sb, namelen, name, de))
+                                goto out_unlock;
+                        name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+                        rec_len = fs16_to_cpu(sb, de->d_reclen);
+                        if (!de->d_ino && rec_len >= reclen)
+                                goto got_it;
+                        if (rec_len >= name_len + reclen)
+                                goto got_it;
+                        de = (struct ufs_dir_entry *) ((char *) de + rec_len);
                }
-                if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) {
+                unlock_page(page);
-                        brelse (bh);
+                ufs_put_page(page);
-                        return -ENOENT;
-                }
-                if (ufs_match(sb, namelen, name, de)) {
-                        brelse (bh);
-                        return -EEXIST;
-                }
-                if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len)
-                        break;
-                        
-                if (fs16_to_cpu(sb, de->d_reclen) >=
-                     UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len)
-                        break;
-                offset += fs16_to_cpu(sb, de->d_reclen);
-                de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen));
        }
+        BUG();
+        return -EINVAL;
+got_it:
+        from = (char*)de - (char*)page_address(page);
+        to = from + rec_len;
+        err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+        if (err)
+                goto out_unlock;
        if (de->d_ino) {
-                de1 = (struct ufs_dir_entry *) ((char *) de +
+                struct ufs_dir_entry *de1 =
-                        UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
+                        (struct ufs_dir_entry *) ((char *) de + name_len);
-                de1->d_reclen =
+                de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
-                        cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) -
+                de->d_reclen = cpu_to_fs16(sb, name_len);
-                                UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-                de->d_reclen =
-                        cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
                de = de1;
        }
-        de->d_ino = 0;
        ufs_set_de_namlen(sb, de, namelen);
-        memcpy (de->d_name, name, namelen + 1);
+        memcpy(de->d_name, name, namelen + 1);
        de->d_ino = cpu_to_fs32(sb, inode->i_ino);
        ufs_set_de_type(sb, de, inode->i_mode);
-        mark_buffer_dirty(bh);
-        if (IS_DIRSYNC(dir))
+        err = ufs_commit_chunk(page, from, to);
-                sync_dirty_buffer(bh);
-        brelse (bh);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-        dir->i_version++;
        mark_inode_dirty(dir);
+        /* OFFSET_CACHE */
+out_put:
+        ufs_put_page(page);
+out:
+        return err;
+out_unlock:
+        unlock_page(page);
+        goto out_put;
+}
-        UFSD(("EXIT\n"))
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+                   unsigned offset, unsigned mask)
+{
+        struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+        struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+        while ((char*)p < (char*)de) {
+                if (p->d_reclen == 0)
+                        break;
+                p = ufs_next_entry(sb, p);
+        }
+        return (char *)p - base;
+}
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        loff_t pos = filp->f_pos;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        unsigned int offset = pos & ~PAGE_CACHE_MASK;
+        unsigned long n = pos >> PAGE_CACHE_SHIFT;
+        unsigned long npages = ufs_dir_pages(inode);
+        unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1);
+        int need_revalidate = filp->f_version != inode->i_version;
+        unsigned flags = UFS_SB(sb)->s_flags;
+        UFSD("BEGIN\n");
+        if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+                return 0;
+        for ( ; n < npages; n++, offset = 0) {
+                char *kaddr, *limit;
+                struct ufs_dir_entry *de;
+                struct page *page = ufs_get_page(inode, n);
+                if (IS_ERR(page)) {
+                        ufs_error(sb, __FUNCTION__,
+                                  "bad page in #%lu",
+                                  inode->i_ino);
+                        filp->f_pos += PAGE_CACHE_SIZE - offset;
+                        return -EIO;
+                }
+                kaddr = page_address(page);
+                if (unlikely(need_revalidate)) {
+                        if (offset) {
+                                offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+                                filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                        }
+                        filp->f_version = inode->i_version;
+                        need_revalidate = 0;
+                }
+                de = (struct ufs_dir_entry *)(kaddr+offset);
+                limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+                for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+                        if (de->d_reclen == 0) {
+                                ufs_error(sb, __FUNCTION__,
+                                        "zero-length directory entry");
+                                ufs_put_page(page);
+                                return -EIO;
+                        }
+                        if (de->d_ino) {
+                                int over;
+                                unsigned char d_type = DT_UNKNOWN;
+                                offset = (char *)de - kaddr;
+                                UFSD("filldir(%s,%u)\n", de->d_name,
+                                      fs32_to_cpu(sb, de->d_ino));
+                                UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
+                                if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+                                        d_type = de->d_u.d_44.d_type;
+                                over = filldir(dirent, de->d_name,
+                                               ufs_get_de_namlen(sb, de),
+                                                (n<<PAGE_CACHE_SHIFT) | offset,
+                                               fs32_to_cpu(sb, de->d_ino), d_type);
+                                if (over) {
+                                        ufs_put_page(page);
+                                        return 0;
+                                }
+                        }
+                        filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+                }
+                ufs_put_page(page);
+        }
        return 0;
 }
 /*
 * ufs_delete_entry deletes a directory entry by merging it with the
 * previous entry.
 */
-int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
-        struct buffer_head * bh )
+                     struct page * page)
-        
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_dir_entry * de, * pde;
+        struct address_space *mapping = page->mapping;
-        unsigned i;
+        char *kaddr = page_address(page);
-        
+        unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1);
-        UFSD(("ENTER\n"))
+        unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+        struct ufs_dir_entry *pde = NULL;
+        struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+        int err;
-        sb = inode->i_sb;
+        UFSD("ENTER\n");
-        i = 0;
-        pde = NULL;
+        UFSD("ino %u, reclen %u, namlen %u, name %s\n",
-        de = (struct ufs_dir_entry *) bh->b_data;
+              fs32_to_cpu(sb, de->d_ino),
-        
+              fs16_to_cpu(sb, de->d_reclen),
-        UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
+              ufs_get_de_namlen(sb, de), de->d_name);
-                fs32_to_cpu(sb, de->d_ino),
-                fs16_to_cpu(sb, de->d_reclen),
+        while ((char*)de < (char*)dir) {
-                ufs_get_de_namlen(sb, de), de->d_name))
+                if (de->d_reclen == 0) {
+                        ufs_error(inode->i_sb, __FUNCTION__,
-        while (i < bh->b_size) {
+                                  "zero-length directory entry");
-                if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) {
+                        err = -EIO;
-                        brelse(bh);
+                        goto out;
-                        return -EIO;
-                }
-                if (de == dir)  {
-                        if (pde)
-                                fs16_add(sb, &pde->d_reclen,
-                                        fs16_to_cpu(sb, dir->d_reclen));
-                        dir->d_ino = 0;
-                        inode->i_version++;
-                        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-                        mark_inode_dirty(inode);
-                        mark_buffer_dirty(bh);
-                        if (IS_DIRSYNC(inode))
-                                sync_dirty_buffer(bh);
-                        brelse(bh);
-                        UFSD(("EXIT\n"))
-                        return 0;
                }
-                i += fs16_to_cpu(sb, de->d_reclen);
+                pde = de;
-                if (i == UFS_SECTOR_SIZE) pde = NULL;
+                de = ufs_next_entry(sb, de);
-                else pde = de;
-                de = (struct ufs_dir_entry *)
-                    ((char *) de + fs16_to_cpu(sb, de->d_reclen));
-                if (i == UFS_SECTOR_SIZE && de->d_reclen == 0)
-                        break;
        }
-        UFSD(("EXIT\n"))
+        if (pde)
-        brelse(bh);
+                from = (char*)pde - (char*)page_address(page);
-        return -ENOENT;
+        lock_page(page);
+        err = mapping->a_ops->prepare_write(NULL, page, from, to);
+        BUG_ON(err);
+        if (pde)
+                pde->d_reclen = cpu_to_fs16(sb, to-from);
+        dir->d_ino = 0;
+        err = ufs_commit_chunk(page, from, to);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+        mark_inode_dirty(inode);
+out:
+        ufs_put_page(page);
+        UFSD("EXIT\n");
+        return err;
 }
 int ufs_make_empty(struct inode * inode, struct inode *dir)
 {
        struct super_block * sb = dir->i_sb;
-        struct buffer_head * dir_block;
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(mapping, 0);
        struct ufs_dir_entry * de;
+        char *base;
        int err;
-        dir_block = ufs_bread (inode, 0, 1, &err);
+        if (!page)
-        if (!dir_block)
+                return -ENOMEM;
-                return err;
+        kmap(page);
+        err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE);
+        if (err) {
+                unlock_page(page);
+                goto fail;
+        }
+        base = (char*)page_address(page);
+        memset(base, 0, PAGE_CACHE_SIZE);
+        de = (struct ufs_dir_entry *) base;
-        inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE;
-        de = (struct ufs_dir_entry *) dir_block->b_data;
        de->d_ino = cpu_to_fs32(sb, inode->i_ino);
        ufs_set_de_type(sb, de, inode->i_mode);
        ufs_set_de_namlen(sb, de, 1);
@@ -552,72 +587,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
        de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1));
        ufs_set_de_namlen(sb, de, 2);
        strcpy (de->d_name, "..");
-        mark_buffer_dirty(dir_block);
-        brelse (dir_block);
+        err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE);
-        mark_inode_dirty(inode);
+fail:
-        return 0;
+        kunmap(page);
+        page_cache_release(page);
+        return err;
 }
 /*
 * routine to check that the specified directory is empty (for rmdir)
 */
-int ufs_empty_dir (struct inode * inode)
+int ufs_empty_dir(struct inode * inode)
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        unsigned long offset;
+        struct page *page = NULL;
-        struct buffer_head * bh;
+        unsigned long i, npages = ufs_dir_pages(inode);
-        struct ufs_dir_entry * de, * de1;
-        int err;
+        for (i = 0; i < npages; i++) {
-        
+                char *kaddr;
-        sb = inode->i_sb;
+                struct ufs_dir_entry *de;
+                page = ufs_get_page(inode, i);
-        if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) ||
-            !(bh = ufs_bread (inode, 0, 0, &err))) {
+                if (IS_ERR(page))
-                ufs_warning (inode->i_sb, "empty_dir",
+                        continue;
-                              "bad directory (dir #%lu) - no data block",
-                              inode->i_ino);
+                kaddr = page_address(page);
-                return 1;
+                de = (struct ufs_dir_entry *)kaddr;
-        }
+                kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
-        de = (struct ufs_dir_entry *) bh->b_data;
-        de1 = (struct ufs_dir_entry *)
+                while ((char *)de <= kaddr) {
-                ((char *)de + fs16_to_cpu(sb, de->d_reclen));
+                        if (de->d_reclen == 0) {
-        if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 ||
+                                ufs_error(inode->i_sb, __FUNCTION__,
-             strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) {
+                                        "zero-length directory entry: "
-                ufs_warning (inode->i_sb, "empty_dir",
+                                        "kaddr=%p, de=%p\n", kaddr, de);
-                              "bad directory (dir #%lu) - no `.' or `..'",
+                                goto not_empty;
-                              inode->i_ino);
-                return 1;
-        }
-        offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen);
-        de = (struct ufs_dir_entry *)
-                ((char *)de1 + fs16_to_cpu(sb, de1->d_reclen));
-        while (offset < inode->i_size ) {
-                if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
-                        brelse (bh);
-                        bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err);
-                        if (!bh) {
-                                ufs_error (sb, "empty_dir",
-                                            "directory #%lu contains a hole at offset %lu",
-                                            inode->i_ino, offset);
-                                offset += sb->s_blocksize;
-                                continue;
                        }
-                        de = (struct ufs_dir_entry *) bh->b_data;
+                        if (de->d_ino) {
-                }
+                                u16 namelen=ufs_get_de_namlen(sb, de);
-                if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) {
+                                /* check for . and .. */
-                        brelse (bh);
+                                if (de->d_name[0] != '.')
-                        return 1;
+                                        goto not_empty;
-                }
+                                if (namelen > 2)
-                if (de->d_ino) {
+                                        goto not_empty;
-                        brelse (bh);
+                                if (namelen < 2) {
-                        return 0;
+                                        if (inode->i_ino !=
+                                            fs32_to_cpu(sb, de->d_ino))
+                                                goto not_empty;
+                                } else if (de->d_name[1] != '.')
+                                        goto not_empty;
+                        }
+                        de = ufs_next_entry(sb, de);
                }
-                offset += fs16_to_cpu(sb, de->d_reclen);
+                ufs_put_page(page);
-                de = (struct ufs_dir_entry *)
-                        ((char *)de + fs16_to_cpu(sb, de->d_reclen));
        }
-        brelse (bh);
        return 1;
+not_empty:
+        ufs_put_page(page);
+        return 0;
 }
 const struct file_operations ufs_dir_operations = {
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 312fd3f86313..a9c6e5f04fae 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -25,6 +25,26 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
+#include <linux/buffer_head.h>  /* for sync_mapping_buffers() */
+static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int err;
+        int ret;
+        ret = sync_mapping_buffers(inode->i_mapping);
+        if (!(inode->i_state & I_DIRTY))
+                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return ret;
+        err = ufs_sync_inode(inode);
+        if (ret == 0)
+                ret = err;
+        return ret;
+}
 /*
 * We have mostly NULL's here: the current defaults are ok for
@@ -37,9 +57,6 @@ const struct file_operations ufs_file_operations = {
        .write          = generic_file_write,
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
+        .fsync          = ufs_sync_file,
        .sendfile       = generic_file_sendfile,
 };
-struct inode_operations ufs_file_inode_operations = {
-        .truncate       = ufs_truncate,
-};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c7a47ed4f430..9501dcd3b213 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -34,14 +34,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_IALLOC_DEBUG
-#ifdef UFS_IALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 /*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
@@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode)
        int is_directory;
        unsigned ino, cg, bit;
        
-        UFSD(("ENTER, ino %lu\n", inode->i_ino))
+        UFSD("ENTER, ino %lu\n", inode->i_ino);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -91,7 +83,7 @@ void ufs_free_inode (struct inode * inode)
                unlock_super (sb);
                return;
        }
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg))
                ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
@@ -104,33 +96,33 @@ void ufs_free_inode (struct inode * inode)
        clear_inode (inode);
-        if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
                ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
        else {
-                ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
                if (ino < ucpi->c_irotor)
                        ucpi->c_irotor = ino;
                fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1);
+                uspi->cs_total.cs_nifree++;
                fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
                if (is_directory) {
                        fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
-                        fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1);
+                        uspi->cs_total.cs_ndir--;
                        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
                }
        }
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        
        sb->s_dirt = 1;
        unlock_super (sb);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
 /*
@@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
        unsigned cg, bit, i, j, start;
        struct ufs_inode_info *ufsi;
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        
        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
@@ -213,43 +205,43 @@ cg_found:
        ucpi = ufs_load_cylinder (sb, cg);
        if (!ucpi)
                goto failed;
-        ucg = ubh_get_ucg(UCPI_UBH);
+        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (!ufs_cg_chkmagic(sb, ucg)) 
                ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
        start = ucpi->c_irotor;
-        bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start);
+        bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
        if (!(bit < uspi->s_ipg)) {
-                bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start);
+                bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
                if (!(bit < start)) {
                        ufs_error (sb, "ufs_new_inode",
                            "cylinder group %u corrupted - error in inode bitmap\n", cg);
                        goto failed;
                }
        }
-        UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
+        UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
-        if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
-                ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+                ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
        else {
                ufs_panic (sb, "ufs_new_inode", "internal error");
                goto failed;
        }
        
        fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
-        fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1);
+        uspi->cs_total.cs_nifree--;
        fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
        
        if (S_ISDIR(mode)) {
                fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
-                fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1);
+                uspi->cs_total.cs_ndir++;
                fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
        }
-        ubh_mark_buffer_dirty (USPI_UBH);
+        ubh_mark_buffer_dirty (USPI_UBH(uspi));
-        ubh_mark_buffer_dirty (UCPI_UBH);
+        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
+                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_wait_on_buffer (UCPI_UBH(ucpi));
        }
        sb->s_dirt = 1;
@@ -272,6 +264,7 @@ cg_found:
        ufsi->i_shadow = 0;
        ufsi->i_osync = 0;
        ufsi->i_oeftflag = 0;
+        ufsi->i_dir_start_lookup = 0;
        memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
        insert_inode_hash(inode);
@@ -287,14 +280,14 @@ cg_found:
                return ERR_PTR(-EDQUOT);
        }
-        UFSD(("allocating inode %lu\n", inode->i_ino))
+        UFSD("allocating inode %lu\n", inode->i_ino);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return inode;
 failed:
        unlock_super (sb);
        make_bad_inode(inode);
        iput (inode);
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return ERR_PTR(-ENOSPC);
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3c3f62ce2ad9..e7c8615beb65 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,14 +41,7 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_INODE_DEBUG
+static u64 ufs_frag_map(struct inode *inode, sector_t frag);
-#undef UFS_INODE_DEBUG_MORE
-#ifdef UFS_INODE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -61,7 +54,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
        int n = 0;
-        UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks));
+        UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
        if (i_block < 0) {
                ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
        } else if (i_block < direct_blocks) {
@@ -89,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 * the begining of the filesystem.
 */
-u64  ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -104,8 +97,10 @@ u64  ufs_frag_map(struct inode *inode, sector_t frag)
        unsigned flags = UFS_SB(sb)->s_flags;
        u64 temp = 0L;
-        UFSD((": frag = %llu  depth = %d\n", (unsigned long long)frag, depth));
+        UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
-        UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask));
+        UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",
+                uspi->s_fpbshift, uspi->s_apbmask,
+                (unsigned long long)mask);
        if (depth == 0)
                return 0;
@@ -161,26 +156,64 @@ out:
        return ret;
 }
-static struct buffer_head * ufs_inode_getfrag (struct inode *inode,
+static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
-        unsigned int fragment, unsigned int new_fragment,
+{
-        unsigned int required, int *err, int metadata, long *phys, int *new)
+        lock_buffer(bh);
+        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        unlock_buffer(bh);
+        if (IS_SYNC(inode))
+                sync_dirty_buffer(bh);
+}
+static struct buffer_head *
+ufs_clear_frags(struct inode *inode, sector_t beg,
+                unsigned int n)
+{
+        struct buffer_head *res, *bh;
+        sector_t end = beg + n;
+        res = sb_getblk(inode->i_sb, beg);
+        ufs_clear_frag(inode, res);
+        for (++beg; beg < end; ++beg) {
+                bh = sb_getblk(inode->i_sb, beg);
+                ufs_clear_frag(inode, bh);
+                brelse(bh);
+        }
+        return res;
+}
+/**
+ * ufs_inode_getfrag() - allocate new fragment(s)
+ * @inode - pointer to inode
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated fragment(s)
+ * @new_fragment - number of new allocated fragment(s)
+ * @required - how many fragment(s) we require
+ * @err - we set it if something wrong
+ * @phys - pointer to where we save physical number of new allocated fragments,
+ *   NULL if we allocate not data(indirect blocks for example).
+ * @new - we set it if we allocate new block
+ * @locked_page - for ufs_new_fragments()
+ */
+static struct buffer_head *
+ufs_inode_getfrag(struct inode *inode, unsigned int fragment,
+                  sector_t new_fragment, unsigned int required, int *err,
+                  long *phys, int *new, struct page *locked_page)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
        struct buffer_head * result;
        unsigned block, blockoff, lastfrag, lastblock, lastblockoff;
        unsigned tmp, goal;
        __fs32 * p, * p2;
-        unsigned flags = 0;
-        UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
+        UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, "
-                inode->i_ino, fragment, new_fragment, required))         
+             "metadata %d\n", inode->i_ino, fragment,
+             (unsigned long long)new_fragment, required, !phys);
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
-        flags = UFS_SB(sb)->s_flags;
        /* TODO : to be done for write support
        if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
             goto ufs2;
@@ -195,16 +228,16 @@ repeat:
        tmp = fs32_to_cpu(sb, *p);
        lastfrag = ufsi->i_lastfrag;
        if (tmp && fragment < lastfrag) {
-                if (metadata) {
+                if (!phys) {
                        result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
                        if (tmp == fs32_to_cpu(sb, *p)) {
-                                UFSD(("EXIT, result %u\n", tmp + blockoff))
+                                UFSD("EXIT, result %u\n", tmp + blockoff);
                                return result;
                        }
                        brelse (result);
                        goto repeat;
                } else {
-                        *phys = tmp;
+                        *phys = tmp + blockoff;
                        return NULL;
                }
        }
@@ -221,7 +254,8 @@ repeat:
                if (lastblockoff) {
                        p2 = ufsi->i_u1.i_data + lastblock;
                        tmp = ufs_new_fragments (inode, p2, lastfrag, 
-                                fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err);
+                                                 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff,
+                                                 err, locked_page);
                        if (!tmp) {
                                if (lastfrag != ufsi->i_lastfrag)
                                        goto repeat;
@@ -233,14 +267,16 @@ repeat:
                }
                goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
                tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-                        goal, required + blockoff, err);
+                                         goal, required + blockoff,
+                                         err, locked_page);
        }
        /*
         * We will extend last allocated block
         */
        else if (lastblock == block) {
-                tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff),
+                tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
-                        fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff), err);
+                                        fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff),
+                                        err, locked_page);
        }
        /*
         * We will allocate new block before last allocated block
@@ -248,8 +284,8 @@ repeat:
        else /* (lastblock > block) */ {
                if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
                        goal = tmp + uspi->s_fpb;
-                tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
+                tmp = ufs_new_fragments(inode, p, fragment - blockoff,
-                        goal, uspi->s_fpb, err);
+                                        goal, uspi->s_fpb, err, locked_page);
        }
        if (!tmp) {
                if ((!blockoff && *p) || 
@@ -259,14 +295,10 @@ repeat:
                return NULL;
        }
-        /* The nullification of framgents done in ufs/balloc.c is
+        if (!phys) {
-         * something I don't have the stomache to move into here right
+                result = ufs_clear_frags(inode, tmp + blockoff, required);
-         * now. -DaveM
-         */
-        if (metadata) {
-                result = sb_getblk(inode->i_sb, tmp + blockoff);
        } else {
-                *phys = tmp;
+                *phys = tmp + blockoff;
                result = NULL;
                *err = 0;
                *new = 1;
@@ -276,7 +308,7 @@ repeat:
        if (IS_SYNC(inode))
                ufs_sync_inode (inode);
        mark_inode_dirty(inode);
-        UFSD(("EXIT, result %u\n", tmp + blockoff))
+        UFSD("EXIT, result %u\n", tmp + blockoff);
        return result;
     /* This part : To be implemented ....
@@ -295,22 +327,35 @@ repeat2:
     */
 }
-static struct buffer_head * ufs_block_getfrag (struct inode *inode,
+/**
-        struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, 
+ * ufs_inode_getblock() - allocate new block
-        unsigned int blocksize, int * err, int metadata, long *phys, int *new)
+ * @inode - pointer to inode
+ * @bh - pointer to block which hold "pointer" to new allocated block
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated block
+ * @new_fragment - number of new allocated fragment
+ *  (block will hold this fragment and also uspi->s_fpb-1)
+ * @err - see ufs_inode_getfrag()
+ * @phys - see ufs_inode_getfrag()
+ * @new - see ufs_inode_getfrag()
+ * @locked_page - see ufs_inode_getfrag()
+ */
+static struct buffer_head *
+ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
+                  unsigned int fragment, sector_t new_fragment, int *err,
+                  long *phys, int *new, struct page *locked_page)
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
        struct buffer_head * result;
        unsigned tmp, goal, block, blockoff;
        __fs32 * p;
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
        block = ufs_fragstoblks (fragment);
        blockoff = ufs_fragnum (fragment);
-        UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment))  
+        UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n",
+             inode->i_ino, fragment, (unsigned long long)new_fragment, !phys);
        result = NULL;
        if (!bh)
@@ -326,14 +371,14 @@ static struct buffer_head * ufs_block_getfrag (struct inode *inode,
 repeat:
        tmp = fs32_to_cpu(sb, *p);
        if (tmp) {
-                if (metadata) {
+                if (!phys) {
                        result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
                        if (tmp == fs32_to_cpu(sb, *p))
                                goto out;
                        brelse (result);
                        goto repeat;
                } else {
-                        *phys = tmp;
+                        *phys = tmp + blockoff;
                        goto out;
                }
        }
@@ -342,21 +387,19 @@ repeat:
                goal = tmp + uspi->s_fpb;
        else
                goal = bh->b_blocknr + uspi->s_fpb;
-        tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err);
+        tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+                                uspi->s_fpb, err, locked_page);
        if (!tmp) {
                if (fs32_to_cpu(sb, *p))
                        goto repeat;
                goto out;
        }               
-        /* The nullification of framgents done in ufs/balloc.c is
-         * something I don't have the stomache to move into here right
+        if (!phys) {
-         * now. -DaveM
+                result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb);
-         */
-        if (metadata) {
-                result = sb_getblk(sb, tmp + blockoff);
        } else {
-                *phys = tmp;
+                *phys = tmp + blockoff;
                *new = 1;
        }
@@ -365,18 +408,19 @@ repeat:
                sync_dirty_buffer(bh);
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        UFSD(("result %u\n", tmp + blockoff));
+        UFSD("result %u\n", tmp + blockoff);
 out:
        brelse (bh);
-        UFSD(("EXIT\n"));
+        UFSD("EXIT\n");
        return result;
 }
-/*
+/**
- * This function gets the block which contains the fragment.
+ * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * readpage, writepage and so on
 */
-int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
        struct super_block * sb = inode->i_sb;
        struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
@@ -387,7 +431,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
        
        if (!create) {
                phys64 = ufs_frag_map(inode, fragment);
-                UFSD(("phys64 = %llu \n",phys64));
+                UFSD("phys64 = %llu\n", (unsigned long long)phys64);
                if (phys64)
                        map_bh(bh_result, sb, phys64);
                return 0;
@@ -402,7 +446,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
        lock_kernel();
-        UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment))
+        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
        if (fragment < 0)
                goto abort_negative;
        if (fragment >
@@ -418,15 +462,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
         * it much more readable:
         */
 #define GET_INODE_DATABLOCK(x) \
-                ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new)
+        ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-                ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL)
+        ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page)
 #define GET_INDIRECT_DATABLOCK(x) \
-                ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
+        ufs_inode_getblock(inode, bh, x, fragment,      \
-                                  &err, 0, &phys, &new);
+                          &err, &phys, &new, bh_result->b_page);
 #define GET_INDIRECT_PTR(x) \
-                ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
+        ufs_inode_getblock(inode, bh, x, fragment,      \
-                                  &err, 1, NULL, NULL);
+                          &err, NULL, NULL, bh_result->b_page);
        if (ptr < UFS_NDIR_FRAGMENT) {
                bh = GET_INODE_DATABLOCK(ptr);
@@ -474,8 +518,9 @@ abort_too_big:
        goto abort;
 }
-struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment,
+static struct buffer_head *ufs_getfrag(struct inode *inode,
-                                int create, int *err)
+                                       unsigned int fragment,
+                                       int create, int *err)
 {
        struct buffer_head dummy;
        int error;
@@ -502,7 +547,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
 {
        struct buffer_head * bh;
-        UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment))
+        UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
        bh = ufs_getfrag (inode, fragment, create, err);
        if (!bh || buffer_uptodate(bh))                 
                return bh;
@@ -531,7 +576,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,ufs_getfrag_block);
 }
-struct address_space_operations ufs_aops = {
+const struct address_space_operations ufs_aops = {
        .readpage = ufs_readpage,
        .writepage = ufs_writepage,
        .sync_page = block_sync_page,
@@ -540,39 +585,34 @@ struct address_space_operations ufs_aops = {
        .bmap = ufs_bmap
 };
-void ufs_read_inode (struct inode * inode)
+static void ufs_set_inode_ops(struct inode *inode)
+{
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &ufs_file_inode_operations;
+                inode->i_fop = &ufs_file_operations;
+                inode->i_mapping->a_ops = &ufs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &ufs_dir_inode_operations;
+                inode->i_fop = &ufs_dir_operations;
+                inode->i_mapping->a_ops = &ufs_aops;
+        } else if (S_ISLNK(inode->i_mode)) {
+                if (!inode->i_blocks)
+                        inode->i_op = &ufs_fast_symlink_inode_operations;
+                else {
+                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_mapping->a_ops = &ufs_aops;
+                }
+        } else
+                init_special_inode(inode, inode->i_mode,
+                                   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
+}
+static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi;
-        struct ufs_inode * ufs_inode;   
-        struct ufs2_inode *ufs2_inode;
-        struct buffer_head * bh;
        mode_t mode;
        unsigned i;
-        unsigned flags;
-        
-        UFSD(("ENTER, ino %lu\n", inode->i_ino))
-        
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
-        flags = UFS_SB(sb)->s_flags;
-        if (inode->i_ino < UFS_ROOTINO || 
-            inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
-                ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
-                goto bad_inode;
-        }
-        
-        bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
-        if (!bh) {
-                ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
-                goto bad_inode;
-        }
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-                goto ufs2_inode;
-        ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino));
        /*
         * Copy data to the in-core inode.
@@ -596,56 +636,29 @@ void ufs_read_inode (struct inode * inode)
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
-        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat) */
-        inode->i_version++;
        ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
        ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen);
        ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
        ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-        ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
        
        if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
                        ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
-        }
+        } else {
-        else {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
        }
-        ufsi->i_osync = 0;
+}
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_op = &ufs_file_inode_operations;
-                inode->i_fop = &ufs_file_operations;
-                inode->i_mapping->a_ops = &ufs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &ufs_dir_inode_operations;
-                inode->i_fop = &ufs_dir_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                if (!inode->i_blocks)
-                        inode->i_op = &ufs_fast_symlink_inode_operations;
-                else {
-                        inode->i_op = &page_symlink_inode_operations;
-                        inode->i_mapping->a_ops = &ufs_aops;
-                }
-        } else
-                init_special_inode(inode, inode->i_mode,
-                        ufs_get_inode_dev(sb, ufsi));
-        brelse (bh);
-        UFSD(("EXIT\n"))
-        return;
-bad_inode:
-        make_bad_inode(inode);
-        return;
-ufs2_inode :
-        UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino))
-        ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
+static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+        struct ufs_inode_info *ufsi = UFS_I(inode);
+        struct super_block *sb = inode->i_sb;
+        mode_t mode;
+        unsigned i;
+        UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
        /*
         * Copy data to the in-core inode.
         */
@@ -668,50 +681,75 @@ ufs2_inode :
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
-        inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/
-        inode->i_version++;
        ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
        ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen);
        /*
        ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
        ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
        */
-        ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift;
        if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
                        ufsi->i_u1.u2_i_data[i] =
                                ufs2_inode->ui_u2.ui_addr.ui_db[i];
-        }
+        } else {
-        else {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
        }
+}
+void ufs_read_inode(struct inode * inode)
+{
+        struct ufs_inode_info *ufsi = UFS_I(inode);
+        struct super_block * sb;
+        struct ufs_sb_private_info * uspi;
+        struct buffer_head * bh;
+        UFSD("ENTER, ino %lu\n", inode->i_ino);
+        sb = inode->i_sb;
+        uspi = UFS_SB(sb)->s_uspi;
+        if (inode->i_ino < UFS_ROOTINO ||
+            inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+                ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+                            inode->i_ino);
+                goto bad_inode;
+        }
+        bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+        if (!bh) {
+                ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+                            inode->i_ino);
+                goto bad_inode;
+        }
+        if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+                struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+                ufs2_read_inode(inode,
+                                ufs2_inode + ufs_inotofsbo(inode->i_ino));
+        } else {
+                struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+                ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+        }
+        inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
+        inode->i_version++;
+        ufsi->i_lastfrag =
+                (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+        ufsi->i_dir_start_lookup = 0;
        ufsi->i_osync = 0;
-        if (S_ISREG(inode->i_mode)) {
+        ufs_set_inode_ops(inode);
-                inode->i_op = &ufs_file_inode_operations;
-                inode->i_fop = &ufs_file_operations;
-                inode->i_mapping->a_ops = &ufs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &ufs_dir_inode_operations;
-                inode->i_fop = &ufs_dir_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                if (!inode->i_blocks)
-                        inode->i_op = &ufs_fast_symlink_inode_operations;
-                else {
-                        inode->i_op = &page_symlink_inode_operations;
-                        inode->i_mapping->a_ops = &ufs_aops;
-                }
-        } else   /* TODO  : here ...*/
-                init_special_inode(inode, inode->i_mode,
-                        ufs_get_inode_dev(sb, ufsi));
        brelse(bh);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return;
+bad_inode:
+        make_bad_inode(inode);
 }
 static int ufs_update_inode(struct inode * inode, int do_sync)
@@ -724,7 +762,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
        unsigned i;
        unsigned flags;
-        UFSD(("ENTER, ino %lu\n", inode->i_ino))
+        UFSD("ENTER, ino %lu\n", inode->i_ino);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -785,7 +823,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
                sync_dirty_buffer(bh);
        brelse (bh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return 0;
 }
@@ -805,14 +843,17 @@ int ufs_sync_inode (struct inode *inode)
 void ufs_delete_inode (struct inode * inode)
 {
+        loff_t old_i_size;
        truncate_inode_pages(&inode->i_data, 0);
        /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
        lock_kernel();
        mark_inode_dirty(inode);
        ufs_update_inode(inode, IS_SYNC(inode));
+        old_i_size = inode->i_size;
        inode->i_size = 0;
-        if (inode->i_blocks)
+        if (inode->i_blocks && ufs_truncate(inode, old_i_size))
-                ufs_truncate (inode);
+                ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n");
        ufs_free_inode (inode);
        unlock_kernel();
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 8d5f98a01c74..abd5f23a426d 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,6 +1,9 @@
 /*
 * linux/fs/ufs/namei.c
 *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
 * Copyright (C) 1998
 * Daniel Pirkl <daniel.pirkl@email.cz>
 * Charles University, Faculty of Mathematics and Physics
@@ -28,21 +31,9 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include "swab.h"       /* will go away - see comment in mknod() */
 #include "util.h"
-/*
-#undef UFS_NAMEI_DEBUG
-*/
-#define UFS_NAMEI_DEBUG
-#ifdef UFS_NAMEI_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
        int err = ufs_add_link(dentry, inode);
@@ -88,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                struct nameidata *nd)
 {
-        struct inode * inode = ufs_new_inode(dir, mode);
+        struct inode *inode;
-        int err = PTR_ERR(inode);
+        int err;
+        UFSD("BEGIN\n");
+        inode = ufs_new_inode(dir, mode);
+        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ufs_file_inode_operations;
                inode->i_fop = &ufs_file_operations;
@@ -99,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                err = ufs_add_nondir(dentry, inode);
                unlock_kernel();
        }
+        UFSD("END: err=%d\n", err);
        return err;
 }
@@ -205,6 +202,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        inode->i_op = &ufs_dir_inode_operations;
        inode->i_fop = &ufs_dir_operations;
+        inode->i_mapping->a_ops = &ufs_aops;
        inode_inc_link_count(inode);
@@ -231,19 +229,18 @@ out_dir:
        goto out;
 }
-static int ufs_unlink(struct inode * dir, struct dentry *dentry)
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode * inode = dentry->d_inode;
-        struct buffer_head * bh;
+        struct ufs_dir_entry *de;
-        struct ufs_dir_entry * de;
+        struct page *page;
        int err = -ENOENT;
-        lock_kernel();
+        de = ufs_find_entry(dir, dentry, &page);
-        de = ufs_find_entry (dentry, &bh);
        if (!de)
                goto out;
-        err = ufs_delete_entry (dir, de, bh);
+        err = ufs_delete_entry(dir, de, page);
        if (err)
                goto out;
@@ -251,7 +248,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry)
        inode_dec_link_count(inode);
        err = 0;
 out:
-        unlock_kernel();
        return err;
 }
@@ -273,42 +269,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        return err;
 }
-static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
-        struct inode * new_dir, struct dentry * new_dentry )
+                      struct inode *new_dir, struct dentry *new_dentry)
 {
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
-        struct buffer_head *dir_bh = NULL;
+        struct page *dir_page = NULL;
-        struct ufs_dir_entry *dir_de = NULL;
+        struct ufs_dir_entry * dir_de = NULL;
-        struct buffer_head *old_bh;
+        struct page *old_page;
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        lock_kernel();
+        old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
-        old_de = ufs_find_entry (old_dentry, &old_bh);
        if (!old_de)
                goto out;
        if (S_ISDIR(old_inode->i_mode)) {
                err = -EIO;
-                dir_de = ufs_dotdot(old_inode, &dir_bh);
+                dir_de = ufs_dotdot(old_inode, &dir_page);
                if (!dir_de)
                        goto out_old;
        }
        if (new_inode) {
-                struct buffer_head *new_bh;
+                struct page *new_page;
                struct ufs_dir_entry *new_de;
                err = -ENOTEMPTY;
-                if (dir_de && !ufs_empty_dir (new_inode))
+                if (dir_de && !ufs_empty_dir(new_inode))
                        goto out_dir;
                err = -ENOENT;
-                new_de = ufs_find_entry (new_dentry, &new_bh);
+                new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
                inode_inc_link_count(old_inode);
-                ufs_set_link(new_dir, new_de, new_bh, old_inode);
+                ufs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
                        new_inode->i_nlink--;
@@ -329,24 +325,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
                        inode_inc_link_count(new_dir);
        }
-        ufs_delete_entry (old_dir, old_de, old_bh);
+        /*
+         * Like most other Unix systems, set the ctime for inodes on a
+         * rename.
+         * inode_dec_link_count() will mark the inode dirty.
+         */
+        old_inode->i_ctime = CURRENT_TIME_SEC;
+        ufs_delete_entry(old_dir, old_de, old_page);
        inode_dec_link_count(old_inode);
        if (dir_de) {
-                ufs_set_link(old_inode, dir_de, dir_bh, new_dir);
+                ufs_set_link(old_inode, dir_de, dir_page, new_dir);
                inode_dec_link_count(old_dir);
        }
-        unlock_kernel();
        return 0;
 out_dir:
-        if (dir_de)
+        if (dir_de) {
-                brelse(dir_bh);
+                kunmap(dir_page);
+                page_cache_release(dir_page);
+        }
 out_old:
-        brelse (old_bh);
+        kunmap(old_page);
+        page_cache_release(old_page);
 out:
-        unlock_kernel();
        return err;
 }
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index db98a4c71e63..19a99726e58d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -64,7 +64,6 @@
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
@@ -90,95 +89,84 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_SUPER_DEBUG
+#ifdef CONFIG_UFS_DEBUG
-#undef UFS_SUPER_DEBUG_MORE
-#undef UFS_SUPER_DEBUG_MORE
-#ifdef UFS_SUPER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-#ifdef UFS_SUPER_DEBUG_MORE
 /*
 * Print contents of ufs_super_block, useful for debugging
 */
-void ufs_print_super_stuff(struct super_block *sb,
+static void ufs_print_super_stuff(struct super_block *sb, unsigned flags,
-        struct ufs_super_block_first * usb1,
+                                  struct ufs_super_block_first *usb1,
-        struct ufs_super_block_second * usb2, 
+                                  struct ufs_super_block_second *usb2,
-        struct ufs_super_block_third * usb3)
+                                  struct ufs_super_block_third *usb3)
 {
        printk("ufs_print_super_stuff\n");
-        printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
+        printk("  magic:     0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
-        printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
+        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-        printk("  sblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+                printk("  fs_size:   %llu\n", (unsigned long long)
-        printk("  cblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
-        printk("  iblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+                printk("  fs_dsize:  %llu\n", (unsigned long long)
-        printk("  dblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
-        printk("  cgoffset:      %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset));
+                printk("  bsize:         %u\n",
-        printk("  ~cgmask:       0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask));
+                       fs32_to_cpu(sb, usb1->fs_bsize));
-        printk("  size:          %u\n", fs32_to_cpu(sb, usb1->fs_size));
+                printk("  fsize:         %u\n",
-        printk("  dsize:         %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+                       fs32_to_cpu(sb, usb1->fs_fsize));
-        printk("  ncg:           %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+                printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
-        printk("  bsize:         %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+                printk("  fs_sblockloc: %llu\n", (unsigned long long)
-        printk("  fsize:         %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
-        printk("  frag:          %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+                printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
-        printk("  fragshift:     %u\n", fs32_to_cpu(sb, usb1->fs_fragshift));
+                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
-        printk("  ~fmask:        %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+                printk("  cs_nbfree(No of free blocks):  %llu\n",
-        printk("  fshift:        %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+                       (unsigned long long)
-        printk("  sbsize:        %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
-        printk("  spc:           %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+        } else {
-        printk("  cpg:           %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+                printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
-        printk("  ipg:           %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+                printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
-        printk("  fpg:           %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+                printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
-        printk("  csaddr:        %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+                printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
-        printk("  cssize:        %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+                printk(" cgoffset:    %u\n",
-        printk("  cgsize:        %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+                       fs32_to_cpu(sb, usb1->fs_cgoffset));
-        printk("  fstodb:        %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb));
+                printk(" ~cgmask:     0x%x\n",
-        printk("  contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize));
+                       ~fs32_to_cpu(sb, usb1->fs_cgmask));
-        printk("  postblformat:  %u\n", fs32_to_cpu(sb, usb3->fs_postblformat));
+                printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
-        printk("  nrpos:         %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+                printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
-        printk("  ndir           %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+                printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
-        printk("  nifree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+                printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
-        printk("  nbfree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+                printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
-        printk("  nffree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+                printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
-        printk("\n");
+                printk(" fragshift:   %u\n",
-}
+                       fs32_to_cpu(sb, usb1->fs_fragshift));
+                printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
-/*
+                printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
- * Print contents of ufs2 ufs_super_block, useful for debugging
+                printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
- */
+                printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
-void ufs2_print_super_stuff(
+                printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
-     struct super_block *sb,
+                printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
-      struct ufs_super_block *usb)
+                printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
-{
+                printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
-        printk("ufs_print_super_stuff\n");
+                printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
-        printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
+                printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
-        printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
+                printk(" fstodb:      %u\n",
-        printk("  fs_size:   %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
+                       fs32_to_cpu(sb, usb1->fs_fsbtodb));
-        printk("  fs_dsize:  %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
+                printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
-        printk("  bsize:         %u\n", fs32_to_cpu(usb, usb->fs_bsize));
+                printk(" ndir         %u\n",
-        printk("  fsize:         %u\n", fs32_to_cpu(usb, usb->fs_fsize));
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
-        printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
+                printk(" nifree       %u\n",
-        printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
-        printk("  fs_sblockloc: %u\n",fs64_to_cpu(sb,
+                printk(" nbfree       %u\n",
-                        usb->fs_u11.fs_u2.fs_sblockloc));
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
-        printk("  cs_ndir(No of dirs):  %u\n",fs64_to_cpu(sb,
+                printk(" nffree       %u\n",
-                        usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
+                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
-        printk("  cs_nbfree(No of free blocks):  %u\n",fs64_to_cpu(sb,
+        }
-                        usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
        printk("\n");
 }
 /*
 * Print contents of ufs_cylinder_group, useful for debugging
 */
-void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg)
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+                                     struct ufs_cylinder_group *cg)
 {
        printk("\nufs_print_cylinder_stuff\n");
-        printk("size of ucg: %u\n", sizeof(struct ufs_cylinder_group));
+        printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
        printk("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
        printk("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
        printk("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
@@ -202,12 +190,18 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
        printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
        printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
        printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
-        printk("  clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+        printk("  clustersumoff %u\n",
-        printk("  clusteroff    %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
-        printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+        printk("  clusteroff    %u\n",
+               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+        printk("  nclusterblks  %u\n",
+               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
        printk("\n");
 }
-#endif /* UFS_SUPER_DEBUG_MORE */
+#else
+#  define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
+#endif /* CONFIG_UFS_DEBUG */
 static struct super_operations ufs_super_ops;
@@ -225,7 +219,7 @@ void ufs_error (struct super_block * sb, const char * function,
        
        if (!(sb->s_flags & MS_RDONLY)) {
                usb1->fs_clean = UFS_FSBAD;
-                ubh_mark_buffer_dirty(USPI_UBH);
+                ubh_mark_buffer_dirty(USPI_UBH(uspi));
                sb->s_dirt = 1;
                sb->s_flags |= MS_RDONLY;
        }
@@ -257,7 +251,7 @@ void ufs_panic (struct super_block * sb, const char * function,
        
        if (!(sb->s_flags & MS_RDONLY)) {
                usb1->fs_clean = UFS_FSBAD;
-                ubh_mark_buffer_dirty(USPI_UBH);
+                ubh_mark_buffer_dirty(USPI_UBH(uspi));
                sb->s_dirt = 1;
        }
        va_start (args, fmt);
@@ -309,7 +303,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 {
        char * p;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        
        if (!options)
                return 1;
@@ -386,27 +380,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 }
 /*
+ * Diffrent types of UFS hold fs_cstotal in different
+ * places, and use diffrent data structure for it.
+ * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
+ */
+static void ufs_setup_cstotal(struct super_block *sb)
+{
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        struct ufs_sb_private_info *uspi = sbi->s_uspi;
+        struct ufs_super_block_first *usb1;
+        struct ufs_super_block_second *usb2;
+        struct ufs_super_block_third *usb3;
+        unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+        UFSD("ENTER, mtype=%u\n", mtype);
+        usb1 = ubh_get_usb_first(uspi);
+        usb2 = ubh_get_usb_second(uspi);
+        usb3 = ubh_get_usb_third(uspi);
+        if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+             (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+            mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+                /*we have statistic in different place, then usual*/
+                uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+                uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+                uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+                uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+        } else {
+                uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+                uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+                uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+                uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+        }
+        UFSD("EXIT\n");
+}
+/*
 * Read on-disk structures associated with cylinder groups
 */
-static int ufs_read_cylinder_structures (struct super_block *sb)
+static int ufs_read_cylinder_structures(struct super_block *sb)
 {
-        struct ufs_sb_info * sbi = UFS_SB(sb);
+        struct ufs_sb_info *sbi = UFS_SB(sb);
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = sbi->s_uspi;
-        struct ufs_super_block *usb;
+        unsigned flags = sbi->s_flags;
        struct ufs_buffer_head * ubh;
        unsigned char * base, * space;
        unsigned size, blks, i;
-        unsigned flags = 0;
+        struct ufs_super_block_third *usb3;
-        
-        UFSD(("ENTER\n"))
-        
-        uspi = sbi->s_uspi;
-        usb  = (struct ufs_super_block *)
+        UFSD("ENTER\n");
-                ((struct ufs_buffer_head *)uspi)->bh[0]->b_data;
-        flags = UFS_SB(sb)->s_flags;
+        usb3 = ubh_get_usb_third(uspi);
-        
        /*
         * Read cs structures from (usually) first data block
         * on the device. 
@@ -424,7 +448,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
                if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 
                        ubh = ubh_bread(sb,
-                                fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size);
+                                fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size);
                else 
                        ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
                
@@ -451,14 +475,13 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
        for (i = 0; i < uspi->s_ncg; i++) {
-                UFSD(("read cg %u\n", i))
+                UFSD("read cg %u\n", i);
                if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
                        goto failed;
                if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
                        goto failed;
-#ifdef UFS_SUPER_DEBUG_MORE
                ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
-#endif
        }
        for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
@@ -466,7 +489,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
        sbi->s_cg_loaded = 0;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return 1;
 failed:
@@ -479,26 +502,69 @@ failed:
                for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
                        kfree (sbi->s_ucpi[i]);
        }
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return 0;
 }
 /*
- * Put on-disk structures associated with cylinder groups and 
+ * Sync our internal copy of fs_cstotal with disk
- * write them back to disk
 */
-static void ufs_put_cylinder_structures (struct super_block *sb)
+static void ufs_put_cstotal(struct super_block *sb)
 {
-        struct ufs_sb_info * sbi = UFS_SB(sb);
+        unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
-        struct ufs_sb_private_info * uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_super_block_first *usb1;
+        struct ufs_super_block_second *usb2;
+        struct ufs_super_block_third *usb3;
+        UFSD("ENTER\n");
+        usb1 = ubh_get_usb_first(uspi);
+        usb2 = ubh_get_usb_second(uspi);
+        usb3 = ubh_get_usb_third(uspi);
+        if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+             (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+            mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+                /*we have statistic in different place, then usual*/
+                usb2->fs_un.fs_u2.cs_ndir =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+                usb2->fs_un.fs_u2.cs_nbfree =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+                usb3->fs_un1.fs_u2.cs_nifree =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+                usb3->fs_un1.fs_u2.cs_nffree =
+                        cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+        } else {
+                usb1->fs_cstotal.cs_ndir =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+                usb1->fs_cstotal.cs_nbfree =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+                usb1->fs_cstotal.cs_nifree =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+                usb1->fs_cstotal.cs_nffree =
+                        cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+        }
+        ubh_mark_buffer_dirty(USPI_UBH(uspi));
+        UFSD("EXIT\n");
+}
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        struct ufs_sb_private_info *uspi = sbi->s_uspi;
        struct ufs_buffer_head * ubh;
        unsigned char * base, * space;
        unsigned blks, size, i;
-        
-        UFSD(("ENTER\n"))
-        
-        uspi = sbi->s_uspi;
+        
+        UFSD("ENTER\n");
+        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
        base = space = (char*) sbi->s_csp;
@@ -523,7 +589,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
                brelse (sbi->s_ucg[i]);
        kfree (sbi->s_ucg);
        kfree (base);
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
 }
 static int ufs_fill_super(struct super_block *sb, void *data, int silent)
@@ -533,7 +599,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_second * usb2;
        struct ufs_super_block_third * usb3;
-        struct ufs_super_block *usb;
        struct ufs_buffer_head * ubh;   
        struct inode *inode;
        unsigned block_size, super_block_size;
@@ -544,7 +609,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        ubh = NULL;
        flags = 0;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
                
        sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -552,7 +617,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbi;
        memset(sbi, 0, sizeof(struct ufs_sb_info));
-        UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY)))
+        UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
        
 #ifndef CONFIG_UFS_FS_WRITE
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -593,7 +658,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
           the rules */
        switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
        case UFS_MOUNT_UFSTYPE_44BSD:
-                UFSD(("ufstype=44bsd\n"))
+                UFSD("ufstype=44bsd\n");
                uspi->s_fsize = block_size = 512;
                uspi->s_fmask = ~(512 - 1);
                uspi->s_fshift = 9;
@@ -602,7 +667,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
                break;
        case UFS_MOUNT_UFSTYPE_UFS2:
-                UFSD(("ufstype=ufs2\n"));
+                UFSD("ufstype=ufs2\n");
                super_block_offset=SBLOCK_UFS2;
                uspi->s_fsize = block_size = 512;
                uspi->s_fmask = ~(512 - 1);
@@ -617,7 +682,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
                
        case UFS_MOUNT_UFSTYPE_SUN:
-                UFSD(("ufstype=sun\n"))
+                UFSD("ufstype=sun\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -628,7 +693,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        case UFS_MOUNT_UFSTYPE_SUNx86:
-                UFSD(("ufstype=sunx86\n"))
+                UFSD("ufstype=sunx86\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -639,7 +704,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        case UFS_MOUNT_UFSTYPE_OLD:
-                UFSD(("ufstype=old\n"))
+                UFSD("ufstype=old\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -654,7 +719,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-                UFSD(("ufstype=nextstep\n"))
+                UFSD("ufstype=nextstep\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -669,7 +734,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-                UFSD(("ufstype=nextstep-cd\n"))
+                UFSD("ufstype=nextstep-cd\n");
                uspi->s_fsize = block_size = 2048;
                uspi->s_fmask = ~(2048 - 1);
                uspi->s_fshift = 11;
@@ -684,7 +749,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_OPENSTEP:
-                UFSD(("ufstype=openstep\n"))
+                UFSD("ufstype=openstep\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -699,7 +764,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        
        case UFS_MOUNT_UFSTYPE_HP:
-                UFSD(("ufstype=hp\n"))
+                UFSD("ufstype=hp\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -737,8 +802,6 @@ again:
        usb1 = ubh_get_usb_first(uspi);
        usb2 = ubh_get_usb_second(uspi);
        usb3 = ubh_get_usb_third(uspi);
-        usb  = (struct ufs_super_block *)
-                ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
        /*
         * Check ufs magic number
@@ -820,16 +883,12 @@ magic_found:
                ubh = NULL;
                block_size = uspi->s_fsize;
                super_block_size = uspi->s_sbsize;
-                UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size))
+                UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
                goto again;
        }
-#ifdef UFS_SUPER_DEBUG_MORE
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
+        ufs_print_super_stuff(sb, flags, usb1, usb2, usb3);
-                ufs2_print_super_stuff(sb,usb);
-        else
-                ufs_print_super_stuff(sb, usb1, usb2, usb3);
-#endif
        /*
         * Check, if file system was correctly unmounted.
@@ -842,13 +901,13 @@ magic_found:
          (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
                switch(usb1->fs_clean) {
                case UFS_FSCLEAN:
-                        UFSD(("fs is clean\n"))
+                        UFSD("fs is clean\n");
                        break;
                case UFS_FSSTABLE:
-                        UFSD(("fs is stable\n"))
+                        UFSD("fs is stable\n");
                        break;
                case UFS_FSOSF1:
-                        UFSD(("fs is DEC OSF/1\n"))
+                        UFSD("fs is DEC OSF/1\n");
                        break;
                case UFS_FSACTIVE:
                        printk("ufs_read_super: fs is active\n");
@@ -863,8 +922,7 @@ magic_found:
                        sb->s_flags |= MS_RDONLY;
                        break;
                }
-        }
+        } else {
-        else {
                printk("ufs_read_super: fs needs fsck\n");
                sb->s_flags |= MS_RDONLY;
        }
@@ -884,10 +942,9 @@ magic_found:
        uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-                uspi->s_u2_size  = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size);
+                uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
-                uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
+                uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
-        }
+        } else {
-        else {
                uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
                uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
        }
@@ -901,8 +958,8 @@ magic_found:
        uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
        uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
        uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
-        UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+        UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
-                uspi->s_fshift));
+                uspi->s_fshift);
        uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
        uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
        /* s_sbsize already set */
@@ -922,8 +979,8 @@ magic_found:
        uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
        uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
        uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
-        uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc);
+        uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
-        uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize);
+        uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
        uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
        uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
        uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
@@ -935,12 +992,11 @@ magic_found:
         * Compute another frequently used values
         */
        uspi->s_fpbmask = uspi->s_fpb - 1;
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
                uspi->s_apbshift = uspi->s_bshift - 3;
-        }
+        else
-        else {
                uspi->s_apbshift = uspi->s_bshift - 2;
-        }
        uspi->s_2apbshift = uspi->s_apbshift * 2;
        uspi->s_3apbshift = uspi->s_apbshift * 3;
        uspi->s_apb = 1 << uspi->s_apbshift;
@@ -956,7 +1012,7 @@ magic_found:
        if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) ==
            UFS_MOUNT_UFSTYPE_44BSD)
                uspi->s_maxsymlinklen =
-                    fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen);
+                    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
        
        sbi->s_flags = flags;
@@ -967,7 +1023,7 @@ magic_found:
        if (!sb->s_root)
                goto dalloc_failed;
+        ufs_setup_cstotal(sb);
        /*
         * Read cylinder group structures
         */
@@ -975,7 +1031,7 @@ magic_found:
                if (!ufs_read_cylinder_structures(sb))
                        goto failed;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return 0;
 dalloc_failed:
@@ -986,15 +1042,16 @@ failed:
        kfree (uspi);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        UFSD(("EXIT (FAILED)\n"))
+        UFSD("EXIT (FAILED)\n");
        return -EINVAL;
 failed_nomem:
-        UFSD(("EXIT (NOMEM)\n"))
+        UFSD("EXIT (NOMEM)\n");
        return -ENOMEM;
 }
-static void ufs_write_super (struct super_block *sb) {
+static void ufs_write_super(struct super_block *sb)
+{
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_third * usb3;
@@ -1002,7 +1059,7 @@ static void ufs_write_super (struct super_block *sb) {
        lock_kernel();
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        flags = UFS_SB(sb)->s_flags;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
@@ -1014,26 +1071,27 @@ static void ufs_write_super (struct super_block *sb) {
                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
                        ufs_set_fs_state(sb, usb1, usb3,
                                        UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                ubh_mark_buffer_dirty (USPI_UBH);
+                ufs_put_cstotal(sb);
        }
        sb->s_dirt = 0;
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        unlock_kernel();
 }
-static void ufs_put_super (struct super_block *sb)
+static void ufs_put_super(struct super_block *sb)
 {
        struct ufs_sb_info * sbi = UFS_SB(sb);
                
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        if (!(sb->s_flags & MS_RDONLY))
-                ufs_put_cylinder_structures (sb);
+                ufs_put_super_internal(sb);
        
        ubh_brelse_uspi (sbi->s_uspi);
        kfree (sbi->s_uspi);
        kfree (sbi);
        sb->s_fs_info = NULL;
+        UFSD("EXIT\n");
        return;
 }
@@ -1062,8 +1120,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                return -EINVAL;
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
                new_mount_opt |= ufstype;
-        }
+        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
-        else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
                return -EINVAL;
        }
@@ -1077,20 +1134,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
         * fs was mouted as rw, remounting ro
         */
        if (*mount_flags & MS_RDONLY) {
-                ufs_put_cylinder_structures(sb);
+                ufs_put_super_internal(sb);
                usb1->fs_time = cpu_to_fs32(sb, get_seconds());
                if ((flags & UFS_ST_MASK) == UFS_ST_SUN
                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
                        ufs_set_fs_state(sb, usb1, usb3,
                                UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                ubh_mark_buffer_dirty (USPI_UBH);
+                ubh_mark_buffer_dirty (USPI_UBH(uspi));
                sb->s_dirt = 0;
                sb->s_flags |= MS_RDONLY;
-        }
+        } else {
        /*
         * fs was mounted as ro, remounting rw
         */
-        else {
 #ifndef CONFIG_UFS_FS_WRITE
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
@@ -1102,7 +1158,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                        printk("this ufstype is read-only supported\n");
                        return -EINVAL;
                }
-                if (!ufs_read_cylinder_structures (sb)) {
+                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
                        return -EPERM;
                }
@@ -1113,36 +1169,31 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        return 0;
 }
-static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct ufs_sb_private_info * uspi;
+        struct super_block *sb = dentry->d_sb;
-        struct ufs_super_block_first * usb1;
+        struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
-        struct ufs_super_block * usb;
+        unsigned  flags = UFS_SB(sb)->s_flags;
-        unsigned  flags = 0;
+        struct ufs_super_block_first *usb1;
+        struct ufs_super_block_second *usb2;
+        struct ufs_super_block_third *usb3;
        lock_kernel();
-        uspi = UFS_SB(sb)->s_uspi;
+        usb1 = ubh_get_usb_first(uspi);
-        usb1 = ubh_get_usb_first (uspi);
+        usb2 = ubh_get_usb_second(uspi);
-        usb  = (struct ufs_super_block *)
+        usb3 = ubh_get_usb_third(uspi);
-                ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
        
-        flags = UFS_SB(sb)->s_flags;
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
                buf->f_type = UFS2_MAGIC;
-                buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
+                buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
-                buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) +
+        } else {
-                        fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree);
-                buf->f_ffree = fs64_to_cpu(sb,
-                        usb->fs_u11.fs_u2.fs_cstotal.cs_nifree);
-        }
-        else {
                buf->f_type = UFS_MAGIC;
                buf->f_blocks = uspi->s_dsize;
-                buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
-                        fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
-                buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
        }
+        buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+                uspi->cs_total.cs_nffree;
+        buf->f_ffree = uspi->cs_total.cs_nifree;
        buf->f_bsize = sb->s_blocksize;
        buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
                ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
@@ -1311,10 +1362,10 @@ out:
 #endif
-static struct super_block *ufs_get_sb(struct file_system_type *fs_type,
+static int ufs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
 }
 static struct file_system_type ufs_fs_type = {
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 02e86291ef8a..c9b55872079b 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -49,14 +49,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_TRUNCATE_DEBUG
-#ifdef UFS_TRUNCATE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
- 
 /*
 * Secure deletion currently doesn't work. It interacts very badly
 * with buffers shared with memory mappings, and for that reason
@@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode)
        unsigned i, tmp;
        int retry;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode)
                block2 = ufs_fragstoblks (frag3);
        }
-        UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4))
+        UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4);
        if (frag1 >= frag2)
                goto next1;             
@@ -120,9 +112,8 @@ static int ufs_trunc_direct (struct inode * inode)
        frag1 = ufs_fragnum (frag1);
        frag2 = ufs_fragnum (frag2);
-        inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
-        mark_inode_dirty(inode);
        ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
+        mark_inode_dirty(inode);
        frag_to_free = tmp + frag1;
 next1:
@@ -136,8 +127,7 @@ next1:
                        continue;
                *p = 0;
-                inode->i_blocks -= uspi->s_nspb;
-                mark_inode_dirty(inode);
                if (free_count == 0) {
                        frag_to_free = tmp;
                        free_count = uspi->s_fpb;
@@ -148,6 +138,7 @@ next1:
                        frag_to_free = tmp;
                        free_count = uspi->s_fpb;
                }
+                mark_inode_dirty(inode);
        }
        
        if (free_count > 0)
@@ -166,12 +157,12 @@ next1:
        frag4 = ufs_fragnum (frag4);
        *p = 0;
-        inode->i_blocks -= frag4 << uspi->s_nspfshift;
-        mark_inode_dirty(inode);
        ufs_free_fragments (inode, tmp, frag4);
+        mark_inode_dirty(inode);
 next3:
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return retry;
 }
@@ -186,7 +177,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
        unsigned frag_to_free, free_count;
        int retry;
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
                
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -227,7 +218,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
                        frag_to_free = tmp;
                        free_count = uspi->s_fpb;
                }
-                inode->i_blocks -= uspi->s_nspb;
                mark_inode_dirty(inode);
        }
@@ -238,26 +229,21 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
                if (*ubh_get_addr32(ind_ubh,i))
                        break;
        if (i >= uspi->s_apb) {
-                if (ubh_max_bcount(ind_ubh) != 1) {
+                tmp = fs32_to_cpu(sb, *p);
-                        retry = 1;
+                *p = 0;
-                }
-                else {
+                ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        tmp = fs32_to_cpu(sb, *p);
+                mark_inode_dirty(inode);
-                        *p = 0;
+                ubh_bforget(ind_ubh);
-                        inode->i_blocks -= uspi->s_nspb;
+                ind_ubh = NULL;
-                        mark_inode_dirty(inode);
-                        ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        ubh_bforget(ind_ubh);
-                        ind_ubh = NULL;
-                }
        }
        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-                ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
+                ubh_ll_rw_block(SWRITE, ind_ubh);
                ubh_wait_on_buffer (ind_ubh);
        }
        ubh_brelse (ind_ubh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        
        return retry;
 }
@@ -271,7 +257,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
        __fs32 * dind;
        int retry = 0;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -306,25 +292,21 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
                if (*ubh_get_addr32 (dind_bh, i))
                        break;
        if (i >= uspi->s_apb) {
-                if (ubh_max_bcount(dind_bh) != 1)
+                tmp = fs32_to_cpu(sb, *p);
-                        retry = 1;
+                *p = 0;
-                else {
-                        tmp = fs32_to_cpu(sb, *p);
+                ufs_free_blocks(inode, tmp, uspi->s_fpb);
-                        *p = 0;
+                mark_inode_dirty(inode);
-                        inode->i_blocks -= uspi->s_nspb;
+                ubh_bforget(dind_bh);
-                        mark_inode_dirty(inode);
+                dind_bh = NULL;
-                        ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        ubh_bforget(dind_bh);
-                        dind_bh = NULL;
-                }
        }
        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-                ubh_ll_rw_block (SWRITE, 1, &dind_bh);
+                ubh_ll_rw_block(SWRITE, dind_bh);
                ubh_wait_on_buffer (dind_bh);
        }
        ubh_brelse (dind_bh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        
        return retry;
 }
@@ -339,7 +321,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
        __fs32 * tind, * p;
        int retry;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -370,45 +352,114 @@ static int ufs_trunc_tindirect (struct inode * inode)
                if (*ubh_get_addr32 (tind_bh, i))
                        break;
        if (i >= uspi->s_apb) {
-                if (ubh_max_bcount(tind_bh) != 1)
+                tmp = fs32_to_cpu(sb, *p);
-                        retry = 1;
+                *p = 0;
-                else {
-                        tmp = fs32_to_cpu(sb, *p);
+                ufs_free_blocks(inode, tmp, uspi->s_fpb);
-                        *p = 0;
+                mark_inode_dirty(inode);
-                        inode->i_blocks -= uspi->s_nspb;
+                ubh_bforget(tind_bh);
-                        mark_inode_dirty(inode);
+                tind_bh = NULL;
-                        ufs_free_blocks (inode, tmp, uspi->s_fpb);
-                        ubh_bforget(tind_bh);
-                        tind_bh = NULL;
-                }
        }
        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-                ubh_ll_rw_block (SWRITE, 1, &tind_bh);
+                ubh_ll_rw_block(SWRITE, tind_bh);
                ubh_wait_on_buffer (tind_bh);
        }
        ubh_brelse (tind_bh);
        
-        UFSD(("EXIT\n"))
+        UFSD("EXIT\n");
        return retry;
 }
-                
-void ufs_truncate (struct inode * inode)
+static int ufs_alloc_lastblock(struct inode *inode)
 {
+        int err = 0;
+        struct address_space *mapping = inode->i_mapping;
+        struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
        struct ufs_inode_info *ufsi = UFS_I(inode);
-        struct super_block * sb;
+        unsigned lastfrag, i, end;
-        struct ufs_sb_private_info * uspi;
+        struct page *lastpage;
-        int retry;
+        struct buffer_head *bh;
+        lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
+        if (!lastfrag) {
+                ufsi->i_lastfrag = 0;
+                goto out;
+        }
+        lastfrag--;
+        lastpage = ufs_get_locked_page(mapping, lastfrag >>
+                                       (PAGE_CACHE_SHIFT - inode->i_blkbits));
+       if (IS_ERR(lastpage)) {
+               err = -EIO;
+               goto out;
+       }
+       end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+       bh = page_buffers(lastpage);
+       for (i = 0; i < end; ++i)
+               bh = bh->b_this_page;
+       if (!buffer_mapped(bh)) {
+               err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+               if (unlikely(err))
+                       goto out_unlock;
+               if (buffer_new(bh)) {
+                       clear_buffer_new(bh);
+                       unmap_underlying_metadata(bh->b_bdev,
+                                                 bh->b_blocknr);
+                       /*
+                        * we do not zeroize fragment, because of
+                        * if it maped to hole, it already contains zeroes
+                        */
+                       set_buffer_uptodate(bh);
+                       mark_buffer_dirty(bh);
+                       set_page_dirty(lastpage);
+               }
+       }
+out_unlock:
+       ufs_put_locked_page(lastpage);
+out:
+       return err;
+}
+int ufs_truncate(struct inode *inode, loff_t old_i_size)
+{
+        struct ufs_inode_info *ufsi = UFS_I(inode);
+        struct super_block *sb = inode->i_sb;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        int retry, err = 0;
        
-        UFSD(("ENTER\n"))
+        UFSD("ENTER\n");
-        sb = inode->i_sb;
-        uspi = UFS_SB(sb)->s_uspi;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                return;
+              S_ISLNK(inode->i_mode)))
+                return -EINVAL;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
+                return -EPERM;
+        if (inode->i_size > old_i_size) {
+                /*
+                 * if we expand file we should care about
+                 * allocation of block for last byte first of all
+                 */
+                err = ufs_alloc_lastblock(inode);
+                if (err) {
+                        i_size_write(inode, old_i_size);
+                        goto out;
+                }
+                /*
+                 * go away, because of we expand file, and we do not
+                 * need free blocks, and zeroizes page
+                 */
+                lock_kernel();
+                goto almost_end;
+        }
-        block_truncate_page(inode->i_mapping,   inode->i_size, ufs_getfrag_block);
+        block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
        lock_kernel();
        while (1) {
@@ -426,9 +477,58 @@ void ufs_truncate (struct inode * inode)
                yield();
        }
+        if (inode->i_size < old_i_size) {
+                /*
+                 * now we should have enough space
+                 * to allocate block for last byte
+                 */
+                err = ufs_alloc_lastblock(inode);
+                if (err)
+                        /*
+                         * looks like all the same - we have no space,
+                         * but we truncate file already
+                         */
+                        inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize;
+        }
+almost_end:
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        ufsi->i_lastfrag = DIRECT_FRAGMENT;
        unlock_kernel();
        mark_inode_dirty(inode);
-        UFSD(("EXIT\n"))
+out:
+        UFSD("EXIT: err %d\n", err);
+        return err;
 }
+/*
+ * We don't define our `inode->i_op->truncate', and call it here,
+ * because of:
+ * - there is no way to know old size
+ * - there is no way inform user about error, if it happens in `truncate'
+ */
+static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        unsigned int ia_valid = attr->ia_valid;
+        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
+        if (ia_valid & ATTR_SIZE &&
+            attr->ia_size != i_size_read(inode)) {
+                loff_t old_i_size = inode->i_size;
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+                error = ufs_truncate(inode, old_i_size);
+                if (error)
+                        return error;
+        }
+        return inode_setattr(inode, attr);
+}
+struct inode_operations ufs_file_inode_operations = {
+        .setattr = ufs_setattr,
+};
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 59acc8f073ac..337cf2c46d10 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -14,15 +14,6 @@
 #include "swab.h"
 #include "util.h"
-#undef UFS_UTILS_DEBUG
-#ifdef UFS_UTILS_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
 struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
        struct super_block *sb, u64 fragment, u64 size)
 {
@@ -63,17 +54,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
        count = size >> uspi->s_fshift;
        if (count <= 0 || count > UFS_MAXFRAG)
                return NULL;
-        USPI_UBH->fragment = fragment;
+        USPI_UBH(uspi)->fragment = fragment;
-        USPI_UBH->count = count;
+        USPI_UBH(uspi)->count = count;
        for (i = 0; i < count; i++)
-                if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i)))
+                if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
                        goto failed;
        for (; i < UFS_MAXFRAG; i++)
-                USPI_UBH->bh[i] = NULL;
+                USPI_UBH(uspi)->bh[i] = NULL;
-        return USPI_UBH;
+        return USPI_UBH(uspi);
 failed:
        for (j = 0; j < i; j++)
-                brelse (USPI_UBH->bh[j]);
+                brelse (USPI_UBH(uspi)->bh[j]);
        return NULL;
 }
@@ -90,11 +81,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh)
 void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
 {
        unsigned i;
-        if (!USPI_UBH)
+        if (!USPI_UBH(uspi))
                return;
-        for ( i = 0; i < USPI_UBH->count; i++ ) {
+        for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
-                brelse (USPI_UBH->bh[i]);
+                brelse (USPI_UBH(uspi)->bh[i]);
-                USPI_UBH->bh[i] = NULL;
+                USPI_UBH(uspi)->bh[i] = NULL;
        }
 }
@@ -121,13 +112,12 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
        }
 }
-void ubh_ll_rw_block (int rw, unsigned nr, struct ufs_buffer_head * ubh[])
+void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
 {
-        unsigned i;
        if (!ubh)
                return;
-        for ( i = 0; i < nr; i++ )
-                ll_rw_block (rw, ubh[i]->count, ubh[i]->bh);
+        ll_rw_block(rw, ubh->count, ubh->bh);
 }
 void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
@@ -139,18 +129,6 @@ void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
                wait_on_buffer (ubh->bh[i]);
 }
-unsigned ubh_max_bcount (struct ufs_buffer_head * ubh)
-{
-        unsigned i;
-        unsigned max = 0;
-        if (!ubh)
-                return 0;
-        for ( i = 0; i < ubh->count; i++ ) 
-                if ( atomic_read(&ubh->bh[i]->b_count) > max )
-                        max = atomic_read(&ubh->bh[i]->b_count);
-        return max;
-}
 void ubh_bforget (struct ufs_buffer_head * ubh)
 {
        unsigned i;
@@ -255,3 +233,57 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
        else
                ufsi->i_u1.i_data[0] = fs32;
 }
+/**
+ * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * read it from disk.
+ * @mapping: the address_space to search
+ * @index: the page index
+ *
+ * Locates the desired pagecache page, if not exist we'll read it,
+ * locks it, increments its reference
+ * count and returns its address.
+ *
+ */
+struct page *ufs_get_locked_page(struct address_space *mapping,
+                                 pgoff_t index)
+{
+        struct page *page;
+try_again:
+        page = find_lock_page(mapping, index);
+        if (!page) {
+                page = read_cache_page(mapping, index,
+                                       (filler_t*)mapping->a_ops->readpage,
+                                       NULL);
+                if (IS_ERR(page)) {
+                        printk(KERN_ERR "ufs_change_blocknr: "
+                               "read_cache_page error: ino %lu, index: %lu\n",
+                               mapping->host->i_ino, index);
+                        goto out;
+                }
+                lock_page(page);
+                if (!PageUptodate(page) || PageError(page)) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        printk(KERN_ERR "ufs_change_blocknr: "
+                               "can not read page: ino %lu, index: %lu\n",
+                               mapping->host->i_ino, index);
+                        page = ERR_PTR(-EIO);
+                        goto out;
+                }
+        }
+        if (unlikely(!page->mapping || !page_has_buffers(page))) {
+                unlock_page(page);
+                page_cache_release(page);
+                goto try_again;/*we really need these buffers*/
+        }
+out:
+        return page;
+}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 48d6d9bcc157..28fce6c239b5 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -17,10 +17,16 @@
 #define in_range(b,first,len)   ((b)>=(first)&&(b)<(first)+(len))
 /*
- * macros used for retyping
+ * functions used for retyping
 */
-#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
-#define USPI_UBH ((struct ufs_buffer_head *)uspi)
+{
+        return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+        return &spi->s_ubh;
+}
@@ -33,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state);
+                return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
        case UFS_ST_SUNx86:
                return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
        case UFS_ST_44BSD:
        default:
-                return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state);
+                return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
        }
 }
@@ -48,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+                usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
                break;
        case UFS_ST_SUNx86:
                usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
                break;
        case UFS_ST_44BSD:
-                usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value);
+                usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
                break;
        }
 }
@@ -64,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
                  struct ufs_super_block_third *usb3)
 {
        if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect);
+                return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
        else
                return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
 }
@@ -76,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
                break;
        case UFS_ST_SUNx86:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
                break;
        case UFS_ST_44BSD:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
                break;
        }
@@ -99,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUN:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
                break;
        case UFS_ST_SUNx86:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
                break;
        case UFS_ST_44BSD:
-                ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0];
+                ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
-                ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1];
+                ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
                break;
        }
@@ -236,9 +242,8 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **);
+extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
 extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
-extern unsigned ubh_max_bcount (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
@@ -246,6 +251,14 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc
 #define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
 extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
+/* This functions works with cache pages*/
+extern struct page *ufs_get_locked_page(struct address_space *mapping,
+                                        pgoff_t index);
+static inline void ufs_put_locked_page(struct page *page)
+{
+       unlock_page(page);
+       page_cache_release(page);
+}
 /*
@@ -297,40 +310,26 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
 #define ubh_blkmap(ubh,begin,bit) \
        ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
-/*
- * Macros for access to superblock array structures
- */
-#define ubh_postbl(ubh,cylno,i) \
-        ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-        ? (*(__s16*)(ubh_get_addr(ubh, \
-        (unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \
-        + (((cylno) * 16 + (i)) << 1) ) )) \
-        : (*(__s16*)(ubh_get_addr(ubh, \
-        uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) ))))
-#define ubh_rotbl(ubh,i) \
-        ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-        ? (*(__u8*)(ubh_get_addr(ubh, \
-        (unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \
-        : (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i)))))
 /*
 * Determine the number of available frags given a
 * percentage to hold in reserve.
 */
-#define ufs_freespace(usb, percentreserved) \
+static inline u64
-        (ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
-        fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100))
+{
+        return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+                uspi->cs_total.cs_nffree -
+                (uspi->s_dsize * (percentreserved) / 100);
+}
 /*
 * Macros to access cylinder group array structures
 */
 #define ubh_cg_blktot(ucpi,cylno) \
-        (*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2))))
+        (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
 #define ubh_cg_blks(ucpi,cylno,rpos) \
-        (*((__fs16*)ubh_get_addr(UCPI_UBH, \
+        (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
        (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
 /*
@@ -508,29 +507,3 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
        if (fragsize > 0 && fragsize < uspi->s_fpb)
                fs32_add(sb, &fraglist[fragsize], cnt);
 }
-#define ubh_scanc(ubh,begin,size,table,mask) _ubh_scanc_(uspi,ubh,begin,size,table,mask)
-static inline unsigned _ubh_scanc_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, 
-        unsigned begin, unsigned size, unsigned char * table, unsigned char mask)
-{
-        unsigned rest, offset;
-        unsigned char * cp;
-        
-        offset = begin & ~uspi->s_fmask;
-        begin >>= uspi->s_fshift;
-        for (;;) {
-                if ((offset + size) < uspi->s_fsize)
-                        rest = size;
-                else
-                        rest = uspi->s_fsize - offset;
-                size -= rest;
-                cp = ubh->bh[begin]->b_data + offset;
-                while ((table[*cp++] & mask) == 0 && --rest);
-                if (rest || !size)
-                        break;
-                begin++;
-                offset = 0;
-        }
-        return (size + rest);
-}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a56cec3be5f0..9a8f48bae956 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
-static struct super_block *vfat_get_sb(struct file_system_type *fs_type,
+static int vfat_get_sb(struct file_system_type *fs_type,
-                                       int flags, const char *dev_name,
+                       int flags, const char *dev_name,
-                                       void *data)
+                       void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+                           mnt);
 }
 static struct file_system_type vfat_fs_type = {
diff --git a/fs/xattr.c b/fs/xattr.c
index e416190f5e9c..c32f15b5f60f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -242,7 +242,7 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
        if (!f)
                return error;
        dentry = f->f_dentry;
-        audit_inode(NULL, dentry->d_inode, 0);
+        audit_inode(NULL, dentry->d_inode);
        error = setxattr(dentry, name, value, size, flags);
        fput(f);
        return error;
@@ -469,7 +469,7 @@ sys_fremovexattr(int fd, char __user *name)
        if (!f)
                return error;
        dentry = f->f_dentry;
-        audit_inode(NULL, dentry->d_inode, 0);
+        audit_inode(NULL, dentry->d_inode);
        error = removexattr(dentry, name);
        fput(f);
        return error;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index bac27d66151d..26b364c9d62c 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,5 @@
 config XFS_FS
        tristate "XFS filesystem support"
-        select EXPORTFS if NFSD!=n
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
@@ -18,11 +17,6 @@ config XFS_FS
          system of your root partition is compiled as a module, you'll need
          to use an initial ramdisk (initrd) to boot.
-config XFS_EXPORT
-        bool
-        depends on XFS_FS && EXPORTFS
-        default y
 config XFS_QUOTA
        bool "XFS Quota support"
        depends on XFS_FS
@@ -65,18 +59,19 @@ config XFS_POSIX_ACL
          If you don't know what Access Control Lists are, say N.
 config XFS_RT
-        bool "XFS Realtime support (EXPERIMENTAL)"
+        bool "XFS Realtime subvolume support"
-        depends on XFS_FS && EXPERIMENTAL
+        depends on XFS_FS
        help
          If you say Y here you will be able to mount and use XFS filesystems
-          which contain a realtime subvolume. The realtime subvolume is a
+          which contain a realtime subvolume.  The realtime subvolume is a
-          separate area of disk space where only file data is stored. The
+          separate area of disk space where only file data is stored.  It was
-          realtime subvolume is designed to provide very deterministic
+          originally designed to provide deterministic data rates suitable
-          data rates suitable for media streaming applications.
+          for media streaming applications, but is also useful as a generic
+          mechanism for ensuring data and metadata/log I/Os are completely
-          See the xfs man page in section 5 for a bit more information.
+          separated.  Regular file I/Os are isolated to a separate device
+          from all other requests, and this can be done quite transparently
+          to applications via the inherit-realtime directory inode flag.
-          This feature is unsupported at this time, is not yet fully
+          See the xfs man page in section 5 for additional information.
-          functional, and may cause serious problems.
          If unsure, say N.
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 5d73eaa1971f..9e7f85986d0d 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -59,7 +59,6 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT)        += $(XFS_LINUX)/xfs_export.o
 xfs-y                           += xfs_alloc.o \
@@ -73,14 +72,12 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_btree.o \
                                   xfs_buf_item.o \
                                   xfs_da_btree.o \
-                                   xfs_dir.o \
                                   xfs_dir2.o \
                                   xfs_dir2_block.o \
                                   xfs_dir2_data.o \
                                   xfs_dir2_leaf.o \
                                   xfs_dir2_node.o \
                                   xfs_dir2_sf.o \
-                                   xfs_dir_leaf.o \
                                   xfs_error.o \
                                   xfs_extfree_item.o \
                                   xfs_fsops.o \
@@ -117,6 +114,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
                                   xfs_globals.o \
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 2cfd33d4d8aa..939bd84bc7ee 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -23,42 +23,6 @@
 #include <linux/mm.h>
 /*
- * Process flags handling
- */
-#define PFLAGS_TEST_NOIO()              (current->flags & PF_NOIO)
-#define PFLAGS_TEST_FSTRANS()           (current->flags & PF_FSTRANS)
-#define PFLAGS_SET_NOIO() do {          \
-        current->flags |= PF_NOIO;      \
-} while (0)
-#define PFLAGS_CLEAR_NOIO() do {        \
-        current->flags &= ~PF_NOIO;     \
-} while (0)
-/* these could be nested, so we save state */
-#define PFLAGS_SET_FSTRANS(STATEP) do { \
-        *(STATEP) = current->flags;     \
-        current->flags |= PF_FSTRANS;   \
-} while (0)
-#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
-        *(STATEP) = current->flags;     \
-        current->flags &= ~PF_FSTRANS;  \
-} while (0)
-/* Restore the PF_FSTRANS state to what was saved in STATEP */
-#define PFLAGS_RESTORE_FSTRANS(STATEP) do {                     \
-        current->flags = ((current->flags & ~PF_FSTRANS) |      \
-                          (*(STATEP) & PF_FSTRANS));            \
-} while (0)
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
-        *(NSTATEP) = *(OSTATEP);        \
-} while (0)
-/*
 * General memory allocation interfaces
 */
@@ -83,7 +47,7 @@ kmem_flags_convert(unsigned int __nocast flags)
                lflags = GFP_ATOMIC | __GFP_NOWARN;
        } else {
                lflags = GFP_KERNEL | __GFP_NOWARN;
-                if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
+                if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
                        lflags &= ~__GFP_FS;
        }
        return lflags;
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
index 1b262b790d9c..32e1ce0f04c9 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -28,7 +28,7 @@ typedef struct {
 } mrlock_t;
 #define mrinit(mrp, name)       \
-        ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+        do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
 #define mrfree(mrp)             do { } while (0)
 #define mraccess(mrp)           mraccessf(mrp, 0)
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 194a84490bd1..b25090094cca 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -34,20 +34,21 @@ typedef struct semaphore sema_t;
 #define initnsema(sp, val, name)        sema_init(sp, val)
 #define psema(sp, b)                    down(sp)
 #define vsema(sp)                       up(sp)
-#define valusema(sp)                    (atomic_read(&(sp)->count))
+#define freesema(sema)                  do { } while (0)
-#define freesema(sema)
+static inline int issemalocked(sema_t *sp)
+{
+        return down_trylock(sp) || (up(sp), 0);
+}
 /*
 * Map cpsema (try to get the sema) to down_trylock. We need to switch
 * the return values since cpsema returns 1 (acquired) 0 (failed) and
 * down_trylock returns the reverse 0 (acquired) 1 (failed).
 */
+static inline int cpsema(sema_t *sp)
-#define cpsema(sp)                      (down_trylock(sp) ? 0 : 1)
+{
+        return down_trylock(sp) ? 0 : 1;
-/*
+}
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
 #endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 4d191ef39b67..c40f81ba9b13 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,7 +21,6 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
 #include "xfs_dmapi.h"
@@ -29,7 +28,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -76,7 +74,7 @@ xfs_page_trace(
        int             mask)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        loff_t          isize = i_size_read(inode);
        loff_t          offset = page_offset(page);
        int             delalloc = -1, unmapped = -1, unwritten = -1;
@@ -136,9 +134,10 @@ xfs_destroy_ioend(
        for (bh = ioend->io_buffer_head; bh; bh = next) {
                next = bh->b_private;
-                bh->b_end_io(bh, ioend->io_uptodate);
+                bh->b_end_io(bh, !ioend->io_error);
        }
+        if (unlikely(ioend->io_error))
+                vn_ioerror(ioend->io_vnode, ioend->io_error, __FILE__,__LINE__);
        vn_iowake(ioend->io_vnode);
        mempool_free(ioend, xfs_ioend_pool);
 }
@@ -180,13 +179,12 @@ xfs_end_bio_unwritten(
        void                    *data)
 {
        xfs_ioend_t             *ioend = data;
-        vnode_t                 *vp = ioend->io_vnode;
+        bhv_vnode_t             *vp = ioend->io_vnode;
        xfs_off_t               offset = ioend->io_offset;
        size_t                  size = ioend->io_size;
-        int                     error;
-        if (ioend->io_uptodate)
+        if (likely(!ioend->io_error))
-                VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+                bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
        xfs_destroy_ioend(ioend);
 }
@@ -211,7 +209,7 @@ xfs_alloc_ioend(
         * all the I/O from calling the completion routine too early.
         */
        atomic_set(&ioend->io_remaining, 1);
-        ioend->io_uptodate = 1; /* cleared if any I/O fails */
+        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
        ioend->io_vnode = vn_from_inode(inode);
@@ -239,10 +237,10 @@ xfs_map_blocks(
        xfs_iomap_t             *mapp,
        int                     flags)
 {
-        vnode_t                 *vp = vn_from_inode(inode);
+        bhv_vnode_t             *vp = vn_from_inode(inode);
        int                     error, nmaps = 1;
-        VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
+        error = bhv_vop_bmap(vp, offset, count, flags, mapp, &nmaps);
        if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
                VMODIFY(vp);
        return -error;
@@ -271,16 +269,14 @@ xfs_end_bio(
        if (bio->bi_size)
                return 1;
-        ASSERT(ioend);
        ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+        ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
        /* Toss bio and pass work off to an xfsdatad thread */
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                ioend->io_uptodate = 0;
        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        bio_put(bio);
        xfs_finish_ioend(ioend);
        return 0;
 }
@@ -1127,7 +1123,7 @@ xfs_vm_writepage(
         * then mark the page dirty again and leave the page
         * as is.
         */
-        if (PFLAGS_TEST_FSTRANS() && need_trans)
+        if (current_test_flags(PF_FSTRANS) && need_trans)
                goto out_fail;
        /*
@@ -1158,6 +1154,18 @@ out_unlock:
        return error;
 }
+STATIC int
+xfs_vm_writepages(
+        struct address_space    *mapping,
+        struct writeback_control *wbc)
+{
+        struct bhv_vnode        *vp = vn_from_inode(mapping->host);
+        if (VN_TRUNC(vp))
+                VUNTRUNCATE(vp);
+        return generic_writepages(mapping, wbc);
+}
 /*
 * Called to move a page into cleanable state - and from there
 * to be released. Possibly the page is already clean. We always
@@ -1204,7 +1212,7 @@ xfs_vm_releasepage(
        /* If we are already inside a transaction or the thread cannot
         * do I/O, we cannot release this page.
         */
-        if (PFLAGS_TEST_FSTRANS())
+        if (current_test_flags(PF_FSTRANS))
                return 0;
        /*
@@ -1231,7 +1239,7 @@ __xfs_get_blocks(
        int                     direct,
        bmapi_flags_t           flags)
 {
-        vnode_t                 *vp = vn_from_inode(inode);
+        bhv_vnode_t             *vp = vn_from_inode(inode);
        xfs_iomap_t             iomap;
        xfs_off_t               offset;
        ssize_t                 size;
@@ -1241,8 +1249,8 @@ __xfs_get_blocks(
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
-        VOP_BMAP(vp, offset, size,
+        error = bhv_vop_bmap(vp, offset, size,
-                create ? flags : BMAPI_READ, &iomap, &niomap, error);
+                             create ? flags : BMAPI_READ, &iomap, &niomap);
        if (error)
                return -error;
        if (niomap == 0)
@@ -1370,13 +1378,13 @@ xfs_vm_direct_IO(
 {
        struct file     *file = iocb->ki_filp;
        struct inode    *inode = file->f_mapping->host;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        xfs_iomap_t     iomap;
        int             maps = 1;
        int             error;
        ssize_t         ret;
-        VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
+        error = bhv_vop_bmap(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps);
        if (error)
                return -error;
@@ -1409,14 +1417,12 @@ xfs_vm_bmap(
        sector_t                block)
 {
        struct inode            *inode = (struct inode *)mapping->host;
-        vnode_t                 *vp = vn_from_inode(inode);
+        bhv_vnode_t             *vp = vn_from_inode(inode);
-        int                     error;
        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        bhv_vop_rwlock(vp, VRWLOCK_READ);
-        VOP_RWLOCK(vp, VRWLOCK_READ);
+        bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
-        VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+        bhv_vop_rwunlock(vp, VRWLOCK_READ);
-        VOP_RWUNLOCK(vp, VRWLOCK_READ);
        return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
@@ -1448,10 +1454,11 @@ xfs_vm_invalidatepage(
        block_invalidatepage(page, offset);
 }
-struct address_space_operations xfs_address_space_operations = {
+const struct address_space_operations xfs_address_space_operations = {
        .readpage               = xfs_vm_readpage,
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
+        .writepages             = xfs_vm_writepages,
        .sync_page              = block_sync_page,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 60716543c68b..2244e516b66a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 Silicon Graphics, Inc.
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -30,9 +30,9 @@ typedef void (*xfs_ioend_func_t)(void *);
 typedef struct xfs_ioend {
        struct xfs_ioend        *io_list;       /* next ioend in chain */
        unsigned int            io_type;        /* delalloc / unwritten */
-        unsigned int            io_uptodate;    /* I/O status register */
+        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
-        struct vnode            *io_vnode;      /* file being written to */
+        struct bhv_vnode        *io_vnode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
        size_t                  io_size;        /* size of the extent */
@@ -40,7 +40,7 @@ typedef struct xfs_ioend {
        struct work_struct      io_work;        /* xfsdatad work queue */
 } xfs_ioend_t;
-extern struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
-#endif /* __XFS_IOPS_H__ */
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 26fed0756f01..2af528dcfb04 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1520,7 +1520,7 @@ xfs_mapping_buftarg(
        struct backing_dev_info *bdi;
        struct inode            *inode;
        struct address_space    *mapping;
-        static struct address_space_operations mapping_aops = {
+        static const struct address_space_operations mapping_aops = {
                .sync_page = block_sync_page,
                .migratepage = fail_migrate_page,
        };
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 4dd6592d5a4c..ceda3a2859d2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_BUF_H__
 #define __XFS_BUF_H__
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index b768ea910bbe..5fb75d9151f2 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -21,7 +21,6 @@
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
@@ -97,7 +96,7 @@ xfs_fs_encode_fh(
        int                     len;
        int                     is64 = 0;
 #if XFS_BIG_INUMS
-        vfs_t                   *vfs = vfs_from_sb(inode->i_sb);
+        bhv_vfs_t               *vfs = vfs_from_sb(inode->i_sb);
        if (!(vfs->vfs_flag & VFS_32BITINODES)) {
                /* filesystem may contain 64bit inode numbers */
@@ -136,13 +135,13 @@ xfs_fs_get_dentry(
        struct super_block      *sb,
        void                    *data)
 {
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        struct inode            *inode;
        struct dentry           *result;
-        vfs_t                   *vfsp = vfs_from_sb(sb);
+        bhv_vfs_t               *vfsp = vfs_from_sb(sb);
        int                     error;
-        VFS_VGET(vfsp, &vp, (fid_t *)data, error);
+        error = bhv_vfs_vget(vfsp, &vp, (fid_t *)data);
        if (error || vp == NULL)
                return ERR_PTR(-ESTALE) ;
@@ -160,12 +159,12 @@ xfs_fs_get_parent(
        struct dentry           *child)
 {
        int                     error;
-        vnode_t                 *vp, *cvp;
+        bhv_vnode_t             *vp, *cvp;
        struct dentry           *parent;
        cvp = NULL;
        vp = vn_from_inode(child->d_inode);
-        VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
+        error = bhv_vop_lookup(vp, &dotdot, &cvp, 0, NULL, NULL);
        if (unlikely(error))
                return ERR_PTR(-error);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index c847416f6d10..3d4f6dff2113 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -21,7 +21,6 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -58,15 +56,12 @@ __xfs_file_read(
 {
        struct iovec            iov = {buf, count};
        struct file             *file = iocb->ki_filp;
-        vnode_t                 *vp = vn_from_inode(file->f_dentry->d_inode);
+        bhv_vnode_t             *vp = vn_from_inode(file->f_dentry->d_inode);
-        ssize_t                 rval;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
-        VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
+        return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
-        return rval;
 }
 STATIC ssize_t
@@ -100,15 +95,12 @@ __xfs_file_write(
        struct iovec    iov = {(void __user *)buf, count};
        struct file     *file = iocb->ki_filp;
        struct inode    *inode = file->f_mapping->host;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        ssize_t         rval;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
-        VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-        return rval;
 }
 STATIC ssize_t
@@ -140,7 +132,7 @@ __xfs_file_readv(
        loff_t                  *ppos)
 {
        struct inode    *inode = file->f_mapping->host;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        struct kiocb    kiocb;
        ssize_t         rval;
@@ -149,7 +141,8 @@ __xfs_file_readv(
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
-        VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+        rval = bhv_vop_read(vp, &kiocb, iov, nr_segs,
+                                &kiocb.ki_pos, ioflags, NULL);
        *ppos = kiocb.ki_pos;
        return rval;
@@ -184,7 +177,7 @@ __xfs_file_writev(
        loff_t                  *ppos)
 {
        struct inode    *inode = file->f_mapping->host;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        struct kiocb    kiocb;
        ssize_t         rval;
@@ -193,7 +186,8 @@ __xfs_file_writev(
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
-        VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+        rval = bhv_vop_write(vp, &kiocb, iov, nr_segs,
+                                 &kiocb.ki_pos, ioflags, NULL);
        *ppos = kiocb.ki_pos;
        return rval;
@@ -227,11 +221,8 @@ xfs_file_sendfile(
        read_actor_t            actor,
        void                    *target)
 {
-        vnode_t                 *vp = vn_from_inode(filp->f_dentry->d_inode);
+        return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode),
-        ssize_t                 rval;
+                                filp, pos, 0, count, actor, target, NULL);
-        VOP_SENDFILE(vp, filp, pos, 0, count, actor, target, NULL, rval);
-        return rval;
 }
 STATIC ssize_t
@@ -242,11 +233,8 @@ xfs_file_sendfile_invis(
        read_actor_t            actor,
        void                    *target)
 {
-        vnode_t                 *vp = vn_from_inode(filp->f_dentry->d_inode);
+        return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode),
-        ssize_t                 rval;
+                                filp, pos, IO_INVIS, count, actor, target, NULL);
-        VOP_SENDFILE(vp, filp, pos, IO_INVIS, count, actor, target, NULL, rval);
-        return rval;
 }
 STATIC ssize_t
@@ -257,11 +245,8 @@ xfs_file_splice_read(
        size_t                  len,
        unsigned int            flags)
 {
-        vnode_t                 *vp = vn_from_inode(infilp->f_dentry->d_inode);
+        return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode),
-        ssize_t                 rval;
+                                   infilp, ppos, pipe, len, flags, 0, NULL);
-        VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
-        return rval;
 }
 STATIC ssize_t
@@ -272,11 +257,9 @@ xfs_file_splice_read_invis(
        size_t                  len,
        unsigned int            flags)
 {
-        vnode_t                 *vp = vn_from_inode(infilp->f_dentry->d_inode);
+        return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode),
-        ssize_t                 rval;
+                                   infilp, ppos, pipe, len, flags, IO_INVIS,
+                                   NULL);
-        VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
-        return rval;
 }
 STATIC ssize_t
@@ -287,11 +270,8 @@ xfs_file_splice_write(
        size_t                  len,
        unsigned int            flags)
 {
-        vnode_t                 *vp = vn_from_inode(outfilp->f_dentry->d_inode);
+        return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode),
-        ssize_t                 rval;
+                                    pipe, outfilp, ppos, len, flags, 0, NULL);
-        VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
-        return rval;
 }
 STATIC ssize_t
@@ -302,11 +282,9 @@ xfs_file_splice_write_invis(
        size_t                  len,
        unsigned int            flags)
 {
-        vnode_t                 *vp = vn_from_inode(outfilp->f_dentry->d_inode);
+        return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode),
-        ssize_t                 rval;
+                                    pipe, outfilp, ppos, len, flags, IO_INVIS,
+                                    NULL);
-        VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
-        return rval;
 }
 STATIC int
@@ -314,13 +292,18 @@ xfs_file_open(
        struct inode    *inode,
        struct file     *filp)
 {
-        vnode_t         *vp = vn_from_inode(inode);
-        int             error;
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EFBIG;
-        VOP_OPEN(vp, NULL, error);
+        return -bhv_vop_open(vn_from_inode(inode), NULL);
-        return -error;
+}
+STATIC int
+xfs_file_close(
+        struct file     *filp,
+        fl_owner_t      id)
+{
+        return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0,
+                                file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
 }
 STATIC int
@@ -328,12 +311,11 @@ xfs_file_release(
        struct inode    *inode,
        struct file     *filp)
 {
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        int             error = 0;
        if (vp)
-                VOP_RELEASE(vp, error);
+                return -bhv_vop_release(vp);
-        return -error;
+        return 0;
 }
 STATIC int
@@ -342,15 +324,14 @@ xfs_file_fsync(
        struct dentry   *dentry,
        int             datasync)
 {
-        struct inode    *inode = dentry->d_inode;
+        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
-        vnode_t         *vp = vn_from_inode(inode);
-        int             error;
        int             flags = FSYNC_WAIT;
        if (datasync)
                flags |= FSYNC_DATA;
-        VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
+        if (VN_TRUNC(vp))
-        return -error;
+                VUNTRUNCATE(vp);
+        return -bhv_vop_fsync(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1);
 }
 #ifdef CONFIG_XFS_DMAPI
@@ -361,16 +342,11 @@ xfs_vm_nopage(
        int                     *type)
 {
        struct inode    *inode = area->vm_file->f_dentry->d_inode;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
-        int             error;
        ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
+        if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0))
-        error = XFS_SEND_MMAP(mp, area, 0);
-        if (error)
                return NULL;
        return filemap_nopage(area, address, type);
 }
 #endif /* CONFIG_XFS_DMAPI */
@@ -382,7 +358,7 @@ xfs_file_readdir(
        filldir_t       filldir)
 {
        int             error = 0;
-        vnode_t         *vp = vn_from_inode(filp->f_dentry->d_inode);
+        bhv_vnode_t     *vp = vn_from_inode(filp->f_dentry->d_inode);
        uio_t           uio;
        iovec_t         iov;
        int             eof = 0;
@@ -417,7 +393,7 @@ xfs_file_readdir(
                start_offset = uio.uio_offset;
-                VOP_READDIR(vp, &uio, NULL, &eof, error);
+                error = bhv_vop_readdir(vp, &uio, NULL, &eof);
                if ((uio.uio_offset == start_offset) || error) {
                        size = 0;
                        break;
@@ -456,38 +432,28 @@ xfs_file_mmap(
        struct file     *filp,
        struct vm_area_struct *vma)
 {
-        struct inode    *ip = filp->f_dentry->d_inode;
-        vnode_t         *vp = vn_from_inode(ip);
-        vattr_t         vattr;
-        int             error;
        vma->vm_ops = &xfs_file_vm_ops;
 #ifdef CONFIG_XFS_DMAPI
-        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+        if (vn_from_inode(filp->f_dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
                vma->vm_ops = &xfs_dmapi_file_vm_ops;
-        }
 #endif /* CONFIG_XFS_DMAPI */
-        vattr.va_mask = XFS_AT_UPDATIME;
+        file_accessed(filp);
-        VOP_SETATTR(vp, &vattr, XFS_AT_UPDATIME, NULL, error);
-        if (likely(!error))
-                __vn_revalidate(vp, &vattr);    /* update flags */
        return 0;
 }
 STATIC long
 xfs_file_ioctl(
        struct file     *filp,
        unsigned int    cmd,
-        unsigned long   arg)
+        unsigned long   p)
 {
        int             error;
        struct inode    *inode = filp->f_dentry->d_inode;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
+        error = bhv_vop_ioctl(vp, inode, filp, 0, cmd, (void __user *)p);
        VMODIFY(vp);
        /* NOTE:  some of the ioctl's return positive #'s as a
@@ -503,13 +469,13 @@ STATIC long
 xfs_file_ioctl_invis(
        struct file     *filp,
        unsigned int    cmd,
-        unsigned long   arg)
+        unsigned long   p)
 {
-        struct inode    *inode = filp->f_dentry->d_inode;
-        vnode_t         *vp = vn_from_inode(inode);
        int             error;
+        struct inode    *inode = filp->f_dentry->d_inode;
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
+        error = bhv_vop_ioctl(vp, inode, filp, IO_INVIS, cmd, (void __user *)p);
        VMODIFY(vp);
        /* NOTE:  some of the ioctl's return positive #'s as a
@@ -528,7 +494,7 @@ xfs_vm_mprotect(
        struct vm_area_struct *vma,
        unsigned int    newflags)
 {
-        vnode_t         *vp = vn_from_inode(vma->vm_file->f_dentry->d_inode);
+        bhv_vnode_t     *vp = vn_from_inode(vma->vm_file->f_dentry->d_inode);
        int             error = 0;
        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
@@ -554,24 +520,19 @@ STATIC int
 xfs_file_open_exec(
        struct inode    *inode)
 {
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
-        int             error = 0;
-        xfs_inode_t     *ip;
-        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+        if (unlikely(vp->v_vfsp->vfs_flag & VFS_DMI)) {
-                ip = xfs_vtoi(vp);
+                xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
-                if (!ip) {
+                xfs_inode_t     *ip = xfs_vtoi(vp);
-                        error = -EINVAL;
-                        goto open_exec_out;
+                if (!ip)
-                }
+                        return -EINVAL;
-                if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
+                if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ))
-                        error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
+                        return -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
                                               0, 0, 0, NULL);
-                }
        }
-open_exec_out:
+        return 0;
-        return error;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
@@ -592,6 +553,7 @@ const struct file_operations xfs_file_operations = {
 #endif
        .mmap           = xfs_file_mmap,
        .open           = xfs_file_open,
+        .flush          = xfs_file_close,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
 #ifdef HAVE_FOP_OPEN_EXEC
@@ -616,6 +578,7 @@ const struct file_operations xfs_invis_file_operations = {
 #endif
        .mmap           = xfs_file_mmap,
        .open           = xfs_file_open,
+        .flush          = xfs_file_close,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
 };
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 575f2a790f31..dc0562828e76 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -15,40 +15,12 @@
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
-/*
+int  fs_noerr(void) { return 0; }
- * Stub for no-op vnode operations that return error status.
+int  fs_nosys(void) { return ENOSYS; }
- */
+void fs_noval(void) { return; }
-int
-fs_noerr(void)
-{
-        return 0;
-}
-/*
- * Operation unsupported under this file system.
- */
-int
-fs_nosys(void)
-{
-        return ENOSYS;
-}
-/*
- * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
- */
-/* ARGSUSED */
-void
-fs_noval(void)
-{
-}
-/*
- * vnode pcache layer for vnode_tosspages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 void
 fs_tosspages(
        bhv_desc_t      *bdp,
@@ -56,18 +28,13 @@ fs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        vnode_t         *vp = BHV_TO_VNODE(bdp);
+        bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
        struct inode    *ip = vn_to_inode(vp);
        if (VN_CACHED(vp))
                truncate_inode_pages(ip->i_mapping, first);
 }
-/*
- * vnode pcache layer for vnode_flushinval_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 void
 fs_flushinval_pages(
        bhv_desc_t      *bdp,
@@ -75,20 +42,17 @@ fs_flushinval_pages(
        xfs_off_t       last,
        int             fiopt)
 {
-        vnode_t         *vp = BHV_TO_VNODE(bdp);
+        bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
        struct inode    *ip = vn_to_inode(vp);
        if (VN_CACHED(vp)) {
+                if (VN_TRUNC(vp))
+                        VUNTRUNCATE(vp);
                filemap_write_and_wait(ip->i_mapping);
                truncate_inode_pages(ip->i_mapping, first);
        }
 }
-/*
- * vnode pcache layer for vnode_flush_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 int
 fs_flush_pages(
        bhv_desc_t      *bdp,
@@ -97,15 +61,16 @@ fs_flush_pages(
        uint64_t        flags,
        int             fiopt)
 {
-        vnode_t         *vp = BHV_TO_VNODE(bdp);
+        bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
        struct inode    *ip = vn_to_inode(vp);
-        if (VN_CACHED(vp)) {
+        if (VN_DIRTY(vp)) {
+                if (VN_TRUNC(vp))
+                        VUNTRUNCATE(vp);
                filemap_fdatawrite(ip->i_mapping);
                if (flags & XFS_B_ASYNC)
                        return 0;
                filemap_fdatawait(ip->i_mapping);
        }
        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 6e8085f34635..6c162c3dde7e 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -45,6 +45,7 @@ xfs_param_t xfs_params = {
        .xfs_buf_age    = {     1*100,          15*100,         7200*100},
        .inherit_nosym  = {     0,              0,              1       },
        .rotorstep      = {     1,              1,              255     },
+        .inherit_nodfrg = {     0,              1,              1       },
 };
 /*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 84478491609b..6e52a5dd38d8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -31,7 +30,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
@@ -78,7 +76,7 @@ xfs_find_handle(
        xfs_handle_t            handle;
        xfs_fsop_handlereq_t    hreq;
        struct inode            *inode;
-        struct vnode            *vp;
+        bhv_vnode_t             *vp;
        if (copy_from_user(&hreq, arg, sizeof(hreq)))
                return -XFS_ERROR(EFAULT);
@@ -192,7 +190,7 @@ xfs_vget_fsop_handlereq(
        xfs_mount_t             *mp,
        struct inode            *parinode,      /* parent inode pointer    */
        xfs_fsop_handlereq_t    *hreq,
-        vnode_t                 **vp,
+        bhv_vnode_t             **vp,
        struct inode            **inode)
 {
        void                    __user *hanp;
@@ -202,7 +200,7 @@ xfs_vget_fsop_handlereq(
        xfs_handle_t            handle;
        xfs_inode_t             *ip;
        struct inode            *inodep;
-        vnode_t                 *vpp;
+        bhv_vnode_t             *vpp;
        xfs_ino_t               ino;
        __u32                   igen;
        int                     error;
@@ -277,7 +275,7 @@ xfs_open_by_handle(
        struct file             *filp;
        struct inode            *inode;
        struct dentry           *dentry;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        xfs_fsop_handlereq_t    hreq;
        if (!capable(CAP_SYS_ADMIN))
@@ -362,7 +360,7 @@ xfs_readlink_by_handle(
        struct uio              auio;
        struct inode            *inode;
        xfs_fsop_handlereq_t    hreq;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        __u32                   olen;
        if (!capable(CAP_SYS_ADMIN))
@@ -393,9 +391,11 @@ xfs_readlink_by_handle(
        auio.uio_segflg = UIO_USERSPACE;
        auio.uio_resid  = olen;
-        VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
+        error = bhv_vop_readlink(vp, &auio, IO_INVIS, NULL);
        VN_RELE(vp);
+        if (error)
+                return -error;
        return (olen - auio.uio_resid);
 }
@@ -411,7 +411,7 @@ xfs_fssetdm_by_handle(
        xfs_fsop_setdm_handlereq_t dmhreq;
        struct inode            *inode;
        bhv_desc_t              *bdp;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        if (!capable(CAP_MKNOD))
                return -XFS_ERROR(EPERM);
@@ -452,7 +452,7 @@ xfs_attrlist_by_handle(
        attrlist_cursor_kern_t  *cursor;
        xfs_fsop_attrlist_handlereq_t al_hreq;
        struct inode            *inode;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        char                    *kbuf;
        if (!capable(CAP_SYS_ADMIN))
@@ -472,8 +472,8 @@ xfs_attrlist_by_handle(
                goto out_vn_rele;
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-        VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
+        error = bhv_vop_attr_list(vp, kbuf, al_hreq.buflen, al_hreq.flags,
-                        cursor, NULL, error);
+                                        cursor, NULL);
        if (error)
                goto out_kfree;
@@ -490,7 +490,7 @@ xfs_attrlist_by_handle(
 STATIC int
 xfs_attrmulti_attr_get(
-        struct vnode            *vp,
+        bhv_vnode_t             *vp,
        char                    *name,
        char                    __user *ubuf,
        __uint32_t              *len,
@@ -505,7 +505,7 @@ xfs_attrmulti_attr_get(
        if (!kbuf)
                return ENOMEM;
-        VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
+        error = bhv_vop_attr_get(vp, name, kbuf, len, flags, NULL);
        if (error)
                goto out_kfree;
@@ -519,7 +519,7 @@ xfs_attrmulti_attr_get(
 STATIC int
 xfs_attrmulti_attr_set(
-        struct vnode            *vp,
+        bhv_vnode_t             *vp,
        char                    *name,
        const char              __user *ubuf,
        __uint32_t              len,
@@ -542,7 +542,7 @@ xfs_attrmulti_attr_set(
        if (copy_from_user(kbuf, ubuf, len))
                goto out_kfree;
                        
-        VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
+        error = bhv_vop_attr_set(vp, name, kbuf, len, flags, NULL);
 out_kfree:
        kfree(kbuf);
@@ -551,20 +551,15 @@ xfs_attrmulti_attr_set(
 STATIC int
 xfs_attrmulti_attr_remove(
-        struct vnode            *vp,
+        bhv_vnode_t             *vp,
        char                    *name,
        __uint32_t              flags)
 {
-        int                     error;
        if (IS_RDONLY(&vp->v_inode))
                return -EROFS;
        if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
                return EPERM;
+        return bhv_vop_attr_remove(vp, name, flags, NULL);
-        VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
-        return error;
 }
 STATIC int
@@ -578,7 +573,7 @@ xfs_attrmulti_by_handle(
        xfs_attr_multiop_t      *ops;
        xfs_fsop_attrmulti_handlereq_t am_hreq;
        struct inode            *inode;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        unsigned int            i, size;
        char                    *attr_name;
@@ -658,7 +653,7 @@ xfs_attrmulti_by_handle(
 STATIC int
 xfs_ioc_space(
        bhv_desc_t              *bdp,
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        struct file             *filp,
        int                     flags,
        unsigned int            cmd,
@@ -682,7 +677,7 @@ xfs_ioc_fsgeometry(
 STATIC int
 xfs_ioc_xattr(
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        xfs_inode_t             *ip,
        struct file             *filp,
        unsigned int            cmd,
@@ -711,7 +706,7 @@ xfs_ioctl(
        void                    __user *arg)
 {
        int                     error;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        xfs_inode_t             *ip;
        xfs_mount_t             *mp;
@@ -962,7 +957,7 @@ xfs_ioctl(
 STATIC int
 xfs_ioc_space(
        bhv_desc_t              *bdp,
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        struct file             *filp,
        int                     ioflags,
        unsigned int            cmd,
@@ -1153,14 +1148,14 @@ xfs_di2lxflags(
 STATIC int
 xfs_ioc_xattr(
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        xfs_inode_t             *ip,
        struct file             *filp,
        unsigned int            cmd,
        void                    __user *arg)
 {
        struct fsxattr          fa;
-        struct vattr            *vattr;
+        struct bhv_vattr        *vattr;
        int                     error = 0;
        int                     attr_flags;
        unsigned int            flags;
@@ -1173,7 +1168,7 @@ xfs_ioc_xattr(
        case XFS_IOC_FSGETXATTR: {
                vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
                                 XFS_AT_NEXTENTS | XFS_AT_PROJID;
-                VOP_GETATTR(vp, vattr, 0, NULL, error);
+                error = bhv_vop_getattr(vp, vattr, 0, NULL);
                if (unlikely(error)) {
                        error = -error;
                        break;
@@ -1206,7 +1201,7 @@ xfs_ioc_xattr(
                vattr->va_extsize = fa.fsx_extsize;
                vattr->va_projid  = fa.fsx_projid;
-                VOP_SETATTR(vp, vattr, attr_flags, NULL, error);
+                error = bhv_vop_setattr(vp, vattr, attr_flags, NULL);
                if (likely(!error))
                        __vn_revalidate(vp, vattr);     /* update flags */
                error = -error;
@@ -1216,7 +1211,7 @@ xfs_ioc_xattr(
        case XFS_IOC_FSGETXATTRA: {
                vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
                                 XFS_AT_ANEXTENTS | XFS_AT_PROJID;
-                VOP_GETATTR(vp, vattr, 0, NULL, error);
+                error = bhv_vop_getattr(vp, vattr, 0, NULL);
                if (unlikely(error)) {
                        error = -error;
                        break;
@@ -1262,7 +1257,7 @@ xfs_ioc_xattr(
                vattr->va_xflags = xfs_merge_ioc_xflags(flags,
                                                        xfs_ip2xflags(ip));
-                VOP_SETATTR(vp, vattr, attr_flags, NULL, error);
+                error = bhv_vop_setattr(vp, vattr, attr_flags, NULL);
                if (likely(!error))
                        __vn_revalidate(vp, vattr);     /* update flags */
                error = -error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 251bfe451a3f..270db0f3861d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -15,7 +15,6 @@
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/config.h>
 #include <linux/compat.h>
 #include <linux/init.h>
 #include <linux/ioctl.h>
@@ -114,7 +113,7 @@ xfs_compat_ioctl(
        unsigned long   arg)
 {
        struct inode    *inode = file->f_dentry->d_inode;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        int             error;
        switch (cmd) {
@@ -193,7 +192,7 @@ xfs_compat_ioctl(
                return -ENOIOCTLCMD;
        }
-        VOP_IOCTL(vp, inode, file, mode, cmd, (void __user *)arg, error);
+        error = bhv_vop_ioctl(vp, inode, file, mode, cmd, (void __user *)arg);
        VMODIFY(vp);
        return error;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2e2e275c786f..d9180020de63 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -61,7 +59,7 @@
 */
 xfs_inode_t *
 xfs_vtoi(
-        struct vnode    *vp)
+        bhv_vnode_t     *vp)
 {
        bhv_desc_t      *bdp;
@@ -80,7 +78,7 @@ void
 xfs_synchronize_atime(
        xfs_inode_t     *ip)
 {
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = XFS_ITOV_NULL(ip);
        if (vp) {
@@ -200,14 +198,10 @@ xfs_ichgtime_fast(
 STATIC void
 xfs_validate_fields(
        struct inode    *ip,
-        struct vattr    *vattr)
+        bhv_vattr_t     *vattr)
 {
-        vnode_t         *vp = vn_from_inode(ip);
-        int             error;
        vattr->va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
-        VOP_GETATTR(vp, vattr, ATTR_LAZY, NULL, error);
+        if (!bhv_vop_getattr(vn_from_inode(ip), vattr, ATTR_LAZY, NULL)) {
-        if (likely(!error)) {
                ip->i_nlink = vattr->va_nlink;
                ip->i_blocks = vattr->va_nblocks;
@@ -225,7 +219,7 @@ xfs_validate_fields(
 */
 STATIC int
 xfs_init_security(
-        struct vnode    *vp,
+        bhv_vnode_t     *vp,
        struct inode    *dir)
 {
        struct inode    *ip = vn_to_inode(vp);
@@ -241,7 +235,7 @@ xfs_init_security(
                return -error;
        }
-        VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
+        error = bhv_vop_attr_set(vp, name, value, length, ATTR_SECURE, NULL);
        if (!error)
                VMODIFY(vp);
@@ -264,13 +258,12 @@ xfs_has_fs_struct(struct task_struct *task)
 STATIC inline void
 xfs_cleanup_inode(
-        vnode_t         *dvp,
+        bhv_vnode_t     *dvp,
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        struct dentry   *dentry,
        int             mode)
 {
        struct dentry   teardown = {};
-        int             error;
        /* Oh, the horror.
         * If we can't add the ACL or we fail in
@@ -281,9 +274,9 @@ xfs_cleanup_inode(
        teardown.d_name = dentry->d_name;
        if (S_ISDIR(mode))
-                VOP_RMDIR(dvp, &teardown, NULL, error);
+                bhv_vop_rmdir(dvp, &teardown, NULL);
        else
-                VOP_REMOVE(dvp, &teardown, NULL, error);
+                bhv_vop_remove(dvp, &teardown, NULL);
        VN_RELE(vp);
 }
@@ -295,8 +288,8 @@ xfs_vn_mknod(
        dev_t           rdev)
 {
        struct inode    *ip;
-        vattr_t         vattr = { 0 };
+        bhv_vattr_t     vattr = { 0 };
-        vnode_t         *vp = NULL, *dvp = vn_from_inode(dir);
+        bhv_vnode_t     *vp = NULL, *dvp = vn_from_inode(dir);
        xfs_acl_t       *default_acl = NULL;
        attrexists_t    test_default_acl = _ACL_DEFAULT_EXISTS;
        int             error;
@@ -330,10 +323,10 @@ xfs_vn_mknod(
                vattr.va_mask |= XFS_AT_RDEV;
                /*FALLTHROUGH*/
        case S_IFREG:
-                VOP_CREATE(dvp, dentry, &vattr, &vp, NULL, error);
+                error = bhv_vop_create(dvp, dentry, &vattr, &vp, NULL);
                break;
        case S_IFDIR:
-                VOP_MKDIR(dvp, dentry, &vattr, &vp, NULL, error);
+                error = bhv_vop_mkdir(dvp, dentry, &vattr, &vp, NULL);
                break;
        default:
                error = EINVAL;
@@ -396,14 +389,14 @@ xfs_vn_lookup(
        struct dentry   *dentry,
        struct nameidata *nd)
 {
-        struct vnode    *vp = vn_from_inode(dir), *cvp;
+        bhv_vnode_t     *vp = vn_from_inode(dir), *cvp;
        int             error;
        if (dentry->d_name.len >= MAXNAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
-        VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
+        error = bhv_vop_lookup(vp, dentry, &cvp, 0, NULL, NULL);
-        if (error) {
+        if (unlikely(error)) {
                if (unlikely(error != ENOENT))
                        return ERR_PTR(-error);
                d_add(dentry, NULL);
@@ -420,22 +413,21 @@ xfs_vn_link(
        struct dentry   *dentry)
 {
        struct inode    *ip;    /* inode of guy being linked to */
-        vnode_t         *tdvp;  /* target directory for new name/link */
+        bhv_vnode_t     *tdvp;  /* target directory for new name/link */
-        vnode_t         *vp;    /* vp of name being linked */
+        bhv_vnode_t     *vp;    /* vp of name being linked */
-        vattr_t         vattr;
+        bhv_vattr_t     vattr;
        int             error;
        ip = old_dentry->d_inode;       /* inode being linked to */
-        if (S_ISDIR(ip->i_mode))
-                return -EPERM;
        tdvp = vn_from_inode(dir);
        vp = vn_from_inode(ip);
-        VOP_LINK(tdvp, vp, dentry, NULL, error);
+        VN_HOLD(vp);
-        if (likely(!error)) {
+        error = bhv_vop_link(tdvp, vp, dentry, NULL);
+        if (unlikely(error)) {
+                VN_RELE(vp);
+        } else {
                VMODIFY(tdvp);
-                VN_HOLD(vp);
                xfs_validate_fields(ip, &vattr);
                d_instantiate(dentry, ip);
        }
@@ -448,14 +440,14 @@ xfs_vn_unlink(
        struct dentry   *dentry)
 {
        struct inode    *inode;
-        vnode_t         *dvp;   /* directory containing name to remove */
+        bhv_vnode_t     *dvp;   /* directory containing name to remove */
-        vattr_t         vattr;
+        bhv_vattr_t     vattr;
        int             error;
        inode = dentry->d_inode;
        dvp = vn_from_inode(dir);
-        VOP_REMOVE(dvp, dentry, NULL, error);
+        error = bhv_vop_remove(dvp, dentry, NULL);
        if (likely(!error)) {
                xfs_validate_fields(dir, &vattr);       /* size needs update */
                xfs_validate_fields(inode, &vattr);
@@ -470,27 +462,26 @@ xfs_vn_symlink(
        const char      *symname)
 {
        struct inode    *ip;
-        vattr_t         vattr = { 0 };
+        bhv_vattr_t     va = { 0 };
-        vnode_t         *dvp;   /* directory containing name of symlink */
+        bhv_vnode_t     *dvp;   /* directory containing name of symlink */
-        vnode_t         *cvp;   /* used to lookup symlink to put in dentry */
+        bhv_vnode_t     *cvp;   /* used to lookup symlink to put in dentry */
        int             error;
        dvp = vn_from_inode(dir);
        cvp = NULL;
-        vattr.va_mode = S_IFLNK |
+        va.va_mode = S_IFLNK |
                (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
-        vattr.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
+        va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
-        error = 0;
+        error = bhv_vop_symlink(dvp, dentry, &va, (char *)symname, &cvp, NULL);
-        VOP_SYMLINK(dvp, dentry, &vattr, (char *)symname, &cvp, NULL, error);
        if (likely(!error && cvp)) {
                error = xfs_init_security(cvp, dir);
                if (likely(!error)) {
                        ip = vn_to_inode(cvp);
                        d_instantiate(dentry, ip);
-                        xfs_validate_fields(dir, &vattr);
+                        xfs_validate_fields(dir, &va);
-                        xfs_validate_fields(ip, &vattr);
+                        xfs_validate_fields(ip, &va);
                } else {
                        xfs_cleanup_inode(dvp, cvp, dentry, 0);
                }
@@ -504,11 +495,11 @@ xfs_vn_rmdir(
        struct dentry   *dentry)
 {
        struct inode    *inode = dentry->d_inode;
-        vnode_t         *dvp = vn_from_inode(dir);
+        bhv_vnode_t     *dvp = vn_from_inode(dir);
-        vattr_t         vattr;
+        bhv_vattr_t     vattr;
        int             error;
-        VOP_RMDIR(dvp, dentry, NULL, error);
+        error = bhv_vop_rmdir(dvp, dentry, NULL);
        if (likely(!error)) {
                xfs_validate_fields(inode, &vattr);
                xfs_validate_fields(dir, &vattr);
@@ -524,15 +515,15 @@ xfs_vn_rename(
        struct dentry   *ndentry)
 {
        struct inode    *new_inode = ndentry->d_inode;
-        vnode_t         *fvp;   /* from directory */
+        bhv_vnode_t     *fvp;   /* from directory */
-        vnode_t         *tvp;   /* target directory */
+        bhv_vnode_t     *tvp;   /* target directory */
-        vattr_t         vattr;
+        bhv_vattr_t     vattr;
        int             error;
        fvp = vn_from_inode(odir);
        tvp = vn_from_inode(ndir);
-        VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
+        error = bhv_vop_rename(fvp, odentry, tvp, ndentry, NULL);
        if (likely(!error)) {
                if (new_inode)
                        xfs_validate_fields(new_inode, &vattr);
@@ -553,7 +544,7 @@ xfs_vn_follow_link(
        struct dentry           *dentry,
        struct nameidata        *nd)
 {
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        uio_t                   *uio;
        iovec_t                 iov;
        int                     error;
@@ -586,8 +577,8 @@ xfs_vn_follow_link(
        uio->uio_resid = MAXPATHLEN;
        uio->uio_iovcnt = 1;
-        VOP_READLINK(vp, uio, 0, NULL, error);
+        error = bhv_vop_readlink(vp, uio, 0, NULL);
-        if (error) {
+        if (unlikely(error)) {
                kfree(link);
                link = ERR_PTR(-error);
        } else {
@@ -618,12 +609,7 @@ xfs_vn_permission(
        int             mode,
        struct nameidata *nd)
 {
-        vnode_t         *vp = vn_from_inode(inode);
+        return -bhv_vop_access(vn_from_inode(inode), mode << 6, NULL);
-        int             error;
-        mode <<= 6;             /* convert from linux to vnode access bits */
-        VOP_ACCESS(vp, mode, NULL, error);
-        return -error;
 }
 #else
 #define xfs_vn_permission NULL
@@ -636,14 +622,14 @@ xfs_vn_getattr(
        struct kstat    *stat)
 {
        struct inode    *inode = dentry->d_inode;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        int             error = 0;
        if (unlikely(vp->v_flag & VMODIFIED))
                error = vn_revalidate(vp);
        if (!error)
                generic_fillattr(inode, stat);
-        return 0;
+        return -error;
 }
 STATIC int
@@ -653,8 +639,8 @@ xfs_vn_setattr(
 {
        struct inode    *inode = dentry->d_inode;
        unsigned int    ia_valid = attr->ia_valid;
-        vnode_t         *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
-        vattr_t         vattr = { 0 };
+        bhv_vattr_t     vattr = { 0 };
        int             flags = 0;
        int             error;
@@ -697,7 +683,7 @@ xfs_vn_setattr(
                flags |= ATTR_NONBLOCK;
 #endif
-        VOP_SETATTR(vp, &vattr, flags, NULL, error);
+        error = bhv_vop_setattr(vp, &vattr, flags, NULL);
        if (likely(!error))
                __vn_revalidate(vp, &vattr);
        return -error;
@@ -718,7 +704,7 @@ xfs_vn_setxattr(
        size_t          size,
        int             flags)
 {
-        vnode_t         *vp = vn_from_inode(dentry->d_inode);
+        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
        char            *attr = (char *)name;
        attrnames_t     *namesp;
        int             xflags = 0;
@@ -748,7 +734,7 @@ xfs_vn_getxattr(
        void            *data,
        size_t          size)
 {
-        vnode_t         *vp = vn_from_inode(dentry->d_inode);
+        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
        char            *attr = (char *)name;
        attrnames_t     *namesp;
        int             xflags = 0;
@@ -777,7 +763,7 @@ xfs_vn_listxattr(
        char                    *data,
        size_t                  size)
 {
-        vnode_t                 *vp = vn_from_inode(dentry->d_inode);
+        bhv_vnode_t             *vp = vn_from_inode(dentry->d_inode);
        int                     error, xflags = ATTR_KERNAMELS;
        ssize_t                 result;
@@ -796,7 +782,7 @@ xfs_vn_removexattr(
        struct dentry   *dentry,
        const char      *name)
 {
-        vnode_t         *vp = vn_from_inode(dentry->d_inode);
+        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
        char            *attr = (char *)name;
        attrnames_t     *namesp;
        int             xflags = 0;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index e9fe43d74768..8c021dc57d1f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -19,7 +19,6 @@
 #define __XFS_LINUX__
 #include <linux/types.h>
-#include <linux/config.h>
 /*
 * Some types are conditional depending on the target system.
@@ -134,14 +133,19 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define xfs_buf_age_centisecs   xfs_params.xfs_buf_age.val
 #define xfs_inherit_nosymlinks  xfs_params.inherit_nosym.val
 #define xfs_rotorstep           xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag    xfs_params.inherit_nodfrg.val
-#ifndef raw_smp_processor_id
+#define current_cpu()           (raw_smp_processor_id())
-#define raw_smp_processor_id()  smp_processor_id()
-#endif
-#define current_cpu()           raw_smp_processor_id()
 #define current_pid()           (current->pid)
 #define current_fsuid(cred)     (current->fsuid)
 #define current_fsgid(cred)     (current->fsgid)
+#define current_test_flags(f)   (current->flags & (f))
+#define current_set_flags_nested(sp, f)         \
+                (*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f)       \
+                (*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f)     \
+                (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
 #define NBPP            PAGE_SIZE
 #define DPPSHFT         (PAGE_SHIFT - 9)
@@ -187,25 +191,9 @@ BUFFER_FNS(PrivateStart, unwritten);
 /* bytes to clicks */
 #define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
-#ifndef ENOATTR
 #define ENOATTR         ENODATA         /* Attribute not found */
-#endif
+#define EWRONGFS        EINVAL          /* Mount with wrong filesystem type */
+#define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
-/* Note: EWRONGFS never visible outside the kernel */
-#define EWRONGFS        EINVAL          /* Mount with wrong filesystem type */
-/*
- * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
- *     return codes out of its known range in errno.
- * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
- *     conflict with any code we use already or any code a driver may use)
- * XXX Some options (currently we do #2):
- *      1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
- *      2/ 990 ["Unknown error 990"]
- *      3/ EUCLEAN ["Structure needs cleaning"]
- *      4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
- */
-#define EFSCORRUPTED    990             /* Filesystem is corrupted */
 #define SYNCHRONIZE()   barrier()
 #define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 67efe3308980..5d9cfd91ad08 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -206,7 +204,7 @@ xfs_read(
        xfs_fsize_t             n;
        xfs_inode_t             *ip;
        xfs_mount_t             *mp;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        unsigned long           seg;
        ip = XFS_BHVTOI(bdp);
@@ -258,7 +256,7 @@ xfs_read(
        if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
            !(ioflags & IO_INVIS)) {
-                vrwlock_t locktype = VRWLOCK_READ;
+                bhv_vrwlock_t locktype = VRWLOCK_READ;
                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
@@ -271,7 +269,7 @@ xfs_read(
        }
        if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp)))
-                VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(*offset)),
+                bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
                                                -1, FI_REMAPF_LOCKED);
        xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
@@ -313,7 +311,7 @@ xfs_sendfile(
        if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
            (!(ioflags & IO_INVIS))) {
-                vrwlock_t locktype = VRWLOCK_READ;
+                bhv_vrwlock_t locktype = VRWLOCK_READ;
                int error;
                error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
@@ -357,7 +355,7 @@ xfs_splice_read(
        if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
            (!(ioflags & IO_INVIS))) {
-                vrwlock_t locktype = VRWLOCK_READ;
+                bhv_vrwlock_t locktype = VRWLOCK_READ;
                int error;
                error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
@@ -401,7 +399,7 @@ xfs_splice_write(
        if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) &&
            (!(ioflags & IO_INVIS))) {
-                vrwlock_t locktype = VRWLOCK_WRITE;
+                bhv_vrwlock_t locktype = VRWLOCK_WRITE;
                int error;
                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
@@ -458,7 +456,7 @@ xfs_zero_last_block(
        last_fsb = XFS_B_TO_FSBT(mp, isize);
        nimaps = 1;
        error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
-                          &nimaps, NULL);
+                          &nimaps, NULL, NULL);
        if (error) {
                return error;
        }
@@ -499,7 +497,7 @@ xfs_zero_last_block(
 int                                     /* error (positive) */
 xfs_zero_eof(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        xfs_iocore_t    *io,
        xfs_off_t       offset,         /* starting I/O offset */
        xfs_fsize_t     isize,          /* current inode size */
@@ -510,7 +508,6 @@ xfs_zero_eof(
        xfs_fileoff_t   end_zero_fsb;
        xfs_fileoff_t   zero_count_fsb;
        xfs_fileoff_t   last_fsb;
-        xfs_extlen_t    buf_len_fsb;
        xfs_mount_t     *mp = io->io_mount;
        int             nimaps;
        int             error = 0;
@@ -556,7 +553,7 @@ xfs_zero_eof(
                nimaps = 1;
                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
                error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
-                                  0, NULL, 0, &imap, &nimaps, NULL);
+                                  0, NULL, 0, &imap, &nimaps, NULL, NULL);
                if (error) {
                        ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
                        ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
@@ -579,16 +576,7 @@ xfs_zero_eof(
                }
                /*
-                 * There are blocks in the range requested.
+                 * There are blocks we need to zero.
-                 * Zero them a single write at a time.  We actually
-                 * don't zero the entire range returned if it is
-                 * too big and simply loop around to get the rest.
-                 * That is not the most efficient thing to do, but it
-                 * is simple and this path should not be exercised often.
-                 */
-                buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
-                                              mp->m_writeio_blocks << 8);
-                /*
                 * Drop the inode lock while we're doing the I/O.
                 * We'll still have the iolock to protect us.
                 */
@@ -596,14 +584,13 @@ xfs_zero_eof(
                error = xfs_iozero(ip,
                                   XFS_FSB_TO_B(mp, start_zero_fsb),
-                                   XFS_FSB_TO_B(mp, buf_len_fsb),
+                                   XFS_FSB_TO_B(mp, imap.br_blockcount),
                                   end_size);
                if (error) {
                        goto out_lock;
                }
-                start_zero_fsb = imap.br_startoff + buf_len_fsb;
+                start_zero_fsb = imap.br_startoff + imap.br_blockcount;
                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
                XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
@@ -637,11 +624,11 @@ xfs_write(
        ssize_t                 ret = 0, error = 0;
        xfs_fsize_t             isize, new_size;
        xfs_iocore_t            *io;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        unsigned long           seg;
        int                     iolock;
        int                     eventsent = 0;
-        vrwlock_t               locktype;
+        bhv_vrwlock_t           locktype;
        size_t                  ocount = 0, count;
        loff_t                  pos;
        int                     need_i_mutex = 1, need_flush = 0;
@@ -679,11 +666,11 @@ xfs_write(
        io = &xip->i_iocore;
        mp = io->io_mount;
+        vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
        if (ioflags & IO_ISDIRECT) {
                xfs_buftarg_t   *target =
                        (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
@@ -814,7 +801,7 @@ retry:
                if (need_flush) {
                        xfs_inval_cached_trace(io, pos, -1,
                                        ctooff(offtoct(pos)), -1);
-                        VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
+                        bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
                                        -1, FI_REMAPF_LOCKED);
                }
@@ -903,79 +890,9 @@ retry:
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
-                /*
+                error = xfs_write_sync_logforce(mp, xip);
-                 * If we're treating this as O_DSYNC and we have not updated the
+                if (error)
-                 * size, force the log.
+                        goto out_unlock_internal;
-                 */
-                if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-                    !(xip->i_update_size)) {
-                        xfs_inode_log_item_t    *iip = xip->i_itemp;
-                        /*
-                         * If an allocation transaction occurred
-                         * without extending the size, then we have to force
-                         * the log up the proper point to ensure that the
-                         * allocation is permanent.  We can't count on
-                         * the fact that buffered writes lock out direct I/O
-                         * writes - the direct I/O write could have extended
-                         * the size nontransactionally, then finished before
-                         * we started.  xfs_write_file will think that the file
-                         * didn't grow but the update isn't safe unless the
-                         * size change is logged.
-                         *
-                         * Force the log if we've committed a transaction
-                         * against the inode or if someone else has and
-                         * the commit record hasn't gone to disk (e.g.
-                         * the inode is pinned).  This guarantees that
-                         * all changes affecting the inode are permanent
-                         * when we return.
-                         */
-                        if (iip && iip->ili_last_lsn) {
-                                xfs_log_force(mp, iip->ili_last_lsn,
-                                                XFS_LOG_FORCE | XFS_LOG_SYNC);
-                        } else if (xfs_ipincount(xip) > 0) {
-                                xfs_log_force(mp, (xfs_lsn_t)0,
-                                                XFS_LOG_FORCE | XFS_LOG_SYNC);
-                        }
-                } else {
-                        xfs_trans_t     *tp;
-                        /*
-                         * O_SYNC or O_DSYNC _with_ a size update are handled
-                         * the same way.
-                         *
-                         * If the write was synchronous then we need to make
-                         * sure that the inode modification time is permanent.
-                         * We'll have updated the timestamp above, so here
-                         * we use a synchronous transaction to log the inode.
-                         * It's not fast, but it's necessary.
-                         *
-                         * If this a dsync write and the size got changed
-                         * non-transactionally, then we need to ensure that
-                         * the size change gets logged in a synchronous
-                         * transaction.
-                         */
-                        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-                        if ((error = xfs_trans_reserve(tp, 0,
-                                                      XFS_SWRITE_LOG_RES(mp),
-                                                      0, 0, 0))) {
-                                /* Transaction reserve failed */
-                                xfs_trans_cancel(tp, 0);
-                        } else {
-                                /* Transaction reserve successful */
-                                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                                xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
-                                xfs_trans_ihold(tp, xip);
-                                xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
-                                xfs_trans_set_sync(tp);
-                                error = xfs_trans_commit(tp, 0, NULL);
-                                xfs_iunlock(xip, XFS_ILOCK_EXCL);
-                        }
-                        if (error)
-                                goto out_unlock_internal;
-                }
                xfs_rwunlock(bdp, locktype);
                if (need_i_mutex)
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 8f4539952350..c77e62efb742 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -18,8 +18,8 @@
 #ifndef __XFS_LRW_H__
 #define __XFS_LRW_H__
-struct vnode;
 struct bhv_desc;
+struct bhv_vnode;
 struct xfs_mount;
 struct xfs_iocore;
 struct xfs_inode;
@@ -49,7 +49,7 @@ struct xfs_iomap;
 #define XFS_CTRUNC4             14
 #define XFS_CTRUNC5             15
 #define XFS_CTRUNC6             16
-#define XFS_BUNMAPI             17
+#define XFS_BUNMAP              17
 #define XFS_INVAL_CACHED        18
 #define XFS_DIORD_ENTER         19
 #define XFS_DIOWR_ENTER         20
@@ -82,7 +82,7 @@ extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
+extern int xfs_zero_eof(struct bhv_vnode *, struct xfs_iocore *, xfs_off_t,
                                xfs_fsize_t, xfs_fsize_t);
 extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
                                const struct iovec *, unsigned int,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 1f0589a05eca..e480b6102051 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -62,7 +62,7 @@ xfs_read_xfsstats(
                while (j < xstats[i].endpoint) {
                        val = 0;
                        /* sum over all cpus */
-                        for_each_cpu(c)
+                        for_each_possible_cpu(c)
                                val += *(((__u32*)&per_cpu(xfsstats, c) + j));
                        len += sprintf(buffer + len, " %u", val);
                        j++;
@@ -70,7 +70,7 @@ xfs_read_xfsstats(
                buffer[len++] = '\n';
        }
        /* extra precision counters */
-        for_each_cpu(i) {
+        for_each_possible_cpu(i) {
                xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
                xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
                xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 68f4793e8a11..9bdef9d51900 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -151,7 +149,7 @@ xfs_set_inodeops(
 STATIC __inline__ void
 xfs_revalidate_inode(
        xfs_mount_t             *mp,
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        xfs_inode_t             *ip)
 {
        struct inode            *inode = vn_to_inode(vp);
@@ -206,7 +204,7 @@ xfs_revalidate_inode(
 void
 xfs_initialize_vnode(
        bhv_desc_t              *bdp,
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        bhv_desc_t              *inode_bhv,
        int                     unlock)
 {
@@ -336,7 +334,7 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
        if (unlikely(!vp))
@@ -359,13 +357,13 @@ xfs_fs_inode_init_once(
 {
        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
                      SLAB_CTOR_CONSTRUCTOR)
-                inode_init_once(vn_to_inode((vnode_t *)vnode));
+                inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }
 STATIC int
 xfs_init_zones(void)
 {
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(vnode_t), "xfs_vnode_t",
+        xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
                                        KM_ZONE_SPREAD,
                                        xfs_fs_inode_init_once);
@@ -409,22 +407,17 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
-        vnode_t                 *vp = vn_from_inode(inode);
+        bhv_vnode_t             *vp = vn_from_inode(inode);
        int                     error = 0, flags = FLUSH_INODE;
        if (vp) {
                vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
                if (sync)
                        flags |= FLUSH_SYNC;
-                VOP_IFLUSH(vp, flags, error);
+                error = bhv_vop_iflush(vp, flags);
-                if (error == EAGAIN) {
+                if (error == EAGAIN)
-                        if (sync)
+                        error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0;
-                                VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
-                        else
-                                error = 0;
-                }
        }
        return -error;
 }
@@ -432,8 +425,7 @@ STATIC void
 xfs_fs_clear_inode(
        struct inode            *inode)
 {
-        vnode_t                 *vp = vn_from_inode(inode);
+        bhv_vnode_t             *vp = vn_from_inode(inode);
-        int                     error, cache;
        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -446,20 +438,18 @@ xfs_fs_clear_inode(
         * This can happen because xfs_iget_core calls xfs_idestroy if we
         * find an inode with di_mode == 0 but without IGET_CREATE set.
         */
-        if (vp->v_fbhv)
+        if (VNHEAD(vp))
-                VOP_INACTIVE(vp, NULL, cache);
+                bhv_vop_inactive(vp, NULL);
        VN_LOCK(vp);
        vp->v_flag &= ~VMODIFIED;
        VN_UNLOCK(vp, 0);
-        if (vp->v_fbhv) {
+        if (VNHEAD(vp))
-                VOP_RECLAIM(vp, error);
+                if (bhv_vop_reclaim(vp))
-                if (error)
+                        panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, vp);
-                        panic("vn_purge: cannot reclaim");
-        }
-        ASSERT(vp->v_fbhv == NULL);
+        ASSERT(VNHEAD(vp) == NULL);
 #ifdef XFS_VNODE_TRACE
        ktrace_free(vp->v_trace);
@@ -475,13 +465,13 @@ xfs_fs_clear_inode(
 */
 STATIC void
 xfs_syncd_queue_work(
-        struct vfs      *vfs,
+        struct bhv_vfs  *vfs,
        void            *data,
-        void            (*syncer)(vfs_t *, void *))
+        void            (*syncer)(bhv_vfs_t *, void *))
 {
-        vfs_sync_work_t *work;
+        struct bhv_vfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
+        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
        INIT_LIST_HEAD(&work->w_list);
        work->w_syncer = syncer;
        work->w_data = data;
@@ -500,7 +490,7 @@ xfs_syncd_queue_work(
 */
 STATIC void
 xfs_flush_inode_work(
-        vfs_t           *vfs,
+        bhv_vfs_t       *vfs,
        void            *inode)
 {
        filemap_flush(((struct inode *)inode)->i_mapping);
@@ -512,7 +502,7 @@ xfs_flush_inode(
        xfs_inode_t     *ip)
 {
        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
-        struct vfs      *vfs = XFS_MTOVFS(ip->i_mount);
+        struct bhv_vfs  *vfs = XFS_MTOVFS(ip->i_mount);
        igrab(inode);
        xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
@@ -525,7 +515,7 @@ xfs_flush_inode(
 */
 STATIC void
 xfs_flush_device_work(
-        vfs_t           *vfs,
+        bhv_vfs_t       *vfs,
        void            *inode)
 {
        sync_blockdev(vfs->vfs_super->s_bdev);
@@ -537,7 +527,7 @@ xfs_flush_device(
        xfs_inode_t     *ip)
 {
        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
-        struct vfs      *vfs = XFS_MTOVFS(ip->i_mount);
+        struct bhv_vfs  *vfs = XFS_MTOVFS(ip->i_mount);
        igrab(inode);
        xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
@@ -545,16 +535,16 @@ xfs_flush_device(
        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
-#define SYNCD_FLAGS     (SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR|SYNC_REFCACHE)
 STATIC void
 vfs_sync_worker(
-        vfs_t           *vfsp,
+        bhv_vfs_t       *vfsp,
        void            *unused)
 {
        int             error;
        if (!(vfsp->vfs_flag & VFS_RDONLY))
-                VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
+                error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \
+                                        SYNC_ATTR | SYNC_REFCACHE, NULL);
        vfsp->vfs_sync_seq++;
        wmb();
        wake_up(&vfsp->vfs_wait_single_sync_task);
@@ -565,8 +555,8 @@ xfssyncd(
        void                    *arg)
 {
        long                    timeleft;
-        vfs_t                   *vfsp = (vfs_t *) arg;
+        bhv_vfs_t               *vfsp = (bhv_vfs_t *) arg;
-        struct vfs_sync_work    *work, *n;
+        bhv_vfs_sync_work_t     *work, *n;
        LIST_HEAD               (tmp);
        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
@@ -600,7 +590,7 @@ xfssyncd(
                        list_del(&work->w_list);
                        if (work == &vfsp->vfs_sync_work)
                                continue;
-                        kmem_free(work, sizeof(struct vfs_sync_work));
+                        kmem_free(work, sizeof(struct bhv_vfs_sync_work));
                }
        }
@@ -609,7 +599,7 @@ xfssyncd(
 STATIC int
 xfs_fs_start_syncd(
-        vfs_t                   *vfsp)
+        bhv_vfs_t               *vfsp)
 {
        vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
        vfsp->vfs_sync_work.w_vfs = vfsp;
@@ -621,7 +611,7 @@ xfs_fs_start_syncd(
 STATIC void
 xfs_fs_stop_syncd(
-        vfs_t                   *vfsp)
+        bhv_vfs_t               *vfsp)
 {
        kthread_stop(vfsp->vfs_sync_task);
 }
@@ -630,35 +620,26 @@ STATIC void
 xfs_fs_put_super(
        struct super_block      *sb)
 {
-        vfs_t                   *vfsp = vfs_from_sb(sb);
+        bhv_vfs_t               *vfsp = vfs_from_sb(sb);
        int                     error;
        xfs_fs_stop_syncd(vfsp);
-        VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
+        bhv_vfs_sync(vfsp, SYNC_ATTR | SYNC_DELWRI, NULL);
-        if (!error)
+        error = bhv_vfs_unmount(vfsp, 0, NULL);
-                VFS_UNMOUNT(vfsp, 0, NULL, error);
        if (error) {
-                printk("XFS unmount got error %d\n", error);
+                printk("XFS: unmount got error=%d\n", error);
-                printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
+                printk("%s: vfs=0x%p left dangling!\n", __FUNCTION__, vfsp);
-                return;
+        } else {
+                vfs_deallocate(vfsp);
        }
-        vfs_deallocate(vfsp);
 }
 STATIC void
 xfs_fs_write_super(
        struct super_block      *sb)
 {
-        vfs_t                   *vfsp = vfs_from_sb(sb);
+        if (!(sb->s_flags & MS_RDONLY))
-        int                     error;
+                bhv_vfs_sync(vfs_from_sb(sb), SYNC_FSDATA, NULL);
-        if (sb->s_flags & MS_RDONLY) {
-                sb->s_dirt = 0; /* paranoia */
-                return;
-        }
-        /* Push the log and superblock a little */
-        VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
        sb->s_dirt = 0;
 }
@@ -667,16 +648,16 @@ xfs_fs_sync_super(
        struct super_block      *sb,
        int                     wait)
 {
-        vfs_t           *vfsp = vfs_from_sb(sb);
+        bhv_vfs_t               *vfsp = vfs_from_sb(sb);
-        int             error;
+        int                     error;
-        int             flags = SYNC_FSDATA;
+        int                     flags;
        if (unlikely(sb->s_frozen == SB_FREEZE_WRITE))
                flags = SYNC_QUIESCE;
        else
                flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0);
-        VFS_SYNC(vfsp, flags, NULL, error);
+        error = bhv_vfs_sync(vfsp, flags, NULL);
        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
@@ -703,14 +684,11 @@ xfs_fs_sync_super(
 STATIC int
 xfs_fs_statfs(
-        struct super_block      *sb,
+        struct dentry           *dentry,
        struct kstatfs          *statp)
 {
-        vfs_t                   *vfsp = vfs_from_sb(sb);
+        return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp,
-        int                     error;
+                                vn_from_inode(dentry->d_inode));
-        VFS_STATVFS(vfsp, statp, NULL, error);
-        return -error;
 }
 STATIC int
@@ -719,13 +697,13 @@ xfs_fs_remount(
        int                     *flags,
        char                    *options)
 {
-        vfs_t                   *vfsp = vfs_from_sb(sb);
+        bhv_vfs_t               *vfsp = vfs_from_sb(sb);
        struct xfs_mount_args   *args = xfs_args_allocate(sb, 0);
        int                     error;
-        VFS_PARSEARGS(vfsp, options, args, 1, error);
+        error = bhv_vfs_parseargs(vfsp, options, args, 1);
        if (!error)
-                VFS_MNTUPDATE(vfsp, flags, args, error);
+                error = bhv_vfs_mntupdate(vfsp, flags, args);
        kmem_free(args, sizeof(*args));
        return -error;
 }
@@ -734,7 +712,7 @@ STATIC void
 xfs_fs_lockfs(
        struct super_block      *sb)
 {
-        VFS_FREEZE(vfs_from_sb(sb));
+        bhv_vfs_freeze(vfs_from_sb(sb));
 }
 STATIC int
@@ -742,11 +720,7 @@ xfs_fs_show_options(
        struct seq_file         *m,
        struct vfsmount         *mnt)
 {
-        struct vfs              *vfsp = vfs_from_sb(mnt->mnt_sb);
+        return -bhv_vfs_showargs(vfs_from_sb(mnt->mnt_sb), m);
-        int                     error;
-        VFS_SHOWARGS(vfsp, m, error);
-        return error;
 }
 STATIC int
@@ -754,11 +728,7 @@ xfs_fs_quotasync(
        struct super_block      *sb,
        int                     type)
 {
-        struct vfs              *vfsp = vfs_from_sb(sb);
+        return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XQUOTASYNC, 0, NULL);
-        int                     error;
-        VFS_QUOTACTL(vfsp, Q_XQUOTASYNC, 0, (caddr_t)NULL, error);
-        return -error;
 }
 STATIC int
@@ -766,11 +736,7 @@ xfs_fs_getxstate(
        struct super_block      *sb,
        struct fs_quota_stat    *fqs)
 {
-        struct vfs              *vfsp = vfs_from_sb(sb);
+        return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
-        int                     error;
-        VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
-        return -error;
 }
 STATIC int
@@ -779,11 +745,7 @@ xfs_fs_setxstate(
        unsigned int            flags,
        int                     op)
 {
-        struct vfs              *vfsp = vfs_from_sb(sb);
+        return -bhv_vfs_quotactl(vfs_from_sb(sb), op, 0, (caddr_t)&flags);
-        int                     error;
-        VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
-        return -error;
 }
 STATIC int
@@ -793,13 +755,10 @@ xfs_fs_getxquota(
        qid_t                   id,
        struct fs_disk_quota    *fdq)
 {
-        struct vfs              *vfsp = vfs_from_sb(sb);
+        return -bhv_vfs_quotactl(vfs_from_sb(sb),
-        int                     error, getmode;
+                                 (type == USRQUOTA) ? Q_XGETQUOTA :
+                                  ((type == GRPQUOTA) ? Q_XGETGQUOTA :
-        getmode = (type == USRQUOTA) ? Q_XGETQUOTA :
+                                   Q_XGETPQUOTA), id, (caddr_t)fdq);
-                 ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA);
-        VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
-        return -error;
 }
 STATIC int
@@ -809,13 +768,10 @@ xfs_fs_setxquota(
        qid_t                   id,
        struct fs_disk_quota    *fdq)
 {
-        struct vfs              *vfsp = vfs_from_sb(sb);
+        return -bhv_vfs_quotactl(vfs_from_sb(sb),
-        int                     error, setmode;
+                                 (type == USRQUOTA) ? Q_XSETQLIM :
+                                  ((type == GRPQUOTA) ? Q_XSETGQLIM :
-        setmode = (type == USRQUOTA) ? Q_XSETQLIM :
+                                   Q_XSETPQLIM), id, (caddr_t)fdq);
-                 ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM);
-        VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
-        return -error;
 }
 STATIC int
@@ -824,34 +780,32 @@ xfs_fs_fill_super(
        void                    *data,
        int                     silent)
 {
-        vnode_t                 *rootvp;
+        struct bhv_vnode        *rootvp;
-        struct vfs              *vfsp = vfs_allocate(sb);
+        struct bhv_vfs          *vfsp = vfs_allocate(sb);
        struct xfs_mount_args   *args = xfs_args_allocate(sb, silent);
        struct kstatfs          statvfs;
-        int                     error, error2;
+        int                     error;
        bhv_insert_all_vfsops(vfsp);
-        VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
+        error = bhv_vfs_parseargs(vfsp, (char *)data, args, 0);
        if (error) {
                bhv_remove_all_vfsops(vfsp, 1);
                goto fail_vfsop;
        }
        sb_min_blocksize(sb, BBSIZE);
-#ifdef CONFIG_XFS_EXPORT
        sb->s_export_op = &xfs_export_operations;
-#endif
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
-        VFS_MOUNT(vfsp, args, NULL, error);
+        error = bhv_vfs_mount(vfsp, args, NULL);
        if (error) {
                bhv_remove_all_vfsops(vfsp, 1);
                goto fail_vfsop;
        }
-        VFS_STATVFS(vfsp, &statvfs, NULL, error);
+        error = bhv_vfs_statvfs(vfsp, &statvfs, NULL);
        if (error)
                goto fail_unmount;
@@ -863,7 +817,7 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        VFS_ROOT(vfsp, &rootvp, error);
+        error = bhv_vfs_root(vfsp, &rootvp);
        if (error)
                goto fail_unmount;
@@ -892,7 +846,7 @@ fail_vnrele:
        }
 fail_unmount:
-        VFS_UNMOUNT(vfsp, 0, NULL, error2);
+        bhv_vfs_unmount(vfsp, 0, NULL);
 fail_vfsop:
        vfs_deallocate(vfsp);
@@ -900,14 +854,16 @@ fail_vfsop:
        return -error;
 }
-STATIC struct super_block *
+STATIC int
 xfs_fs_get_sb(
        struct file_system_type *fs_type,
        int                     flags,
        const char              *dev_name,
-        void                    *data)
+        void                    *data,
+        struct vfsmount         *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+        return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+                           mnt);
 }
 STATIC struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 376b96cb513a..33dd1ca13245 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -105,7 +105,7 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int);
+extern void xfs_initialize_vnode(bhv_desc_t *, bhv_vnode_t *, bhv_desc_t *, int);
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7079cc837210..af246532fbfb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -38,7 +38,7 @@ xfs_stats_clear_proc_handler(
        if (!ret && write && *valp) {
                printk("XFS Clearing xfsstats\n");
-                for_each_cpu(c) {
+                for_each_possible_cpu(c) {
                        preempt_disable();
                        /* save vn_active, it's a universal truth! */
                        vn_active = per_cpu(xfsstats, c).vn_active;
@@ -120,6 +120,11 @@ STATIC ctl_table xfs_table[] = {
        &sysctl_intvec, NULL,
        &xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
+        {XFS_INHERIT_NODFRG, "inherit_nodefrag", &xfs_params.inherit_nodfrg.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.inherit_nodfrg.min, &xfs_params.inherit_nodfrg.max},
        /* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
        {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index bc8c11f13722..a631fb8cc5ac 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -46,6 +46,7 @@ typedef struct xfs_param {
        xfs_sysctl_val_t xfs_buf_age;   /* Metadata buffer age before flush. */
        xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
        xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
+        xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 } xfs_param_t;
 /*
@@ -84,6 +85,7 @@ enum {
        /* XFS_IO_BYPASS = 18 */
        XFS_INHERIT_NOSYM = 19,
        XFS_ROTORSTEP = 20,
+        XFS_INHERIT_NODFRG = 21,
 };
 extern xfs_param_t      xfs_params;
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 6f7c9f7a8624..6145e8bd0be2 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_imap.h"
 #include "xfs_alloc.h"
@@ -104,7 +103,7 @@ vfs_mntupdate(
 int
 vfs_root(
        struct bhv_desc         *bdp,
-        struct vnode            **vpp)
+        struct bhv_vnode        **vpp)
 {
        struct bhv_desc         *next = bdp;
@@ -117,15 +116,15 @@ vfs_root(
 int
 vfs_statvfs(
        struct bhv_desc         *bdp,
-        xfs_statfs_t            *sp,
+        bhv_statvfs_t           *statp,
-        struct vnode            *vp)
+        struct bhv_vnode        *vp)
 {
        struct bhv_desc         *next = bdp;
        ASSERT(next);
        while (! (bhvtovfsops(next))->vfs_statvfs)
                next = BHV_NEXT(next);
-        return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
+        return ((*bhvtovfsops(next)->vfs_statvfs)(next, statp, vp));
 }
 int
@@ -145,7 +144,7 @@ vfs_sync(
 int
 vfs_vget(
        struct bhv_desc         *bdp,
-        struct vnode            **vpp,
+        struct bhv_vnode        **vpp,
        struct fid              *fidp)
 {
        struct bhv_desc         *next = bdp;
@@ -187,7 +186,7 @@ vfs_quotactl(
 void
 vfs_init_vnode(
        struct bhv_desc         *bdp,
-        struct vnode            *vp,
+        struct bhv_vnode        *vp,
        struct bhv_desc         *bp,
        int                     unlock)
 {
@@ -226,13 +225,13 @@ vfs_freeze(
        ((*bhvtovfsops(next)->vfs_freeze)(next));
 }
-vfs_t *
+bhv_vfs_t *
 vfs_allocate(
        struct super_block      *sb)
 {
-        struct vfs              *vfsp;
+        struct bhv_vfs          *vfsp;
-        vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
+        vfsp = kmem_zalloc(sizeof(bhv_vfs_t), KM_SLEEP);
        bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
        INIT_LIST_HEAD(&vfsp->vfs_sync_list);
        spin_lock_init(&vfsp->vfs_sync_lock);
@@ -247,25 +246,25 @@ vfs_allocate(
        return vfsp;
 }
-vfs_t *
+bhv_vfs_t *
 vfs_from_sb(
        struct super_block      *sb)
 {
-        return (vfs_t *)sb->s_fs_info;
+        return (bhv_vfs_t *)sb->s_fs_info;
 }
 void
 vfs_deallocate(
-        struct vfs              *vfsp)
+        struct bhv_vfs          *vfsp)
 {
        bhv_head_destroy(VFS_BHVHEAD(vfsp));
-        kmem_free(vfsp, sizeof(vfs_t));
+        kmem_free(vfsp, sizeof(bhv_vfs_t));
 }
 void
 vfs_insertops(
-        struct vfs              *vfsp,
+        struct bhv_vfs          *vfsp,
-        struct bhv_vfsops       *vfsops)
+        struct bhv_module_vfsops *vfsops)
 {
        struct bhv_desc         *bdp;
@@ -276,9 +275,9 @@ vfs_insertops(
 void
 vfs_insertbhv(
-        struct vfs              *vfsp,
+        struct bhv_vfs          *vfsp,
        struct bhv_desc         *bdp,
-        struct vfsops           *vfsops,
+        struct bhv_vfsops       *vfsops,
        void                    *mount)
 {
        bhv_desc_init(bdp, mount, vfsp, vfsops);
@@ -287,7 +286,7 @@ vfs_insertbhv(
 void
 bhv_remove_vfsops(
-        struct vfs              *vfsp,
+        struct bhv_vfs          *vfsp,
        int                     pos)
 {
        struct bhv_desc         *bhv;
@@ -301,7 +300,7 @@ bhv_remove_vfsops(
 void
 bhv_remove_all_vfsops(
-        struct vfs              *vfsp,
+        struct bhv_vfs          *vfsp,
        int                     freebase)
 {
        struct xfs_mount        *mp;
@@ -317,7 +316,7 @@ bhv_remove_all_vfsops(
 void
 bhv_insert_all_vfsops(
-        struct vfs              *vfsp)
+        struct bhv_vfs          *vfsp)
 {
        struct xfs_mount        *mp;
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 841200c03092..91fc2c4b3353 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -21,42 +21,40 @@
 #include <linux/vfs.h>
 #include "xfs_fs.h"
+struct bhv_vfs;
+struct bhv_vnode;
 struct fid;
-struct vfs;
 struct cred;
-struct vnode;
-struct kstatfs;
 struct seq_file;
 struct super_block;
 struct xfs_mount_args;
-typedef struct kstatfs xfs_statfs_t;
+typedef struct kstatfs  bhv_statvfs_t;
-typedef struct vfs_sync_work {
+typedef struct bhv_vfs_sync_work {
        struct list_head        w_list;
-        struct vfs              *w_vfs;
+        struct bhv_vfs          *w_vfs;
        void                    *w_data;        /* syncer routine argument */
-        void                    (*w_syncer)(struct vfs *, void *);
+        void                    (*w_syncer)(struct bhv_vfs *, void *);
-} vfs_sync_work_t;
+} bhv_vfs_sync_work_t;
-typedef struct vfs {
+typedef struct bhv_vfs {
        u_int                   vfs_flag;       /* flags */
        xfs_fsid_t              vfs_fsid;       /* file system ID */
        xfs_fsid_t              *vfs_altfsid;   /* An ID fixed for life of FS */
        bhv_head_t              vfs_bh;         /* head of vfs behavior chain */
        struct super_block      *vfs_super;     /* generic superblock pointer */
        struct task_struct      *vfs_sync_task; /* generalised sync thread */
-        vfs_sync_work_t         vfs_sync_work;  /* work item for VFS_SYNC */
+        bhv_vfs_sync_work_t     vfs_sync_work;  /* work item for VFS_SYNC */
        struct list_head        vfs_sync_list;  /* sync thread work item list */
        spinlock_t              vfs_sync_lock;  /* work item list lock */
-        int                     vfs_sync_seq;   /* sync thread generation no. */
+        int                     vfs_sync_seq;   /* sync thread generation no. */
        wait_queue_head_t       vfs_wait_single_sync_task;
-} vfs_t;
+} bhv_vfs_t;
-#define vfs_fbhv                vfs_bh.bh_first /* 1st on vfs behavior chain */
-#define bhvtovfs(bdp)           ( (struct vfs *)BHV_VOBJ(bdp) )
+#define bhvtovfs(bdp)           ( (struct bhv_vfs *)BHV_VOBJ(bdp) )
-#define bhvtovfsops(bdp)        ( (struct vfsops *)BHV_OPS(bdp) )
+#define bhvtovfsops(bdp)        ( (struct bhv_vfsops *)BHV_OPS(bdp) )
 #define VFS_BHVHEAD(vfs)        ( &(vfs)->vfs_bh )
 #define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
@@ -71,7 +69,7 @@ typedef enum {
        VFS_BHV_QM,             /* quota manager */
        VFS_BHV_IO,             /* IO path */
        VFS_BHV_END             /* housekeeping end-of-range */
-} vfs_bhv_t;
+} bhv_vfs_type_t;
 #define VFS_POSITION_XFS        (BHV_POSITION_BASE)
 #define VFS_POSITION_DM         (VFS_POSITION_BASE+10)
@@ -81,8 +79,9 @@ typedef enum {
 #define VFS_RDONLY              0x0001  /* read-only vfs */
 #define VFS_GRPID               0x0002  /* group-ID assigned from directory */
 #define VFS_DMI                 0x0004  /* filesystem has the DMI enabled */
-#define VFS_32BITINODES         0x0008  /* do not use inums above 32 bits */
+#define VFS_UMOUNT              0x0008  /* unmount in progress */
-#define VFS_END                 0x0008  /* max flag */
+#define VFS_32BITINODES         0x0010  /* do not use inums above 32 bits */
+#define VFS_END                 0x0010  /* max flag */
 #define SYNC_ATTR               0x0001  /* sync attributes */
 #define SYNC_CLOSE              0x0002  /* close file system down */
@@ -92,7 +91,14 @@ typedef enum {
 #define SYNC_FSDATA             0x0020  /* flush fs data (e.g. superblocks) */
 #define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_QUIESCE            0x0100  /* quiesce filesystem for a snapshot */
+#define SYNC_QUIESCE            0x0100  /* quiesce fileystem for a snapshot */
+#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
 typedef int     (*vfs_mount_t)(bhv_desc_t *,
                                struct xfs_mount_args *, struct cred *);
@@ -102,18 +108,19 @@ typedef	int	(*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
 typedef int     (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
 typedef int     (*vfs_mntupdate_t)(bhv_desc_t *, int *,
                                struct xfs_mount_args *);
-typedef int     (*vfs_root_t)(bhv_desc_t *, struct vnode **);
+typedef int     (*vfs_root_t)(bhv_desc_t *, struct bhv_vnode **);
-typedef int     (*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+typedef int     (*vfs_statvfs_t)(bhv_desc_t *, bhv_statvfs_t *,
+                                struct bhv_vnode *);
 typedef int     (*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
-typedef int     (*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
+typedef int     (*vfs_vget_t)(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 typedef int     (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
 typedef int     (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
 typedef void    (*vfs_init_vnode_t)(bhv_desc_t *,
-                                struct vnode *, bhv_desc_t *, int);
+                                struct bhv_vnode *, bhv_desc_t *, int);
 typedef void    (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
 typedef void    (*vfs_freeze_t)(bhv_desc_t *);
-typedef struct vfsops {
+typedef struct bhv_vfsops {
        bhv_position_t          vf_position;    /* behavior chain position */
        vfs_mount_t             vfs_mount;      /* mount file system */
        vfs_parseargs_t         vfs_parseargs;  /* parse mount options */
@@ -129,82 +136,82 @@ typedef struct vfsops {
        vfs_init_vnode_t        vfs_init_vnode; /* initialize a new vnode */
        vfs_force_shutdown_t    vfs_force_shutdown;     /* crash and burn */
        vfs_freeze_t            vfs_freeze;     /* freeze fs for snapshot */
-} vfsops_t;
+} bhv_vfsops_t;
 /*
- * VFS's.  Operates on vfs structure pointers (starts at bhv head).
+ * Virtual filesystem operations, operating from head bhv.
 */
-#define VHEAD(v)                        ((v)->vfs_fbhv)
+#define VFSHEAD(v)                      ((v)->vfs_bh.bh_first)
-#define VFS_MOUNT(v, ma,cr, rv)         ((rv) = vfs_mount(VHEAD(v), ma,cr))
+#define bhv_vfs_mount(v, ma,cr)         vfs_mount(VFSHEAD(v), ma,cr)
-#define VFS_PARSEARGS(v, o,ma,f, rv)    ((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
+#define bhv_vfs_parseargs(v, o,ma,f)    vfs_parseargs(VFSHEAD(v), o,ma,f)
-#define VFS_SHOWARGS(v, m, rv)          ((rv) = vfs_showargs(VHEAD(v), m))
+#define bhv_vfs_showargs(v, m)          vfs_showargs(VFSHEAD(v), m)
-#define VFS_UNMOUNT(v, f, cr, rv)       ((rv) = vfs_unmount(VHEAD(v), f,cr))
+#define bhv_vfs_unmount(v, f,cr)        vfs_unmount(VFSHEAD(v), f,cr)
-#define VFS_MNTUPDATE(v, fl, args, rv)  ((rv) = vfs_mntupdate(VHEAD(v), fl, args))
+#define bhv_vfs_mntupdate(v, fl,args)   vfs_mntupdate(VFSHEAD(v), fl,args)
-#define VFS_ROOT(v, vpp, rv)            ((rv) = vfs_root(VHEAD(v), vpp))
+#define bhv_vfs_root(v, vpp)            vfs_root(VFSHEAD(v), vpp)
-#define VFS_STATVFS(v, sp,vp, rv)       ((rv) = vfs_statvfs(VHEAD(v), sp,vp))
+#define bhv_vfs_statvfs(v, sp,vp)       vfs_statvfs(VFSHEAD(v), sp,vp)
-#define VFS_SYNC(v, flag,cr, rv)        ((rv) = vfs_sync(VHEAD(v), flag,cr))
+#define bhv_vfs_sync(v, flag,cr)        vfs_sync(VFSHEAD(v), flag,cr)
-#define VFS_VGET(v, vpp,fidp, rv)       ((rv) = vfs_vget(VHEAD(v), vpp,fidp))
+#define bhv_vfs_vget(v, vpp,fidp)       vfs_vget(VFSHEAD(v), vpp,fidp)
-#define VFS_DMAPIOPS(v, p, rv)          ((rv) = vfs_dmapiops(VHEAD(v), p))
+#define bhv_vfs_dmapiops(v, p)          vfs_dmapiops(VFSHEAD(v), p)
-#define VFS_QUOTACTL(v, c,id,p, rv)     ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
+#define bhv_vfs_quotactl(v, c,id,p)     vfs_quotactl(VFSHEAD(v), c,id,p)
-#define VFS_INIT_VNODE(v, vp,b,ul)      ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
+#define bhv_vfs_init_vnode(v, vp,b,ul)  vfs_init_vnode(VFSHEAD(v), vp,b,ul)
-#define VFS_FORCE_SHUTDOWN(v, fl,f,l)   ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
+#define bhv_vfs_force_shutdown(v,u,f,l) vfs_force_shutdown(VFSHEAD(v), u,f,l)
-#define VFS_FREEZE(v)                   ( vfs_freeze(VHEAD(v)) )
+#define bhv_vfs_freeze(v)               vfs_freeze(VFSHEAD(v))
 /*
- * PVFS's.  Operates on behavior descriptor pointers.
+ * Virtual filesystem operations, operating from next bhv.
 */
-#define PVFS_MOUNT(b, ma,cr, rv)        ((rv) = vfs_mount(b, ma,cr))
+#define bhv_next_vfs_mount(b, ma,cr)            vfs_mount(b, ma,cr)
-#define PVFS_PARSEARGS(b, o,ma,f, rv)   ((rv) = vfs_parseargs(b, o,ma,f))
+#define bhv_next_vfs_parseargs(b, o,ma,f)       vfs_parseargs(b, o,ma,f)
-#define PVFS_SHOWARGS(b, m, rv)         ((rv) = vfs_showargs(b, m))
+#define bhv_next_vfs_showargs(b, m)             vfs_showargs(b, m)
-#define PVFS_UNMOUNT(b, f,cr, rv)       ((rv) = vfs_unmount(b, f,cr))
+#define bhv_next_vfs_unmount(b, f,cr)           vfs_unmount(b, f,cr)
-#define PVFS_MNTUPDATE(b, fl, args, rv) ((rv) = vfs_mntupdate(b, fl, args))
+#define bhv_next_vfs_mntupdate(b, fl,args)      vfs_mntupdate(b, fl, args)
-#define PVFS_ROOT(b, vpp, rv)           ((rv) = vfs_root(b, vpp))
+#define bhv_next_vfs_root(b, vpp)               vfs_root(b, vpp)
-#define PVFS_STATVFS(b, sp,vp, rv)      ((rv) = vfs_statvfs(b, sp,vp))
+#define bhv_next_vfs_statvfs(b, sp,vp)          vfs_statvfs(b, sp,vp)
-#define PVFS_SYNC(b, flag,cr, rv)       ((rv) = vfs_sync(b, flag,cr))
+#define bhv_next_vfs_sync(b, flag,cr)           vfs_sync(b, flag,cr)
-#define PVFS_VGET(b, vpp,fidp, rv)      ((rv) = vfs_vget(b, vpp,fidp))
+#define bhv_next_vfs_vget(b, vpp,fidp)          vfs_vget(b, vpp,fidp)
-#define PVFS_DMAPIOPS(b, p, rv)         ((rv) = vfs_dmapiops(b, p))
+#define bhv_next_vfs_dmapiops(b, p)             vfs_dmapiops(b, p)
-#define PVFS_QUOTACTL(b, c,id,p, rv)    ((rv) = vfs_quotactl(b, c,id,p))
+#define bhv_next_vfs_quotactl(b, c,id,p)        vfs_quotactl(b, c,id,p)
-#define PVFS_INIT_VNODE(b, vp,b2,ul)    ( vfs_init_vnode(b, vp,b2,ul) )
+#define bhv_next_vfs_init_vnode(b, vp,b2,ul)    vfs_init_vnode(b, vp,b2,ul)
-#define PVFS_FORCE_SHUTDOWN(b, fl,f,l)  ( vfs_force_shutdown(b, fl,f,l) )
+#define bhv_next_force_shutdown(b, fl,f,l)      vfs_force_shutdown(b, fl,f,l)
-#define PVFS_FREEZE(b)                  ( vfs_freeze(b) )
+#define bhv_next_vfs_freeze(b)                  vfs_freeze(b)
 extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
 extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
 extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
 extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
 extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
-extern int vfs_root(bhv_desc_t *, struct vnode **);
+extern int vfs_root(bhv_desc_t *, struct bhv_vnode **);
-extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+extern int vfs_statvfs(bhv_desc_t *, bhv_statvfs_t *, struct bhv_vnode *);
 extern int vfs_sync(bhv_desc_t *, int, struct cred *);
-extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
+extern int vfs_vget(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
 extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
-extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
+extern void vfs_init_vnode(bhv_desc_t *, struct bhv_vnode *, bhv_desc_t *, int);
 extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
 extern void vfs_freeze(bhv_desc_t *);
-typedef struct bhv_vfsops {
+#define vfs_test_for_freeze(vfs)        ((vfs)->vfs_super->s_frozen)
-        struct vfsops           bhv_common;
+#define vfs_wait_for_freeze(vfs,l)      vfs_check_frozen((vfs)->vfs_super, (l))
+ 
+typedef struct bhv_module_vfsops {
+        struct bhv_vfsops       bhv_common;
        void *                  bhv_custom;
-} bhv_vfsops_t;
+} bhv_module_vfsops_t;
-#define vfs_bhv_lookup(v, id)   ( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
+#define vfs_bhv_lookup(v, id)   (bhv_lookup_range(&(v)->vfs_bh, (id), (id)))
-#define vfs_bhv_custom(b)       ( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
+#define vfs_bhv_custom(b)       (((bhv_module_vfsops_t*)BHV_OPS(b))->bhv_custom)
-#define vfs_bhv_set_custom(b,o) ( (b)->bhv_custom = (void *)(o))
+#define vfs_bhv_set_custom(b,o) ((b)->bhv_custom = (void *)(o))
-#define vfs_bhv_clr_custom(b)   ( (b)->bhv_custom = NULL )
+#define vfs_bhv_clr_custom(b)   ((b)->bhv_custom = NULL)
-extern vfs_t *vfs_allocate(struct super_block *);
+extern bhv_vfs_t *vfs_allocate(struct super_block *);
-extern vfs_t *vfs_from_sb(struct super_block *);
+extern bhv_vfs_t *vfs_from_sb(struct super_block *);
-extern void vfs_deallocate(vfs_t *);
+extern void vfs_deallocate(bhv_vfs_t *);
-extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
+extern void vfs_insertbhv(bhv_vfs_t *, bhv_desc_t *, bhv_vfsops_t *, void *);
-extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
-extern void bhv_insert_all_vfsops(struct vfs *);
+extern void vfs_insertops(bhv_vfs_t *, bhv_module_vfsops_t *);
-extern void bhv_remove_all_vfsops(struct vfs *, int);
-extern void bhv_remove_vfsops(struct vfs *, int);
-#define fs_frozen(vfsp)         ((vfsp)->vfs_super->s_frozen)
+extern void bhv_insert_all_vfsops(struct bhv_vfs *);
-#define fs_check_frozen(vfsp, level) \
+extern void bhv_remove_all_vfsops(struct bhv_vfs *, int);
-        vfs_check_frozen(vfsp->vfs_super, level);
+extern void bhv_remove_vfsops(struct bhv_vfs *, int);
 #endif  /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index d27c25b27ccd..6628d96b6fd6 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -39,7 +39,7 @@ vn_init(void)
 void
 vn_iowait(
-        struct vnode    *vp)
+        bhv_vnode_t     *vp)
 {
        wait_queue_head_t *wq = vptosync(vp);
@@ -48,17 +48,33 @@ vn_iowait(
 void
 vn_iowake(
-        struct vnode    *vp)
+        bhv_vnode_t     *vp)
 {
        if (atomic_dec_and_test(&vp->v_iocount))
                wake_up(vptosync(vp));
 }
-struct vnode *
+/*
+ * Volume managers supporting multiple paths can send back ENODEV when the
+ * final path disappears.  In this case continuing to fill the page cache
+ * with dirty data which cannot be written out is evil, so prevent that.
+ */
+void
+vn_ioerror(
+        bhv_vnode_t     *vp,
+        int             error,
+        char            *f,
+        int             l)
+{
+        if (unlikely(error == -ENODEV))
+                bhv_vfs_force_shutdown(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l);
+}
+bhv_vnode_t *
 vn_initialize(
        struct inode    *inode)
 {
-        struct vnode    *vp = vn_from_inode(inode);
+        bhv_vnode_t     *vp = vn_from_inode(inode);
        XFS_STATS_INC(vn_active);
        XFS_STATS_INC(vn_alloc);
@@ -94,8 +110,8 @@ vn_initialize(
 */
 void
 vn_revalidate_core(
-        struct vnode    *vp,
+        bhv_vnode_t     *vp,
-        vattr_t         *vap)
+        bhv_vattr_t     *vap)
 {
        struct inode    *inode = vn_to_inode(vp);
@@ -130,14 +146,14 @@ vn_revalidate_core(
 */
 int
 __vn_revalidate(
-        struct vnode    *vp,
+        bhv_vnode_t     *vp,
-        struct vattr    *vattr)
+        bhv_vattr_t     *vattr)
 {
        int             error;
        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
        vattr->va_mask = XFS_AT_STAT | XFS_AT_XFLAGS;
-        VOP_GETATTR(vp, vattr, 0, NULL, error);
+        error = bhv_vop_getattr(vp, vattr, 0, NULL);
        if (likely(!error)) {
                vn_revalidate_core(vp, vattr);
                VUNMODIFY(vp);
@@ -147,9 +163,9 @@ __vn_revalidate(
 int
 vn_revalidate(
-        struct vnode    *vp)
+        bhv_vnode_t     *vp)
 {
-        vattr_t         vattr;
+        bhv_vattr_t     vattr;
        return __vn_revalidate(vp, &vattr);
 }
@@ -157,9 +173,9 @@ vn_revalidate(
 /*
 * Add a reference to a referenced vnode.
 */
-struct vnode *
+bhv_vnode_t *
 vn_hold(
-        struct vnode    *vp)
+        bhv_vnode_t     *vp)
 {
        struct inode    *inode;
@@ -192,31 +208,31 @@ vn_hold(
 * Vnode tracing code.
 */
 void
-vn_trace_entry(vnode_t *vp, const char *func, inst_t *ra)
+vn_trace_entry(bhv_vnode_t *vp, const char *func, inst_t *ra)
 {
        KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
 }
 void
-vn_trace_exit(vnode_t *vp, const char *func, inst_t *ra)
+vn_trace_exit(bhv_vnode_t *vp, const char *func, inst_t *ra)
 {
        KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
 }
 void
-vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_hold(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
        KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
 }
 void
-vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_ref(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
        KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
 }
 void
-vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_rele(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
        KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
 }
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 2a8e16c22353..c42b3221b20c 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -14,57 +14,35 @@
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Portions Copyright (c) 1989, 1993
- *      The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
 */
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
 struct uio;
 struct file;
-struct vattr;
+struct bhv_vfs;
+struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
+typedef struct dentry   bhv_vname_t;
+typedef __u64           bhv_vnumber_t;
-typedef xfs_ino_t vnumber_t;
+typedef enum bhv_vflags {
-typedef struct dentry vname_t;
+        VMODIFIED       = 0x08, /* XFS inode state possibly differs */
-typedef bhv_head_t vn_bhv_head_t;
+                                /* to the Linux inode state. */
+        VTRUNCATED      = 0x40, /* truncated down so flush-on-close */
+} bhv_vflags_t;
 /*
 * MP locking protocols:
 *      v_flag, v_vfsp                          VN_LOCK/VN_UNLOCK
 */
-typedef struct vnode {
+typedef struct bhv_vnode {
-        __u32           v_flag;                 /* vnode flags (see below) */
+        bhv_vflags_t    v_flag;                 /* vnode flags (see above) */
-        struct vfs      *v_vfsp;                /* ptr to containing VFS */
+        bhv_vfs_t       *v_vfsp;                /* ptr to containing VFS */
-        vnumber_t       v_number;               /* in-core vnode number */
+        bhv_vnumber_t   v_number;               /* in-core vnode number */
-        vn_bhv_head_t   v_bh;                   /* behavior head */
+        bhv_head_t      v_bh;                   /* behavior head */
        spinlock_t      v_lock;                 /* VN_LOCK/VN_UNLOCK */
        atomic_t        v_iocount;              /* outstanding I/O count */
 #ifdef XFS_VNODE_TRACE
@@ -72,7 +50,7 @@ typedef struct vnode {
 #endif
        struct inode    v_inode;                /* Linux inode */
        /* inode MUST be last */
-} vnode_t;
+} bhv_vnode_t;
 #define VN_ISLNK(vp)    S_ISLNK((vp)->v_inode.i_mode)
 #define VN_ISREG(vp)    S_ISREG((vp)->v_inode.i_mode)
@@ -80,9 +58,6 @@ typedef struct vnode {
 #define VN_ISCHR(vp)    S_ISCHR((vp)->v_inode.i_mode)
 #define VN_ISBLK(vp)    S_ISBLK((vp)->v_inode.i_mode)
-#define v_fbhv                  v_bh.bh_first          /* first behavior */
-#define v_fops                  v_bh.bh_first->bd_ops  /* first behavior ops */
 #define VNODE_POSITION_BASE     BHV_POSITION_BASE       /* chain bottom */
 #define VNODE_POSITION_TOP      BHV_POSITION_TOP        /* chain top */
 #define VNODE_POSITION_INVALID  BHV_POSITION_INVALID    /* invalid pos. num */
@@ -104,8 +79,8 @@ typedef enum {
 /*
 * Macros for dealing with the behavior descriptor inside of the vnode.
 */
-#define BHV_TO_VNODE(bdp)       ((vnode_t *)BHV_VOBJ(bdp))
+#define BHV_TO_VNODE(bdp)       ((bhv_vnode_t *)BHV_VOBJ(bdp))
-#define BHV_TO_VNODE_NULL(bdp)  ((vnode_t *)BHV_VOBJNULL(bdp))
+#define BHV_TO_VNODE_NULL(bdp)  ((bhv_vnode_t *)BHV_VOBJNULL(bdp))
 #define VN_BHV_HEAD(vp)                 ((bhv_head_t *)(&((vp)->v_bh)))
 #define vn_bhv_head_init(bhp,name)      bhv_head_init(bhp,name)
@@ -116,35 +91,29 @@ typedef enum {
 /*
 * Vnode to Linux inode mapping.
 */
-static inline struct vnode *vn_from_inode(struct inode *inode)
+static inline struct bhv_vnode *vn_from_inode(struct inode *inode)
 {
-        return (vnode_t *)list_entry(inode, vnode_t, v_inode);
+        return container_of(inode, bhv_vnode_t, v_inode);
 }
-static inline struct inode *vn_to_inode(struct vnode *vnode)
+static inline struct inode *vn_to_inode(struct bhv_vnode *vnode)
 {
        return &vnode->v_inode;
 }
 /*
- * Vnode flags.
+ * Values for the vop_rwlock/rwunlock flags parameter.
- */
-#define VMODIFIED              0x8      /* XFS inode state possibly differs */
-                                        /* to the Linux inode state.    */
-/*
- * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
 */
-typedef enum vrwlock {
+typedef enum bhv_vrwlock {
        VRWLOCK_NONE,
        VRWLOCK_READ,
        VRWLOCK_WRITE,
        VRWLOCK_WRITE_DIRECT,
        VRWLOCK_TRY_READ,
        VRWLOCK_TRY_WRITE
-} vrwlock_t;
+} bhv_vrwlock_t;
 /*
- * Return values for VOP_INACTIVE.  A return value of
+ * Return values for bhv_vop_inactive.  A return value of
 * VN_INACTIVE_NOCACHE implies that the file system behavior
 * has disassociated its state and bhv_desc_t from the vnode.
 */
@@ -152,18 +121,20 @@ typedef enum vrwlock {
 #define VN_INACTIVE_NOCACHE     1
 /*
- * Values for the cmd code given to VOP_VNODE_CHANGE.
+ * Values for the cmd code given to vop_vnode_change.
 */
-typedef enum vchange {
+typedef enum bhv_vchange {
        VCHANGE_FLAGS_FRLOCKS           = 0,
        VCHANGE_FLAGS_ENF_LOCKING       = 1,
        VCHANGE_FLAGS_TRUNCATED         = 2,
        VCHANGE_FLAGS_PAGE_DIRTY        = 3,
        VCHANGE_FLAGS_IOEXCL_COUNT      = 4
-} vchange_t;
+} bhv_vchange_t;
+typedef enum { L_FALSE, L_TRUE } lastclose_t;
 typedef int     (*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef int     (*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *);
 typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
                                const struct iovec *, unsigned int,
                                loff_t *, int, struct cred *);
@@ -181,27 +152,27 @@ typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
                                struct cred *);
 typedef int     (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
                                int, unsigned int, void __user *);
-typedef int     (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
+typedef int     (*vop_getattr_t)(bhv_desc_t *, struct bhv_vattr *, int,
                                struct cred *);
-typedef int     (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
+typedef int     (*vop_setattr_t)(bhv_desc_t *, struct bhv_vattr *, int,
                                struct cred *);
 typedef int     (*vop_access_t)(bhv_desc_t *, int, struct cred *);
-typedef int     (*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
+typedef int     (*vop_lookup_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t **,
-                                int, vnode_t *, struct cred *);
+                                int, bhv_vnode_t *, struct cred *);
-typedef int     (*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
+typedef int     (*vop_create_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *,
-                                vnode_t **, struct cred *);
+                                bhv_vnode_t **, struct cred *);
-typedef int     (*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
+typedef int     (*vop_remove_t)(bhv_desc_t *, bhv_vname_t *, struct cred *);
-typedef int     (*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
+typedef int     (*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, bhv_vname_t *,
-                                struct cred *);
-typedef int     (*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
                                struct cred *);
-typedef int     (*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
+typedef int     (*vop_rename_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *,
-                                vnode_t **, struct cred *);
+                                bhv_vname_t *, struct cred *);
-typedef int     (*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
+typedef int     (*vop_mkdir_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *,
+                                bhv_vnode_t **, struct cred *);
+typedef int     (*vop_rmdir_t)(bhv_desc_t *, bhv_vname_t *, struct cred *);
 typedef int     (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
                                int *);
-typedef int     (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
+typedef int     (*vop_symlink_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr*,
-                                char *, vnode_t **, struct cred *);
+                                char *, bhv_vnode_t **, struct cred *);
 typedef int     (*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
                                struct cred *);
 typedef int     (*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
@@ -209,8 +180,8 @@ typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
 typedef int     (*vop_inactive_t)(bhv_desc_t *, struct cred *);
 typedef int     (*vop_fid2_t)(bhv_desc_t *, struct fid *);
 typedef int     (*vop_release_t)(bhv_desc_t *);
-typedef int     (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
+typedef int     (*vop_rwlock_t)(bhv_desc_t *, bhv_vrwlock_t);
-typedef void    (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
+typedef void    (*vop_rwunlock_t)(bhv_desc_t *, bhv_vrwlock_t);
 typedef int     (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
                                struct xfs_iomap *, int *);
 typedef int     (*vop_reclaim_t)(bhv_desc_t *);
@@ -222,8 +193,8 @@ typedef	int	(*vop_attr_remove_t)(bhv_desc_t *, const char *,
                                int, struct cred *);
 typedef int     (*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
                                struct attrlist_cursor_kern *, struct cred *);
-typedef void    (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
+typedef void    (*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int);
-typedef void    (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
+typedef void    (*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t);
 typedef void    (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef void    (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef int     (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
@@ -231,9 +202,10 @@ typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
 typedef int     (*vop_iflush_t)(bhv_desc_t *, int);
-typedef struct vnodeops {
+typedef struct bhv_vnodeops {
        bhv_position_t  vn_position;    /* position within behavior chain */
        vop_open_t              vop_open;
+        vop_close_t             vop_close;
        vop_read_t              vop_read;
        vop_write_t             vop_write;
        vop_sendfile_t          vop_sendfile;
@@ -271,103 +243,80 @@ typedef struct vnodeops {
        vop_pflushvp_t          vop_flush_pages;
        vop_release_t           vop_release;
        vop_iflush_t            vop_iflush;
-} vnodeops_t;
+} bhv_vnodeops_t;
 /*
- * VOP's.
+ * Virtual node operations, operating from head bhv.
- */
-#define _VOP_(op, vp)   (*((vnodeops_t *)(vp)->v_fops)->op)
-#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv)                 \
-        rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv)                \
-        rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)               \
-        rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv)                  \
-        rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv)                 \
-        rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_BMAP(vp,of,sz,rw,b,n,rv)                                    \
-        rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_OPEN(vp, cr, rv)                                            \
-        rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
-#define VOP_GETATTR(vp, vap, f, cr, rv)                                 \
-        rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define VOP_SETATTR(vp, vap, f, cr, rv)                                 \
-        rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define VOP_ACCESS(vp, mode, cr, rv)                                    \
-        rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
-#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)                               \
-        rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
-#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)                                 \
-        rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_REMOVE(dvp,d,cr,rv)                                         \
-        rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
-#define VOP_LINK(tdvp,fvp,d,cr,rv)                                      \
-        rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
-#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)                              \
-        rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
-#define VOP_MKDIR(dp,d,vap,vpp,cr,rv)                                   \
-        rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_RMDIR(dp,d,cr,rv)                                           \
-        rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
-#define VOP_READDIR(vp,uiop,cr,eofp,rv)                                 \
-        rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
-#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)                            \
-        rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
-#define VOP_READLINK(vp,uiop,fl,cr,rv)                                  \
-        rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
-#define VOP_FSYNC(vp,f,cr,b,e,rv)                                       \
-        rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
-#define VOP_INACTIVE(vp, cr, rv)                                        \
-        rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
-#define VOP_RELEASE(vp, rv)                                             \
-        rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
-#define VOP_FID2(vp, fidp, rv)                                          \
-        rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
-#define VOP_RWLOCK(vp,i)                                                \
-        (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWLOCK_TRY(vp,i)                                            \
-        _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWUNLOCK(vp,i)                                              \
-        (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
-#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv)                          \
-        rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
-#define VOP_RECLAIM(vp, rv)                                             \
-        rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
-#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)              \
-        rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
-#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)               \
-        rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
-#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv)                      \
-        rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
-#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)            \
-        rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
-#define VOP_LINK_REMOVED(vp, dvp, linkzero)                             \
-        (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
-#define VOP_VNODE_CHANGE(vp, cmd, val)                                  \
-        (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
-/*
- * These are page cache functions that now go thru VOPs.
- * 'last' parameter is unused and left in for IRIX compatibility
 */
-#define VOP_TOSS_PAGES(vp, first, last, fiopt)                          \
+#define VNHEAD(vp)      ((vp)->v_bh.bh_first)
-        _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
+#define VOP(op, vp)     (*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op)
-/*
+#define bhv_vop_open(vp, cr)            VOP(vop_open, vp)(VNHEAD(vp),cr)
- * 'last' parameter is unused and left in for IRIX compatibility
+#define bhv_vop_close(vp, f,last,cr)    VOP(vop_close, vp)(VNHEAD(vp),f,last,cr)
- */
+#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr)                \
-#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)                    \
+                VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
-        _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
+#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr)               \
-/*
+                VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
- * 'last' parameter is unused and left in for IRIX compatibility
+#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr)              \
- */
+                VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr)
-#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)              \
+#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr)                 \
-        rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
+                VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv)                    \
+#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr)                \
-        rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
+                VOP(vop_splice_write, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_IFLUSH(vp, flags, rv)                                       \
+#define bhv_vop_bmap(vp,of,sz,rw,b,n)                                   \
-        rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
+                VOP(vop_bmap, vp)(VNHEAD(vp),of,sz,rw,b,n)
+#define bhv_vop_getattr(vp, vap,f,cr)                                   \
+                VOP(vop_getattr, vp)(VNHEAD(vp), vap,f,cr)
+#define bhv_vop_setattr(vp, vap,f,cr)                                   \
+                VOP(vop_setattr, vp)(VNHEAD(vp), vap,f,cr)
+#define bhv_vop_access(vp, mode,cr)     VOP(vop_access, vp)(VNHEAD(vp), mode,cr)
+#define bhv_vop_lookup(vp,d,vpp,f,rdir,cr)                              \
+                VOP(vop_lookup, vp)(VNHEAD(vp),d,vpp,f,rdir,cr)
+#define bhv_vop_create(dvp,d,vap,vpp,cr)                                \
+                VOP(vop_create, dvp)(VNHEAD(dvp),d,vap,vpp,cr)
+#define bhv_vop_remove(dvp,d,cr)        VOP(vop_remove, dvp)(VNHEAD(dvp),d,cr)
+#define bhv_vop_link(dvp,fvp,d,cr)      VOP(vop_link, dvp)(VNHEAD(dvp),fvp,d,cr)
+#define bhv_vop_rename(fvp,fnm,tdvp,tnm,cr)                             \
+                VOP(vop_rename, fvp)(VNHEAD(fvp),fnm,tdvp,tnm,cr)
+#define bhv_vop_mkdir(dp,d,vap,vpp,cr)                                  \
+                VOP(vop_mkdir, dp)(VNHEAD(dp),d,vap,vpp,cr)
+#define bhv_vop_rmdir(dp,d,cr)          VOP(vop_rmdir, dp)(VNHEAD(dp),d,cr)
+#define bhv_vop_readdir(vp,uiop,cr,eofp)                                \
+                VOP(vop_readdir, vp)(VNHEAD(vp),uiop,cr,eofp)
+#define bhv_vop_symlink(dvp,d,vap,tnm,vpp,cr)                           \
+                VOP(vop_symlink, dvp)(VNHEAD(dvp),d,vap,tnm,vpp,cr)
+#define bhv_vop_readlink(vp,uiop,fl,cr)                                 \
+                VOP(vop_readlink, vp)(VNHEAD(vp),uiop,fl,cr)
+#define bhv_vop_fsync(vp,f,cr,b,e)      VOP(vop_fsync, vp)(VNHEAD(vp),f,cr,b,e)
+#define bhv_vop_inactive(vp,cr)         VOP(vop_inactive, vp)(VNHEAD(vp),cr)
+#define bhv_vop_release(vp)             VOP(vop_release, vp)(VNHEAD(vp))
+#define bhv_vop_fid2(vp,fidp)           VOP(vop_fid2, vp)(VNHEAD(vp),fidp)
+#define bhv_vop_rwlock(vp,i)            VOP(vop_rwlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_rwlock_try(vp,i)        VOP(vop_rwlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_rwunlock(vp,i)          VOP(vop_rwunlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_frlock(vp,c,fl,flags,offset,fr)                         \
+                VOP(vop_frlock, vp)(VNHEAD(vp),c,fl,flags,offset,fr)
+#define bhv_vop_reclaim(vp)             VOP(vop_reclaim, vp)(VNHEAD(vp))
+#define bhv_vop_attr_get(vp, name, val, vallenp, fl, cred)              \
+                VOP(vop_attr_get, vp)(VNHEAD(vp),name,val,vallenp,fl,cred)
+#define bhv_vop_attr_set(vp, name, val, vallen, fl, cred)               \
+                VOP(vop_attr_set, vp)(VNHEAD(vp),name,val,vallen,fl,cred)
+#define bhv_vop_attr_remove(vp, name, flags, cred)                      \
+                VOP(vop_attr_remove, vp)(VNHEAD(vp),name,flags,cred)
+#define bhv_vop_attr_list(vp, buf, buflen, fl, cursor, cred)            \
+                VOP(vop_attr_list, vp)(VNHEAD(vp),buf,buflen,fl,cursor,cred)
+#define bhv_vop_link_removed(vp, dvp, linkzero)                         \
+                VOP(vop_link_removed, vp)(VNHEAD(vp), dvp, linkzero)
+#define bhv_vop_vnode_change(vp, cmd, val)                              \
+                VOP(vop_vnode_change, vp)(VNHEAD(vp), cmd, val)
+#define bhv_vop_toss_pages(vp, first, last, fiopt)                      \
+                VOP(vop_tosspages, vp)(VNHEAD(vp), first, last, fiopt)
+#define bhv_vop_flushinval_pages(vp, first, last, fiopt)                \
+                VOP(vop_flushinval_pages, vp)(VNHEAD(vp),first,last,fiopt)
+#define bhv_vop_flush_pages(vp, first, last, flags, fiopt)              \
+                VOP(vop_flush_pages, vp)(VNHEAD(vp),first,last,flags,fiopt)
+#define bhv_vop_ioctl(vp, inode, filp, fl, cmd, arg)                    \
+                VOP(vop_ioctl, vp)(VNHEAD(vp),inode,filp,fl,cmd,arg)
+#define bhv_vop_iflush(vp, flags)       VOP(vop_iflush, vp)(VNHEAD(vp), flags)
 /*
 * Flags for read/write calls - same values as IRIX
@@ -377,7 +326,7 @@ typedef struct vnodeops {
 #define IO_INVIS        0x00020         /* don't update inode timestamps */
 /*
- * Flags for VOP_IFLUSH call
+ * Flags for vop_iflush call
 */
 #define FLUSH_SYNC              1       /* wait for flush to complete   */
 #define FLUSH_INODE             2       /* flush the inode itself       */
@@ -385,8 +334,7 @@ typedef struct vnodeops {
                                         * this inode out to disk       */
 /*
- * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
+ * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- *      VOP_FLUSH_PAGES.
 */
 #define FI_NONE                 0       /* none */
 #define FI_REMAPF               1       /* Do a remapf prior to the operation */
@@ -398,7 +346,7 @@ typedef struct vnodeops {
 * Vnode attributes.  va_mask indicates those attributes the caller
 * wants to set or extract.
 */
-typedef struct vattr {
+typedef struct bhv_vattr {
        int             va_mask;        /* bit-mask of attributes present */
        mode_t          va_mode;        /* file access mode and type */
        xfs_nlink_t     va_nlink;       /* number of references to file */
@@ -418,7 +366,7 @@ typedef struct vattr {
        u_long          va_nextents;    /* number of extents in file */
        u_long          va_anextents;   /* number of attr extents in file */
        prid_t          va_projid;      /* project id */
-} vattr_t;
+} bhv_vattr_t;
 /*
 * setattr or getattr attributes
@@ -492,29 +440,17 @@ typedef struct vattr {
        (VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
 extern void     vn_init(void);
-extern vnode_t  *vn_initialize(struct inode *);
+extern bhv_vnode_t      *vn_initialize(struct inode *);
+extern int      vn_revalidate(struct bhv_vnode *);
-/*
+extern int      __vn_revalidate(struct bhv_vnode *, bhv_vattr_t *);
- * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
+extern void     vn_revalidate_core(struct bhv_vnode *, bhv_vattr_t *);
- */
-typedef struct vnode_map {
-        vfs_t           *v_vfsp;
-        vnumber_t       v_number;               /* in-core vnode number */
-        xfs_ino_t       v_ino;                  /* inode #      */
-} vmap_t;
-#define VMAP(vp, vmap)  {(vmap).v_vfsp   = (vp)->v_vfsp,        \
-                         (vmap).v_number = (vp)->v_number,      \
-                         (vmap).v_ino    = (vp)->v_inode.i_ino; }
-extern int      vn_revalidate(struct vnode *);
+extern void     vn_iowait(struct bhv_vnode *vp);
-extern int      __vn_revalidate(struct vnode *, vattr_t *);
+extern void     vn_iowake(struct bhv_vnode *vp);
-extern void     vn_revalidate_core(struct vnode *, vattr_t *);
-extern void     vn_iowait(struct vnode *vp);
+extern void     vn_ioerror(struct bhv_vnode *vp, int error, char *f, int l);
-extern void     vn_iowake(struct vnode *vp);
-static inline int vn_count(struct vnode *vp)
+static inline int vn_count(struct bhv_vnode *vp)
 {
        return atomic_read(&vn_to_inode(vp)->i_count);
 }
@@ -522,7 +458,7 @@ static inline int vn_count(struct vnode *vp)
 /*
 * Vnode reference counting functions (and macros for compatibility).
 */
-extern vnode_t  *vn_hold(struct vnode *);
+extern bhv_vnode_t      *vn_hold(struct bhv_vnode *);
 #if defined(XFS_VNODE_TRACE)
 #define VN_HOLD(vp)             \
@@ -536,7 +472,7 @@ extern vnode_t	*vn_hold(struct vnode *);
 #define VN_RELE(vp)             (iput(vn_to_inode(vp)))
 #endif
-static inline struct vnode *vn_grab(struct vnode *vp)
+static inline struct bhv_vnode *vn_grab(struct bhv_vnode *vp)
 {
        struct inode *inode = igrab(vn_to_inode(vp));
        return inode ? vn_from_inode(inode) : NULL;
@@ -554,32 +490,39 @@ static inline struct vnode *vn_grab(struct vnode *vp)
 */
 #define VN_LOCK(vp)             mutex_spinlock(&(vp)->v_lock)
 #define VN_UNLOCK(vp, s)        mutex_spinunlock(&(vp)->v_lock, s)
-#define VN_FLAGSET(vp,b)        vn_flagset(vp,b)
-#define VN_FLAGCLR(vp,b)        vn_flagclr(vp,b)
-static __inline__ void vn_flagset(struct vnode *vp, uint flag)
+static __inline__ void vn_flagset(struct bhv_vnode *vp, uint flag)
 {
        spin_lock(&vp->v_lock);
        vp->v_flag |= flag;
        spin_unlock(&vp->v_lock);
 }
-static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+static __inline__ uint vn_flagclr(struct bhv_vnode *vp, uint flag)
 {
+        uint    cleared;
        spin_lock(&vp->v_lock);
+        cleared = (vp->v_flag & flag);
        vp->v_flag &= ~flag;
        spin_unlock(&vp->v_lock);
+        return cleared;
 }
+#define VMODIFY(vp)     vn_flagset(vp, VMODIFIED)
+#define VUNMODIFY(vp)   vn_flagclr(vp, VMODIFIED)
+#define VTRUNCATE(vp)   vn_flagset(vp, VTRUNCATED)
+#define VUNTRUNCATE(vp) vn_flagclr(vp, VTRUNCATED)
 /*
 * Dealing with bad inodes
 */
-static inline void vn_mark_bad(struct vnode *vp)
+static inline void vn_mark_bad(struct bhv_vnode *vp)
 {
        make_bad_inode(vn_to_inode(vp));
 }
-static inline int VN_BAD(struct vnode *vp)
+static inline int VN_BAD(struct bhv_vnode *vp)
 {
        return is_bad_inode(vn_to_inode(vp));
 }
@@ -587,18 +530,18 @@ static inline int VN_BAD(struct vnode *vp)
 /*
 * Extracting atime values in various formats
 */
-static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
 {
        bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
        bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
 }
-static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
 {
        *ts = vp->v_inode.i_atime;
 }
-static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
+static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 {
        *tt = vp->v_inode.i_atime.tv_sec;
 }
@@ -610,11 +553,10 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define VN_CACHED(vp)   (vn_to_inode(vp)->i_mapping->nrpages)
 #define VN_DIRTY(vp)    mapping_tagged(vn_to_inode(vp)->i_mapping, \
                                        PAGECACHE_TAG_DIRTY)
-#define VMODIFY(vp)     VN_FLAGSET(vp, VMODIFIED)
+#define VN_TRUNC(vp)    ((vp)->v_flag & VTRUNCATED)
-#define VUNMODIFY(vp)   VN_FLAGCLR(vp, VMODIFIED)
 /*
- * Flags to VOP_SETATTR/VOP_GETATTR.
+ * Flags to vop_setattr/getattr.
 */
 #define ATTR_UTIME      0x01    /* non-default utime(2) request */
 #define ATTR_DMI        0x08    /* invocation from a DMI function */
@@ -624,7 +566,7 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define ATTR_NOSIZETOK  0x400   /* Don't get the SIZE token */
 /*
- * Flags to VOP_FSYNC and VOP_RECLAIM.
+ * Flags to vop_fsync/reclaim.
 */
 #define FSYNC_NOWAIT    0       /* asynchronous flush */
 #define FSYNC_WAIT      0x1     /* synchronous fsync or forced reclaim */
@@ -643,11 +585,11 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define VNODE_KTRACE_REF        4
 #define VNODE_KTRACE_RELE       5
-extern void vn_trace_entry(struct vnode *, const char *, inst_t *);
+extern void vn_trace_entry(struct bhv_vnode *, const char *, inst_t *);
-extern void vn_trace_exit(struct vnode *, const char *, inst_t *);
+extern void vn_trace_exit(struct bhv_vnode *, const char *, inst_t *);
-extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_hold(struct bhv_vnode *, char *, int, inst_t *);
-extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_ref(struct bhv_vnode *, char *, int, inst_t *);
-extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_rele(struct bhv_vnode *, char *, int, inst_t *);
 #define VN_TRACE(vp)            \
        vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 772ac48329ea..3aa771531856 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -444,7 +442,7 @@ xfs_qm_dqalloc(
                              XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
                              &firstblock,
                              XFS_QM_DQALLOC_SPACE_RES(mp),
-                              &map, &nmaps, &flist))) {
+                              &map, &nmaps, &flist, NULL))) {
                goto error0;
        }
        ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -559,7 +557,7 @@ xfs_qm_dqtobp(
                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
                                  XFS_BMAPI_METADATA,
-                                  NULL, 0, &map, &nmaps, NULL);
+                                  NULL, 0, &map, &nmaps, NULL, NULL);
                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
                if (error)
@@ -1261,7 +1259,7 @@ xfs_qm_dqflush(
        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
                           0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
-                xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
+                xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
                return XFS_ERROR(EIO);
        }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index c0c629663a5c..78d3ab95c5fd 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -119,7 +119,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 */
 #define xfs_dqflock(dqp)         { psema(&((dqp)->q_flock), PINOD | PRECALC);\
                                   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp)       { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
+#define xfs_dqfunlock(dqp)       { ASSERT(issemalocked(&((dqp)->q_flock))); \
                                   vsema(&((dqp)->q_flock)); \
                                   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
@@ -128,7 +128,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 #define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
                                     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
+#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 546f48af882a..5b2dcc58b244 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -248,7 +246,7 @@ xfs_qm_dquot_logitem_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if ((valusema(&(dqp->q_flock)) > 0)  ||
+        if (!issemalocked(&(dqp->q_flock))  ||
            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                qip->qli_pushbuf_flag = 0;
                xfs_dqunlock(dqp);
@@ -261,7 +259,7 @@ xfs_qm_dquot_logitem_pushbuf(
        if (bp != NULL) {
                if (XFS_BUF_ISDELAYWRITE(bp)) {
                        dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-                                  (valusema(&(dqp->q_flock)) <= 0));
+                                  issemalocked(&(dqp->q_flock)));
                        qip->qli_pushbuf_flag = 0;
                        xfs_dqunlock(dqp);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7fb5eca9bd50..e23e45535c48 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1603,7 +1601,7 @@ xfs_qm_dqiterate(
                                  maxlblkcnt - lblkno,
                                  XFS_BMAPI_METADATA,
                                  NULL,
-                                  0, map, &nmaps, NULL);
+                                  0, map, &nmaps, NULL, NULL);
                xfs_iunlock(qip, XFS_ILOCK_SHARED);
                if (error)
                        break;
@@ -1905,9 +1903,7 @@ xfs_qm_quotacheck(
                 */
                if ((error = xfs_bulkstat(mp, &lastino, &count,
                                     xfs_qm_dqusage_adjust, NULL,
-                                     structsz, NULL,
+                                     structsz, NULL, BULKSTAT_FG_IGET, &done)))
-                                     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
-                                     &done)))
                        break;
        } while (! done);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 6838b36d95a9..e95e99f7168f 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -129,7 +127,7 @@ xfs_qm_parseargs(
                return XFS_ERROR(EINVAL);
        }
-        PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
+        error = bhv_next_vfs_parseargs(BHV_NEXT(bhv), options, args, update);
        if (!error && !referenced)
                bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
        return error;
@@ -140,9 +138,8 @@ xfs_qm_showargs(
        struct bhv_desc         *bhv,
        struct seq_file         *m)
 {
-        struct vfs              *vfsp = bhvtovfs(bhv);
+        struct bhv_vfs          *vfsp = bhvtovfs(bhv);
        struct xfs_mount        *mp = XFS_VFSTOM(vfsp);
-        int                     error;
        if (mp->m_qflags & XFS_UQUOTA_ACCT) {
                (mp->m_qflags & XFS_UQUOTA_ENFD) ?
@@ -165,8 +162,7 @@ xfs_qm_showargs(
        if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
                seq_puts(m, "," MNTOPT_NOQUOTA);
-        PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
+        return bhv_next_vfs_showargs(BHV_NEXT(bhv), m);
-        return error;
 }
 STATIC int
@@ -175,14 +171,67 @@ xfs_qm_mount(
        struct xfs_mount_args   *args,
        struct cred             *cr)
 {
-        struct vfs              *vfsp = bhvtovfs(bhv);
+        struct bhv_vfs          *vfsp = bhvtovfs(bhv);
        struct xfs_mount        *mp = XFS_VFSTOM(vfsp);
-        int                     error;
        if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA))
                xfs_qm_mount_quotainit(mp, args->flags);
-        PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
+        return bhv_next_vfs_mount(BHV_NEXT(bhv), args, cr);
-        return error;
+}
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+STATIC int
+xfs_qm_statvfs(
+        struct bhv_desc         *bhv,
+        bhv_statvfs_t           *statp,
+        struct bhv_vnode        *vnode)
+{
+        xfs_mount_t             *mp;
+        xfs_inode_t             *ip;
+        xfs_dquot_t             *dqp;
+        xfs_disk_dquot_t        *dp;
+        __uint64_t              limit;
+        int                     error;
+        error = bhv_next_vfs_statvfs(BHV_NEXT(bhv), statp, vnode);
+        if (error || !vnode)
+                return error;
+        mp = XFS_BHVTOM(bhv);
+        ip = xfs_vtoi(vnode);
+        if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+                return 0;
+        if (!(mp->m_qflags & XFS_PQUOTA_ACCT))
+                return 0;
+        if (!(mp->m_qflags & XFS_OQUOTA_ENFD))
+                return 0;
+        if (xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp))
+                return 0;
+        dp = &dqp->q_core;
+        limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit;
+        if (limit && statp->f_blocks > limit) {
+                statp->f_blocks = limit;
+                statp->f_bfree = (statp->f_blocks > dp->d_bcount) ?
+                                        (statp->f_blocks - dp->d_bcount) : 0;
+        }
+        limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit;
+        if (limit && statp->f_files > limit) {
+                statp->f_files = limit;
+                statp->f_ffree = (statp->f_files > dp->d_icount) ?
+                                        (statp->f_ffree - dp->d_icount) : 0;
+        }
+        xfs_qm_dqput(dqp);
+        return 0;
 }
 STATIC int
@@ -191,7 +240,7 @@ xfs_qm_syncall(
        int                     flags,
        cred_t                  *credp)
 {
-        struct vfs              *vfsp = bhvtovfs(bhv);
+        struct bhv_vfs          *vfsp = bhvtovfs(bhv);
        struct xfs_mount        *mp = XFS_VFSTOM(vfsp);
        int                     error;
@@ -210,8 +259,7 @@ xfs_qm_syncall(
                        }
                }
        }
-        PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
+        return bhv_next_vfs_sync(BHV_NEXT(bhv), flags, credp);
-        return error;
 }
 STATIC int
@@ -346,11 +394,12 @@ STATIC struct xfs_qmops xfs_qmcore_xfs = {
        .xfs_dqtrxops           = &xfs_trans_dquot_ops,
 };
-struct bhv_vfsops xfs_qmops = { {
+struct bhv_module_vfsops xfs_qmops = { {
        BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
        .vfs_parseargs          = xfs_qm_parseargs,
        .vfs_showargs           = xfs_qm_showargs,
        .vfs_mount              = xfs_qm_mount,
+        .vfs_statvfs            = xfs_qm_statvfs,
        .vfs_sync               = xfs_qm_syncall,
        .vfs_quotactl           = xfs_qm_quotactl, },
 };
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 0570f7733550..6f858fb81a36 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c55db463bbf2..ed620c4d1594 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,7 +26,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -35,7 +34,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -91,8 +89,8 @@ xfs_qm_quotactl(
        xfs_caddr_t     addr)
 {
        xfs_mount_t     *mp;
+        bhv_vfs_t       *vfsp;
        int             error;
-        struct vfs      *vfsp;
        vfsp = bhvtovfs(bdp);
        mp = XFS_VFSTOM(vfsp);
@@ -1035,7 +1033,7 @@ xfs_qm_dqrele_all_inodes(
 {
        xfs_inode_t     *ip, *topino;
        uint            ireclaims;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        boolean_t       vnode_refd;
        ASSERT(mp->m_quotainfo);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 9168918db252..0242e9666e8e 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index b08b3d9345b7..36fbeccdc722 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -47,7 +47,7 @@ cmn_err(register int level, char *fmt, ...)
        va_start(ap, fmt);
        if (*fmt == '!') fp++;
        len = vsprintf(message, fp, ap);
-        if (message[len-1] != '\n')
+        if (level != CE_DEBUG && message[len-1] != '\n')
                strcat(message, "\n");
        printk("%s%s", err_level[level], message);
        va_end(ap);
@@ -68,7 +68,7 @@ icmn_err(register int level, char *fmt, va_list ap)
                level = XFS_MAX_ERR_LEVEL;
        spin_lock_irqsave(&xfs_err_lock,flags);
        len = vsprintf(message, fmt, ap);
-        if (message[len-1] != '\n')
+        if (level != CE_DEBUG && message[len-1] != '\n')
                strcat(message, "\n");
        spin_unlock_irqrestore(&xfs_err_lock,flags);
        printk("%s%s", err_level[level], message);
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index e3bf58112e7e..4f54dca662a8 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -33,9 +33,6 @@ extern void cmn_err(int, char *, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
-#define prdev(fmt,targ,args...) \
-        printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
 #define ASSERT_ALWAYS(expr)     \
        (unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 2539af34eb63..4b0cb474be4c 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,12 +21,10 @@
 #include "xfs_bit.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -39,15 +37,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
-STATIC int      xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
+STATIC int      xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void     xfs_acl_get_endian(xfs_acl_t *);
 STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int      xfs_acl_invalid(xfs_acl_t *);
 STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void     xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void     xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
+STATIC void     xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
-STATIC int      xfs_acl_allow_set(vnode_t *, int);
+STATIC int      xfs_acl_allow_set(bhv_vnode_t *, int);
 kmem_zone_t *xfs_acl_zone;
@@ -57,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
 */
 int
 xfs_acl_vhasacl_access(
-        vnode_t         *vp)
+        bhv_vnode_t     *vp)
 {
        int             error;
@@ -70,7 +68,7 @@ xfs_acl_vhasacl_access(
 */
 int
 xfs_acl_vhasacl_default(
-        vnode_t         *vp)
+        bhv_vnode_t     *vp)
 {
        int             error;
@@ -209,7 +207,7 @@ posix_acl_xfs_to_xattr(
 int
 xfs_acl_vget(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        void            *acl,
        size_t          size,
        int             kind)
@@ -241,10 +239,10 @@ xfs_acl_vget(
                        goto out;
                }
                if (kind == _ACL_TYPE_ACCESS) {
-                        vattr_t va;
+                        bhv_vattr_t     va;
                        va.va_mask = XFS_AT_MODE;
-                        VOP_GETATTR(vp, &va, 0, sys_cred, error);
+                        error = bhv_vop_getattr(vp, &va, 0, sys_cred);
                        if (error)
                                goto out;
                        xfs_acl_sync_mode(va.va_mode, xfs_acl);
@@ -260,7 +258,7 @@ out:
 int
 xfs_acl_vremove(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        int             kind)
 {
        int             error;
@@ -268,9 +266,9 @@ xfs_acl_vremove(
        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        if (!error) {
-                VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
+                error = bhv_vop_attr_remove(vp, kind == _ACL_TYPE_DEFAULT?
-                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
+                                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
-                                ATTR_ROOT, sys_cred, error);
+                                                ATTR_ROOT, sys_cred);
                if (error == ENOATTR)
                        error = 0;      /* 'scool */
        }
@@ -280,7 +278,7 @@ xfs_acl_vremove(
 int
 xfs_acl_vset(
-        vnode_t                 *vp,
+        bhv_vnode_t             *vp,
        void                    *acl,
        size_t                  size,
        int                     kind)
@@ -370,10 +368,10 @@ xfs_acl_iaccess(
 STATIC int
 xfs_acl_allow_set(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        int             kind)
 {
-        vattr_t         va;
+        bhv_vattr_t     va;
        int             error;
        if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
@@ -383,7 +381,7 @@ xfs_acl_allow_set(
        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
                return EROFS;
        va.va_mask = XFS_AT_UID;
-        VOP_GETATTR(vp, &va, 0, NULL, error);
+        error = bhv_vop_getattr(vp, &va, 0, NULL);
        if (error)
                return error;
        if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
@@ -606,7 +604,7 @@ xfs_acl_get_endian(
 */
 STATIC void
 xfs_acl_get_attr(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             flags,
@@ -616,9 +614,9 @@ xfs_acl_get_attr(
        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
        flags |= ATTR_ROOT;
-        VOP_ATTR_GET(vp,
+        *error = bhv_vop_attr_get(vp, kind == _ACL_TYPE_ACCESS ?
-                kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
+                                        SGI_ACL_FILE : SGI_ACL_DEFAULT,
-                (char *)aclp, &len, flags, sys_cred, *error);
+                                        (char *)aclp, &len, flags, sys_cred);
        if (*error || (flags & ATTR_KERNOVAL))
                return;
        xfs_acl_get_endian(aclp);
@@ -629,7 +627,7 @@ xfs_acl_get_attr(
 */
 STATIC void
 xfs_acl_set_attr(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             *error)
@@ -654,19 +652,19 @@ xfs_acl_set_attr(
                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
        }
        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        VOP_ATTR_SET(vp,
+        *error = bhv_vop_attr_set(vp, kind == _ACL_TYPE_ACCESS ?
-                kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
+                                SGI_ACL_FILE: SGI_ACL_DEFAULT,
-                (char *)newacl, len, ATTR_ROOT, sys_cred, *error);
+                                (char *)newacl, len, ATTR_ROOT, sys_cred);
        _ACL_FREE(newacl);
 }
 int
 xfs_acl_vtoacl(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        xfs_acl_t       *access_acl,
        xfs_acl_t       *default_acl)
 {
-        vattr_t         va;
+        bhv_vattr_t     va;
        int             error = 0;
        if (access_acl) {
@@ -678,7 +676,7 @@ xfs_acl_vtoacl(
                if (!error) {
                        /* Got the ACL, need the mode... */
                        va.va_mask = XFS_AT_MODE;
-                        VOP_GETATTR(vp, &va, 0, sys_cred, error);
+                        error = bhv_vop_getattr(vp, &va, 0, sys_cred);
                }
                if (error)
@@ -701,8 +699,8 @@ xfs_acl_vtoacl(
 */
 int
 xfs_acl_inherit(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
-        vattr_t         *vap,
+        bhv_vattr_t     *vap,
        xfs_acl_t       *pdaclp)
 {
        xfs_acl_t       *cacl;
@@ -757,11 +755,11 @@ xfs_acl_inherit(
 */
 STATIC int
 xfs_acl_setmode(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        xfs_acl_t       *acl,
        int             *basicperms)
 {
-        vattr_t         va;
+        bhv_vattr_t     va;
        xfs_acl_entry_t *ap;
        xfs_acl_entry_t *gap = NULL;
        int             i, error, nomask = 1;
@@ -776,7 +774,7 @@ xfs_acl_setmode(
         * mode.  The m:: bits take precedence over the g:: bits.
         */
        va.va_mask = XFS_AT_MODE;
-        VOP_GETATTR(vp, &va, 0, sys_cred, error);
+        error = bhv_vop_getattr(vp, &va, 0, sys_cred);
        if (error)
                return error;
@@ -810,8 +808,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                va.va_mode |= gap->ae_perm << 3;
-        VOP_SETATTR(vp, &va, 0, sys_cred, error);
+        return bhv_vop_setattr(vp, &va, 0, sys_cred);
-        return error;
 }
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 538d0d65b04c..f853cf1a6270 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -50,7 +50,7 @@ typedef struct xfs_acl {
 #ifdef CONFIG_XFS_POSIX_ACL
 struct vattr;
-struct vnode;
+struct bhv_vnode;
 struct xfs_inode;
 extern struct kmem_zone *xfs_acl_zone;
@@ -58,14 +58,14 @@ extern struct kmem_zone *xfs_acl_zone;
                (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)      kmem_zone_destroy(zone)
-extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_inherit(struct bhv_vnode *, struct bhv_vattr *, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vtoacl(struct bhv_vnode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct vnode *);
+extern int xfs_acl_vhasacl_access(struct bhv_vnode *);
-extern int xfs_acl_vhasacl_default(struct vnode *);
+extern int xfs_acl_vhasacl_default(struct bhv_vnode *);
-extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vset(struct bhv_vnode *, void *, size_t, int);
-extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vget(struct bhv_vnode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct vnode *vp, int);
+extern int xfs_acl_vremove(struct bhv_vnode *, int);
 #define _ACL_TYPE_ACCESS        1
 #define _ACL_TYPE_DEFAULT       2
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8558226281c4..eef6763f3a67 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1862,7 +1860,7 @@ xfs_alloc_fix_freelist(
                (pag->pagf_longest - delta) :
                (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
        if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-            (args->minleft &&
+            (!(flags & XFS_ALLOC_FLAG_FREEING) &&
             (int)(pag->pagf_freeblks + pag->pagf_flcount -
                   need - args->total) <
             (int)args->minleft)) {
@@ -1898,7 +1896,7 @@ xfs_alloc_fix_freelist(
        longest = (longest > delta) ? (longest - delta) :
                (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
        if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-             (args->minleft &&
+             (!(flags & XFS_ALLOC_FLAG_FREEING) &&
                (int)(be32_to_cpu(agf->agf_freeblks) +
                   be32_to_cpu(agf->agf_flcount) - need - args->total) <
             (int)args->minleft)) {
@@ -1951,8 +1949,14 @@ xfs_alloc_fix_freelist(
                 * the restrictions correctly.  Can happen for free calls
                 * on a completely full ag.
                 */
-                if (targs.agbno == NULLAGBLOCK)
+                if (targs.agbno == NULLAGBLOCK) {
+                        if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+                                xfs_trans_brelse(tp, agflbp);
+                                args->agbp = NULL;
+                                return 0;
+                        }
                        break;
+                }
                /*
                 * Put each allocated block on the list.
                 */
@@ -2360,8 +2364,19 @@ xfs_alloc_vextent(
                        if (args->agno == sagno &&
                            type == XFS_ALLOCTYPE_START_BNO)
                                args->type = XFS_ALLOCTYPE_THIS_AG;
-                        if (++(args->agno) == mp->m_sb.sb_agcount)
+                        /*
-                                args->agno = 0;
+                        * For the first allocation, we can try any AG to get
+                        * space.  However, if we already have allocated a
+                        * block, we don't want to try AGs whose number is below
+                        * sagno. Otherwise, we may end up with out-of-order
+                        * locking of AGF, which might cause deadlock.
+                        */
+                        if (++(args->agno) == mp->m_sb.sb_agcount) {
+                                if (args->firstblock != NULLFSBLOCK)
+                                        args->agno = sagno;
+                                else
+                                        args->agno = 0;
+                        }
                        /*
                         * Reached the starting a.g., must either be done
                         * or switch to non-trylock mode.
@@ -2443,7 +2458,7 @@ xfs_free_extent(
        args.minlen = args.minleft = args.minalignslop = 0;
        down_read(&args.mp->m_peraglock);
        args.pag = &args.mp->m_perag[args.agno];
-        if ((error = xfs_alloc_fix_freelist(&args, 0)))
+        if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
                goto error0;
 #ifdef DEBUG
        ASSERT(args.agbp != NULL);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2d1f8928b267..650591f999ae 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -41,6 +41,7 @@ typedef enum xfs_alloctype
 * Flags for xfs_alloc_fix_freelist.
 */
 #define XFS_ALLOC_FLAG_TRYLOCK  0x00000001  /* use trylock for buffer locking */
+#define XFS_ALLOC_FLAG_FREEING  0x00000002  /* indicate caller is freeing extents*/
 /*
 * Argument structure for xfs_alloc routines.
@@ -70,6 +71,7 @@ typedef struct xfs_alloc_arg {
        char            wasfromfl;      /* set if allocation is from freelist */
        char            isfl;           /* set if is freelist blocks - !acctg */
        char            userdata;       /* set if this is user data */
+        xfs_fsblock_t   firstblock;     /* io first block allocated */
 } xfs_alloc_arg_t;
 /*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index a1d92da86ccd..7446556e8021 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b6e1e02bbb28..1a2101043275 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -27,7 +27,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -35,7 +34,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1910,7 +1908,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
                                  args->rmtblkcnt,
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                  NULL, 0, map, &nmap, NULL);
+                                  NULL, 0, map, &nmap, NULL, NULL);
                if (error)
                        return(error);
                ASSERT(nmap >= 1);
@@ -1988,7 +1986,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
                                                        XFS_BMAPI_WRITE,
                                  args->firstblock, args->total, &map, &nmap,
-                                  args->flist);
+                                  args->flist, NULL);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                *args->firstblock, &committed);
@@ -2039,7 +2037,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
                                  args->rmtblkcnt,
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                  args->firstblock, 0, &map, &nmap, NULL);
+                                  args->firstblock, 0, &map, &nmap,
+                                  NULL, NULL);
                if (error) {
                        return(error);
                }
@@ -2104,7 +2103,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                                        args->rmtblkcnt,
                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                        args->firstblock, 0, &map, &nmap,
-                                        args->flist);
+                                        args->flist, NULL);
                if (error) {
                        return(error);
                }
@@ -2142,7 +2141,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                XFS_BMAP_INIT(args->flist, args->firstblock);
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                    1, args->firstblock, args->flist, &done);
+                                    1, args->firstblock, args->flist,
+                                    NULL, &done);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                *args->firstblock, &committed);
@@ -2322,56 +2322,56 @@ xfs_attr_trace_enter(int type, char *where,
 STATIC int
 posix_acl_access_set(
-        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
 }
 STATIC int
 posix_acl_access_remove(
-        struct vnode *vp, char *name, int xflags)
+        bhv_vnode_t *vp, char *name, int xflags)
 {
        return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
 }
 STATIC int
 posix_acl_access_get(
-        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
 }
 STATIC int
 posix_acl_access_exists(
-        vnode_t *vp)
+        bhv_vnode_t *vp)
 {
        return xfs_acl_vhasacl_access(vp);
 }
 STATIC int
 posix_acl_default_set(
-        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
 }
 STATIC int
 posix_acl_default_get(
-        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
 }
 STATIC int
 posix_acl_default_remove(
-        struct vnode *vp, char *name, int xflags)
+        bhv_vnode_t *vp, char *name, int xflags)
 {
        return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
 }
 STATIC int
 posix_acl_default_exists(
-        vnode_t *vp)
+        bhv_vnode_t *vp)
 {
        return xfs_acl_vhasacl_default(vp);
 }
@@ -2404,21 +2404,18 @@ STATIC struct attrnames *attr_system_names[] =
 STATIC int
 attr_generic_set(
-        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
-        int     error;
+        return -bhv_vop_attr_set(vp, name, data, size, xflags, NULL);
-        VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error);
-        return -error;
 }
 STATIC int
 attr_generic_get(
-        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        int     error, asize = size;
-        VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error);
+        error = bhv_vop_attr_get(vp, name, data, &asize, xflags, NULL);
        if (!error)
                return asize;
        return -error;
@@ -2426,12 +2423,9 @@ attr_generic_get(
 STATIC int
 attr_generic_remove(
-        struct vnode *vp, char *name, int xflags)
+        bhv_vnode_t *vp, char *name, int xflags)
 {
-        int     error;
+        return -bhv_vop_attr_remove(vp, name, xflags, NULL);
-        VOP_ATTR_REMOVE(vp, name, xflags, NULL, error);
-        return -error;
 }
 STATIC int
@@ -2459,7 +2453,7 @@ attr_generic_listadd(
 STATIC int
 attr_system_list(
-        struct vnode            *vp,
+        bhv_vnode_t             *vp,
        void                    *data,
        size_t                  size,
        ssize_t                 *result)
@@ -2481,12 +2475,12 @@ attr_system_list(
 int
 attr_generic_list(
-        struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result)
+        bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result)
 {
        attrlist_cursor_kern_t  cursor = { 0 };
        int                     error;
-        VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
+        error = bhv_vop_attr_list(vp, data, size, xflags, &cursor, NULL);
        if (error > 0)
                return -error;
        *result = -error;
@@ -2514,7 +2508,7 @@ attr_lookup_namespace(
 */
 STATIC int
 attr_user_capable(
-        struct vnode    *vp,
+        bhv_vnode_t     *vp,
        cred_t          *cred)
 {
        struct inode    *inode = vn_to_inode(vp);
@@ -2532,7 +2526,7 @@ attr_user_capable(
 STATIC int
 attr_trusted_capable(
-        struct vnode    *vp,
+        bhv_vnode_t     *vp,
        cred_t          *cred)
 {
        struct inode    *inode = vn_to_inode(vp);
@@ -2546,7 +2540,7 @@ attr_trusted_capable(
 STATIC int
 attr_secure_capable(
-        struct vnode    *vp,
+        bhv_vnode_t     *vp,
        cred_t          *cred)
 {
        return -ENOSECURITY;
@@ -2554,7 +2548,7 @@ attr_secure_capable(
 STATIC int
 attr_system_set(
-        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        attrnames_t     *namesp;
        int             error;
@@ -2573,7 +2567,7 @@ attr_system_set(
 STATIC int
 attr_system_get(
-        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
        attrnames_t     *namesp;
@@ -2585,7 +2579,7 @@ attr_system_get(
 STATIC int
 attr_system_remove(
-        struct vnode *vp, char *name, int xflags)
+        bhv_vnode_t *vp, char *name, int xflags)
 {
        attrnames_t     *namesp;
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index b2c7b9fcded3..981633f6c077 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -36,13 +36,13 @@
 *========================================================================*/
 struct cred;
-struct vnode;
+struct bhv_vnode;
-typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int);
+typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int);
-typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int);
+typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(struct vnode *, char *, int);
+typedef int (*attrremove_t)(struct bhv_vnode *, char *, int);
-typedef int (*attrexists_t)(struct vnode *);
+typedef int (*attrexists_t)(struct bhv_vnode *);
-typedef int (*attrcapable_t)(struct vnode *, struct cred *);
+typedef int (*attrcapable_t)(struct bhv_vnode *, struct cred *);
 typedef struct attrnames {
        char *          attr_name;
@@ -63,7 +63,7 @@ extern struct attrnames attr_trusted;
 extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
 extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
+extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *);
 #define ATTR_DONTFOLLOW 0x0001  /* -- unused, from IRIX -- */
 #define ATTR_ROOT       0x0002  /* use attrs in root (trusted) namespace */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 9462be86aa14..9455051f0120 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -34,7 +33,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -2990,7 +2988,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                nmap = 1;
                error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                        NULL, 0, &map, &nmap, NULL);
+                                        NULL, 0, &map, &nmap, NULL, NULL);
                if (error) {
                        return(error);
                }
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
index 1d8ff103201c..6e6e56fb352d 100644
--- a/fs/xfs/xfs_behavior.h
+++ b/fs/xfs/xfs_behavior.h
@@ -78,15 +78,12 @@
 *
 */
-struct bhv_head_lock;
 /*
 * Behavior head.  Head of the chain of behaviors.
 * Contained within each virtualized object data structure.
 */
 typedef struct bhv_head {
        struct bhv_desc *bh_first;      /* first behavior in chain */
-        struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
 } bhv_head_t;
 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 26939d364bc4..3a6137539064 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,13 +24,11 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -40,13 +38,15 @@
 #include "xfs_mount.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
 #include "xfs_inode_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_rw.h"
 #include "xfs_quota.h"
@@ -101,6 +101,7 @@ xfs_bmap_add_extent(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd);  /* OK to allocate reserved blocks */
@@ -118,6 +119,7 @@ xfs_bmap_add_extent_delay_real(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
@@ -131,6 +133,7 @@ xfs_bmap_add_extent_hole_delay(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp,/* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
@@ -144,6 +147,7 @@ xfs_bmap_add_extent_hole_real(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork); /* data or attr fork */
 /*
@@ -156,7 +160,8 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp); /* inode logging flags */
+        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta); /* Change made to incore extents */
 /*
 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -203,6 +208,7 @@ xfs_bmap_del_extent(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp,/* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd);   /* OK to allocate reserved blocks */
@@ -510,7 +516,7 @@ xfs_bmap_add_attrfork_local(
                dargs.total = mp->m_dirblkfsbs;
                dargs.whichfork = XFS_DATA_FORK;
                dargs.trans = tp;
-                error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
+                error = xfs_dir2_sf_to_block(&dargs);
        } else
                error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
                        XFS_DATA_FORK);
@@ -530,6 +536,7 @@ xfs_bmap_add_extent(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd)   /* OK to use reserved data blocks */
 {
@@ -567,6 +574,15 @@ xfs_bmap_add_extent(
                        logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
                } else
                        logflags = 0;
+                /* DELTA: single new extent */
+                if (delta) {
+                        if (delta->xed_startoff > new->br_startoff)
+                                delta->xed_startoff = new->br_startoff;
+                        if (delta->xed_blockcount <
+                                        new->br_startoff + new->br_blockcount)
+                                delta->xed_blockcount = new->br_startoff +
+                                                new->br_blockcount;
+                }
        }
        /*
         * Any kind of new delayed allocation goes here.
@@ -576,7 +592,7 @@ xfs_bmap_add_extent(
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
-                                &logflags, rsvd)))
+                                &logflags, delta, rsvd)))
                        goto done;
        }
        /*
@@ -587,7 +603,7 @@ xfs_bmap_add_extent(
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, whichfork)))
+                                &logflags, delta, whichfork)))
                        goto done;
        } else {
                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
@@ -612,17 +628,17 @@ xfs_bmap_add_extent(
                                                XFS_BTCUR_BPRV_WASDEL);
                                if ((error = xfs_bmap_add_extent_delay_real(ip,
                                        idx, &cur, new, &da_new, first, flist,
-                                        &logflags, rsvd)))
+                                        &logflags, delta, rsvd)))
                                        goto done;
                        } else if (new->br_state == XFS_EXT_NORM) {
                                ASSERT(new->br_state == XFS_EXT_NORM);
                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags)))
+                                        ip, idx, &cur, new, &logflags, delta)))
                                        goto done;
                        } else {
                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags)))
+                                        ip, idx, &cur, new, &logflags, delta)))
                                        goto done;
                        }
                        ASSERT(*curp == cur || *curp == NULL);
@@ -635,7 +651,7 @@ xfs_bmap_add_extent(
                                ASSERT((cur->bc_private.b.flags &
                                        XFS_BTCUR_BPRV_WASDEL) == 0);
                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, whichfork)))
+                                        new, &logflags, delta, whichfork)))
                                goto done;
                }
        }
@@ -700,6 +716,7 @@ xfs_bmap_add_extent_delay_real(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd)   /* OK to use reserved data block allocation */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
@@ -716,8 +733,8 @@ xfs_bmap_add_extent_delay_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
-        xfs_filblks_t           temp;   /* value for dnew calculations */
+        xfs_filblks_t           temp=0; /* value for dnew calculations */
-        xfs_filblks_t           temp2;  /* value for dnew calculations */
+        xfs_filblks_t           temp2=0;/* value for dnew calculations */
        int                     tmp_rval;       /* partial logging flags */
        enum {                          /* bit number definitions for state */
                LEFT_CONTIG,    RIGHT_CONTIG,
@@ -839,6 +856,11 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
+                /* DELTA: Three in-core extents are replaced by one. */
+                temp = LEFT.br_startoff;
+                temp2 = LEFT.br_blockcount +
+                        PREV.br_blockcount +
+                        RIGHT.br_blockcount;
                break;
        case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
@@ -872,6 +894,10 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
+                /* DELTA: Two in-core extents are replaced by one. */
+                temp = LEFT.br_startoff;
+                temp2 = LEFT.br_blockcount +
+                        PREV.br_blockcount;
                break;
        case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
@@ -906,6 +932,10 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
+                /* DELTA: Two in-core extents are replaced by one. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount +
+                        RIGHT.br_blockcount;
                break;
        case MASK2(LEFT_FILLING, RIGHT_FILLING):
@@ -936,6 +966,9 @@ xfs_bmap_add_extent_delay_real(
                        ASSERT(i == 1);
                }
                *dnew = 0;
+                /* DELTA: The in-core extent described by new changed type. */
+                temp = new->br_startoff;
+                temp2 = new->br_blockcount;
                break;
        case MASK2(LEFT_FILLING, LEFT_CONTIG):
@@ -978,6 +1011,10 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
                        XFS_DATA_FORK);
                *dnew = temp;
+                /* DELTA: The boundary between two in-core extents moved. */
+                temp = LEFT.br_startoff;
+                temp2 = LEFT.br_blockcount +
+                        PREV.br_blockcount;
                break;
        case MASK(LEFT_FILLING):
@@ -1025,6 +1062,9 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
                        XFS_DATA_FORK);
                *dnew = temp;
+                /* DELTA: One in-core extent is split in two. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount;
                break;
        case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
@@ -1067,6 +1107,10 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
                        XFS_DATA_FORK);
                *dnew = temp;
+                /* DELTA: The boundary between two in-core extents moved. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount +
+                        RIGHT.br_blockcount;
                break;
        case MASK(RIGHT_FILLING):
@@ -1112,6 +1156,9 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
                xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
                *dnew = temp;
+                /* DELTA: One in-core extent is split in two. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount;
                break;
        case 0:
@@ -1194,6 +1241,9 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
                        XFS_DATA_FORK);
                *dnew = temp + temp2;
+                /* DELTA: One in-core extent is split in three. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount;
                break;
        case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
@@ -1209,6 +1259,13 @@ xfs_bmap_add_extent_delay_real(
                ASSERT(0);
        }
        *curp = cur;
+        if (delta) {
+                temp2 += temp;
+                if (delta->xed_startoff > temp)
+                        delta->xed_startoff = temp;
+                if (delta->xed_blockcount < temp2)
+                        delta->xed_blockcount = temp2;
+        }
 done:
        *logflagsp = rval;
        return error;
@@ -1235,7 +1292,8 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp) /* inode logging flags */
+        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta) /* Change made to incore extents */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
        xfs_bmbt_rec_t          *ep;    /* extent entry for idx */
@@ -1252,6 +1310,8 @@ xfs_bmap_add_extent_unwritten_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
+        xfs_filblks_t           temp=0;
+        xfs_filblks_t           temp2=0;
        enum {                          /* bit number definitions for state */
                LEFT_CONTIG,    RIGHT_CONTIG,
                LEFT_FILLING,   RIGHT_FILLING,
@@ -1380,6 +1440,11 @@ xfs_bmap_add_extent_unwritten_real(
                                RIGHT.br_blockcount, LEFT.br_state)))
                                goto done;
                }
+                /* DELTA: Three in-core extents are replaced by one. */
+                temp = LEFT.br_startoff;
+                temp2 = LEFT.br_blockcount +
+                        PREV.br_blockcount +
+                        RIGHT.br_blockcount;
                break;
        case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
@@ -1419,6 +1484,10 @@ xfs_bmap_add_extent_unwritten_real(
                                LEFT.br_state)))
                                goto done;
                }
+                /* DELTA: Two in-core extents are replaced by one. */
+                temp = LEFT.br_startoff;
+                temp2 = LEFT.br_blockcount +
+                        PREV.br_blockcount;
                break;
        case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
@@ -1459,6 +1528,10 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
+                /* DELTA: Two in-core extents are replaced by one. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount +
+                        RIGHT.br_blockcount;
                break;
        case MASK2(LEFT_FILLING, RIGHT_FILLING):
@@ -1487,6 +1560,9 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
+                /* DELTA: The in-core extent described by new changed type. */
+                temp = new->br_startoff;
+                temp2 = new->br_blockcount;
                break;
        case MASK2(LEFT_FILLING, LEFT_CONTIG):
@@ -1534,6 +1610,10 @@ xfs_bmap_add_extent_unwritten_real(
                                LEFT.br_state))
                                goto done;
                }
+                /* DELTA: The boundary between two in-core extents moved. */
+                temp = LEFT.br_startoff;
+                temp2 = LEFT.br_blockcount +
+                        PREV.br_blockcount;
                break;
        case MASK(LEFT_FILLING):
@@ -1574,6 +1654,9 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        ASSERT(i == 1);
                }
+                /* DELTA: One in-core extent is split in two. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount;
                break;
        case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
@@ -1617,6 +1700,10 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
+                /* DELTA: The boundary between two in-core extents moved. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount +
+                        RIGHT.br_blockcount;
                break;
        case MASK(RIGHT_FILLING):
@@ -1657,6 +1744,9 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        ASSERT(i == 1);
                }
+                /* DELTA: One in-core extent is split in two. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount;
                break;
        case 0:
@@ -1710,6 +1800,9 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        ASSERT(i == 1);
                }
+                /* DELTA: One in-core extent is split in three. */
+                temp = PREV.br_startoff;
+                temp2 = PREV.br_blockcount;
                break;
        case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
@@ -1725,6 +1818,13 @@ xfs_bmap_add_extent_unwritten_real(
                ASSERT(0);
        }
        *curp = cur;
+        if (delta) {
+                temp2 += temp;
+                if (delta->xed_startoff > temp)
+                        delta->xed_startoff = temp;
+                if (delta->xed_blockcount < temp2)
+                        delta->xed_blockcount = temp2;
+        }
 done:
        *logflagsp = rval;
        return error;
@@ -1753,6 +1853,7 @@ xfs_bmap_add_extent_hole_delay(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd)           /* OK to allocate reserved blocks */
 {
        xfs_bmbt_rec_t          *ep;    /* extent record for idx */
@@ -1765,7 +1866,8 @@ xfs_bmap_add_extent_hole_delay(
        xfs_filblks_t           oldlen=0;       /* old indirect size */
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     state;  /* state bits, accessed thru macros */
-        xfs_filblks_t           temp;   /* temp for indirect calculations */
+        xfs_filblks_t           temp=0; /* temp for indirect calculations */
+        xfs_filblks_t           temp2=0;
        enum {                          /* bit number definitions for state */
                LEFT_CONTIG,    RIGHT_CONTIG,
                LEFT_DELAY,     RIGHT_DELAY,
@@ -1844,6 +1946,9 @@ xfs_bmap_add_extent_hole_delay(
                        XFS_DATA_FORK);
                xfs_iext_remove(ifp, idx, 1);
                ip->i_df.if_lastex = idx - 1;
+                /* DELTA: Two in-core extents were replaced by one. */
+                temp2 = temp;
+                temp = left.br_startoff;
                break;
        case MASK(LEFT_CONTIG):
@@ -1864,6 +1969,9 @@ xfs_bmap_add_extent_hole_delay(
                xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx - 1;
+                /* DELTA: One in-core extent grew into a hole. */
+                temp2 = temp;
+                temp = left.br_startoff;
                break;
        case MASK(RIGHT_CONTIG):
@@ -1881,6 +1989,9 @@ xfs_bmap_add_extent_hole_delay(
                        NULLSTARTBLOCK((int)newlen), temp, right.br_state);
                xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
                ip->i_df.if_lastex = idx;
+                /* DELTA: One in-core extent grew into a hole. */
+                temp2 = temp;
+                temp = new->br_startoff;
                break;
        case 0:
@@ -1894,6 +2005,9 @@ xfs_bmap_add_extent_hole_delay(
                        XFS_DATA_FORK);
                xfs_iext_insert(ifp, idx, 1, new);
                ip->i_df.if_lastex = idx;
+                /* DELTA: A new in-core extent was added in a hole. */
+                temp2 = new->br_blockcount;
+                temp = new->br_startoff;
                break;
        }
        if (oldlen != newlen) {
@@ -1904,6 +2018,13 @@ xfs_bmap_add_extent_hole_delay(
                 * Nothing to do for disk quota accounting here.
                 */
        }
+        if (delta) {
+                temp2 += temp;
+                if (delta->xed_startoff > temp)
+                        delta->xed_startoff = temp;
+                if (delta->xed_blockcount < temp2)
+                        delta->xed_blockcount = temp2;
+        }
        *logflagsp = 0;
        return 0;
 #undef  MASK
@@ -1925,6 +2046,7 @@ xfs_bmap_add_extent_hole_real(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork) /* data or attr fork */
 {
        xfs_bmbt_rec_t          *ep;    /* pointer to extent entry ins. point */
@@ -1936,7 +2058,10 @@ xfs_bmap_add_extent_hole_real(
        xfs_ifork_t             *ifp;   /* inode fork pointer */
        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+        int                     rval=0; /* return value (logging flags) */
        int                     state;  /* state bits, accessed thru macros */
+        xfs_filblks_t           temp=0;
+        xfs_filblks_t           temp2=0;
        enum {                          /* bit number definitions for state */
                LEFT_CONTIG,    RIGHT_CONTIG,
                LEFT_DELAY,     RIGHT_DELAY,
@@ -1993,6 +2118,7 @@ xfs_bmap_add_extent_hole_real(
                 left.br_blockcount + new->br_blockcount +
                     right.br_blockcount <= MAXEXTLEN));
+        error = 0;
        /*
         * Select which case we're in here, and implement it.
         */
@@ -2018,25 +2144,35 @@ xfs_bmap_add_extent_hole_real(
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                if (cur == NULL) {
-                        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
-                        return 0;
+                } else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur,
+                                        right.br_startoff,
+                                        right.br_startblock,
+                                        right.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_delete(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
+                                        left.br_startblock,
+                                        left.br_blockcount +
+                                                new->br_blockcount +
+                                                right.br_blockcount,
+                                        left.br_state)))
+                                goto done;
                }
-                *logflagsp = XFS_ILOG_CORE;
+                /* DELTA: Two in-core extents were replaced by one. */
-                if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+                temp = left.br_startoff;
-                                right.br_startblock, right.br_blockcount, &i)))
+                temp2 = left.br_blockcount +
-                        return error;
+                        new->br_blockcount +
-                ASSERT(i == 1);
+                        right.br_blockcount;
-                if ((error = xfs_bmbt_delete(cur, &i)))
+                break;
-                        return error;
-                ASSERT(i == 1);
-                if ((error = xfs_bmbt_decrement(cur, 0, &i)))
-                        return error;
-                ASSERT(i == 1);
-                error = xfs_bmbt_update(cur, left.br_startoff,
-                                left.br_startblock,
-                                left.br_blockcount + new->br_blockcount +
-                                right.br_blockcount, left.br_state);
-                return error;
        case MASK(LEFT_CONTIG):
                /*
@@ -2050,19 +2186,27 @@ xfs_bmap_add_extent_hole_real(
                xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
                ifp->if_lastex = idx - 1;
                if (cur == NULL) {
-                        *logflagsp = XFS_ILOG_FEXT(whichfork);
+                        rval = XFS_ILOG_FEXT(whichfork);
-                        return 0;
+                } else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur,
+                                        left.br_startoff,
+                                        left.br_startblock,
+                                        left.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
+                                        left.br_startblock,
+                                        left.br_blockcount +
+                                                new->br_blockcount,
+                                        left.br_state)))
+                                goto done;
                }
-                *logflagsp = 0;
+                /* DELTA: One in-core extent grew. */
-                if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
+                temp = left.br_startoff;
-                                left.br_startblock, left.br_blockcount, &i)))
+                temp2 = left.br_blockcount +
-                        return error;
+                        new->br_blockcount;
-                ASSERT(i == 1);
+                break;
-                error = xfs_bmbt_update(cur, left.br_startoff,
-                                left.br_startblock,
-                                left.br_blockcount + new->br_blockcount,
-                                left.br_state);
-                return error;
        case MASK(RIGHT_CONTIG):
                /*
@@ -2077,19 +2221,27 @@ xfs_bmap_add_extent_hole_real(
                xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
                ifp->if_lastex = idx;
                if (cur == NULL) {
-                        *logflagsp = XFS_ILOG_FEXT(whichfork);
+                        rval = XFS_ILOG_FEXT(whichfork);
-                        return 0;
+                } else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur,
+                                        right.br_startoff,
+                                        right.br_startblock,
+                                        right.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                        new->br_startblock,
+                                        new->br_blockcount +
+                                                right.br_blockcount,
+                                        right.br_state)))
+                                goto done;
                }
-                *logflagsp = 0;
+                /* DELTA: One in-core extent grew. */
-                if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+                temp = new->br_startoff;
-                                right.br_startblock, right.br_blockcount, &i)))
+                temp2 = new->br_blockcount +
-                        return error;
+                        right.br_blockcount;
-                ASSERT(i == 1);
+                break;
-                error = xfs_bmbt_update(cur, new->br_startoff,
-                                new->br_startblock,
-                                new->br_blockcount + right.br_blockcount,
-                                right.br_state);
-                return error;
        case 0:
                /*
@@ -2104,29 +2256,41 @@ xfs_bmap_add_extent_hole_real(
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                if (cur == NULL) {
-                        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
-                        return 0;
+                } else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur,
+                                        new->br_startoff,
+                                        new->br_startblock,
+                                        new->br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 0);
+                        cur->bc_rec.b.br_state = new->br_state;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
                }
-                *logflagsp = XFS_ILOG_CORE;
+                /* DELTA: A new extent was added in a hole. */
-                if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                temp = new->br_startoff;
-                                new->br_startblock, new->br_blockcount, &i)))
+                temp2 = new->br_blockcount;
-                        return error;
+                break;
-                ASSERT(i == 0);
+        }
-                cur->bc_rec.b.br_state = new->br_state;
+        if (delta) {
-                if ((error = xfs_bmbt_insert(cur, &i)))
+                temp2 += temp;
-                        return error;
+                if (delta->xed_startoff > temp)
-                ASSERT(i == 1);
+                        delta->xed_startoff = temp;
-                return 0;
+                if (delta->xed_blockcount < temp2)
+                        delta->xed_blockcount = temp2;
        }
+done:
+        *logflagsp = rval;
+        return error;
 #undef  MASK
 #undef  MASK2
 #undef  STATE_SET
 #undef  STATE_TEST
 #undef  STATE_SET_TEST
 #undef  SWITCH_STATE
-        /* NOTREACHED */
-        ASSERT(0);
-        return 0; /* keep gcc quite */
 }
 /*
@@ -2598,6 +2762,7 @@ xfs_bmap_btalloc(
        args.mp = mp;
        args.fsbno = ap->rval;
        args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
                args.type = XFS_ALLOCTYPE_START_BNO;
@@ -2657,7 +2822,7 @@ xfs_bmap_btalloc(
                else
                        args.minlen = ap->alen;
        } else if (ap->low) {
-                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.type = XFS_ALLOCTYPE_START_BNO;
                args.total = args.minlen = ap->minlen;
        } else {
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -2669,7 +2834,7 @@ xfs_bmap_btalloc(
                args.prod = ap->ip->i_d.di_extsize;
                if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
                        args.mod = (xfs_extlen_t)(args.prod - args.mod);
-        } else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) {
+        } else if (mp->m_sb.sb_blocksize >= NBPP) {
                args.prod = 1;
                args.mod = 0;
        } else {
@@ -2885,6 +3050,7 @@ xfs_bmap_del_extent(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
+        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd)   /* OK to allocate reserved blocks */
 {
@@ -3193,6 +3359,14 @@ xfs_bmap_del_extent(
        if (da_old > da_new)
                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
                        rsvd);
+        if (delta) {
+                /* DELTA: report the original extent. */
+                if (delta->xed_startoff > got.br_startoff)
+                        delta->xed_startoff = got.br_startoff;
+                if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
+                        delta->xed_blockcount = got.br_startoff +
+                                                        got.br_blockcount;
+        }
 done:
        *logflagsp = flags;
        return error;
@@ -3279,6 +3453,7 @@ xfs_bmap_extents_to_btree(
        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
        args.tp = tp;
        args.mp = mp;
+        args.firstblock = *firstblock;
        if (*firstblock == NULLFSBLOCK) {
                args.type = XFS_ALLOCTYPE_START_BNO;
                args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
@@ -3414,6 +3589,7 @@ xfs_bmap_local_to_extents(
                args.tp = tp;
                args.mp = ip->i_mount;
+                args.firstblock = *firstblock;
                ASSERT((ifp->if_flags &
                        (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
                /*
@@ -3753,7 +3929,7 @@ xfs_bunmap_trace(
        if (ip->i_rwtrace == NULL)
                return;
        ktrace_enter(ip->i_rwtrace,
-                (void *)(__psint_t)XFS_BUNMAPI,
+                (void *)(__psint_t)XFS_BUNMAP,
                (void *)ip,
                (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
                (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
@@ -4087,8 +4263,8 @@ xfs_bmap_finish(
                        if (!XFS_FORCED_SHUTDOWN(mp))
                                xfs_force_shutdown(mp,
                                                   (error == EFSCORRUPTED) ?
-                                                   XFS_CORRUPT_INCORE :
+                                                   SHUTDOWN_CORRUPT_INCORE :
-                                                   XFS_METADATA_IO_ERROR);
+                                                   SHUTDOWN_META_IO_ERROR);
                        return error;
                }
                xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
@@ -4538,7 +4714,8 @@ xfs_bmapi(
        xfs_extlen_t    total,          /* total blocks needed */
        xfs_bmbt_irec_t *mval,          /* output: map values */
        int             *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t *flist)         /* i/o: list extents to free */
+        xfs_bmap_free_t *flist,         /* i/o: list extents to free */
+        xfs_extdelta_t  *delta)         /* o: change made to incore extents */
 {
        xfs_fsblock_t   abno;           /* allocated block number */
        xfs_extlen_t    alen;           /* allocated extent length */
@@ -4650,6 +4827,10 @@ xfs_bmapi(
        end = bno + len;
        obno = bno;
        bma.ip = NULL;
+        if (delta) {
+                delta->xed_startoff = NULLFILEOFF;
+                delta->xed_blockcount = 0;
+        }
        while (bno < end && n < *nmap) {
                /*
                 * Reading past eof, act as though there's a hole
@@ -4886,8 +5067,8 @@ xfs_bmapi(
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
-                                firstblock, flist, &tmp_logflags, whichfork,
+                                firstblock, flist, &tmp_logflags, delta,
-                                (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
@@ -4983,8 +5164,8 @@ xfs_bmapi(
                        }
                        mval->br_state = XFS_EXT_NORM;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
-                                firstblock, flist, &tmp_logflags, whichfork,
+                                firstblock, flist, &tmp_logflags, delta,
-                                (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
@@ -5073,7 +5254,14 @@ xfs_bmapi(
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
               XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
        error = 0;
+        if (delta && delta->xed_startoff != NULLFILEOFF) {
+                /* A change was actually made.
+                 * Note that delta->xed_blockount is an offset at this
+                 * point and needs to be converted to a block count.
+                 */
+                ASSERT(delta->xed_blockcount > delta->xed_startoff);
+                delta->xed_blockcount -= delta->xed_startoff;
+        }
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
@@ -5185,6 +5373,8 @@ xfs_bunmapi(
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_extdelta_t          *delta,         /* o: change made to incore
+                                                   extents */
        int                     *done)          /* set if not done yet */
 {
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
@@ -5242,6 +5432,10 @@ xfs_bunmapi(
        bno = start + len - 1;
        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
                &prev);
+        if (delta) {
+                delta->xed_startoff = NULLFILEOFF;
+                delta->xed_blockcount = 0;
+        }
        /*
         * Check to see if the given block number is past the end of the
         * file, back up to the last block if so...
@@ -5340,7 +5534,8 @@ xfs_bunmapi(
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
-                                firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+                                firstblock, flist, &logflags, delta,
+                                XFS_DATA_FORK, 0);
                        if (error)
                                goto error0;
                        goto nodelete;
@@ -5394,7 +5589,7 @@ xfs_bunmapi(
                                prev.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
                                        &prev, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        delta, XFS_DATA_FORK, 0);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5403,7 +5598,7 @@ xfs_bunmapi(
                                del.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent(ip, lastx, &cur,
                                        &del, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        delta, XFS_DATA_FORK, 0);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5456,7 +5651,7 @@ xfs_bunmapi(
                        goto error0;
                }
                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-                        &tmp_logflags, whichfork, rsvd);
+                                &tmp_logflags, delta, whichfork, rsvd);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
@@ -5513,6 +5708,14 @@ nodelete:
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        error = 0;
+        if (delta && delta->xed_startoff != NULLFILEOFF) {
+                /* A change was actually made.
+                 * Note that delta->xed_blockount is an offset at this
+                 * point and needs to be converted to a block count.
+                 */
+                ASSERT(delta->xed_blockcount > delta->xed_startoff);
+                delta->xed_blockcount -= delta->xed_startoff;
+        }
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
@@ -5556,7 +5759,7 @@ xfs_getbmap(
        __int64_t               fixlen;         /* length for -1 case */
        int                     i;              /* extent number */
        xfs_inode_t             *ip;            /* xfs incore inode pointer */
-        vnode_t                 *vp;            /* corresponding vnode */
+        bhv_vnode_t             *vp;            /* corresponding vnode */
        int                     lock;           /* lock state */
        xfs_bmbt_irec_t         *map;           /* buffer for user's data */
        xfs_mount_t             *mp;            /* file system mount point */
@@ -5653,7 +5856,7 @@ xfs_getbmap(
        if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
-                VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+                error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
        }
        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
@@ -5689,7 +5892,8 @@ xfs_getbmap(
                nmap = (nexleft > subnex) ? subnex : nexleft;
                error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
                                  XFS_BB_TO_FSB(mp, bmv->bmv_length),
-                                  bmapi_flags, NULL, 0, map, &nmap, NULL);
+                                  bmapi_flags, NULL, 0, map, &nmap,
+                                  NULL, NULL);
                if (error)
                        goto unlock_and_return;
                ASSERT(nmap <= subnex);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 8e0d73d9ccc4..80e93409b78d 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -26,6 +26,20 @@ struct xfs_mount;
 struct xfs_trans;
 /*
+ * DELTA: describe a change to the in-core extent list.
+ *
+ * Internally the use of xed_blockount is somewhat funky.
+ * xed_blockcount contains an offset much of the time because this
+ * makes merging changes easier.  (xfs_fileoff_t and xfs_filblks_t are
+ * the same underlying type).
+ */
+typedef struct xfs_extdelta
+{
+        xfs_fileoff_t           xed_startoff;   /* offset of range */
+        xfs_filblks_t           xed_blockcount; /* blocks in range */
+} xfs_extdelta_t;
+/*
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
@@ -275,7 +289,9 @@ xfs_bmapi(
        xfs_extlen_t            total,          /* total blocks needed */
        struct xfs_bmbt_irec    *mval,          /* output: map values */
        int                     *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t         *flist);        /* i/o: list extents to free */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_extdelta_t          *delta);        /* o: change made to incore
+                                                   extents */
 /*
 * Map file blocks to filesystem blocks, simple version.
@@ -309,6 +325,8 @@ xfs_bunmapi(
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_extdelta_t          *delta,         /* o: change made to incore
+                                                   extents */
        int                     *done);         /* set if not done yet */
 /*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bea44709afbe..18fb7385d719 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1569,12 +1567,11 @@ xfs_bmbt_split(
        lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
        args.fsbno = cur->bc_private.b.firstblock;
+        args.firstblock = args.fsbno;
        if (args.fsbno == NULLFSBLOCK) {
                args.fsbno = lbno;
                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else if (cur->bc_private.b.flist->xbf_low)
+        } else
-                args.type = XFS_ALLOCTYPE_FIRST_AG;
-        else
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
        args.mod = args.minleft = args.alignment = args.total = args.isfl =
                args.userdata = args.minalignslop = 0;
@@ -2356,6 +2353,7 @@ xfs_bmbt_newroot(
                args.userdata = args.minalignslop = 0;
        args.minlen = args.maxlen = args.prod = 1;
        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        args.firstblock = args.fsbno;
        if (args.fsbno == NULLFSBLOCK) {
 #ifdef DEBUG
                if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
@@ -2365,9 +2363,7 @@ xfs_bmbt_newroot(
 #endif
                args.fsbno = INT_GET(*pp, ARCH_CONVERT);
                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else if (args.wasdel)
+        } else
-                args.type = XFS_ALLOCTYPE_FIRST_AG;
-        else
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
        if ((error = xfs_alloc_vextent(&args))) {
                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52d5d095fc35..ee2255bd6562 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 5fed15682dda..a4aa53974f76 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -23,7 +23,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_buf_item.h"
@@ -1030,9 +1029,9 @@ xfs_buf_iodone_callbacks(
                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
                    (time_after(jiffies, (lasttime + 5*HZ)))) {
                        lasttime = jiffies;
-                        prdev("XFS write error in file system meta-data "
+                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-                              "block 0x%llx in %s",
+                                        " block 0x%llx in %s",
-                              XFS_BUF_TARGET(bp),
+                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
                }
                lasttarg = XFS_BUF_TARGET(bp);
@@ -1108,7 +1107,7 @@ xfs_buf_error_relse(
        XFS_BUF_ERROR(bp,0);
        xfs_buftrace("BUF_ERROR_RELSE", bp);
        if (! XFS_FORCED_SHUTDOWN(mp))
-                xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        /*
         * We have to unpin the pinned buffers so do the
         * callbacks.
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
index d0035c6e9514..7a0e482dd436 100644
--- a/fs/xfs/xfs_cap.h
+++ b/fs/xfs/xfs_cap.h
@@ -49,12 +49,12 @@ typedef struct xfs_cap_set {
 #include <linux/posix_cap_xattr.h>
-struct vnode;
+struct bhv_vnode;
-extern int xfs_cap_vhascap(struct vnode *);
+extern int xfs_cap_vhascap(struct bhv_vnode *);
-extern int xfs_cap_vset(struct vnode *, void *, size_t);
+extern int xfs_cap_vset(struct bhv_vnode *, void *, size_t);
-extern int xfs_cap_vget(struct vnode *, void *, size_t);
+extern int xfs_cap_vget(struct bhv_vnode *, void *, size_t);
-extern int xfs_cap_vremove(struct vnode *vp);
+extern int xfs_cap_vremove(struct bhv_vnode *);
 #define _CAP_EXISTS             xfs_cap_vhascap
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 8988b9051175..32ab61d17ace 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -43,7 +41,6 @@
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -159,7 +156,7 @@ xfs_da_split(xfs_da_state_t *state)
        max = state->path.active - 1;
        ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
        ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
-               state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+               state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
        addblk = &state->path.blk[max];         /* initial dummy value */
        for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
@@ -199,38 +196,7 @@ xfs_da_split(xfs_da_state_t *state)
                                return(error);  /* GROT: attr inconsistent */
                        addblk = newblk;
                        break;
-                case XFS_DIR_LEAF_MAGIC:
-                        ASSERT(XFS_DIR_IS_V1(state->mp));
-                        error = xfs_dir_leaf_split(state, oldblk, newblk);
-                        if ((error != 0) && (error != ENOSPC)) {
-                                return(error);  /* GROT: dir is inconsistent */
-                        }
-                        if (!error) {
-                                addblk = newblk;
-                                break;
-                        }
-                        /*
-                         * Entry wouldn't fit, split the leaf again.
-                         */
-                        state->extravalid = 1;
-                        if (state->inleaf) {
-                                state->extraafter = 0;  /* before newblk */
-                                error = xfs_dir_leaf_split(state, oldblk,
-                                                           &state->extrablk);
-                                if (error)
-                                        return(error);  /* GROT: dir incon. */
-                                addblk = newblk;
-                        } else {
-                                state->extraafter = 1;  /* after newblk */
-                                error = xfs_dir_leaf_split(state, newblk,
-                                                           &state->extrablk);
-                                if (error)
-                                        return(error);  /* GROT: dir incon. */
-                                addblk = newblk;
-                        }
-                        break;
                case XFS_DIR2_LEAFN_MAGIC:
-                        ASSERT(XFS_DIR_IS_V2(state->mp));
                        error = xfs_dir2_leafn_split(state, oldblk, newblk);
                        if (error)
                                return error;
@@ -363,7 +329,6 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
                             (char *)oldroot);
        } else {
-                ASSERT(XFS_DIR_IS_V2(mp));
                ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
                leaf = (xfs_dir2_leaf_t *)oldroot;
                size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
@@ -379,8 +344,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
         * Set up the new root node.
         */
        error = xfs_da_node_create(args,
-                args->whichfork == XFS_DATA_FORK &&
+                (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
-                XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
                be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
        if (error)
                return(error);
@@ -427,10 +391,9 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
        /*
-         * With V2 the extra block is data or freespace.
+         * With V2 dirs the extra block is data or freespace.
         */
-        useextra = state->extravalid && (XFS_DIR_IS_V1(state->mp) ||
+        useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
-                        state->args->whichfork == XFS_ATTR_FORK);
        newcount = 1 + useextra;
        /*
         * Do we have to split the node?
@@ -624,7 +587,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
        ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
        ASSERT(newblk->blkno != 0);
-        if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+        if (state->args->whichfork == XFS_DATA_FORK)
                ASSERT(newblk->blkno >= mp->m_dirleafblk &&
                       newblk->blkno < mp->m_dirfreeblk);
@@ -670,7 +633,7 @@ xfs_da_join(xfs_da_state_t *state)
        save_blk = &state->altpath.blk[ state->path.active-1 ];
        ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
        ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
-               drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+               drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
        /*
         * Walk back up the tree joining/deallocating as necessary.
@@ -693,17 +656,7 @@ xfs_da_join(xfs_da_state_t *state)
                                return(0);
                        xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
                        break;
-                case XFS_DIR_LEAF_MAGIC:
-                        ASSERT(XFS_DIR_IS_V1(state->mp));
-                        error = xfs_dir_leaf_toosmall(state, &action);
-                        if (error)
-                                return(error);
-                        if (action == 0)
-                                return(0);
-                        xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
-                        break;
                case XFS_DIR2_LEAFN_MAGIC:
-                        ASSERT(XFS_DIR_IS_V2(state->mp));
                        error = xfs_dir2_leafn_toosmall(state, &action);
                        if (error)
                                return error;
@@ -790,7 +743,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
        ASSERT(bp != NULL);
        blkinfo = bp->data;
        if (be16_to_cpu(oldroot->hdr.level) == 1) {
-                ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+                ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC ||
                       be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC);
        } else {
                ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC);
@@ -951,14 +904,7 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
                if (count == 0)
                        return;
                break;
-        case XFS_DIR_LEAF_MAGIC:
-                ASSERT(XFS_DIR_IS_V1(state->mp));
-                lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
-                if (count == 0)
-                        return;
-                break;
        case XFS_DIR2_LEAFN_MAGIC:
-                ASSERT(XFS_DIR_IS_V2(state->mp));
                lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
                if (count == 0)
                        return;
@@ -1117,10 +1063,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
         * Descend thru the B-tree searching each level for the right
         * node to use, until the right hashval is found.
         */
-        if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
+        blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
-                blkno = state->mp->m_dirleafblk;
-        else
-                blkno = 0;
        for (blk = &state->path.blk[0], state->path.active = 1;
                         state->path.active <= XFS_DA_NODE_MAXDEPTH;
                         blk++, state->path.active++) {
@@ -1137,7 +1080,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                }
                curr = blk->bp->data;
                ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC ||
-                       be16_to_cpu(curr->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+                       be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC ||
                       be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC);
                /*
@@ -1190,16 +1133,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                                blk->index = probe;
                                blkno = be32_to_cpu(btree->before);
                        }
-                }
+                } else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
-                else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
                        blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
                        break;
-                }
+                } else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
-                else if (be16_to_cpu(curr->magic) == XFS_DIR_LEAF_MAGIC) {
-                        blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
-                        break;
-                }
-                else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
                        blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
                        break;
                }
@@ -1212,12 +1149,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
         * next leaf and keep searching.
         */
        for (;;) {
-                if (blk->magic == XFS_DIR_LEAF_MAGIC) {
+                if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-                        ASSERT(XFS_DIR_IS_V1(state->mp));
-                        retval = xfs_dir_leaf_lookup_int(blk->bp, args,
-                                                                  &blk->index);
-                } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-                        ASSERT(XFS_DIR_IS_V2(state->mp));
                        retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
                                                        &blk->index, state);
                }
@@ -1270,7 +1202,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
        old_info = old_blk->bp->data;
        new_info = new_blk->bp->data;
        ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
-               old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+               old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
               old_blk->magic == XFS_ATTR_LEAF_MAGIC);
        ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
        ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
@@ -1280,12 +1212,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
        case XFS_ATTR_LEAF_MAGIC:
                before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
                break;
-        case XFS_DIR_LEAF_MAGIC:
-                ASSERT(XFS_DIR_IS_V1(state->mp));
-                before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
-                break;
        case XFS_DIR2_LEAFN_MAGIC:
-                ASSERT(XFS_DIR_IS_V2(state->mp));
                before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
                break;
        case XFS_DA_NODE_MAGIC:
@@ -1404,7 +1331,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
        save_info = save_blk->bp->data;
        drop_info = drop_blk->bp->data;
        ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
-               save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+               save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
               save_blk->magic == XFS_ATTR_LEAF_MAGIC);
        ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
        ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
@@ -1529,7 +1456,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
                ASSERT(blk->bp != NULL);
                info = blk->bp->data;
                ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC ||
-                       be16_to_cpu(info->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+                       be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC ||
                       be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
                blk->magic = be16_to_cpu(info->magic);
                if (blk->magic == XFS_DA_NODE_MAGIC) {
@@ -1548,20 +1475,13 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
                                blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
                                                                      NULL);
                                break;
-                        case XFS_DIR_LEAF_MAGIC:
-                                ASSERT(XFS_DIR_IS_V1(state->mp));
-                                blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
-                                                                     NULL);
-                                break;
                        case XFS_DIR2_LEAFN_MAGIC:
-                                ASSERT(XFS_DIR_IS_V2(state->mp));
                                blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
                                                                       NULL);
                                break;
                        default:
                                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
-                                       blk->magic ==
+                                       blk->magic == XFS_DIR2_LEAFN_MAGIC);
-                                       XFS_DIRX_LEAF_MAGIC(state->mp));
                                break;
                        }
                }
@@ -1620,7 +1540,6 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        xfs_bmbt_irec_t *mapp;
        xfs_inode_t *dp;
        int nmap, error, w, count, c, got, i, mapi;
-        xfs_fsize_t size;
        xfs_trans_t *tp;
        xfs_mount_t *mp;
@@ -1631,7 +1550,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        /*
         * For new directories adjust the file offset and block count.
         */
-        if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
+        if (w == XFS_DATA_FORK) {
                bno = mp->m_dirleafblk;
                count = mp->m_dirblkfsbs;
        } else {
@@ -1641,10 +1560,9 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        /*
         * Find a spot in the file space to put the new block.
         */
-        if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+        if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w)))
                return error;
-        }
+        if (w == XFS_DATA_FORK)
-        if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
                ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
        /*
         * Try mapping it in one filesystem block.
@@ -1655,7 +1573,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                        XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
                        XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                        args->flist))) {
+                        args->flist, NULL))) {
                return error;
        }
        ASSERT(nmap <= 1);
@@ -1676,7 +1594,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                                        XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
                                        XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                        &mapp[mapi], &nmap, args->flist))) {
+                                        &mapp[mapi], &nmap, args->flist,
+                                        NULL))) {
                                kmem_free(mapp, sizeof(*mapp) * count);
                                return error;
                        }
@@ -1705,19 +1624,6 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        if (mapp != &map)
                kmem_free(mapp, sizeof(*mapp) * count);
        *new_blkno = (xfs_dablk_t)bno;
-        /*
-         * For version 1 directories, adjust the file size if it changed.
-         */
-        if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
-                ASSERT(mapi == 1);
-                if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
-                        return error;
-                size = XFS_FSB_TO_B(mp, bno);
-                if (size != dp->i_d.di_size) {
-                        dp->i_d.di_size = size;
-                        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-                }
-        }
        return 0;
 }
@@ -1742,7 +1648,6 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
        int error, w, entno, level, dead_level;
        xfs_da_blkinfo_t *dead_info, *sib_info;
        xfs_da_intnode_t *par_node, *dead_node;
-        xfs_dir_leafblock_t *dead_leaf;
        xfs_dir2_leaf_t *dead_leaf2;
        xfs_dahash_t dead_hash;
@@ -1753,11 +1658,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
        w = args->whichfork;
        ASSERT(w == XFS_DATA_FORK);
        mp = ip->i_mount;
-        if (XFS_DIR_IS_V2(mp)) {
+        lastoff = mp->m_dirfreeblk;
-                lastoff = mp->m_dirfreeblk;
+        error = xfs_bmap_last_before(tp, ip, &lastoff, w);
-                error = xfs_bmap_last_before(tp, ip, &lastoff, w);
-        } else
-                error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
        if (error)
                return error;
        if (unlikely(lastoff == 0)) {
@@ -1780,14 +1682,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
        /*
         * Get values from the moved block.
         */
-        if (be16_to_cpu(dead_info->magic) == XFS_DIR_LEAF_MAGIC) {
+        if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
-                ASSERT(XFS_DIR_IS_V1(mp));
-                dead_leaf = (xfs_dir_leafblock_t *)dead_info;
-                dead_level = 0;
-                dead_hash =
-                        INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
-        } else if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
-                ASSERT(XFS_DIR_IS_V2(mp));
                dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
                dead_level = 0;
                dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
@@ -1842,7 +1737,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
                xfs_da_buf_done(sib_buf);
                sib_buf = NULL;
        }
-        par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+        par_blkno = mp->m_dirleafblk;
        level = -1;
        /*
         * Walk down the tree looking for the parent of the moved block.
@@ -1941,8 +1836,6 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 {
        xfs_inode_t *dp;
        int done, error, w, count;
-        xfs_fileoff_t bno;
-        xfs_fsize_t size;
        xfs_trans_t *tp;
        xfs_mount_t *mp;
@@ -1950,7 +1843,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
        w = args->whichfork;
        tp = args->trans;
        mp = dp->i_mount;
-        if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+        if (w == XFS_DATA_FORK)
                count = mp->m_dirblkfsbs;
        else
                count = 1;
@@ -1961,34 +1854,17 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                 */
                if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
                                XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
-                                0, args->firstblock, args->flist,
+                                0, args->firstblock, args->flist, NULL,
                                &done)) == ENOSPC) {
                        if (w != XFS_DATA_FORK)
-                                goto done;
+                                break;
                        if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
                                        &dead_buf)))
-                                goto done;
+                                break;
-                } else if (error)
+                } else {
-                        goto done;
-                else
                        break;
-        }
-        ASSERT(done);
-        xfs_da_binval(tp, dead_buf);
-        /*
-         * Adjust the directory size for version 1.
-         */
-        if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
-                if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
-                        return error;
-                size = XFS_FSB_TO_B(dp->i_mount, bno);
-                if (size != dp->i_d.di_size) {
-                        dp->i_d.di_size = size;
-                        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
                }
        }
-        return 0;
-done:
        xfs_da_binval(tp, dead_buf);
        return error;
 }
@@ -2049,10 +1925,7 @@ xfs_da_do_buf(
        xfs_dabuf_t     *rbp;
        mp = dp->i_mount;
-        if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+        nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
-                nfsb = mp->m_dirblkfsbs;
-        else
-                nfsb = 1;
        mappedbno = *mappedbnop;
        /*
         * Caller doesn't have a mapping.  -2 means don't complain
@@ -2086,7 +1959,7 @@ xfs_da_do_buf(
                                        nfsb,
                                        XFS_BMAPI_METADATA |
                                                XFS_BMAPI_AFLAG(whichfork),
-                                        NULL, 0, mapp, &nmap, NULL)))
+                                        NULL, 0, mapp, &nmap, NULL, NULL)))
                                goto exit0;
                }
        } else {
@@ -2198,7 +2071,6 @@ xfs_da_do_buf(
                magic1 = be32_to_cpu(data->hdr.magic);
                if (unlikely(
                    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
-                                   (magic != XFS_DIR_LEAF_MAGIC) &&
                                   (magic != XFS_ATTR_LEAF_MAGIC) &&
                                   (magic != XFS_DIR2_LEAF1_MAGIC) &&
                                   (magic != XFS_DIR2_LEAFN_MAGIC) &&
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 243a730d5ec8..4ab865ec8b82 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -36,14 +36,10 @@ struct zone;
 * level in the Btree, and to identify which type of block this is.
 */
 #define XFS_DA_NODE_MAGIC       0xfebe  /* magic number: non-leaf blocks */
-#define XFS_DIR_LEAF_MAGIC      0xfeeb  /* magic number: directory leaf blks */
 #define XFS_ATTR_LEAF_MAGIC     0xfbee  /* magic number: attribute leaf blks */
 #define XFS_DIR2_LEAF1_MAGIC    0xd2f1  /* magic number: v2 dirlf single blks */
 #define XFS_DIR2_LEAFN_MAGIC    0xd2ff  /* magic number: v2 dirlf multi blks */
-#define XFS_DIRX_LEAF_MAGIC(mp) \
-        (XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
 typedef struct xfs_da_blkinfo {
        __be32          forw;                   /* previous block in list */
        __be32          back;                   /* following block in list */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 4968a6358e61..80562b60fb95 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -54,24 +52,14 @@ xfs_swapext(
        xfs_swapext_t   __user *sxu)
 {
        xfs_swapext_t   *sxp;
-        xfs_inode_t     *ip=NULL, *tip=NULL, *ips[2];
+        xfs_inode_t     *ip=NULL, *tip=NULL;
-        xfs_trans_t     *tp;
        xfs_mount_t     *mp;
-        xfs_bstat_t     *sbp;
        struct file     *fp = NULL, *tfp = NULL;
-        vnode_t         *vp, *tvp;
+        bhv_vnode_t     *vp, *tvp;
-        static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
-        int             ilf_fields, tilf_fields;
        int             error = 0;
-        xfs_ifork_t     *tempifp, *ifp, *tifp;
-        __uint64_t      tmp;
-        int             aforkblks = 0;
-        int             taforkblks = 0;
-        char            locked = 0;
        sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+        if (!sxp) {
-        if (!sxp || !tempifp) {
                error = XFS_ERROR(ENOMEM);
                goto error0;
        }
@@ -118,14 +106,56 @@ xfs_swapext(
        mp = ip->i_mount;
-        sbp = &sxp->sx_stat;
        if (XFS_FORCED_SHUTDOWN(mp)) {
                error =  XFS_ERROR(EIO);
                goto error0;
        }
-        locked = 1;
+        error = XFS_SWAP_EXTENTS(mp, &ip->i_iocore, &tip->i_iocore, sxp);
+ error0:
+        if (fp != NULL)
+                fput(fp);
+        if (tfp != NULL)
+                fput(tfp);
+        if (sxp != NULL)
+                kmem_free(sxp, sizeof(xfs_swapext_t));
+        return error;
+}
+int
+xfs_swap_extents(
+        xfs_inode_t     *ip,
+        xfs_inode_t     *tip,
+        xfs_swapext_t   *sxp)
+{
+        xfs_mount_t     *mp;
+        xfs_inode_t     *ips[2];
+        xfs_trans_t     *tp;
+        xfs_bstat_t     *sbp = &sxp->sx_stat;
+        bhv_vnode_t     *vp, *tvp;
+        xfs_ifork_t     *tempifp, *ifp, *tifp;
+        int             ilf_fields, tilf_fields;
+        static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+        int             error = 0;
+        int             aforkblks = 0;
+        int             taforkblks = 0;
+        __uint64_t      tmp;
+        char            locked = 0;
+        mp = ip->i_mount;
+        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+        if (!tempifp) {
+                error = XFS_ERROR(ENOMEM);
+                goto error0;
+        }
+        sbp = &sxp->sx_stat;
+        vp = XFS_ITOV(ip);
+        tvp = XFS_ITOV(tip);
        /* Lock in i_ino order */
        if (ip->i_ino < tip->i_ino) {
@@ -137,6 +167,7 @@ xfs_swapext(
        }
        xfs_lock_inodes(ips, 2, 0, lock_flags);
+        locked = 1;
        /* Check permissions */
        error = xfs_iaccess(ip, S_IWUSR, NULL);
@@ -169,7 +200,7 @@ xfs_swapext(
        if (VN_CACHED(tvp) != 0) {
                xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1);
-                VOP_FLUSHINVAL_PAGES(tvp, 0, -1, FI_REMAPF_LOCKED);
+                bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED);
        }
        /* Verify O_DIRECT for ftmp */
@@ -214,7 +245,7 @@ xfs_swapext(
        /* We need to fail if the file is memory mapped.  Once we have tossed
         * all existing pages, the page fault will have no option
         * but to go to the filesystem for pages. By making the page fault call
-         * VOP_READ (or write in the case of autogrow) they block on the iolock
+         * vop_read (or write in the case of autogrow) they block on the iolock
         * until we have switched the extents.
         */
        if (VN_MAPPED(vp)) {
@@ -233,7 +264,7 @@ xfs_swapext(
         * fields change.
         */
-        VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+        bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
        if ((error = xfs_trans_reserve(tp, 0,
@@ -360,16 +391,7 @@ xfs_swapext(
                xfs_iunlock(ip,  lock_flags);
                xfs_iunlock(tip, lock_flags);
        }
-        if (fp != NULL)
-                fput(fp);
-        if (tfp != NULL)
-                fput(tfp);
-        if (sxp != NULL)
-                kmem_free(sxp, sizeof(xfs_swapext_t));
        if (tempifp != NULL)
                kmem_free(tempifp, sizeof(xfs_ifork_t));
        return error;
 }
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index f678559abc45..da178205be68 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,6 +48,9 @@ typedef struct xfs_swapext
 */
 int     xfs_swapext(struct xfs_swapext __user *sx);
+int     xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+                struct xfs_swapext *sxp);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 79d0d9e1fbab..b33826961c45 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -85,7 +85,6 @@ typedef struct xfs_dinode
        union {
                xfs_bmdr_block_t di_bmbt;       /* btree root block */
                xfs_bmbt_rec_32_t di_bmx[1];    /* extent list */
-                xfs_dir_shortform_t di_dirsf;   /* shortform directory */
                xfs_dir2_sf_t   di_dir2sf;      /* shortform directory v2 */
                char            di_c[1];        /* local contents */
                xfs_dev_t       di_dev;         /* device for S_IFCHR/S_IFBLK */
@@ -257,6 +256,7 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOSYMLINKS_BIT   10  /* disallow symlink creation */
 #define XFS_DIFLAG_EXTSIZE_BIT      11  /* inode extent size allocator hint */
 #define XFS_DIFLAG_EXTSZINHERIT_BIT 12  /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13  /* do not reorganize/defragment */
 #define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
 #define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
 #define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -270,12 +270,13 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
 #define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
 #define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
 #define XFS_DIFLAG_ANY \
        (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
         XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
         XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
-         XFS_DIFLAG_EXTSZINHERIT)
+         XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG)
 #endif  /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
deleted file mode 100644
index 9cc702a839a3..000000000000
--- a/fs/xfs/xfs_dir.c
+++ /dev/null
@@ -1,1217 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-/*
- * xfs_dir.c
- *
- * Provide the external interfaces to manage directories.
- */
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-/*
- * Functions for the dirops interfaces.
- */
-static void     xfs_dir_mount(struct xfs_mount *mp);
-static int      xfs_dir_isempty(struct xfs_inode *dp);
-static int      xfs_dir_init(struct xfs_trans *trans,
-                             struct xfs_inode *dir,
-                             struct xfs_inode *parent_dir);
-static int      xfs_dir_createname(struct xfs_trans *trans,
-                                   struct xfs_inode *dp,
-                                   char *name_string,
-                                   int name_len,
-                                   xfs_ino_t inode_number,
-                                   xfs_fsblock_t *firstblock,
-                                   xfs_bmap_free_t *flist,
-                                   xfs_extlen_t total);
-static int      xfs_dir_lookup(struct xfs_trans *tp,
-                               struct xfs_inode *dp,
-                               char *name_string,
-                               int name_length,
-                               xfs_ino_t *inode_number);
-static int      xfs_dir_removename(struct xfs_trans *trans,
-                                   struct xfs_inode *dp,
-                                   char *name_string,
-                                   int name_length,
-                                   xfs_ino_t ino,
-                                   xfs_fsblock_t *firstblock,
-                                   xfs_bmap_free_t *flist,
-                                   xfs_extlen_t total);
-static int      xfs_dir_getdents(struct xfs_trans *tp,
-                                 struct xfs_inode *dp,
-                                 struct uio *uiop,
-                                 int *eofp);
-static int      xfs_dir_replace(struct xfs_trans *tp,
-                                struct xfs_inode *dp,
-                                char *name_string,
-                                int name_length,
-                                xfs_ino_t inode_number,
-                                xfs_fsblock_t *firstblock,
-                                xfs_bmap_free_t *flist,
-                                xfs_extlen_t total);
-static int      xfs_dir_canenter(struct xfs_trans *tp,
-                                 struct xfs_inode *dp,
-                                 char *name_string,
-                                 int name_length);
-static int      xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
-                                                  xfs_dinode_t *dip);
-xfs_dirops_t xfsv1_dirops = {
-        .xd_mount                       = xfs_dir_mount,
-        .xd_isempty                     = xfs_dir_isempty,
-        .xd_init                        = xfs_dir_init,
-        .xd_createname                  = xfs_dir_createname,
-        .xd_lookup                      = xfs_dir_lookup,
-        .xd_removename                  = xfs_dir_removename,
-        .xd_getdents                    = xfs_dir_getdents,
-        .xd_replace                     = xfs_dir_replace,
-        .xd_canenter                    = xfs_dir_canenter,
-        .xd_shortform_validate_ondisk   = xfs_dir_shortform_validate_ondisk,
-        .xd_shortform_to_single         = xfs_dir_shortform_to_leaf,
-};
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
-                                                 int *total_namebytes);
-STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
-                                             uio_t *uio, int *eofp,
-                                             xfs_dirent_t *dbp,
-                                             xfs_dir_put_t put);
-STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
-/*
- * Internal routines when dirsize > XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
-STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
-STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
-                                             uio_t *uio, int *eofp,
-                                             xfs_dirent_t *dbp,
-                                             xfs_dir_put_t put);
-STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
-#if defined(XFS_DIR_TRACE)
-ktrace_t *xfs_dir_trace_buf;
-#endif
-/*========================================================================
- * Overall external interface routines.
- *========================================================================*/
-xfs_dahash_t    xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-/*
- * One-time startup routine called from xfs_init().
- */
-void
-xfs_dir_startup(void)
-{
-        xfs_dir_hash_dot = xfs_da_hashname(".", 1);
-        xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
-}
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir_mount(xfs_mount_t *mp)
-{
-        uint shortcount, leafcount, count;
-        mp->m_dirversion = 1;
-        if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
-                shortcount = (mp->m_attroffset -
-                                (uint)sizeof(xfs_dir_sf_hdr_t)) /
-                                 (uint)sizeof(xfs_dir_sf_entry_t);
-                leafcount = (XFS_LBSIZE(mp) -
-                                (uint)sizeof(xfs_dir_leaf_hdr_t)) /
-                                 ((uint)sizeof(xfs_dir_leaf_entry_t) +
-                                  (uint)sizeof(xfs_dir_leaf_name_t));
-        } else {
-                shortcount = (XFS_BMDR_SPACE_CALC(MINABTPTRS) -
-                              (uint)sizeof(xfs_dir_sf_hdr_t)) /
-                               (uint)sizeof(xfs_dir_sf_entry_t);
-                leafcount = (XFS_LBSIZE(mp) -
-                            (uint)sizeof(xfs_dir_leaf_hdr_t)) /
-                             ((uint)sizeof(xfs_dir_leaf_entry_t) +
-                              (uint)sizeof(xfs_dir_leaf_name_t));
-        }
-        count = shortcount > leafcount ? shortcount : leafcount;
-        mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
-        ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
-        mp->m_dir_node_ents = mp->m_attr_node_ents =
-                (XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
-                (uint)sizeof(xfs_da_node_entry_t);
-        mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
-        mp->m_dirblksize = mp->m_sb.sb_blocksize;
-        mp->m_dirblkfsbs = 1;
-}
-/*
- * Return 1 if directory contains only "." and "..".
- */
-static int
-xfs_dir_isempty(xfs_inode_t *dp)
-{
-        xfs_dir_sf_hdr_t *hdr;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if (dp->i_d.di_size == 0)
-                return(1);
-        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
-                return(0);
-        hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
-        return(hdr->count == 0);
-}
-/*
- * Initialize a directory with its "." and ".." entries.
- */
-static int
-xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
-{
-        xfs_da_args_t args;
-        int error;
-        memset((char *)&args, 0, sizeof(args));
-        args.dp = dir;
-        args.trans = trans;
-        ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
-                return error;
-        return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
-}
-/*
- * Generic handler routine to add a name to a directory.
- * Transitions directory from shortform to Btree as necessary.
- */
-static int                                                      /* error */
-xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
-                   int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
-                   xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-        xfs_da_args_t args;
-        int retval, newsize, done;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
-                return (retval);
-        XFS_STATS_INC(xs_dir_create);
-        /*
-         * Fill in the arg structure for this request.
-         */
-        args.name = name;
-        args.namelen = namelen;
-        args.hashval = xfs_da_hashname(name, namelen);
-        args.inumber = inum;
-        args.dp = dp;
-        args.firstblock = firstblock;
-        args.flist = flist;
-        args.total = total;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = trans;
-        args.justcheck = 0;
-        args.addname = args.oknoent = 1;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
-        done = 0;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
-                if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
-                        retval = xfs_dir_shortform_addname(&args);
-                        done = 1;
-                } else {
-                        if (total == 0)
-                                return XFS_ERROR(ENOSPC);
-                        retval = xfs_dir_shortform_to_leaf(&args);
-                        done = retval != 0;
-                }
-        }
-        if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-                retval = xfs_dir_leaf_addname(&args);
-                done = retval != ENOSPC;
-                if (!done) {
-                        if (total == 0)
-                                return XFS_ERROR(ENOSPC);
-                        retval = xfs_dir_leaf_to_node(&args);
-                        done = retval != 0;
-                }
-        }
-        if (!done) {
-                retval = xfs_dir_node_addname(&args);
-        }
-        return(retval);
-}
-/*
- * Generic handler routine to check if a name can be added to a directory,
- * without adding any blocks to the directory.
- */
-static int                                                      /* error */
-xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
-{
-        xfs_da_args_t args;
-        int retval, newsize;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        /*
-         * Fill in the arg structure for this request.
-         */
-        args.name = name;
-        args.namelen = namelen;
-        args.hashval = xfs_da_hashname(name, namelen);
-        args.inumber = 0;
-        args.dp = dp;
-        args.firstblock = NULL;
-        args.flist = NULL;
-        args.total = 0;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = trans;
-        args.justcheck = args.addname = args.oknoent = 1;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
-                if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
-                        retval = 0;
-                else
-                        retval = XFS_ERROR(ENOSPC);
-        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-                retval = xfs_dir_leaf_addname(&args);
-        } else {
-                retval = xfs_dir_node_addname(&args);
-        }
-        return(retval);
-}
-/*
- * Generic handler routine to remove a name from a directory.
- * Transitions directory from Btree to shortform as necessary.
- */
-static int                                                      /* error */
-xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
-                   int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
-                   xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-        xfs_da_args_t args;
-        int count, totallen, newsize, retval;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        XFS_STATS_INC(xs_dir_remove);
-        /*
-         * Fill in the arg structure for this request.
-         */
-        args.name = name;
-        args.namelen = namelen;
-        args.hashval = xfs_da_hashname(name, namelen);
-        args.inumber = ino;
-        args.dp = dp;
-        args.firstblock = firstblock;
-        args.flist = flist;
-        args.total = total;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = trans;
-        args.justcheck = args.addname = args.oknoent = 0;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                retval = xfs_dir_shortform_removename(&args);
-        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-                retval = xfs_dir_leaf_removename(&args, &count, &totallen);
-                if (retval == 0) {
-                        newsize = XFS_DIR_SF_ALLFIT(count, totallen);
-                        if (newsize <= XFS_IFORK_DSIZE(dp)) {
-                                retval = xfs_dir_leaf_to_shortform(&args);
-                        }
-                }
-        } else {
-                retval = xfs_dir_node_removename(&args);
-        }
-        return(retval);
-}
-static int                                                      /* error */
-xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
-                                   xfs_ino_t *inum)
-{
-        xfs_da_args_t args;
-        int retval;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        XFS_STATS_INC(xs_dir_lookup);
-        /*
-         * Fill in the arg structure for this request.
-         */
-        args.name = name;
-        args.namelen = namelen;
-        args.hashval = xfs_da_hashname(name, namelen);
-        args.inumber = 0;
-        args.dp = dp;
-        args.firstblock = NULL;
-        args.flist = NULL;
-        args.total = 0;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = trans;
-        args.justcheck = args.addname = 0;
-        args.oknoent = 1;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                retval = xfs_dir_shortform_lookup(&args);
-        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-                retval = xfs_dir_leaf_lookup(&args);
-        } else {
-                retval = xfs_dir_node_lookup(&args);
-        }
-        if (retval == EEXIST)
-                retval = 0;
-        *inum = args.inumber;
-        return(retval);
-}
-/*
- * Implement readdir.
- */
-static int                                                      /* error */
-xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
-{
-        xfs_dirent_t *dbp;
-        int  alignment, retval;
-        xfs_dir_put_t put;
-        XFS_STATS_INC(xs_dir_getdents);
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        /*
-         * If our caller has given us a single contiguous memory buffer,
-         * just work directly within that buffer.  If it's in user memory,
-         * lock it down first.
-         */
-        alignment = sizeof(xfs_off_t) - 1;
-        if ((uio->uio_iovcnt == 1) &&
-            (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
-            ((uio->uio_iov[0].iov_len & alignment) == 0)) {
-                dbp = NULL;
-                put = xfs_dir_put_dirent64_direct;
-        } else {
-                dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
-                put = xfs_dir_put_dirent64_uio;
-        }
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
-        *eofp = 0;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
-        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-                retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
-        } else {
-                retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
-        }
-        if (dbp != NULL)
-                kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
-        return(retval);
-}
-static int                                                      /* error */
-xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
-                                    xfs_ino_t inum, xfs_fsblock_t *firstblock,
-                                    xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-        xfs_da_args_t args;
-        int retval;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
-                return retval;
-        /*
-         * Fill in the arg structure for this request.
-         */
-        args.name = name;
-        args.namelen = namelen;
-        args.hashval = xfs_da_hashname(name, namelen);
-        args.inumber = inum;
-        args.dp = dp;
-        args.firstblock = firstblock;
-        args.flist = flist;
-        args.total = total;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = trans;
-        args.justcheck = args.addname = args.oknoent = 0;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                retval = xfs_dir_shortform_replace(&args);
-        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-                retval = xfs_dir_leaf_replace(&args);
-        } else {
-                retval = xfs_dir_node_replace(&args);
-        }
-        return(retval);
-}
-static int
-xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
-{
-        xfs_ino_t               ino;
-        int                     namelen_sum;
-        int                     count;
-        xfs_dir_shortform_t     *sf;
-        xfs_dir_sf_entry_t      *sfe;
-        int                     i;
-        if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) {
-                return 0;
-        }
-        if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
-                return 0;
-        }
-        if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
-                xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p",
-                        dp);
-                return 1;
-        }
-        sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
-        ino = XFS_GET_DIR_INO8(sf->hdr.parent);
-        if (xfs_dir_ino_validate(mp, ino))
-                return 1;
-        count = sf->hdr.count;
-        if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
-                xfs_fs_cmn_err(CE_WARN, mp,
-                        "Invalid shortform count: dp 0x%p", dp);
-                return(1);
-        }
-        if (count == 0) {
-                return 0;
-        }
-        namelen_sum = 0;
-        sfe = &sf->list[0];
-        for (i = sf->hdr.count - 1; i >= 0; i--) {
-                ino = XFS_GET_DIR_INO8(sfe->inumber);
-                xfs_dir_ino_validate(mp, ino);
-                if (sfe->namelen >= XFS_LITINO(mp)) {
-                        xfs_fs_cmn_err(CE_WARN, mp,
-                                "Invalid shortform namelen: dp 0x%p", dp);
-                        return 1;
-                }
-                namelen_sum += sfe->namelen;
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-        }
-        if (namelen_sum >= XFS_LITINO(mp)) {
-                xfs_fs_cmn_err(CE_WARN, mp,
-                        "Invalid shortform namelen: dp 0x%p", dp);
-                return 1;
-        }
-        return 0;
-}
-/*========================================================================
- * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
- *========================================================================*/
-/*
- * Add a name to the leaf directory structure
- * This is the external routine.
- */
-int
-xfs_dir_leaf_addname(xfs_da_args_t *args)
-{
-        int index, retval;
-        xfs_dabuf_t *bp;
-        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-                                              XFS_DATA_FORK);
-        if (retval)
-                return(retval);
-        ASSERT(bp != NULL);
-        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-        if (retval == ENOENT)
-                retval = xfs_dir_leaf_add(bp, args, index);
-        xfs_da_buf_done(bp);
-        return(retval);
-}
-/*
- * Remove a name from the leaf directory structure
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
-{
-        xfs_dir_leafblock_t *leaf;
-        int index, retval;
-        xfs_dabuf_t *bp;
-        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-                                              XFS_DATA_FORK);
-        if (retval)
-                return(retval);
-        ASSERT(bp != NULL);
-        leaf = bp->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-        if (retval == EEXIST) {
-                (void)xfs_dir_leaf_remove(args->trans, bp, index);
-                *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-                *totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-                retval = 0;
-        }
-        xfs_da_buf_done(bp);
-        return(retval);
-}
-/*
- * Look up a name in a leaf directory structure.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_lookup(xfs_da_args_t *args)
-{
-        int index, retval;
-        xfs_dabuf_t *bp;
-        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-                                              XFS_DATA_FORK);
-        if (retval)
-                return(retval);
-        ASSERT(bp != NULL);
-        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-        xfs_da_brelse(args->trans, bp);
-        return(retval);
-}
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-STATIC int
-xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
-                                  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-        xfs_dabuf_t *bp;
-        int retval, eob;
-        retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
-        if (retval)
-                return(retval);
-        ASSERT(bp != NULL);
-        retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
-        xfs_da_brelse(trans, bp);
-        *eofp = (eob == 0);
-        return(retval);
-}
-/*
- * Look up a name in a leaf directory structure, replace the inode number.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_replace(xfs_da_args_t *args)
-{
-        int index, retval;
-        xfs_dabuf_t *bp;
-        xfs_ino_t inum;
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_entry_t *entry;
-        xfs_dir_leaf_name_t *namest;
-        inum = args->inumber;
-        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-                                              XFS_DATA_FORK);
-        if (retval)
-                return(retval);
-        ASSERT(bp != NULL);
-        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-        if (retval == EEXIST) {
-                leaf = bp->data;
-                entry = &leaf->entries[index];
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-                /* XXX - replace assert? */
-                XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
-                xfs_da_log_buf(args->trans, bp,
-                    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
-                xfs_da_buf_done(bp);
-                retval = 0;
-        } else
-                xfs_da_brelse(args->trans, bp);
-        return(retval);
-}
-/*========================================================================
- * External routines when dirsize > XFS_LBSIZE(mp).
- *========================================================================*/
-/*
- * Add a name to a Btree-format directory.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_addname(xfs_da_args_t *args)
-{
-        xfs_da_state_t *state;
-        xfs_da_state_blk_t *blk;
-        int retval, error;
-        /*
-         * Fill in bucket of arguments/results/context to carry around.
-         */
-        state = xfs_da_state_alloc();
-        state->args = args;
-        state->mp = args->dp->i_mount;
-        state->blocksize = state->mp->m_sb.sb_blocksize;
-        state->node_ents = state->mp->m_dir_node_ents;
-        /*
-         * Search to see if name already exists, and get back a pointer
-         * to where it should go.
-         */
-        error = xfs_da_node_lookup_int(state, &retval);
-        if (error)
-                retval = error;
-        if (retval != ENOENT)
-                goto error;
-        blk = &state->path.blk[ state->path.active-1 ];
-        ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-        retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
-        if (retval == 0) {
-                /*
-                 * Addition succeeded, update Btree hashvals.
-                 */
-                if (!args->justcheck)
-                        xfs_da_fixhashpath(state, &state->path);
-        } else {
-                /*
-                 * Addition failed, split as many Btree elements as required.
-                 */
-                if (args->total == 0) {
-                        ASSERT(retval == ENOSPC);
-                        goto error;
-                }
-                retval = xfs_da_split(state);
-        }
-error:
-        xfs_da_state_free(state);
-        return(retval);
-}
-/*
- * Remove a name from a B-tree directory.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_removename(xfs_da_args_t *args)
-{
-        xfs_da_state_t *state;
-        xfs_da_state_blk_t *blk;
-        int retval, error;
-        state = xfs_da_state_alloc();
-        state->args = args;
-        state->mp = args->dp->i_mount;
-        state->blocksize = state->mp->m_sb.sb_blocksize;
-        state->node_ents = state->mp->m_dir_node_ents;
-        /*
-         * Search to see if name exists, and get back a pointer to it.
-         */
-        error = xfs_da_node_lookup_int(state, &retval);
-        if (error)
-                retval = error;
-        if (retval != EEXIST) {
-                xfs_da_state_free(state);
-                return(retval);
-        }
-        /*
-         * Remove the name and update the hashvals in the tree.
-         */
-        blk = &state->path.blk[ state->path.active-1 ];
-        ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-        retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
-        xfs_da_fixhashpath(state, &state->path);
-        /*
-         * Check to see if the tree needs to be collapsed.
-         */
-        error = 0;
-        if (retval) {
-                error = xfs_da_join(state);
-        }
-        xfs_da_state_free(state);
-        if (error)
-                return(error);
-        return(0);
-}
-/*
- * Look up a filename in a int directory.
- * Use an internal routine to actually do all the work.
- */
-STATIC int
-xfs_dir_node_lookup(xfs_da_args_t *args)
-{
-        xfs_da_state_t *state;
-        int retval, error, i;
-        state = xfs_da_state_alloc();
-        state->args = args;
-        state->mp = args->dp->i_mount;
-        state->blocksize = state->mp->m_sb.sb_blocksize;
-        state->node_ents = state->mp->m_dir_node_ents;
-        /*
-         * Search to see if name exists,
-         * and get back a pointer to it.
-         */
-        error = xfs_da_node_lookup_int(state, &retval);
-        if (error) {
-                retval = error;
-        }
-        /*
-         * If not in a transaction, we have to release all the buffers.
-         */
-        for (i = 0; i < state->path.active; i++) {
-                xfs_da_brelse(args->trans, state->path.blk[i].bp);
-                state->path.blk[i].bp = NULL;
-        }
-        xfs_da_state_free(state);
-        return(retval);
-}
-STATIC int
-xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
-                                  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-        xfs_da_intnode_t *node;
-        xfs_da_node_entry_t *btree;
-        xfs_dir_leafblock_t *leaf = NULL;
-        xfs_dablk_t bno, nextbno;
-        xfs_dahash_t cookhash;
-        xfs_mount_t *mp;
-        int error, eob, i;
-        xfs_dabuf_t *bp;
-        xfs_daddr_t nextda;
-        /*
-         * Pick up our context.
-         */
-        mp = dp->i_mount;
-        bp = NULL;
-        bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
-        cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-        xfs_dir_trace_g_du("node: start", dp, uio);
-        /*
-         * Re-find our place, even if we're confused about what our place is.
-         *
-         * First we check the block number from the magic cookie, it is a
-         * cache of where we ended last time.  If we find a leaf block, and
-         * the starting hashval in that block is less than our desired
-         * hashval, then we run with it.
-         */
-        if (bno > 0) {
-                error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK);
-                if ((error != 0) && (error != EFSCORRUPTED))
-                        return(error);
-                if (bp)
-                        leaf = bp->data;
-                if (bp && be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC) {
-                        xfs_dir_trace_g_dub("node: block not a leaf",
-                                                   dp, uio, bno);
-                        xfs_da_brelse(trans, bp);
-                        bp = NULL;
-                }
-                if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
-                        xfs_dir_trace_g_dub("node: leaf hash too large",
-                                                   dp, uio, bno);
-                        xfs_da_brelse(trans, bp);
-                        bp = NULL;
-                }
-                if (bp &&
-                    cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
-                        xfs_dir_trace_g_dub("node: leaf hash too small",
-                                                   dp, uio, bno);
-                        xfs_da_brelse(trans, bp);
-                        bp = NULL;
-                }
-        }
-        /*
-         * If we did not find a leaf block from the blockno in the cookie,
-         * or we there was no blockno in the cookie (eg: first time thru),
-         * the we start at the top of the Btree and re-find our hashval.
-         */
-        if (bp == NULL) {
-                xfs_dir_trace_g_du("node: start at root" , dp, uio);
-                bno = 0;
-                for (;;) {
-                        error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
-                                                       XFS_DATA_FORK);
-                        if (error)
-                                return(error);
-                        if (bp == NULL)
-                                return(XFS_ERROR(EFSCORRUPTED));
-                        node = bp->data;
-                        if (be16_to_cpu(node->hdr.info.magic) != XFS_DA_NODE_MAGIC)
-                                break;
-                        btree = &node->btree[0];
-                        xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
-                        for (i = 0; i < be16_to_cpu(node->hdr.count); btree++, i++) {
-                                if (be32_to_cpu(btree->hashval) >= cookhash) {
-                                        bno = be32_to_cpu(btree->before);
-                                        break;
-                                }
-                        }
-                        if (i == be16_to_cpu(node->hdr.count)) {
-                                xfs_da_brelse(trans, bp);
-                                xfs_dir_trace_g_du("node: hash beyond EOF",
-                                                          dp, uio);
-                                uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
-                                                             XFS_DA_MAXHASH);
-                                *eofp = 1;
-                                return(0);
-                        }
-                        xfs_dir_trace_g_dub("node: going to block",
-                                                   dp, uio, bno);
-                        xfs_da_brelse(trans, bp);
-                }
-        }
-        ASSERT(cookhash != XFS_DA_MAXHASH);
-        /*
-         * We've dropped down to the (first) leaf block that contains the
-         * hashval we are interested in.  Continue rolling upward thru the
-         * leaf blocks until we fill up our buffer.
-         */
-        for (;;) {
-                leaf = bp->data;
-                if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC)) {
-                        xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
-                        xfs_da_brelse(trans, bp);
-                        XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)",
-                                             XFS_ERRLEVEL_LOW, mp, leaf);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
-                xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
-                if ((nextbno = be32_to_cpu(leaf->hdr.info.forw))) {
-                        nextda = xfs_da_reada_buf(trans, dp, nextbno,
-                                                  XFS_DATA_FORK);
-                } else
-                        nextda = -1;
-                error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
-                                                  put, nextda);
-                xfs_da_brelse(trans, bp);
-                bno = nextbno;
-                if (eob) {
-                        xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
-                        *eofp = 0;
-                        return(error);
-                }
-                if (bno == 0)
-                        break;
-                error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
-                                        XFS_DATA_FORK);
-                if (error)
-                        return(error);
-                if (unlikely(bp == NULL)) {
-                        XFS_ERROR_REPORT("xfs_dir_node_getdents(2)",
-                                         XFS_ERRLEVEL_LOW, mp);
-                        return(XFS_ERROR(EFSCORRUPTED));
-                }
-        }
-        *eofp = 1;
-        xfs_dir_trace_g_du("node: E-O-F", dp, uio);
-        return(0);
-}
-/*
- * Look up a filename in an int directory, replace the inode number.
- * Use an internal routine to actually do the lookup.
- */
-STATIC int
-xfs_dir_node_replace(xfs_da_args_t *args)
-{
-        xfs_da_state_t *state;
-        xfs_da_state_blk_t *blk;
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_entry_t *entry;
-        xfs_dir_leaf_name_t *namest;
-        xfs_ino_t inum;
-        int retval, error, i;
-        xfs_dabuf_t *bp;
-        state = xfs_da_state_alloc();
-        state->args = args;
-        state->mp = args->dp->i_mount;
-        state->blocksize = state->mp->m_sb.sb_blocksize;
-        state->node_ents = state->mp->m_dir_node_ents;
-        inum = args->inumber;
-        /*
-         * Search to see if name exists,
-         * and get back a pointer to it.
-         */
-        error = xfs_da_node_lookup_int(state, &retval);
-        if (error) {
-                retval = error;
-        }
-        if (retval == EEXIST) {
-                blk = &state->path.blk[state->path.active - 1];
-                ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-                bp = blk->bp;
-                leaf = bp->data;
-                entry = &leaf->entries[blk->index];
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-                /* XXX - replace assert ? */
-                XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
-                xfs_da_log_buf(args->trans, bp,
-                    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
-                xfs_da_buf_done(bp);
-                blk->bp = NULL;
-                retval = 0;
-        } else {
-                i = state->path.active - 1;
-                xfs_da_brelse(args->trans, state->path.blk[i].bp);
-                state->path.blk[i].bp = NULL;
-        }
-        for (i = 0; i < state->path.active - 1; i++) {
-                xfs_da_brelse(args->trans, state->path.blk[i].bp);
-                state->path.blk[i].bp = NULL;
-        }
-        xfs_da_state_free(state);
-        return(retval);
-}
-#if defined(XFS_DIR_TRACE)
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
-{
-        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
-                     (void *)dp, (void *)dp->i_mount,
-                     (void *)((unsigned long)(uio->uio_offset >> 32)),
-                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-                     (void *)(unsigned long)uio->uio_resid,
-                     NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
-{
-        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
-                     (void *)dp, (void *)dp->i_mount,
-                     (void *)((unsigned long)(uio->uio_offset >> 32)),
-                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-                     (void *)(unsigned long)uio->uio_resid,
-                     (void *)(unsigned long)bno,
-                     NULL, NULL, NULL, NULL, NULL, NULL);
-}
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
-                        xfs_da_intnode_t *node)
-{
-        int     last = be16_to_cpu(node->hdr.count) - 1;
-        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
-                     (void *)dp, (void *)dp->i_mount,
-                     (void *)((unsigned long)(uio->uio_offset >> 32)),
-                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-                     (void *)(unsigned long)uio->uio_resid,
-                     (void *)(unsigned long)be32_to_cpu(node->hdr.info.forw),
-                     (void *)(unsigned long)
-                        be16_to_cpu(node->hdr.count),
-                     (void *)(unsigned long)
-                        be32_to_cpu(node->btree[0].hashval),
-                     (void *)(unsigned long)
-                        be32_to_cpu(node->btree[last].hashval),
-                     NULL, NULL, NULL);
-}
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
-                        xfs_dir_leafblock_t *leaf)
-{
-        int     last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
-        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
-                     (void *)dp, (void *)dp->i_mount,
-                     (void *)((unsigned long)(uio->uio_offset >> 32)),
-                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-                     (void *)(unsigned long)uio->uio_resid,
-                     (void *)(unsigned long)be32_to_cpu(leaf->hdr.info.forw),
-                     (void *)(unsigned long)
-                        INT_GET(leaf->hdr.count, ARCH_CONVERT),
-                     (void *)(unsigned long)
-                        INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
-                     (void *)(unsigned long)
-                        INT_GET(leaf->entries[last].hashval, ARCH_CONVERT),
-                     NULL, NULL, NULL);
-}
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
-                        xfs_dir_leaf_entry_t *entry)
-{
-        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
-                     (void *)dp, (void *)dp->i_mount,
-                     (void *)((unsigned long)(uio->uio_offset >> 32)),
-                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-                     (void *)(unsigned long)uio->uio_resid,
-                     (void *)(unsigned long)
-                        INT_GET(entry->hashval, ARCH_CONVERT),
-                     NULL, NULL, NULL, NULL, NULL, NULL);
-}
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
-{
-        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
-                     (void *)dp, (void *)dp->i_mount,
-                     (void *)((unsigned long)(uio->uio_offset >> 32)),
-                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-                     (void *)(unsigned long)uio->uio_resid,
-                     (void *)((unsigned long)(cookie >> 32)),
-                     (void *)((unsigned long)(cookie & 0xFFFFFFFF)),
-                     NULL, NULL, NULL, NULL, NULL);
-}
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_dir_trace_enter(int type, char *where,
-                        void * a0, void * a1,
-                        void * a2, void * a3,
-                        void * a4, void * a5,
-                        void * a6, void * a7,
-                        void * a8, void * a9,
-                        void * a10, void * a11)
-{
-        ASSERT(xfs_dir_trace_buf);
-        ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type,
-                                        (void *)where,
-                                        (void *)a0, (void *)a1, (void *)a2,
-                                        (void *)a3, (void *)a4, (void *)a5,
-                                        (void *)a6, (void *)a7, (void *)a8,
-                                        (void *)a9, (void *)a10, (void *)a11,
-                                        NULL, NULL);
-}
-#endif  /* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
deleted file mode 100644
index 8cc8afb9f6c0..000000000000
--- a/fs/xfs/xfs_dir.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_H__
-#define __XFS_DIR_H__
-/*
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes.  Filenames are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of a filename may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- *
- * Small directories use a different format and are packed as tightly
- * as possible so as to fit into the literal area of the inode.
- */
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-struct uio;
-struct xfs_bmap_free;
-struct xfs_da_args;
-struct xfs_dinode;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-/*
- * Directory function types.
- * Put in structures (xfs_dirops_t) for v1 and v2 directories.
- */
-typedef void    (*xfs_dir_mount_t)(struct xfs_mount *mp);
-typedef int     (*xfs_dir_isempty_t)(struct xfs_inode *dp);
-typedef int     (*xfs_dir_init_t)(struct xfs_trans *tp,
-                                  struct xfs_inode *dp,
-                                  struct xfs_inode *pdp);
-typedef int     (*xfs_dir_createname_t)(struct xfs_trans *tp,
-                                        struct xfs_inode *dp,
-                                        char *name,
-                                        int namelen,
-                                        xfs_ino_t inum,
-                                        xfs_fsblock_t *first,
-                                        struct xfs_bmap_free *flist,
-                                        xfs_extlen_t total);
-typedef int     (*xfs_dir_lookup_t)(struct xfs_trans *tp,
-                                    struct xfs_inode *dp,
-                                    char *name,
-                                    int namelen,
-                                    xfs_ino_t *inum);
-typedef int     (*xfs_dir_removename_t)(struct xfs_trans *tp,
-                                        struct xfs_inode *dp,
-                                        char *name,
-                                        int namelen,
-                                        xfs_ino_t ino,
-                                        xfs_fsblock_t *first,
-                                        struct xfs_bmap_free *flist,
-                                        xfs_extlen_t total);
-typedef int     (*xfs_dir_getdents_t)(struct xfs_trans *tp,
-                                      struct xfs_inode *dp,
-                                      struct uio *uio,
-                                      int *eofp);
-typedef int     (*xfs_dir_replace_t)(struct xfs_trans *tp,
-                                     struct xfs_inode *dp,
-                                     char *name,
-                                     int namelen,
-                                     xfs_ino_t inum,
-                                     xfs_fsblock_t *first,
-                                     struct xfs_bmap_free *flist,
-                                     xfs_extlen_t total);
-typedef int     (*xfs_dir_canenter_t)(struct xfs_trans *tp,
-                                      struct xfs_inode *dp,
-                                      char *name,
-                                      int namelen);
-typedef int     (*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
-                                                       struct xfs_dinode *dip);
-typedef int     (*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
-typedef struct xfs_dirops {
-        xfs_dir_mount_t                         xd_mount;
-        xfs_dir_isempty_t                       xd_isempty;
-        xfs_dir_init_t                          xd_init;
-        xfs_dir_createname_t                    xd_createname;
-        xfs_dir_lookup_t                        xd_lookup;
-        xfs_dir_removename_t                    xd_removename;
-        xfs_dir_getdents_t                      xd_getdents;
-        xfs_dir_replace_t                       xd_replace;
-        xfs_dir_canenter_t                      xd_canenter;
-        xfs_dir_shortform_validate_ondisk_t     xd_shortform_validate_ondisk;
-        xfs_dir_shortform_to_single_t           xd_shortform_to_single;
-} xfs_dirops_t;
-/*
- * Overall external interface routines.
- */
-void    xfs_dir_startup(void);  /* called exactly once */
-#define XFS_DIR_MOUNT(mp)       \
-        ((mp)->m_dirops.xd_mount(mp))
-#define XFS_DIR_ISEMPTY(mp,dp)  \
-        ((mp)->m_dirops.xd_isempty(dp))
-#define XFS_DIR_INIT(mp,tp,dp,pdp)      \
-        ((mp)->m_dirops.xd_init(tp,dp,pdp))
-#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
-        ((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
-                                      total))
-#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum)      \
-        ((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
-#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \
-        ((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
-#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp)     \
-        ((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
-#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total)   \
-        ((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
-#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \
-        ((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
-#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip)       \
-        ((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
-#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args)    \
-        ((mp)->m_dirops.xd_shortform_to_single(args))
-#define XFS_DIR_IS_V1(mp)       ((mp)->m_dirversion == 1)
-#define XFS_DIR_IS_V2(mp)       ((mp)->m_dirversion == 2)
-extern xfs_dirops_t xfsv1_dirops;
-extern xfs_dirops_t xfsv2_dirops;
-#endif  /* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 022c8398ab62..8edbe1adb95b 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -24,21 +24,18 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -46,69 +43,14 @@
 #include "xfs_dir2_trace.h"
 #include "xfs_error.h"
-/*
- * Declarations for interface routines.
- */
-static void     xfs_dir2_mount(xfs_mount_t *mp);
-static int      xfs_dir2_isempty(xfs_inode_t *dp);
-static int      xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
-                              xfs_inode_t *pdp);
-static int      xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
-                                    char *name, int namelen, xfs_ino_t inum,
-                                    xfs_fsblock_t *first,
-                                    xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int      xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-                                int namelen, xfs_ino_t *inum);
-static int      xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
-                                    char *name, int namelen, xfs_ino_t ino,
-                                    xfs_fsblock_t *first,
-                                    xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int      xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
-                                  int *eofp);
-static int      xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-                                 int namelen, xfs_ino_t inum,
-                                 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
-                                 xfs_extlen_t total);
-static int      xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-                                  int namelen);
-static int      xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
-                                                   xfs_dinode_t *dip);
-/*
- * Utility routine declarations.
- */
 static int      xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
 static int      xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
-/*
+void
- * Directory operations vector.
+xfs_dir_mount(
- */
+        xfs_mount_t     *mp)
-xfs_dirops_t    xfsv2_dirops = {
-        .xd_mount                       = xfs_dir2_mount,
-        .xd_isempty                     = xfs_dir2_isempty,
-        .xd_init                        = xfs_dir2_init,
-        .xd_createname                  = xfs_dir2_createname,
-        .xd_lookup                      = xfs_dir2_lookup,
-        .xd_removename                  = xfs_dir2_removename,
-        .xd_getdents                    = xfs_dir2_getdents,
-        .xd_replace                     = xfs_dir2_replace,
-        .xd_canenter                    = xfs_dir2_canenter,
-        .xd_shortform_validate_ondisk   = xfs_dir2_shortform_validate_ondisk,
-        .xd_shortform_to_single         = xfs_dir2_sf_to_block,
-};
-/*
- * Interface routines.
- */
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir2_mount(
-        xfs_mount_t     *mp)            /* filesystem mount point */
 {
-        mp->m_dirversion = 2;
+        ASSERT(XFS_SB_VERSION_HASDIRV2(&mp->m_sb));
        ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
               XFS_MAX_BLOCKSIZE);
        mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
@@ -128,19 +70,15 @@ xfs_dir2_mount(
 /*
 * Return 1 if directory contains only "." and "..".
 */
-static int                              /* return code */
+int
-xfs_dir2_isempty(
+xfs_dir_isempty(
-        xfs_inode_t     *dp)            /* incore inode structure */
+        xfs_inode_t     *dp)
 {
-        xfs_dir2_sf_t   *sfp;           /* shortform directory structure */
+        xfs_dir2_sf_t   *sfp;
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        /*
+        if (dp->i_d.di_size == 0)       /* might happen during shutdown. */
-         * Might happen during shutdown.
-         */
-        if (dp->i_d.di_size == 0) {
                return 1;
-        }
        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
                return 0;
        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
@@ -148,53 +86,83 @@ xfs_dir2_isempty(
 }
 /*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+        xfs_mount_t     *mp,
+        xfs_ino_t       ino)
+{
+        xfs_agblock_t   agblkno;
+        xfs_agino_t     agino;
+        xfs_agnumber_t  agno;
+        int             ino_ok;
+        int             ioff;
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agblkno = XFS_INO_TO_AGBNO(mp, ino);
+        ioff = XFS_INO_TO_OFFSET(mp, ino);
+        agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+        ino_ok =
+                agno < mp->m_sb.sb_agcount &&
+                agblkno < mp->m_sb.sb_agblocks &&
+                agblkno != 0 &&
+                ioff < (1 << mp->m_sb.sb_inopblog) &&
+                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+                        XFS_RANDOM_DIR_INO_VALIDATE))) {
+                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+                                (unsigned long long) ino);
+                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
 * Initialize a directory with its "." and ".." entries.
 */
-static int                              /* error */
+int
-xfs_dir2_init(
+xfs_dir_init(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
-        xfs_inode_t     *pdp)           /* incore parent directory inode */
+        xfs_inode_t     *pdp)
 {
-        xfs_da_args_t   args;           /* operation arguments */
+        xfs_da_args_t   args;
-        int             error;          /* error return value */
+        int             error;
        memset((char *)&args, 0, sizeof(args));
        args.dp = dp;
        args.trans = tp;
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+        if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
                return error;
-        }
        return xfs_dir2_sf_create(&args, pdp->i_ino);
 }
 /*
  Enter a name in a directory.
 */
-static int                                      /* error */
+int
-xfs_dir2_createname(
+xfs_dir_createname(
-        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_trans_t             *tp,
-        xfs_inode_t             *dp,            /* incore directory inode */
+        xfs_inode_t             *dp,
-        char                    *name,          /* new entry name */
+        char                    *name,
-        int                     namelen,        /* new entry name length */
+        int                     namelen,
        xfs_ino_t               inum,           /* new entry inode number */
        xfs_fsblock_t           *first,         /* bmap's firstblock */
        xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
        xfs_extlen_t            total)          /* bmap's total block count */
 {
-        xfs_da_args_t           args;           /* operation arguments */
+        xfs_da_args_t           args;
-        int                     rval;           /* return value */
+        int                     rval;
        int                     v;              /* type-checking value */
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
-        }
        XFS_STATS_INC(xs_dir_create);
-        /*
-         * Fill in the arg structure for this request.
-         */
        args.name = name;
        args.namelen = namelen;
        args.hashval = xfs_da_hashname(name, namelen);
@@ -207,18 +175,16 @@ xfs_dir2_createname(
        args.trans = tp;
        args.justcheck = 0;
        args.addname = args.oknoent = 1;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_addname(&args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_block_addname(&args);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_leaf_addname(&args);
        else
                rval = xfs_dir2_node_addname(&args);
@@ -228,24 +194,21 @@ xfs_dir2_createname(
 /*
 * Lookup a name in a directory, give back the inode number.
 */
-static int                              /* error */
+int
-xfs_dir2_lookup(
+xfs_dir_lookup(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
-        char            *name,          /* lookup name */
+        char            *name,
-        int             namelen,        /* lookup name length */
+        int             namelen,
        xfs_ino_t       *inum)          /* out: inode number */
 {
-        xfs_da_args_t   args;           /* operation arguments */
+        xfs_da_args_t   args;
-        int             rval;           /* return value */
+        int             rval;
        int             v;              /* type-checking value */
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_lookup);
-        /*
-         * Fill in the arg structure for this request.
-         */
        args.name = name;
        args.namelen = namelen;
        args.hashval = xfs_da_hashname(name, namelen);
@@ -258,18 +221,16 @@ xfs_dir2_lookup(
        args.trans = tp;
        args.justcheck = args.addname = 0;
        args.oknoent = 1;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_lookup(&args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_block_lookup(&args);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_leaf_lookup(&args);
        else
                rval = xfs_dir2_node_lookup(&args);
@@ -283,26 +244,24 @@ xfs_dir2_lookup(
 /*
 * Remove an entry from a directory.
 */
-static int                              /* error */
+int
-xfs_dir2_removename(
+xfs_dir_removename(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
-        char            *name,          /* name of entry to remove */
+        char            *name,
-        int             namelen,        /* name length of entry to remove */
+        int             namelen,
-        xfs_ino_t       ino,            /* inode number of entry to remove */
+        xfs_ino_t       ino,
        xfs_fsblock_t   *first,         /* bmap's firstblock */
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
        xfs_extlen_t    total)          /* bmap's total block count */
 {
-        xfs_da_args_t   args;           /* operation arguments */
+        xfs_da_args_t   args;
-        int             rval;           /* return value */
+        int             rval;
        int             v;              /* type-checking value */
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_remove);
-        /*
-         * Fill in the arg structure for this request.
-         */
        args.name = name;
        args.namelen = namelen;
        args.hashval = xfs_da_hashname(name, namelen);
@@ -314,18 +273,16 @@ xfs_dir2_removename(
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
        args.justcheck = args.addname = args.oknoent = 0;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_removename(&args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_block_removename(&args);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_leaf_removename(&args);
        else
                rval = xfs_dir2_node_removename(&args);
@@ -335,10 +292,10 @@ xfs_dir2_removename(
 /*
 * Read a directory.
 */
-static int                              /* error */
+int
-xfs_dir2_getdents(
+xfs_dir_getdents(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
        uio_t           *uio,           /* caller's buffer control */
        int             *eofp)          /* out: eof reached */
 {
@@ -367,14 +324,11 @@ xfs_dir2_getdents(
        }
        *eofp = 0;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
                ;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
        else
                rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
@@ -386,29 +340,26 @@ xfs_dir2_getdents(
 /*
 * Replace the inode number of a directory entry.
 */
-static int                              /* error */
+int
-xfs_dir2_replace(
+xfs_dir_replace(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
        char            *name,          /* name of entry to replace */
-        int             namelen,        /* name length of entry to replace */
+        int             namelen,
        xfs_ino_t       inum,           /* new inode number */
        xfs_fsblock_t   *first,         /* bmap's firstblock */
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
        xfs_extlen_t    total)          /* bmap's total block count */
 {
-        xfs_da_args_t   args;           /* operation arguments */
+        xfs_da_args_t   args;
-        int             rval;           /* return value */
+        int             rval;
        int             v;              /* type-checking value */
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
-        }
-        /*
-         * Fill in the arg structure for this request.
-         */
        args.name = name;
        args.namelen = namelen;
        args.hashval = xfs_da_hashname(name, namelen);
@@ -420,18 +371,16 @@ xfs_dir2_replace(
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
        args.justcheck = args.addname = args.oknoent = 0;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_replace(&args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_block_replace(&args);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_leaf_replace(&args);
        else
                rval = xfs_dir2_node_replace(&args);
@@ -441,21 +390,19 @@ xfs_dir2_replace(
 /*
 * See if this entry can be added to the directory without allocating space.
 */
-static int                              /* error */
+int
-xfs_dir2_canenter(
+xfs_dir_canenter(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
        char            *name,          /* name of entry to add */
-        int             namelen)        /* name length of entry to add */
+        int             namelen)
 {
-        xfs_da_args_t   args;           /* operation arguments */
+        xfs_da_args_t   args;
-        int             rval;           /* return value */
+        int             rval;
        int             v;              /* type-checking value */
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        /*
-         * Fill in the arg structure for this request.
-         */
        args.name = name;
        args.namelen = namelen;
        args.hashval = xfs_da_hashname(name, namelen);
@@ -467,18 +414,16 @@ xfs_dir2_canenter(
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
        args.justcheck = args.addname = args.oknoent = 1;
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_addname(&args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_block_addname(&args);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
                return rval;
-        } else if (v)
+        else if (v)
                rval = xfs_dir2_leaf_addname(&args);
        else
                rval = xfs_dir2_node_addname(&args);
@@ -486,19 +431,6 @@ xfs_dir2_canenter(
 }
 /*
- * Dummy routine for shortform inode validation.
- * Can't really do this.
- */
-/* ARGSUSED */
-static int                              /* error */
-xfs_dir2_shortform_validate_ondisk(
-        xfs_mount_t     *mp,            /* filesystem mount point */
-        xfs_dinode_t    *dip)           /* ondisk inode */
-{
-        return 0;
-}
-/*
 * Utility routines.
 */
@@ -507,24 +439,24 @@ xfs_dir2_shortform_validate_ondisk(
 * This routine is for data and free blocks, not leaf/node blocks
 * which are handled by xfs_da_grow_inode.
 */
-int                                     /* error */
+int
 xfs_dir2_grow_inode(
-        xfs_da_args_t   *args,          /* operation arguments */
+        xfs_da_args_t   *args,
        int             space,          /* v2 dir's space XFS_DIR2_xxx_SPACE */
        xfs_dir2_db_t   *dbp)           /* out: block number added */
 {
        xfs_fileoff_t   bno;            /* directory offset of new block */
        int             count;          /* count of filesystem blocks */
        xfs_inode_t     *dp;            /* incore directory inode */
-        int             error;          /* error return value */
+        int             error;
        int             got;            /* blocks actually mapped */
-        int             i;              /* temp mapping index */
+        int             i;
        xfs_bmbt_irec_t map;            /* single structure for bmap */
        int             mapi;           /* mapping index */
        xfs_bmbt_irec_t *mapp;          /* bmap mapping structure(s) */
-        xfs_mount_t     *mp;            /* filesystem mount point */
+        xfs_mount_t     *mp;
        int             nmap;           /* number of bmap entries */
-        xfs_trans_t     *tp;            /* transaction pointer */
+        xfs_trans_t     *tp;
        xfs_dir2_trace_args_s("grow_inode", args, space);
        dp = args->dp;
@@ -538,9 +470,8 @@ xfs_dir2_grow_inode(
        /*
         * Find the first hole for our block.
         */
-        if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
+        if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK)))
                return error;
-        }
        nmap = 1;
        ASSERT(args->firstblock != NULL);
        /*
@@ -549,13 +480,9 @@ xfs_dir2_grow_inode(
        if ((error = xfs_bmapi(tp, dp, bno, count,
                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                        args->flist))) {
+                        args->flist, NULL)))
                return error;
-        }
        ASSERT(nmap <= 1);
-        /*
-         * Got it in 1.
-         */
        if (nmap == 1) {
                mapp = &map;
                mapi = 1;
@@ -585,7 +512,8 @@ xfs_dir2_grow_inode(
                        if ((error = xfs_bmapi(tp, dp, b, c,
                                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                        &mapp[mapi], &nmap, args->flist))) {
+                                        &mapp[mapi], &nmap, args->flist,
+                                        NULL))) {
                                kmem_free(mapp, sizeof(*mapp) * count);
                                return error;
                        }
@@ -645,20 +573,19 @@ xfs_dir2_grow_inode(
 /*
 * See if the directory is a single-block form directory.
 */
-int                                     /* error */
+int
 xfs_dir2_isblock(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
        int             *vp)            /* out: 1 is block, 0 is not block */
 {
        xfs_fileoff_t   last;           /* last file offset */
-        xfs_mount_t     *mp;            /* filesystem mount point */
+        xfs_mount_t     *mp;
-        int             rval;           /* return value */
+        int             rval;
        mp = dp->i_mount;
-        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
                return rval;
-        }
        rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
        ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
        *vp = rval;
@@ -668,20 +595,19 @@ xfs_dir2_isblock(
 /*
 * See if the directory is a single-leaf form directory.
 */
-int                                     /* error */
+int
 xfs_dir2_isleaf(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_trans_t     *tp,
-        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *dp,
        int             *vp)            /* out: 1 is leaf, 0 is not leaf */
 {
        xfs_fileoff_t   last;           /* last file offset */
-        xfs_mount_t     *mp;            /* filesystem mount point */
+        xfs_mount_t     *mp;
-        int             rval;           /* return value */
+        int             rval;
        mp = dp->i_mount;
-        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
                return rval;
-        }
        *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
        return 0;
 }
@@ -689,9 +615,9 @@ xfs_dir2_isleaf(
 /*
 * Getdents put routine for 64-bit ABI, direct form.
 */
-static int                                      /* error */
+static int
 xfs_dir2_put_dirent64_direct(
-        xfs_dir2_put_args_t     *pa)            /* argument bundle */
+        xfs_dir2_put_args_t     *pa)
 {
        xfs_dirent_t            *idbp;          /* dirent pointer */
        iovec_t                 *iovp;          /* io vector */
@@ -726,9 +652,9 @@ xfs_dir2_put_dirent64_direct(
 /*
 * Getdents put routine for 64-bit ABI, uio form.
 */
-static int                                      /* error */
+static int
 xfs_dir2_put_dirent64_uio(
-        xfs_dir2_put_args_t     *pa)            /* argument bundle */
+        xfs_dir2_put_args_t     *pa)
 {
        xfs_dirent_t            *idbp;          /* dirent pointer */
        int                     namelen;        /* entry name length */
@@ -764,17 +690,17 @@ xfs_dir2_put_dirent64_uio(
 */
 int
 xfs_dir2_shrink_inode(
-        xfs_da_args_t   *args,          /* operation arguments */
+        xfs_da_args_t   *args,
-        xfs_dir2_db_t   db,             /* directory block number */
+        xfs_dir2_db_t   db,
-        xfs_dabuf_t     *bp)            /* block's buffer */
+        xfs_dabuf_t     *bp)
 {
        xfs_fileoff_t   bno;            /* directory file offset */
        xfs_dablk_t     da;             /* directory file offset */
        int             done;           /* bunmap is finished */
-        xfs_inode_t     *dp;            /* incore directory inode */
+        xfs_inode_t     *dp;
-        int             error;          /* error return value */
+        int             error;
-        xfs_mount_t     *mp;            /* filesystem mount point */
+        xfs_mount_t     *mp;
-        xfs_trans_t     *tp;            /* transaction pointer */
+        xfs_trans_t     *tp;
        xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
        dp = args->dp;
@@ -786,7 +712,7 @@ xfs_dir2_shrink_inode(
         */
        if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
                        XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-                        &done))) {
+                        NULL, &done))) {
                /*
                 * ENOSPC actually can happen if we're in a removename with
                 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 7dd364b1e038..86560b6f794c 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -22,7 +22,9 @@ struct uio;
 struct xfs_dabuf;
 struct xfs_da_args;
 struct xfs_dir2_put_args;
+struct xfs_bmap_free;
 struct xfs_inode;
+struct xfs_mount;
 struct xfs_trans;
 /*
@@ -73,7 +75,35 @@ typedef struct xfs_dir2_put_args {
 } xfs_dir2_put_args_t;
 /*
- * Other interfaces used by the rest of the dir v2 code.
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+                                char *name, int namelen, xfs_ino_t inum,
+                                xfs_fsblock_t *first,
+                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+                                char *name, int namelen, xfs_ino_t *inum);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+                                char *name, int namelen, xfs_ino_t ino,
+                                xfs_fsblock_t *first,
+                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+                                uio_t *uio, int *eofp);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+                                char *name, int namelen, xfs_ino_t inum,
+                                xfs_fsblock_t *first,
+                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+                                char *name, int namelen);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+/*
+ * Utility routines for v2 directories.
 */
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
                                xfs_dir2_db_t *dbp);
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 972ded595476..9d7438bba30d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -22,19 +22,16 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -51,6 +48,18 @@ static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
                                     int *entno);
 static int xfs_dir2_block_sort(const void *a, const void *b);
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+        xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+        xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+}
 /*
 * Add an entry to a block directory.
 */
@@ -400,7 +409,7 @@ xfs_dir2_block_addname(
        /*
         * Create the new data entry.
         */
-        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
        memcpy(dep->name, args->name, args->namelen);
        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -508,7 +517,7 @@ xfs_dir2_block_getdents(
                p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
                                                    ptr - (char *)block);
-                p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+                p.ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
                p.ino += mp->m_inoadd;
 #endif
@@ -626,7 +635,7 @@ xfs_dir2_block_lookup(
        /*
         * Fill in inode number, release the block.
         */
-        args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+        args->inumber = be64_to_cpu(dep->inumber);
        xfs_da_brelse(args->trans, bp);
        return XFS_ERROR(EEXIST);
 }
@@ -844,11 +853,11 @@ xfs_dir2_block_replace(
         */
        dep = (xfs_dir2_data_entry_t *)
              ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address)));
-        ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+        ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
        /*
         * Change the inode number to the new value.
         */
-        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->inumber = cpu_to_be64(args->inumber);
        xfs_dir2_data_log_entry(args->trans, bp, dep);
        xfs_dir2_data_check(dp, bp);
        xfs_da_buf_done(bp);
@@ -1130,7 +1139,7 @@ xfs_dir2_sf_to_block(
         */
        dep = (xfs_dir2_data_entry_t *)
              ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
-        INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+        dep->inumber = cpu_to_be64(dp->i_ino);
        dep->namelen = 1;
        dep->name[0] = '.';
        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1144,7 +1153,7 @@ xfs_dir2_sf_to_block(
         */
        dep = (xfs_dir2_data_entry_t *)
                ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
-        INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+        dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
        dep->namelen = 2;
        dep->name[0] = dep->name[1] = '.';
        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1193,7 +1202,7 @@ xfs_dir2_sf_to_block(
                 * Copy a real entry.
                 */
                dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
-                INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp,
+                dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp,
                                XFS_DIR2_SF_INUMBERP(sfep)));
                dep->namelen = sfep->namelen;
                memcpy(dep->name, sfep->name, dep->namelen);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index bb3d03ff002b..f7c799217072 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -22,18 +22,15 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -133,7 +130,7 @@ xfs_dir2_data_check(
                 */
                dep = (xfs_dir2_data_entry_t *)p;
                ASSERT(dep->namelen != 0);
-                ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
+                ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
                ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) ==
                       (char *)dep - (char *)d);
                count++;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index 0847cbb53e17..a6ae2d21c40a 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -85,11 +85,11 @@ typedef struct xfs_dir2_data_hdr {
 * Tag appears as the last 2 bytes.
 */
 typedef struct xfs_dir2_data_entry {
-        xfs_ino_t               inumber;        /* inode number */
+        __be64                  inumber;        /* inode number */
-        __uint8_t               namelen;        /* name length */
+        __u8                    namelen;        /* name length */
-        __uint8_t               name[1];        /* name bytes, no null */
+        __u8                    name[1];        /* name bytes, no null */
                                                /* variable offset */
-        xfs_dir2_data_off_t     tag;            /* starting offset of us */
+        __be16                  tag;            /* starting offset of us */
 } xfs_dir2_data_entry_t;
 /*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0f5e2f2ce6ec..b1cf1fbf423d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -407,7 +405,7 @@ xfs_dir2_leaf_addname(
         * Initialize our new entry (at last).
         */
        dep = (xfs_dir2_data_entry_t *)dup;
-        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
        memcpy(dep->name, args->name, dep->namelen);
        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -884,7 +882,7 @@ xfs_dir2_leaf_getdents(
                                        XFS_DIR2_BYTE_TO_DA(mp,
                                                XFS_DIR2_LEAF_OFFSET) - map_off,
                                        XFS_BMAPI_METADATA, NULL, 0,
-                                        &map[map_valid], &nmap, NULL);
+                                        &map[map_valid], &nmap, NULL, NULL);
                                /*
                                 * Don't know if we should ignore this or
                                 * try to return an error.
@@ -1098,7 +1096,7 @@ xfs_dir2_leaf_getdents(
                p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length);
-                p->ino = INT_GET(dep->inumber, ARCH_CONVERT);
+                p->ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
                p->ino += mp->m_inoadd;
 #endif
@@ -1319,7 +1317,7 @@ xfs_dir2_leaf_lookup(
        /*
         * Return the found inode number.
         */
-        args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+        args->inumber = be64_to_cpu(dep->inumber);
        xfs_da_brelse(tp, dbp);
        xfs_da_brelse(tp, lbp);
        return XFS_ERROR(EEXIST);
@@ -1606,11 +1604,11 @@ xfs_dir2_leaf_replace(
        dep = (xfs_dir2_data_entry_t *)
              ((char *)dbp->data +
               XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address)));
-        ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+        ASSERT(args->inumber != be64_to_cpu(dep->inumber));
        /*
         * Put the new inode number in, log it.
         */
-        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->inumber = cpu_to_be64(args->inumber);
        tp = args->trans;
        xfs_dir2_data_log_entry(tp, dbp, dep);
        xfs_da_buf_done(dbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ac511ab9c52d..9ca71719b683 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -22,13 +22,11 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -505,7 +503,6 @@ xfs_dir2_leafn_lookup_int(
                                                        XFS_DATA_FORK))) {
                                                return error;
                                        }
-                                        curfdb = newfdb;
                                        free = curbp->data;
                                        ASSERT(be32_to_cpu(free->hdr.magic) ==
                                               XFS_DIR2_FREE_MAGIC);
@@ -527,8 +524,11 @@ xfs_dir2_leafn_lookup_int(
                                if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
                                        XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
                                                         XFS_ERRLEVEL_LOW, mp);
+                                        if (curfdb != newfdb)
+                                                xfs_da_brelse(tp, curbp);
                                        return XFS_ERROR(EFSCORRUPTED);
                                }
+                                curfdb = newfdb;
                                if (be16_to_cpu(free->bests[fi]) >= length) {
                                        *indexp = index;
                                        state->extravalid = 1;
@@ -580,7 +580,7 @@ xfs_dir2_leafn_lookup_int(
                        if (dep->namelen == args->namelen &&
                            dep->name[0] == args->name[0] &&
                            memcmp(dep->name, args->name, args->namelen) == 0) {
-                                args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+                                args->inumber = be64_to_cpu(dep->inumber);
                                *indexp = index;
                                state->extravalid = 1;
                                state->extrablk.bp = curbp;
@@ -970,7 +970,7 @@ xfs_dir2_leafn_remove(
                        /*
                         * One less used entry in the free table.
                         */
-                        free->hdr.nused = cpu_to_be32(-1);
+                        be32_add(&free->hdr.nused, -1);
                        xfs_dir2_free_log_header(tp, fbp);
                        /*
                         * If this was the last entry in the table, we can
@@ -1695,7 +1695,7 @@ xfs_dir2_node_addname_int(
         * Fill in the new entry and log it.
         */
        dep = (xfs_dir2_data_entry_t *)dup;
-        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
        memcpy(dep->name, args->name, dep->namelen);
        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1905,11 +1905,11 @@ xfs_dir2_node_replace(
                dep = (xfs_dir2_data_entry_t *)
                      ((char *)data +
                       XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address)));
-                ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+                ASSERT(inum != be64_to_cpu(dep->inumber));
                /*
                 * Fill in the new inode number and log the entry.
                 */
-                INT_SET(dep->inumber, ARCH_CONVERT, inum);
+                dep->inumber = cpu_to_be64(inum);
                xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
                rval = 0;
        }
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index d98a41d1fe63..0cd77b17bf92 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -22,19 +22,16 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_error.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
@@ -117,13 +114,13 @@ xfs_dir2_block_sfsize(
                        dep->name[0] == '.' && dep->name[1] == '.';
 #if XFS_BIG_INUMS
                if (!isdot)
-                        i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+                        i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
 #endif
                if (!isdot && !isdotdot) {
                        count++;
                        namelen += dep->namelen;
                } else if (isdotdot)
-                        parent = INT_GET(dep->inumber, ARCH_CONVERT);
+                        parent = be64_to_cpu(dep->inumber);
                /*
                 * Calculate the new size, see if we should give up yet.
                 */
@@ -229,13 +226,13 @@ xfs_dir2_block_to_sf(
                 * Skip .
                 */
                if (dep->namelen == 1 && dep->name[0] == '.')
-                        ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+                        ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
                /*
                 * Skip .., but make sure the inode number is right.
                 */
                else if (dep->namelen == 2 &&
                         dep->name[0] == '.' && dep->name[1] == '.')
-                        ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
+                        ASSERT(be64_to_cpu(dep->inumber) ==
                               XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
                /*
                 * Normal entry, copy it into shortform.
@@ -246,7 +243,7 @@ xfs_dir2_block_to_sf(
                                (xfs_dir2_data_aoff_t)
                                ((char *)dep - (char *)block));
                        memcpy(sfep->name, dep->name, dep->namelen);
-                        temp=INT_GET(dep->inumber, ARCH_CONVERT);
+                        temp = be64_to_cpu(dep->inumber);
                        XFS_DIR2_SF_PUT_INUMBER(sfp, &temp,
                                XFS_DIR2_SF_INUMBERP(sfep));
                        sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
index c626943b4112..f3fb2ffd6f5c 100644
--- a/fs/xfs/xfs_dir2_trace.c
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -19,11 +19,9 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_inum.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
deleted file mode 100644
index 6d711869262f..000000000000
--- a/fs/xfs/xfs_dir_leaf.c
+++ /dev/null
@@ -1,2213 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-/*
- * xfs_dir_leaf.c
- *
- * Routines to implement leaf blocks of directories as Btrees of hashed names.
- */
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-/*
- * Routines used for growing the Btree.
- */
-STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
-                                              int insertion_index,
-                                              int freemap_index);
-STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
-                                            int musthave, int justcheck);
-STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
-                                                  xfs_da_state_blk_t *blk1,
-                                                  xfs_da_state_blk_t *blk2);
-STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
-                                          xfs_da_state_blk_t *leaf_blk_1,
-                                          xfs_da_state_blk_t *leaf_blk_2,
-                                          int *number_entries_in_blk1,
-                                          int *number_namebytes_in_blk1);
-STATIC int xfs_dir_leaf_create(struct xfs_da_args *args,
-                                xfs_dablk_t which_block,
-                                struct xfs_dabuf **bpp);
-/*
- * Utility routines.
- */
-STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
-                                              int src_start,
-                                              xfs_dir_leafblock_t *dst_leaf,
-                                              int dst_start, int move_count,
-                                              xfs_mount_t *mp);
-/*========================================================================
- * External routines when dirsize < XFS_IFORK_DSIZE(dp).
- *========================================================================*/
-/*
- * Validate a given inode number.
- */
-int
-xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
-{
-        xfs_agblock_t   agblkno;
-        xfs_agino_t     agino;
-        xfs_agnumber_t  agno;
-        int             ino_ok;
-        int             ioff;
-        agno = XFS_INO_TO_AGNO(mp, ino);
-        agblkno = XFS_INO_TO_AGBNO(mp, ino);
-        ioff = XFS_INO_TO_OFFSET(mp, ino);
-        agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
-        ino_ok =
-                agno < mp->m_sb.sb_agcount &&
-                agblkno < mp->m_sb.sb_agblocks &&
-                agblkno != 0 &&
-                ioff < (1 << mp->m_sb.sb_inopblog) &&
-                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
-        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
-                        XFS_RANDOM_DIR_INO_VALIDATE))) {
-                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
-                                (unsigned long long) ino);
-                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        return 0;
-}
-/*
- * Create the initial contents of a shortform directory.
- */
-int
-xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
-{
-        xfs_dir_sf_hdr_t *hdr;
-        xfs_inode_t *dp;
-        dp = args->dp;
-        ASSERT(dp != NULL);
-        ASSERT(dp->i_d.di_size == 0);
-        if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
-                dp->i_df.if_flags &= ~XFS_IFEXTENTS;    /* just in case */
-                dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-                dp->i_df.if_flags |= XFS_IFINLINE;
-        }
-        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-        ASSERT(dp->i_df.if_bytes == 0);
-        xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
-        hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
-        XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent);
-        hdr->count = 0;
-        dp->i_d.di_size = sizeof(*hdr);
-        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-        return 0;
-}
-/*
- * Add a name to the shortform directory structure.
- * Overflow from the inode has already been checked for.
- */
-int
-xfs_dir_shortform_addname(xfs_da_args_t *args)
-{
-        xfs_dir_shortform_t *sf;
-        xfs_dir_sf_entry_t *sfe;
-        int i, offset, size;
-        xfs_inode_t *dp;
-        dp = args->dp;
-        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-        /*
-         * Catch the case where the conversion from shortform to leaf
-         * failed part way through.
-         */
-        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
-        }
-        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-        ASSERT(dp->i_df.if_u1.if_data != NULL);
-        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-        sfe = &sf->list[0];
-        for (i = sf->hdr.count-1; i >= 0; i--) {
-                if (sfe->namelen == args->namelen &&
-                    args->name[0] == sfe->name[0] &&
-                    memcmp(args->name, sfe->name, args->namelen) == 0)
-                        return XFS_ERROR(EEXIST);
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-        }
-        offset = (int)((char *)sfe - (char *)sf);
-        size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
-        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-        sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
-        XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
-        sfe->namelen = args->namelen;
-        memcpy(sfe->name, args->name, sfe->namelen);
-        sf->hdr.count++;
-        dp->i_d.di_size += size;
-        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-        return 0;
-}
-/*
- * Remove a name from the shortform directory structure.
- */
-int
-xfs_dir_shortform_removename(xfs_da_args_t *args)
-{
-        xfs_dir_shortform_t *sf;
-        xfs_dir_sf_entry_t *sfe;
-        int base, size = 0, i;
-        xfs_inode_t *dp;
-        dp = args->dp;
-        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-        /*
-         * Catch the case where the conversion from shortform to leaf
-         * failed part way through.
-         */
-        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
-        }
-        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-        ASSERT(dp->i_df.if_u1.if_data != NULL);
-        base = sizeof(xfs_dir_sf_hdr_t);
-        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-        sfe = &sf->list[0];
-        for (i = sf->hdr.count-1; i >= 0; i--) {
-                size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
-                if (sfe->namelen == args->namelen &&
-                    sfe->name[0] == args->name[0] &&
-                    memcmp(sfe->name, args->name, args->namelen) == 0)
-                        break;
-                base += size;
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-        }
-        if (i < 0) {
-                ASSERT(args->oknoent);
-                return XFS_ERROR(ENOENT);
-        }
-        if ((base + size) != dp->i_d.di_size) {
-                memmove(&((char *)sf)[base], &((char *)sf)[base+size],
-                                              dp->i_d.di_size - (base+size));
-        }
-        sf->hdr.count--;
-        xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
-        dp->i_d.di_size -= size;
-        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-        return 0;
-}
-/*
- * Look up a name in a shortform directory structure.
- */
-int
-xfs_dir_shortform_lookup(xfs_da_args_t *args)
-{
-        xfs_dir_shortform_t *sf;
-        xfs_dir_sf_entry_t *sfe;
-        int i;
-        xfs_inode_t *dp;
-        dp = args->dp;
-        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-        /*
-         * Catch the case where the conversion from shortform to leaf
-         * failed part way through.
-         */
-        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
-        }
-        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-        ASSERT(dp->i_df.if_u1.if_data != NULL);
-        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-        if (args->namelen == 2 &&
-            args->name[0] == '.' && args->name[1] == '.') {
-                XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber);
-                return(XFS_ERROR(EEXIST));
-        }
-        if (args->namelen == 1 && args->name[0] == '.') {
-                args->inumber = dp->i_ino;
-                return(XFS_ERROR(EEXIST));
-        }
-        sfe = &sf->list[0];
-        for (i = sf->hdr.count-1; i >= 0; i--) {
-                if (sfe->namelen == args->namelen &&
-                    sfe->name[0] == args->name[0] &&
-                    memcmp(args->name, sfe->name, args->namelen) == 0) {
-                        XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber);
-                        return(XFS_ERROR(EEXIST));
-                }
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-        }
-        ASSERT(args->oknoent);
-        return(XFS_ERROR(ENOENT));
-}
-/*
- * Convert from using the shortform to the leaf.
- */
-int
-xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
-{
-        xfs_inode_t *dp;
-        xfs_dir_shortform_t *sf;
-        xfs_dir_sf_entry_t *sfe;
-        xfs_da_args_t args;
-        xfs_ino_t inumber;
-        char *tmpbuffer;
-        int retval, i, size;
-        xfs_dablk_t blkno;
-        xfs_dabuf_t *bp;
-        dp = iargs->dp;
-        /*
-         * Catch the case where the conversion from shortform to leaf
-         * failed part way through.
-         */
-        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
-        }
-        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-        ASSERT(dp->i_df.if_u1.if_data != NULL);
-        size = dp->i_df.if_bytes;
-        tmpbuffer = kmem_alloc(size, KM_SLEEP);
-        ASSERT(tmpbuffer != NULL);
-        memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size);
-        sf = (xfs_dir_shortform_t *)tmpbuffer;
-        XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber);
-        xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
-        dp->i_d.di_size = 0;
-        xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
-        retval = xfs_da_grow_inode(iargs, &blkno);
-        if (retval)
-                goto out;
-        ASSERT(blkno == 0);
-        retval = xfs_dir_leaf_create(iargs, blkno, &bp);
-        if (retval)
-                goto out;
-        xfs_da_buf_done(bp);
-        args.name = ".";
-        args.namelen = 1;
-        args.hashval = xfs_dir_hash_dot;
-        args.inumber = dp->i_ino;
-        args.dp = dp;
-        args.firstblock = iargs->firstblock;
-        args.flist = iargs->flist;
-        args.total = iargs->total;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = iargs->trans;
-        args.justcheck = 0;
-        args.addname = args.oknoent = 1;
-        retval = xfs_dir_leaf_addname(&args);
-        if (retval)
-                goto out;
-        args.name = "..";
-        args.namelen = 2;
-        args.hashval = xfs_dir_hash_dotdot;
-        args.inumber = inumber;
-        retval = xfs_dir_leaf_addname(&args);
-        if (retval)
-                goto out;
-        sfe = &sf->list[0];
-        for (i = 0; i < sf->hdr.count; i++) {
-                args.name = (char *)(sfe->name);
-                args.namelen = sfe->namelen;
-                args.hashval = xfs_da_hashname((char *)(sfe->name),
-                                               sfe->namelen);
-                XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber);
-                retval = xfs_dir_leaf_addname(&args);
-                if (retval)
-                        goto out;
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-        }
-        retval = 0;
-out:
-        kmem_free(tmpbuffer, size);
-        return retval;
-}
-STATIC int
-xfs_dir_shortform_compare(const void *a, const void *b)
-{
-        xfs_dir_sf_sort_t *sa, *sb;
-        sa = (xfs_dir_sf_sort_t *)a;
-        sb = (xfs_dir_sf_sort_t *)b;
-        if (sa->hash < sb->hash)
-                return -1;
-        else if (sa->hash > sb->hash)
-                return 1;
-        else
-                return sa->entno - sb->entno;
-}
-/*
- * Copy out directory entries for getdents(), for shortform directories.
- */
-/*ARGSUSED*/
-int
-xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
-                                       xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-        xfs_dir_shortform_t *sf;
-        xfs_dir_sf_entry_t *sfe;
-        int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
-        xfs_mount_t *mp;
-        xfs_dahash_t cookhash, hash;
-        xfs_dir_put_args_t p;
-        xfs_dir_sf_sort_t *sbuf, *sbp;
-        mp = dp->i_mount;
-        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-        cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-        want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-        nsbuf = sf->hdr.count + 2;
-        sbsize = (nsbuf + 1) * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
-        xfs_dir_trace_g_du("sf: start", dp, uio);
-        /*
-         * Collect all the entries into the buffer.
-         * Entry 0 is .
-         */
-        sbp->entno = 0;
-        sbp->seqno = 0;
-        sbp->hash = xfs_dir_hash_dot;
-        sbp->ino = dp->i_ino;
-        sbp->name = ".";
-        sbp->namelen = 1;
-        sbp++;
-        /*
-         * Entry 1 is ..
-         */
-        sbp->entno = 1;
-        sbp->seqno = 0;
-        sbp->hash = xfs_dir_hash_dotdot;
-        sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent);
-        sbp->name = "..";
-        sbp->namelen = 2;
-        sbp++;
-        /*
-         * Scan the directory data for the rest of the entries.
-         */
-        for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-                if (unlikely(
-                    ((char *)sfe < (char *)sf) ||
-                    ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) {
-                        xfs_dir_trace_g_du("sf: corrupted", dp, uio);
-                        XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents",
-                                             XFS_ERRLEVEL_LOW, mp, sfe);
-                        kmem_free(sbuf, sbsize);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
-                sbp->entno = i + 2;
-                sbp->seqno = 0;
-                sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
-                sbp->ino = XFS_GET_DIR_INO8(sfe->inumber);
-                sbp->name = (char *)sfe->name;
-                sbp->namelen = sfe->namelen;
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-                sbp++;
-        }
-        /*
-         * Sort the entries on hash then entno.
-         */
-        xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
-        /*
-         * Stuff in last entry.
-         */
-        sbp->entno = nsbuf;
-        sbp->hash = XFS_DA_MAXHASH;
-        sbp->seqno = 0;
-        /*
-         * Figure out the sequence numbers in case there's a hash duplicate.
-         */
-        for (hash = sbuf->hash, sbp = sbuf + 1;
-                                sbp < &sbuf[nsbuf + 1]; sbp++) {
-                if (sbp->hash == hash)
-                        sbp->seqno = sbp[-1].seqno + 1;
-                else
-                        hash = sbp->hash;
-        }
-        /*
-         * Set up put routine.
-         */
-        p.dbp = dbp;
-        p.put = put;
-        p.uio = uio;
-        /*
-         * Find our place.
-         */
-        for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
-                if (sbp->hash > cookhash ||
-                    (sbp->hash == cookhash && sbp->seqno >= want_entno))
-                        break;
-        }
-        /*
-         * Did we fail to find anything?  We stop at the last entry,
-         * the one we put maxhash into.
-         */
-        if (sbp == &sbuf[nsbuf]) {
-                kmem_free(sbuf, sbsize);
-                xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
-                uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
-                *eofp = 1;
-                return 0;
-        }
-        /*
-         * Loop putting entries into the user buffer.
-         */
-        while (sbp < &sbuf[nsbuf]) {
-                /*
-                 * Save the first resid in a run of equal-hashval entries
-                 * so that we can back them out if they don't all fit.
-                 */
-                if (sbp->seqno == 0 || sbp == sbuf)
-                        lastresid = uio->uio_resid;
-                XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash);
-                p.ino = sbp->ino;
-#if XFS_BIG_INUMS
-                p.ino += mp->m_inoadd;
-#endif
-                p.name = sbp->name;
-                p.namelen = sbp->namelen;
-                retval = p.put(&p);
-                if (!p.done) {
-                        uio->uio_offset =
-                                XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
-                        kmem_free(sbuf, sbsize);
-                        uio->uio_resid = lastresid;
-                        xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
-                        return retval;
-                }
-                sbp++;
-        }
-        kmem_free(sbuf, sbsize);
-        uio->uio_offset = p.cook.o;
-        *eofp = 1;
-        xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
-        return 0;
-}
-/*
- * Look up a name in a shortform directory structure, replace the inode number.
- */
-int
-xfs_dir_shortform_replace(xfs_da_args_t *args)
-{
-        xfs_dir_shortform_t *sf;
-        xfs_dir_sf_entry_t *sfe;
-        xfs_inode_t *dp;
-        int i;
-        dp = args->dp;
-        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-        /*
-         * Catch the case where the conversion from shortform to leaf
-         * failed part way through.
-         */
-        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
-        }
-        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-        ASSERT(dp->i_df.if_u1.if_data != NULL);
-        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-        if (args->namelen == 2 &&
-            args->name[0] == '.' && args->name[1] == '.') {
-                /* XXX - replace assert? */
-                XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
-                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-                return 0;
-        }
-        ASSERT(args->namelen != 1 || args->name[0] != '.');
-        sfe = &sf->list[0];
-        for (i = sf->hdr.count-1; i >= 0; i--) {
-                if (sfe->namelen == args->namelen &&
-                    sfe->name[0] == args->name[0] &&
-                    memcmp(args->name, sfe->name, args->namelen) == 0) {
-                        ASSERT(memcmp((char *)&args->inumber,
-                                (char *)&sfe->inumber, sizeof(xfs_ino_t)));
-                        XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
-                        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-                        return 0;
-                }
-                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-        }
-        ASSERT(args->oknoent);
-        return XFS_ERROR(ENOENT);
-}
-/*
- * Convert a leaf directory to shortform structure
- */
-int
-xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_hdr_t *hdr;
-        xfs_dir_leaf_entry_t *entry;
-        xfs_dir_leaf_name_t *namest;
-        xfs_da_args_t args;
-        xfs_inode_t *dp;
-        xfs_ino_t parent = 0;
-        char *tmpbuffer;
-        int retval, i;
-        xfs_dabuf_t *bp;
-        dp = iargs->dp;
-        tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
-        ASSERT(tmpbuffer != NULL);
-        retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
-                                               XFS_DATA_FORK);
-        if (retval)
-                goto out;
-        ASSERT(bp != NULL);
-        memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
-        leaf = (xfs_dir_leafblock_t *)tmpbuffer;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
-        /*
-         * Find and special case the parent inode number
-         */
-        hdr = &leaf->hdr;
-        entry = &leaf->entries[0];
-        for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-                if ((entry->namelen == 2) &&
-                    (namest->name[0] == '.') &&
-                    (namest->name[1] == '.')) {
-                        XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent);
-                        entry->nameidx = 0;
-                } else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
-                        entry->nameidx = 0;
-                }
-        }
-        retval = xfs_da_shrink_inode(iargs, 0, bp);
-        if (retval)
-                goto out;
-        retval = xfs_dir_shortform_create(iargs, parent);
-        if (retval)
-                goto out;
-        /*
-         * Copy the rest of the filenames
-         */
-        entry = &leaf->entries[0];
-        args.dp = dp;
-        args.firstblock = iargs->firstblock;
-        args.flist = iargs->flist;
-        args.total = iargs->total;
-        args.whichfork = XFS_DATA_FORK;
-        args.trans = iargs->trans;
-        args.justcheck = 0;
-        args.addname = args.oknoent = 1;
-        for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
-                if (!entry->nameidx)
-                        continue;
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-                args.name = (char *)(namest->name);
-                args.namelen = entry->namelen;
-                args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
-                XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
-                xfs_dir_shortform_addname(&args);
-        }
-out:
-        kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
-        return retval;
-}
-/*
- * Convert from using a single leaf to a root node and a leaf.
- */
-int
-xfs_dir_leaf_to_node(xfs_da_args_t *args)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_da_intnode_t *node;
-        xfs_inode_t *dp;
-        xfs_dabuf_t *bp1, *bp2;
-        xfs_dablk_t blkno;
-        int retval;
-        dp = args->dp;
-        retval = xfs_da_grow_inode(args, &blkno);
-        ASSERT(blkno == 1);
-        if (retval)
-                return retval;
-        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
-                                              XFS_DATA_FORK);
-        if (retval)
-                return retval;
-        ASSERT(bp1 != NULL);
-        retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
-                                             XFS_DATA_FORK);
-        if (retval) {
-                xfs_da_buf_done(bp1);
-                return retval;
-        }
-        ASSERT(bp2 != NULL);
-        memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
-        xfs_da_buf_done(bp1);
-        xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
-        /*
-         * Set up the new root node.
-         */
-        retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
-        if (retval) {
-                xfs_da_buf_done(bp2);
-                return retval;
-        }
-        node = bp1->data;
-        leaf = bp2->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        node->btree[0].hashval = cpu_to_be32(
-                INT_GET(leaf->entries[
-                        INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
-        xfs_da_buf_done(bp2);
-        node->btree[0].before = cpu_to_be32(blkno);
-        node->hdr.count = cpu_to_be16(1);
-        xfs_da_log_buf(args->trans, bp1,
-                XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
-        xfs_da_buf_done(bp1);
-        return retval;
-}
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-/*
- * Create the initial contents of a leaf directory
- * or a leaf in a node directory.
- */
-STATIC int
-xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_hdr_t *hdr;
-        xfs_inode_t *dp;
-        xfs_dabuf_t *bp;
-        int retval;
-        dp = args->dp;
-        ASSERT(dp != NULL);
-        retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
-        if (retval)
-                return retval;
-        ASSERT(bp != NULL);
-        leaf = bp->data;
-        memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
-        hdr = &leaf->hdr;
-        hdr->info.magic = cpu_to_be16(XFS_DIR_LEAF_MAGIC);
-        INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
-        if (!hdr->firstused)
-                INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
-        INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
-        INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
-        xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
-        *bpp = bp;
-        return 0;
-}
-/*
- * Split the leaf node, rebalance, then add the new entry.
- */
-int
-xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
-                                  xfs_da_state_blk_t *newblk)
-{
-        xfs_dablk_t blkno;
-        xfs_da_args_t *args;
-        int error;
-        /*
-         * Allocate space for a new leaf node.
-         */
-        args = state->args;
-        ASSERT(args != NULL);
-        ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
-        error = xfs_da_grow_inode(args, &blkno);
-        if (error)
-                return error;
-        error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
-        if (error)
-                return error;
-        newblk->blkno = blkno;
-        newblk->magic = XFS_DIR_LEAF_MAGIC;
-        /*
-         * Rebalance the entries across the two leaves.
-         */
-        xfs_dir_leaf_rebalance(state, oldblk, newblk);
-        error = xfs_da_blk_link(state, oldblk, newblk);
-        if (error)
-                return error;
-        /*
-         * Insert the new entry in the correct block.
-         */
-        if (state->inleaf) {
-                error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
-        } else {
-                error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
-        }
-        /*
-         * Update last hashval in each block since we added the name.
-         */
-        oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
-        newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
-        return error;
-}
-/*
- * Add a name to the leaf directory structure.
- *
- * Must take into account fragmented leaves and leaves where spacemap has
- * lost some freespace information (ie: holes).
- */
-int
-xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_hdr_t *hdr;
-        xfs_dir_leaf_map_t *map;
-        int tablesize, entsize, sum, i, tmp, error;
-        leaf = bp->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
-        hdr = &leaf->hdr;
-        entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
-        /*
-         * Search through freemap for first-fit on new name length.
-         * (may need to figure in size of entry struct too)
-         */
-        tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
-                        + (uint)sizeof(xfs_dir_leaf_hdr_t);
-        map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
-        for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
-                if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
-                        sum += INT_GET(map->size, ARCH_CONVERT);
-                        continue;
-                }
-                if (!map->size)
-                        continue;       /* no space in this map */
-                tmp = entsize;
-                if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
-                        tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
-                if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
-                        if (!args->justcheck)
-                                xfs_dir_leaf_add_work(bp, args, index, i);
-                        return 0;
-                }
-                sum += INT_GET(map->size, ARCH_CONVERT);
-        }
-        /*
-         * If there are no holes in the address space of the block,
-         * and we don't have enough freespace, then compaction will do us
-         * no good and we should just give up.
-         */
-        if (!hdr->holes && (sum < entsize))
-                return XFS_ERROR(ENOSPC);
-        /*
-         * Compact the entries to coalesce free space.
-         * Pass the justcheck flag so the checking pass can return
-         * an error, without changing anything, if it won't fit.
-         */
-        error = xfs_dir_leaf_compact(args->trans, bp,
-                        args->total == 0 ?
-                                entsize +
-                                (uint)sizeof(xfs_dir_leaf_entry_t) : 0,
-                        args->justcheck);
-        if (error)
-                return error;
-        /*
-         * After compaction, the block is guaranteed to have only one
-         * free region, in freemap[0].  If it is not big enough, give up.
-         */
-        if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
-            (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
-                return XFS_ERROR(ENOSPC);
-        if (!args->justcheck)
-                xfs_dir_leaf_add_work(bp, args, index, 0);
-        return 0;
-}
-/*
- * Add a name to a leaf directory structure.
- */
-STATIC void
-xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
-                      int mapindex)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_hdr_t *hdr;
-        xfs_dir_leaf_entry_t *entry;
-        xfs_dir_leaf_name_t *namest;
-        xfs_dir_leaf_map_t *map;
-        /* REFERENCED */
-        xfs_mount_t *mp;
-        int tmp, i;
-        leaf = bp->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        hdr = &leaf->hdr;
-        ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
-        ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
-        /*
-         * Force open some space in the entry array and fill it in.
-         */
-        entry = &leaf->entries[index];
-        if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
-                tmp  = INT_GET(hdr->count, ARCH_CONVERT) - index;
-                tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-                memmove(entry + 1, entry, tmp);
-                xfs_da_log_buf(args->trans, bp,
-                    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-        }
-        INT_MOD(hdr->count, ARCH_CONVERT, +1);
-        /*
-         * Allocate space for the new string (at the end of the run).
-         */
-        map = &hdr->freemap[mapindex];
-        mp = args->trans->t_mountp;
-        ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
-        ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
-        ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
-        INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
-        INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
-        INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
-        entry->namelen = args->namelen;
-        xfs_da_log_buf(args->trans, bp,
-            XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-        /*
-         * Copy the string and inode number into the new space.
-         */
-        namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-        XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
-        memcpy(namest->name, args->name, args->namelen);
-        xfs_da_log_buf(args->trans, bp,
-            XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
-        /*
-         * Update the control info for this leaf node
-         */
-        if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
-                INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
-        ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
-        tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
-                        + (uint)sizeof(xfs_dir_leaf_hdr_t);
-        map = &hdr->freemap[0];
-        for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-                if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
-                        INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
-                        INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
-                }
-        }
-        INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
-        xfs_da_log_buf(args->trans, bp,
-                XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-}
-/*
- * Garbage collect a leaf directory block by copying it to a new buffer.
- */
-STATIC int
-xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
-                     int justcheck)
-{
-        xfs_dir_leafblock_t *leaf_s, *leaf_d;
-        xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
-        xfs_mount_t *mp;
-        char *tmpbuffer;
-        char *tmpbuffer2=NULL;
-        int rval;
-        int lbsize;
-        mp = trans->t_mountp;
-        lbsize = XFS_LBSIZE(mp);
-        tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
-        ASSERT(tmpbuffer != NULL);
-        memcpy(tmpbuffer, bp->data, lbsize);
-        /*
-         * Make a second copy in case xfs_dir_leaf_moveents()
-         * below destroys the original.
-         */
-        if (musthave || justcheck) {
-                tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
-                memcpy(tmpbuffer2, bp->data, lbsize);
-        }
-        memset(bp->data, 0, lbsize);
-        /*
-         * Copy basic information
-         */
-        leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
-        leaf_d = bp->data;
-        hdr_s = &leaf_s->hdr;
-        hdr_d = &leaf_d->hdr;
-        hdr_d->info = hdr_s->info;      /* struct copy */
-        INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
-        if (!hdr_d->firstused)
-                INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
-        hdr_d->namebytes = 0;
-        hdr_d->count = 0;
-        hdr_d->holes = 0;
-        INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
-        INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
-        /*
-         * Copy all entry's in the same (sorted) order,
-         * but allocate filenames packed and in sequence.
-         * This changes the source (leaf_s) as well.
-         */
-        xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
-        if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
-                rval = XFS_ERROR(ENOSPC);
-        else
-                rval = 0;
-        if (justcheck || rval == ENOSPC) {
-                ASSERT(tmpbuffer2);
-                memcpy(bp->data, tmpbuffer2, lbsize);
-        } else {
-                xfs_da_log_buf(trans, bp, 0, lbsize - 1);
-        }
-        kmem_free(tmpbuffer, lbsize);
-        if (musthave || justcheck)
-                kmem_free(tmpbuffer2, lbsize);
-        return rval;
-}
-/*
- * Redistribute the directory entries between two leaf nodes,
- * taking into account the size of the new entry.
- *
- * NOTE: if new block is empty, then it will get the upper half of old block.
- */
-STATIC void
-xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
-                                      xfs_da_state_blk_t *blk2)
-{
-        xfs_da_state_blk_t *tmp_blk;
-        xfs_dir_leafblock_t *leaf1, *leaf2;
-        xfs_dir_leaf_hdr_t *hdr1, *hdr2;
-        int count, totallen, max, space, swap;
-        /*
-         * Set up environment.
-         */
-        ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
-        ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
-        leaf1 = blk1->bp->data;
-        leaf2 = blk2->bp->data;
-        ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        /*
-         * Check ordering of blocks, reverse if it makes things simpler.
-         */
-        swap = 0;
-        if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
-                tmp_blk = blk1;
-                blk1 = blk2;
-                blk2 = tmp_blk;
-                leaf1 = blk1->bp->data;
-                leaf2 = blk2->bp->data;
-                swap = 1;
-        }
-        hdr1 = &leaf1->hdr;
-        hdr2 = &leaf2->hdr;
-        /*
-         * Examine entries until we reduce the absolute difference in
-         * byte usage between the two blocks to a minimum.  Then get
-         * the direction to copy and the number of elements to move.
-         */
-        state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
-                                                           &count, &totallen);
-        if (swap)
-                state->inleaf = !state->inleaf;
-        /*
-         * Move any entries required from leaf to leaf:
-         */
-        if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
-                /*
-                 * Figure the total bytes to be added to the destination leaf.
-                 */
-                count = INT_GET(hdr1->count, ARCH_CONVERT) - count;     /* number entries being moved */
-                space  = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
-                space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
-                space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-                /*
-                 * leaf2 is the destination, compact it if it looks tight.
-                 */
-                max  = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-                max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-                if (space > max) {
-                        xfs_dir_leaf_compact(state->args->trans, blk2->bp,
-                                                                 0, 0);
-                }
-                /*
-                 * Move high entries from leaf1 to low end of leaf2.
-                 */
-                xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
-                                             leaf2, 0, count, state->mp);
-                xfs_da_log_buf(state->args->trans, blk1->bp, 0,
-                                                   state->blocksize-1);
-                xfs_da_log_buf(state->args->trans, blk2->bp, 0,
-                                                   state->blocksize-1);
-        } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
-                /*
-                 * Figure the total bytes to be added to the destination leaf.
-                 */
-                count -= INT_GET(hdr1->count, ARCH_CONVERT);            /* number entries being moved */
-                space  = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
-                space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
-                space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-                /*
-                 * leaf1 is the destination, compact it if it looks tight.
-                 */
-                max  = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-                max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-                if (space > max) {
-                        xfs_dir_leaf_compact(state->args->trans, blk1->bp,
-                                                                 0, 0);
-                }
-                /*
-                 * Move low entries from leaf2 to high end of leaf1.
-                 */
-                xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
-                                             count, state->mp);
-                xfs_da_log_buf(state->args->trans, blk1->bp, 0,
-                                                   state->blocksize-1);
-                xfs_da_log_buf(state->args->trans, blk2->bp, 0,
-                                                   state->blocksize-1);
-        }
-        /*
-         * Copy out last hashval in each block for B-tree code.
-         */
-        blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-        blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-        /*
-         * Adjust the expected index for insertion.
-         * GROT: this doesn't work unless blk2 was originally empty.
-         */
-        if (!state->inleaf) {
-                blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
-        }
-}
-/*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- * GROT: Is this really necessary?  With other than a 512 byte blocksize,
- * GROT: there will always be enough room in either block for a new entry.
- * GROT: Do a double-split for this case?
- */
-STATIC int
-xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
-                                           xfs_da_state_blk_t *blk1,
-                                           xfs_da_state_blk_t *blk2,
-                                           int *countarg, int *namebytesarg)
-{
-        xfs_dir_leafblock_t *leaf1, *leaf2;
-        xfs_dir_leaf_hdr_t *hdr1, *hdr2;
-        xfs_dir_leaf_entry_t *entry;
-        int count, max, totallen, half;
-        int lastdelta, foundit, tmp;
-        /*
-         * Set up environment.
-         */
-        leaf1 = blk1->bp->data;
-        leaf2 = blk2->bp->data;
-        hdr1 = &leaf1->hdr;
-        hdr2 = &leaf2->hdr;
-        foundit = 0;
-        totallen = 0;
-        /*
-         * Examine entries until we reduce the absolute difference in
-         * byte usage between the two blocks to a minimum.
-         */
-        max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
-        half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-        half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
-        half /= 2;
-        lastdelta = state->blocksize;
-        entry = &leaf1->entries[0];
-        for (count = 0; count < max; entry++, count++) {
-#define XFS_DIR_ABS(A)  (((A) < 0) ? -(A) : (A))
-                /*
-                 * The new entry is in the first block, account for it.
-                 */
-                if (count == blk1->index) {
-                        tmp = totallen + (uint)sizeof(*entry)
-                                + XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
-                        if (XFS_DIR_ABS(half - tmp) > lastdelta)
-                                break;
-                        lastdelta = XFS_DIR_ABS(half - tmp);
-                        totallen = tmp;
-                        foundit = 1;
-                }
-                /*
-                 * Wrap around into the second block if necessary.
-                 */
-                if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
-                        leaf1 = leaf2;
-                        entry = &leaf1->entries[0];
-                }
-                /*
-                 * Figure out if next leaf entry would be too much.
-                 */
-                tmp = totallen + (uint)sizeof(*entry)
-                                + XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
-                if (XFS_DIR_ABS(half - tmp) > lastdelta)
-                        break;
-                lastdelta = XFS_DIR_ABS(half - tmp);
-                totallen = tmp;
-#undef XFS_DIR_ABS
-        }
-        /*
-         * Calculate the number of namebytes that will end up in lower block.
-         * If new entry not in lower block, fix up the count.
-         */
-        totallen -=
-                count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-        if (foundit) {
-                totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
-                            state->args->namelen;
-        }
-        *countarg = count;
-        *namebytesarg = totallen;
-        return foundit;
-}
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-int
-xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_da_state_blk_t *blk;
-        xfs_da_blkinfo_t *info;
-        int count, bytes, forward, error, retval, i;
-        xfs_dablk_t blkno;
-        xfs_dabuf_t *bp;
-        /*
-         * Check for the degenerate case of the block being over 50% full.
-         * If so, it's not worth even looking to see if we might be able
-         * to coalesce with a sibling.
-         */
-        blk = &state->path.blk[ state->path.active-1 ];
-        info = blk->bp->data;
-        ASSERT(be16_to_cpu(info->magic) == XFS_DIR_LEAF_MAGIC);
-        leaf = (xfs_dir_leafblock_t *)info;
-        count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-        bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
-                count * (uint)sizeof(xfs_dir_leaf_entry_t) +
-                count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
-                INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-        if (bytes > (state->blocksize >> 1)) {
-                *action = 0;    /* blk over 50%, don't try to join */
-                return 0;
-        }
-        /*
-         * Check for the degenerate case of the block being empty.
-         * If the block is empty, we'll simply delete it, no need to
-         * coalesce it with a sibling block.  We choose (arbitrarily)
-         * to merge with the forward block unless it is NULL.
-         */
-        if (count == 0) {
-                /*
-                 * Make altpath point to the block we want to keep and
-                 * path point to the block we want to drop (this one).
-                 */
-                forward = (info->forw != 0);
-                memcpy(&state->altpath, &state->path, sizeof(state->path));
-                error = xfs_da_path_shift(state, &state->altpath, forward,
-                                                 0, &retval);
-                if (error)
-                        return error;
-                if (retval) {
-                        *action = 0;
-                } else {
-                        *action = 2;
-                }
-                return 0;
-        }
-        /*
-         * Examine each sibling block to see if we can coalesce with
-         * at least 25% free space to spare.  We need to figure out
-         * whether to merge with the forward or the backward block.
-         * We prefer coalescing with the lower numbered sibling so as
-         * to shrink a directory over time.
-         */
-        forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));  /* start with smaller blk num */
-        for (i = 0; i < 2; forward = !forward, i++) {
-                if (forward)
-                        blkno = be32_to_cpu(info->forw);
-                else
-                        blkno = be32_to_cpu(info->back);
-                if (blkno == 0)
-                        continue;
-                error = xfs_da_read_buf(state->args->trans, state->args->dp,
-                                                            blkno, -1, &bp,
-                                                            XFS_DATA_FORK);
-                if (error)
-                        return error;
-                ASSERT(bp != NULL);
-                leaf = (xfs_dir_leafblock_t *)info;
-                count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-                bytes  = state->blocksize - (state->blocksize>>2);
-                bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-                leaf = bp->data;
-                ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-                count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
-                bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-                bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-                bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
-                bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
-                if (bytes >= 0)
-                        break;  /* fits with at least 25% to spare */
-                xfs_da_brelse(state->args->trans, bp);
-        }
-        if (i >= 2) {
-                *action = 0;
-                return 0;
-        }
-        xfs_da_buf_done(bp);
-        /*
-         * Make altpath point to the block we want to keep (the lower
-         * numbered block) and path point to the block we want to drop.
-         */
-        memcpy(&state->altpath, &state->path, sizeof(state->path));
-        if (blkno < blk->blkno) {
-                error = xfs_da_path_shift(state, &state->altpath, forward,
-                                                 0, &retval);
-        } else {
-                error = xfs_da_path_shift(state, &state->path, forward,
-                                                 0, &retval);
-        }
-        if (error)
-                return error;
-        if (retval) {
-                *action = 0;
-        } else {
-                *action = 1;
-        }
-        return 0;
-}
-/*
- * Remove a name from the leaf directory structure.
- *
- * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
- * If two leaves are 37% full, when combined they will leave 25% free.
- */
-int
-xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_hdr_t *hdr;
-        xfs_dir_leaf_map_t *map;
-        xfs_dir_leaf_entry_t *entry;
-        xfs_dir_leaf_name_t *namest;
-        int before, after, smallest, entsize;
-        int tablesize, tmp, i;
-        xfs_mount_t *mp;
-        leaf = bp->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        hdr = &leaf->hdr;
-        mp = trans->t_mountp;
-        ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
-        ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
-        ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
-        entry = &leaf->entries[index];
-        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
-        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
-        /*
-         * Scan through free region table:
-         *    check for adjacency of free'd entry with an existing one,
-         *    find smallest free region in case we need to replace it,
-         *    adjust any map that borders the entry table,
-         */
-        tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
-                        + (uint)sizeof(xfs_dir_leaf_hdr_t);
-        map = &hdr->freemap[0];
-        tmp = INT_GET(map->size, ARCH_CONVERT);
-        before = after = -1;
-        smallest = XFS_DIR_LEAF_MAPSIZE - 1;
-        entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
-        for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-                ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
-                ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
-                if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
-                        INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
-                        INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
-                }
-                if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
-                        before = i;
-                } else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
-                        after = i;
-                } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
-                        tmp = INT_GET(map->size, ARCH_CONVERT);
-                        smallest = i;
-                }
-        }
-        /*
-         * Coalesce adjacent freemap regions,
-         * or replace the smallest region.
-         */
-        if ((before >= 0) || (after >= 0)) {
-                if ((before >= 0) && (after >= 0)) {
-                        map = &hdr->freemap[before];
-                        INT_MOD(map->size, ARCH_CONVERT, entsize);
-                        INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
-                        hdr->freemap[after].base = 0;
-                        hdr->freemap[after].size = 0;
-                } else if (before >= 0) {
-                        map = &hdr->freemap[before];
-                        INT_MOD(map->size, ARCH_CONVERT, entsize);
-                } else {
-                        map = &hdr->freemap[after];
-                        INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
-                        INT_MOD(map->size, ARCH_CONVERT, entsize);
-                }
-        } else {
-                /*
-                 * Replace smallest region (if it is smaller than free'd entry)
-                 */
-                map = &hdr->freemap[smallest];
-                if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
-                        INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
-                        INT_SET(map->size, ARCH_CONVERT, entsize);
-                }
-        }
-        /*
-         * Did we remove the first entry?
-         */
-        if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
-                smallest = 1;
-        else
-                smallest = 0;
-        /*
-         * Compress the remaining entries and zero out the removed stuff.
-         */
-        namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-        memset((char *)namest, 0, entsize);
-        xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
-        INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
-        tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
-        memmove(entry, entry + 1, tmp);
-        INT_MOD(hdr->count, ARCH_CONVERT, -1);
-        xfs_da_log_buf(trans, bp,
-            XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-        entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
-        memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
-        /*
-         * If we removed the first entry, re-find the first used byte
-         * in the name area.  Note that if the entry was the "firstused",
-         * then we don't have a "hole" in our block resulting from
-         * removing the name.
-         */
-        if (smallest) {
-                tmp = XFS_LBSIZE(mp);
-                entry = &leaf->entries[0];
-                for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
-                        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
-                        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
-                        if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
-                                tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
-                }
-                INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
-                if (!hdr->firstused)
-                        INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
-        } else {
-                hdr->holes = 1;         /* mark as needing compaction */
-        }
-        xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-        /*
-         * Check if leaf is less than 50% full, caller may want to
-         * "join" the leaf with a sibling if so.
-         */
-        tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
-        tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-        tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-        tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
-        if (tmp < mp->m_dir_magicpct)
-                return 1;                       /* leaf is < 37% full */
-        return 0;
-}
-/*
- * Move all the directory entries from drop_leaf into save_leaf.
- */
-void
-xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
-                                      xfs_da_state_blk_t *save_blk)
-{
-        xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
-        xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
-        xfs_mount_t *mp;
-        char *tmpbuffer;
-        /*
-         * Set up environment.
-         */
-        mp = state->mp;
-        ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
-        ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
-        drop_leaf = drop_blk->bp->data;
-        save_leaf = save_blk->bp->data;
-        ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        drop_hdr = &drop_leaf->hdr;
-        save_hdr = &save_leaf->hdr;
-        /*
-         * Save last hashval from dying block for later Btree fixup.
-         */
-        drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
-        /*
-         * Check if we need a temp buffer, or can we do it in place.
-         * Note that we don't check "leaf" for holes because we will
-         * always be dropping it, toosmall() decided that for us already.
-         */
-        if (save_hdr->holes == 0) {
-                /*
-                 * dest leaf has no holes, so we add there.  May need
-                 * to make some room in the entry array.
-                 */
-                if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
-                        xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
-                                                 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-                } else {
-                        xfs_dir_leaf_moveents(drop_leaf, 0,
-                                              save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
-                                              (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-                }
-        } else {
-                /*
-                 * Destination has holes, so we make a temporary copy
-                 * of the leaf and add them both to that.
-                 */
-                tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
-                ASSERT(tmpbuffer != NULL);
-                memset(tmpbuffer, 0, state->blocksize);
-                tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
-                tmp_hdr = &tmp_leaf->hdr;
-                tmp_hdr->info = save_hdr->info; /* struct copy */
-                tmp_hdr->count = 0;
-                INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
-                if (!tmp_hdr->firstused)
-                        INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
-                tmp_hdr->namebytes = 0;
-                if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
-                        xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
-                                                 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-                        xfs_dir_leaf_moveents(save_leaf, 0,
-                                              tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
-                                              (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
-                } else {
-                        xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
-                                                 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
-                        xfs_dir_leaf_moveents(drop_leaf, 0,
-                                              tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
-                                              (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
-                }
-                memcpy(save_leaf, tmp_leaf, state->blocksize);
-                kmem_free(tmpbuffer, state->blocksize);
-        }
-        xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
-                                           state->blocksize - 1);
-        /*
-         * Copy out last hashval in each block for B-tree code.
-         */
-        save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-}
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-/*
- * Look up a name in a leaf directory structure.
- * This is the internal routine, it uses the caller's buffer.
- *
- * Note that duplicate keys are allowed, but only check within the
- * current leaf node.  The Btree code must check in adjacent leaf nodes.
- *
- * Return in *index the index into the entry[] array of either the found
- * entry, or where the entry should have been (insert before that entry).
- *
- * Don't change the args->inumber unless we find the filename.
- */
-int
-xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
-{
-        xfs_dir_leafblock_t *leaf;
-        xfs_dir_leaf_entry_t *entry;
-        xfs_dir_leaf_name_t *namest;
-        int probe, span;
-        xfs_dahash_t hashval;
-        leaf = bp->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
-        /*
-         * Binary search.  (note: small blocks will skip this loop)
-         */
-        hashval = args->hashval;
-        probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
-        for (entry = &leaf->entries[probe]; span > 4;
-                   entry = &leaf->entries[probe]) {
-                span /= 2;
-                if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
-                        probe += span;
-                else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
-                        probe -= span;
-                else
-                        break;
-        }
-        ASSERT((probe >= 0) && \
-               ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
-        ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
-        /*
-         * Since we may have duplicate hashval's, find the first matching
-         * hashval in the leaf.
-         */
-        while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
-                entry--;
-                probe--;
-        }
-        while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
-                entry++;
-                probe++;
-        }
-        if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
-                *index = probe;
-                ASSERT(args->oknoent);
-                return XFS_ERROR(ENOENT);
-        }
-        /*
-         * Duplicate keys may be present, so search all of them for a match.
-         */
-        while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
-                if (entry->namelen == args->namelen &&
-                    namest->name[0] == args->name[0] &&
-                    memcmp(args->name, namest->name, args->namelen) == 0) {
-                        XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
-                        *index = probe;
-                        return XFS_ERROR(EEXIST);
-                }
-                entry++;
-                probe++;
-        }
-        *index = probe;
-        ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
-        return XFS_ERROR(ENOENT);
-}
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-/*
- * Move the indicated entries from one leaf to another.
- * NOTE: this routine modifies both source and destination leaves.
- */
-/* ARGSUSED */
-STATIC void
-xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
-                      xfs_dir_leafblock_t *leaf_d, int start_d,
-                      int count, xfs_mount_t *mp)
-{
-        xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
-        xfs_dir_leaf_entry_t *entry_s, *entry_d;
-        int tmp, i;
-        /*
-         * Check for nothing to do.
-         */
-        if (count == 0)
-                return;
-        /*
-         * Set up environment.
-         */
-        ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        hdr_s = &leaf_s->hdr;
-        hdr_d = &leaf_d->hdr;
-        ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
-        ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
-                ((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
-        ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
-        ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
-                ((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
-        ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
-        ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
-        ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
-        /*
-         * Move the entries in the destination leaf up to make a hole?
-         */
-        if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
-                tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
-                tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-                entry_s = &leaf_d->entries[start_d];
-                entry_d = &leaf_d->entries[start_d + count];
-                memcpy(entry_d, entry_s, tmp);
-        }
-        /*
-         * Copy all entry's in the same (sorted) order,
-         * but allocate filenames packed and in sequence.
-         */
-        entry_s = &leaf_s->entries[start_s];
-        entry_d = &leaf_d->entries[start_d];
-        for (i = 0; i < count; entry_s++, entry_d++, i++) {
-                ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
-                tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
-                INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
-                entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
-                INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
-                entry_d->namelen = entry_s->namelen;
-                ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
-                memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
-                       XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp);
-                ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
-                memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
-                      0, tmp);
-                INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
-                INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
-                INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
-                INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
-                tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
-                                + (uint)sizeof(xfs_dir_leaf_hdr_t);
-                ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
-        }
-        /*
-         * Zero out the entries we just copied.
-         */
-        if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
-                tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-                entry_s = &leaf_s->entries[start_s];
-                ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
-                memset((char *)entry_s, 0, tmp);
-        } else {
-                /*
-                 * Move the remaining entries down to fill the hole,
-                 * then zero the entries at the top.
-                 */
-                tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
-                tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-                entry_s = &leaf_s->entries[start_s + count];
-                entry_d = &leaf_s->entries[start_s];
-                memcpy(entry_d, entry_s, tmp);
-                tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-                entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
-                ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
-                memset((char *)entry_s, 0, tmp);
-        }
-        /*
-         * Fill in the freemap information
-         */
-        INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
-        INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
-        INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
-        INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0));
-        INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0));
-        hdr_s->holes = 1;       /* leaf may not be compact */
-}
-/*
- * Compare two leaf blocks "order".
- */
-int
-xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
-{
-        xfs_dir_leafblock_t *leaf1, *leaf2;
-        leaf1 = leaf1_bp->data;
-        leaf2 = leaf2_bp->data;
-        ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC) &&
-               (be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC));
-        if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
-            ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
-              INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
-             (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
-              INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
-                return 1;
-        }
-        return 0;
-}
-/*
- * Pick up the last hashvalue from a leaf block.
- */
-xfs_dahash_t
-xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
-{
-        xfs_dir_leafblock_t *leaf;
-        leaf = bp->data;
-        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-        if (count)
-                *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-        if (!leaf->hdr.count)
-                return(0);
-        return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
-}
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-int
-xfs_dir_leaf_getdents_int(
-        xfs_dabuf_t     *bp,
-        xfs_inode_t     *dp,
-        xfs_dablk_t     bno,
-        uio_t           *uio,
-        int             *eobp,
-        xfs_dirent_t    *dbp,
-        xfs_dir_put_t   put,
-        xfs_daddr_t             nextda)
-{
-        xfs_dir_leafblock_t     *leaf;
-        xfs_dir_leaf_entry_t    *entry;
-        xfs_dir_leaf_name_t     *namest;
-        int                     entno, want_entno, i, nextentno;
-        xfs_mount_t             *mp;
-        xfs_dahash_t            cookhash;
-        xfs_dahash_t            nexthash = 0;
-#if (BITS_PER_LONG == 32)
-        xfs_dahash_t            lasthash = XFS_DA_MAXHASH;
-#endif
-        xfs_dir_put_args_t      p;
-        mp = dp->i_mount;
-        leaf = bp->data;
-        if (be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC) {
-                *eobp = 1;
-                return XFS_ERROR(ENOENT);       /* XXX wrong code */
-        }
-        want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-        cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-        xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
-        /*
-         * Re-find our place.
-         */
-        for (i = entno = 0, entry = &leaf->entries[0];
-                     i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
-                             entry++, i++) {
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-                                    INT_GET(entry->nameidx, ARCH_CONVERT));
-                if (unlikely(
-                    ((char *)namest < (char *)leaf) ||
-                    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
-                        XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)",
-                                             XFS_ERRLEVEL_LOW, mp, leaf);
-                        xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
-                if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
-                        if (   entno < want_entno
-                            && INT_GET(entry->hashval, ARCH_CONVERT)
-                                                        == cookhash) {
-                                /*
-                                 * Trying to get to a particular offset in a
-                                 * run of equal-hashval entries.
-                                 */
-                                entno++;
-                        } else if (   want_entno > 0
-                                   && entno == want_entno
-                                   && INT_GET(entry->hashval, ARCH_CONVERT)
-                                                        == cookhash) {
-                                break;
-                        } else {
-                                entno = 0;
-                                break;
-                        }
-                }
-        }
-        if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
-                xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
-                if (!leaf->hdr.info.forw)
-                        uio->uio_offset =
-                                XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
-                /*
-                 * Don't set uio_offset if there's another block:
-                 * the node code will be setting uio_offset anyway.
-                 */
-                *eobp = 0;
-                return 0;
-        }
-        xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
-        p.dbp = dbp;
-        p.put = put;
-        p.uio = uio;
-        /*
-         * We're synchronized, start copying entries out to the user.
-         */
-        for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
-                             entry++, i++, (entno = nextentno)) {
-                int lastresid=0, retval;
-                xfs_dircook_t lastoffset;
-                xfs_dahash_t thishash;
-                /*
-                 * Check for a damaged directory leaf block and pick up
-                 * the inode number from this entry.
-                 */
-                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-                                    INT_GET(entry->nameidx, ARCH_CONVERT));
-                if (unlikely(
-                    ((char *)namest < (char *)leaf) ||
-                    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
-                        XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)",
-                                             XFS_ERRLEVEL_LOW, mp, leaf);
-                        xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
-                xfs_dir_trace_g_duc("leaf: middle cookie  ",
-                                                   dp, uio, p.cook.o);
-                if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
-                        nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
-                        if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
-                                nextentno = entno + 1;
-                        else
-                                nextentno = 0;
-                        XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash);
-                        xfs_dir_trace_g_duc("leaf: middle cookie  ",
-                                                   dp, uio, p.cook.o);
-                } else if ((thishash = be32_to_cpu(leaf->hdr.info.forw))) {
-                        xfs_dabuf_t *bp2;
-                        xfs_dir_leafblock_t *leaf2;
-                        ASSERT(nextda != -1);
-                        retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
-                                                 nextda, &bp2, XFS_DATA_FORK);
-                        if (retval)
-                                return retval;
-                        ASSERT(bp2 != NULL);
-                        leaf2 = bp2->data;
-                        if (unlikely(
-                               (be16_to_cpu(leaf2->hdr.info.magic)
-                                                != XFS_DIR_LEAF_MAGIC)
-                            || (be32_to_cpu(leaf2->hdr.info.back)
-                                                != bno))) {     /* GROT */
-                                XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)",
-                                                     XFS_ERRLEVEL_LOW, mp,
-                                                     leaf2);
-                                xfs_da_brelse(dp->i_transp, bp2);
-                                return XFS_ERROR(EFSCORRUPTED);
-                        }
-                        nexthash = INT_GET(leaf2->entries[0].hashval,
-                                                                ARCH_CONVERT);
-                        nextentno = -1;
-                        XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
-                        xfs_da_brelse(dp->i_transp, bp2);
-                        xfs_dir_trace_g_duc("leaf: next blk cookie",
-                                                   dp, uio, p.cook.o);
-                } else {
-                        nextentno = -1;
-                        XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
-                }
-                /*
-                 * Save off the cookie so we can fall back should the
-                 * 'put' into the outgoing buffer fails.  To handle a run
-                 * of equal-hashvals, the off_t structure on 64bit
-                 * builds has entno built into the cookie to ID the
-                 * entry.  On 32bit builds, we only have space for the
-                 * hashval so we can't ID specific entries within a group
-                 * of same hashval entries.   For this, lastoffset is set
-                 * to the first in the run of equal hashvals so we don't
-                 * include any entries unless we can include all entries
-                 * that share the same hashval.  Hopefully the buffer
-                 * provided is big enough to handle it (see pv763517).
-                 */
-#if (BITS_PER_LONG == 32)
-                if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT))
-                                                                != lasthash) {
-                        XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
-                        lastresid = uio->uio_resid;
-                        lasthash = thishash;
-                } else {
-                        xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
-                                                   dp, uio, p.cook.o);
-                }
-#else
-                thishash = INT_GET(entry->hashval, ARCH_CONVERT);
-                XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
-                lastresid = uio->uio_resid;
-#endif /* BITS_PER_LONG == 32 */
-                /*
-                 * Put the current entry into the outgoing buffer.  If we fail
-                 * then restore the UIO to the first entry in the current
-                 * run of equal-hashval entries (probably one 1 entry long).
-                 */
-                p.ino = XFS_GET_DIR_INO8(namest->inumber);
-#if XFS_BIG_INUMS
-                p.ino += mp->m_inoadd;
-#endif
-                p.name = (char *)namest->name;
-                p.namelen = entry->namelen;
-                retval = p.put(&p);
-                if (!p.done) {
-                        uio->uio_offset = lastoffset.o;
-                        uio->uio_resid = lastresid;
-                        *eobp = 1;
-                        xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
-                        return retval;
-                }
-        }
-        uio->uio_offset = p.cook.o;
-        *eobp = 0;
-        xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
-        return 0;
-}
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
-{
-        iovec_t *iovp;
-        int reclen, namelen;
-        xfs_dirent_t *idbp;
-        uio_t *uio;
-        namelen = pa->namelen;
-        reclen = DIRENTSIZE(namelen);
-        uio = pa->uio;
-        if (reclen > uio->uio_resid) {
-                pa->done = 0;
-                return 0;
-        }
-        iovp = uio->uio_iov;
-        idbp = (xfs_dirent_t *)iovp->iov_base;
-        iovp->iov_base = (char *)idbp + reclen;
-        iovp->iov_len -= reclen;
-        uio->uio_resid -= reclen;
-        idbp->d_reclen = reclen;
-        idbp->d_ino = pa->ino;
-        idbp->d_off = pa->cook.o;
-        idbp->d_name[namelen] = '\0';
-        pa->done = 1;
-        memcpy(idbp->d_name, pa->name, namelen);
-        return 0;
-}
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
-{
-        int             retval, reclen, namelen;
-        xfs_dirent_t    *idbp;
-        uio_t           *uio;
-        namelen = pa->namelen;
-        reclen = DIRENTSIZE(namelen);
-        uio = pa->uio;
-        if (reclen > uio->uio_resid) {
-                pa->done = 0;
-                return 0;
-        }
-        idbp = pa->dbp;
-        idbp->d_reclen = reclen;
-        idbp->d_ino = pa->ino;
-        idbp->d_off = pa->cook.o;
-        idbp->d_name[namelen] = '\0';
-        memcpy(idbp->d_name, pa->name, namelen);
-        retval = uio_read((caddr_t)idbp, reclen, uio);
-        pa->done = (retval == 0);
-        return retval;
-}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
deleted file mode 100644
index eb8cd9a4667f..000000000000
--- a/fs/xfs/xfs_dir_leaf.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_LEAF_H__
-#define __XFS_DIR_LEAF_H__
-/*
- * Directory layout, internal structure, access macros, etc.
- *
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes.  Filenames are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of a filename may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- */
-struct uio;
-struct xfs_bmap_free;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_da_state;
-struct xfs_da_state_blk;
-struct xfs_dir_put_args;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-/*========================================================================
- * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top.  Names grow from the bottom
- * but are not packed.  The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped.  When the freemap doesn't show enough space
- * for an allocation, we compact the namelist area and try again.  If we
- * still don't have enough space, then we have to split the block.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual string.  The root and intermediate node search always takes
- * the first-in-the-block key match found, so we should only have to work
- * "forw"ard.  If none matches, continue with the "forw"ard leaf nodes
- * until the hash key changes or the filename is found.
- *
- * The parent directory and the self-pointer are explicitly represented
- * (ie: there are entries for "." and "..").
- *
- * Note that the count being a __uint16_t limits us to something like a
- * blocksize of 1.3MB in the face of worst case (short) filenames.
- */
-#define XFS_DIR_LEAF_MAPSIZE    3       /* how many freespace slots */
-typedef struct xfs_dir_leaf_map {       /* RLE map of free bytes */
-        __uint16_t      base;           /* base of free region */
-        __uint16_t      size;           /* run length of free region */
-} xfs_dir_leaf_map_t;
-typedef struct xfs_dir_leaf_hdr {       /* constant-structure header block */
-        xfs_da_blkinfo_t info;          /* block type, links, etc. */
-        __uint16_t      count;          /* count of active leaf_entry's */
-        __uint16_t      namebytes;      /* num bytes of name strings stored */
-        __uint16_t      firstused;      /* first used byte in name area */
-        __uint8_t       holes;          /* != 0 if blk needs compaction */
-        __uint8_t       pad1;
-        xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
-} xfs_dir_leaf_hdr_t;
-typedef struct xfs_dir_leaf_entry {     /* sorted on key, not name */
-        xfs_dahash_t    hashval;        /* hash value of name */
-        __uint16_t      nameidx;        /* index into buffer of name */
-        __uint8_t       namelen;        /* length of name string */
-        __uint8_t       pad2;
-} xfs_dir_leaf_entry_t;
-typedef struct xfs_dir_leaf_name {
-        xfs_dir_ino_t   inumber;        /* inode number for this key */
-        __uint8_t       name[1];        /* name string itself */
-} xfs_dir_leaf_name_t;
-typedef struct xfs_dir_leafblock {
-        xfs_dir_leaf_hdr_t      hdr;    /* constant-structure header block */
-        xfs_dir_leaf_entry_t    entries[1];     /* var sized array */
-        xfs_dir_leaf_name_t     namelist[1];    /* grows from bottom of buf */
-} xfs_dir_leafblock_t;
-/*
- * Length of name for which a 512-byte block filesystem
- * can get a double split.
- */
-#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN       \
-        (512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
-         (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
-         (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
-typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
-typedef union {
-        xfs_off_t               o;              /* offset (cookie) */
-        /*
-         * Watch the order here (endian-ness dependent).
-         */
-        struct {
-#ifndef XFS_NATIVE_HOST
-                xfs_dahash_t    h;      /* hash value */
-                __uint32_t      be;     /* block and entry */
-#else
-                __uint32_t      be;     /* block and entry */
-                xfs_dahash_t    h;      /* hash value */
-#endif /* XFS_NATIVE_HOST */
-        } s;
-} xfs_dircook_t;
-#define XFS_PUT_COOKIE(c,mp,bno,entry,hash)     \
-        ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
-typedef struct xfs_dir_put_args {
-        xfs_dircook_t   cook;           /* cookie of (next) entry */
-        xfs_intino_t    ino;            /* inode number */
-        struct xfs_dirent *dbp;         /* buffer pointer */
-        char            *name;          /* directory entry name */
-        int             namelen;        /* length of name */
-        int             done;           /* output: set if value was stored */
-        xfs_dir_put_t   put;            /* put function ptr (i/o) */
-        struct uio      *uio;           /* uio control structure */
-} xfs_dir_put_args_t;
-#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)        \
-        xfs_dir_leaf_entsize_byname(len)
-static inline int xfs_dir_leaf_entsize_byname(int len)
-{
-        return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len;
-}
-#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)     \
-        xfs_dir_leaf_entsize_byentry(entry)
-static inline int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
-{
-        return (uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen;
-}
-#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)   \
-        xfs_dir_leaf_namestruct(leafp,offset)
-static inline xfs_dir_leaf_name_t *
-xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
-{
-        return (xfs_dir_leaf_name_t *)&((char *)(leafp))[offset];
-}
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-/*
- * Internal routines when dirsize < XFS_LITINO(mp).
- */
-int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
-int xfs_dir_shortform_addname(struct xfs_da_args *args);
-int xfs_dir_shortform_lookup(struct xfs_da_args *args);
-int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
-int xfs_dir_shortform_removename(struct xfs_da_args *args);
-int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
-                               struct xfs_dirent *dbp, xfs_dir_put_t put);
-int xfs_dir_shortform_replace(struct xfs_da_args *args);
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-int xfs_dir_leaf_to_node(struct xfs_da_args *args);
-int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
-/*
- * Routines used for growing the Btree.
- */
-int     xfs_dir_leaf_split(struct xfs_da_state *state,
-                                  struct xfs_da_state_blk *oldblk,
-                                  struct xfs_da_state_blk *newblk);
-int     xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
-                                struct xfs_da_args *args, int insertion_index);
-int     xfs_dir_leaf_addname(struct xfs_da_args *args);
-int     xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
-                                       struct xfs_da_args *args,
-                                       int *index_found_at);
-int     xfs_dir_leaf_remove(struct xfs_trans *trans,
-                                   struct xfs_dabuf *leaf_buffer,
-                                   int index_to_remove);
-int     xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
-                                         xfs_dablk_t bno, struct uio *uio,
-                                         int *eobp, struct xfs_dirent *dbp,
-                                         xfs_dir_put_t put, xfs_daddr_t nextda);
-/*
- * Routines used for shrinking the Btree.
- */
-int     xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void    xfs_dir_leaf_unbalance(struct xfs_da_state *state,
-                                             struct xfs_da_state_blk *drop_blk,
-                                             struct xfs_da_state_blk *save_blk);
-/*
- * Utility routines.
- */
-uint    xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
-int     xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
-                                  struct xfs_dabuf *leaf2_bp);
-int     xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
-int     xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
-int     xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-/*
- * Global data.
- */
-extern xfs_dahash_t     xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
deleted file mode 100644
index 5b20b4d3f57d..000000000000
--- a/fs/xfs/xfs_dir_sf.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_SF_H__
-#define __XFS_DIR_SF_H__
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to
- * fit into the literal area of the inode.
- */
-typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
-/*
- * The parent directory has a dedicated field, and the self-pointer must
- * be calculated on the fly.
- *
- * Entries are packed toward the top as tight as possible.  The header
- * and the elements much be memcpy'd out into a work area to get correct
- * alignment for the inode number fields.
- */
-typedef struct xfs_dir_sf_hdr {         /* constant-structure header block */
-        xfs_dir_ino_t   parent;         /* parent dir inode number */
-        __uint8_t       count;          /* count of active entries */
-} xfs_dir_sf_hdr_t;
-typedef struct xfs_dir_sf_entry {
-        xfs_dir_ino_t   inumber;        /* referenced inode number */
-        __uint8_t       namelen;        /* actual length of name (no NULL) */
-        __uint8_t       name[1];        /* name */
-} xfs_dir_sf_entry_t;
-typedef struct xfs_dir_shortform {
-        xfs_dir_sf_hdr_t        hdr;
-        xfs_dir_sf_entry_t      list[1];        /* variable sized array */
-} xfs_dir_shortform_t;
-/*
- * We generate this then sort it, so that readdirs are returned in
- * hash-order.  Else seekdir won't work.
- */
-typedef struct xfs_dir_sf_sort {
-        __uint8_t       entno;          /* .=0, ..=1, else entry# + 2 */
-        __uint8_t       seqno;          /* sequence # with same hash value */
-        __uint8_t       namelen;        /* length of name value (no null) */
-        xfs_dahash_t    hash;           /* this entry's hash value */
-        xfs_intino_t    ino;            /* this entry's inode number */
-        char            *name;          /* name value, pointer into buffer */
-} xfs_dir_sf_sort_t;
-#define XFS_DIR_SF_GET_DIRINO(from,to)  xfs_dir_sf_get_dirino(from, to)
-static inline void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
-{
-        *(to) = XFS_GET_DIR_INO8(*from);
-}
-#define XFS_DIR_SF_PUT_DIRINO(from,to)  xfs_dir_sf_put_dirino(from, to)
-static inline void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
-{
-        XFS_PUT_DIR_INO8(*(from), *(to));
-}
-#define XFS_DIR_SF_ENTSIZE_BYNAME(len)  xfs_dir_sf_entsize_byname(len)
-static inline int xfs_dir_sf_entsize_byname(int len)
-{
-        return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (len);
-}
-#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)        xfs_dir_sf_entsize_byentry(sfep)
-static inline int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
-{
-        return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen;
-}
-#define XFS_DIR_SF_NEXTENTRY(sfep)              xfs_dir_sf_nextentry(sfep)
-static inline xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
-{
-        return (xfs_dir_sf_entry_t *) \
-                ((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep));
-}
-#define XFS_DIR_SF_ALLFIT(count,totallen)       \
-        xfs_dir_sf_allfit(count,totallen)
-static inline int xfs_dir_sf_allfit(int count, int totallen)
-{
-        return ((uint)sizeof(xfs_dir_sf_hdr_t) + \
-               ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen));
-}
-#if defined(XFS_DIR_TRACE)
-/*
- * Kernel tracing support for directories.
- */
-struct uio;
-struct xfs_inode;
-struct xfs_da_intnode;
-struct xfs_dinode;
-struct xfs_dir_leafblock;
-struct xfs_dir_leaf_entry;
-#define XFS_DIR_TRACE_SIZE      4096    /* size of global trace buffer */
-extern ktrace_t *xfs_dir_trace_buf;
-/*
- * Trace record types.
- */
-#define XFS_DIR_KTRACE_G_DU     1       /* dp, uio */
-#define XFS_DIR_KTRACE_G_DUB    2       /* dp, uio, bno */
-#define XFS_DIR_KTRACE_G_DUN    3       /* dp, uio, node */
-#define XFS_DIR_KTRACE_G_DUL    4       /* dp, uio, leaf */
-#define XFS_DIR_KTRACE_G_DUE    5       /* dp, uio, leaf entry */
-#define XFS_DIR_KTRACE_G_DUC    6       /* dp, uio, cookie */
-void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
-void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
-                              xfs_dablk_t bno);
-void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
-                              struct xfs_da_intnode *node);
-void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
-                              struct xfs_dir_leafblock *leaf);
-void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
-                              struct xfs_dir_leaf_entry *entry);
-void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
-                              xfs_off_t cookie);
-void xfs_dir_trace_enter(int type, char *where,
-                             void *a0, void *a1, void *a2, void *a3,
-                             void *a4, void *a5, void *a6, void *a7,
-                             void *a8, void *a9, void *a10, void *a11);
-#else
-#define xfs_dir_trace_g_du(w,d,u)
-#define xfs_dir_trace_g_dub(w,d,u,b)
-#define xfs_dir_trace_g_dun(w,d,u,n)
-#define xfs_dir_trace_g_dul(w,d,u,l)
-#define xfs_dir_trace_g_due(w,d,u,e)
-#define xfs_dir_trace_g_duc(w,d,u,c)
-#endif /* DEBUG */
-#endif  /* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 00b1540f8108..4e7865ad6f0e 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -189,6 +189,6 @@ typedef enum {
 #define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
-extern struct bhv_vfsops xfs_dmops;
+extern struct bhv_module_vfsops xfs_dmops;
 #endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index 629795b3b3d5..1e4a35ddf7f9 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2a21c5024017..b95681b03d81 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -22,12 +22,10 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f19282ec8549..6cf6d8769b97 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
@@ -294,6 +293,62 @@ xfs_efi_init(xfs_mount_t	*mp,
 }
 /*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+        xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+        uint i;
+        uint len = sizeof(xfs_efi_log_format_t) + 
+                (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
+        uint len32 = sizeof(xfs_efi_log_format_32_t) + 
+                (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);  
+        uint len64 = sizeof(xfs_efi_log_format_64_t) + 
+                (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);  
+        if (buf->i_len == len) {
+                memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+                return 0;
+        } else if (buf->i_len == len32) {
+                xfs_efi_log_format_32_t *src_efi_fmt_32 =
+                        (xfs_efi_log_format_32_t *)buf->i_addr;
+                dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
+                dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
+                dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+                dst_efi_fmt->efi_id       = src_efi_fmt_32->efi_id;
+                for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+                        dst_efi_fmt->efi_extents[i].ext_start =
+                                src_efi_fmt_32->efi_extents[i].ext_start;
+                        dst_efi_fmt->efi_extents[i].ext_len =
+                                src_efi_fmt_32->efi_extents[i].ext_len;
+                }
+                return 0;
+        } else if (buf->i_len == len64) {
+                xfs_efi_log_format_64_t *src_efi_fmt_64 =
+                        (xfs_efi_log_format_64_t *)buf->i_addr;
+                dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
+                dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
+                dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+                dst_efi_fmt->efi_id       = src_efi_fmt_64->efi_id;
+                for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+                        dst_efi_fmt->efi_extents[i].ext_start =
+                                src_efi_fmt_64->efi_extents[i].ext_start;
+                        dst_efi_fmt->efi_extents[i].ext_len =
+                                src_efi_fmt_64->efi_extents[i].ext_len;
+                }
+                return 0;
+        }
+        return EFSCORRUPTED;
+}
+/*
 * This is called by the efd item code below to release references to
 * the given efi item.  Each efd calls this with the number of
 * extents that it has logged, and when the sum of these reaches
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 5bf681708fec..0ea45edaab03 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -27,6 +27,24 @@ typedef struct xfs_extent {
 } xfs_extent_t;
 /*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+        xfs_dfsbno_t    ext_start;
+        xfs_extlen_t    ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+typedef struct xfs_extent_64 {
+        xfs_dfsbno_t    ext_start;
+        xfs_extlen_t    ext_len;
+        __uint32_t      ext_pad;
+} xfs_extent_64_t;
+/*
 * This is the structure used to lay out an efi log item in the
 * log.  The efi_extents field is a variable size array whose
 * size is given by efi_nextents.
@@ -39,6 +57,22 @@ typedef struct xfs_efi_log_format {
        xfs_extent_t            efi_extents[1]; /* array of extents to free */
 } xfs_efi_log_format_t;
+typedef struct xfs_efi_log_format_32 {
+        unsigned short          efi_type;       /* efi log item type */
+        unsigned short          efi_size;       /* size of this item */
+        uint                    efi_nextents;   /* # extents to free */
+        __uint64_t              efi_id;         /* efi identifier */
+        xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+typedef struct xfs_efi_log_format_64 {
+        unsigned short          efi_type;       /* efi log item type */
+        unsigned short          efi_size;       /* size of this item */
+        uint                    efi_nextents;   /* # extents to free */
+        __uint64_t              efi_id;         /* efi identifier */
+        xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
 /*
 * This is the structure used to lay out an efd log item in the
 * log.  The efd_extents array is a variable size array whose
@@ -52,6 +86,22 @@ typedef struct xfs_efd_log_format {
        xfs_extent_t            efd_extents[1]; /* array of extents freed */
 } xfs_efd_log_format_t;
+typedef struct xfs_efd_log_format_32 {
+        unsigned short          efd_type;       /* efd log item type */
+        unsigned short          efd_size;       /* size of this item */
+        uint                    efd_nextents;   /* # of extents freed */
+        __uint64_t              efd_efi_id;     /* id of corresponding efi */
+        xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+typedef struct xfs_efd_log_format_64 {
+        unsigned short          efd_type;       /* efd log item type */
+        unsigned short          efd_size;       /* size of this item */
+        uint                    efd_nextents;   /* # of extents freed */
+        __uint64_t              efd_efi_id;     /* id of corresponding efi */
+        xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
 #ifdef __KERNEL__
@@ -103,7 +153,8 @@ extern struct kmem_zone	*xfs_efd_zone;
 xfs_efi_log_item_t      *xfs_efi_init(struct xfs_mount *, uint);
 xfs_efd_log_item_t      *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
                                      uint);
+int                     xfs_efi_copy_format(xfs_log_iovec_t *buf,
+                                            xfs_efi_log_format_t *dst_efi_fmt);
 void                    xfs_efi_item_free(xfs_efi_log_item_t *);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 14010f1fa82f..0f0ad1535951 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -67,14 +67,15 @@ struct fsxattr {
 #define XFS_XFLAG_NOSYMLINKS    0x00000400      /* disallow symlink creation */
 #define XFS_XFLAG_EXTSIZE       0x00000800      /* extent size allocator hint */
 #define XFS_XFLAG_EXTSZINHERIT  0x00001000      /* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG      0x00002000      /* do not defragment */
 #define XFS_XFLAG_HASATTR       0x80000000      /* no DIFLAG for this   */
 /*
 * Structure for XFS_IOC_GETBMAP.
 * On input, fill in bmv_offset and bmv_length of the first structure
- * to indicate the area of interest in the file, and bmv_entry with the
+ * to indicate the area of interest in the file, and bmv_entries with
- * number of array elements given.  The first structure is updated on
+ * the number of array elements given back.  The first structure is
- * return to give the offset and length for the next call.
+ * updated on return to give the offset and length for the next call.
 */
 #ifndef HAVE_GETBMAP
 struct getbmap {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dfa3527b20a7..077629bab532 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -542,14 +540,13 @@ xfs_reserve_blocks(
 }
 void
-xfs_fs_log_dummy(xfs_mount_t *mp)
+xfs_fs_log_dummy(
+        xfs_mount_t     *mp)
 {
-        xfs_trans_t *tp;
+        xfs_trans_t     *tp;
-        xfs_inode_t *ip;
+        xfs_inode_t     *ip;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        atomic_inc(&mp->m_active_trans);
        if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
                xfs_trans_cancel(tp, 0);
                return;
@@ -574,21 +571,22 @@ xfs_fs_goingdown(
 {
        switch (inflags) {
        case XFS_FSOP_GOING_FLAGS_DEFAULT: {
-                struct vfs *vfsp = XFS_MTOVFS(mp);
+                struct bhv_vfs *vfsp = XFS_MTOVFS(mp);
                struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
                if (sb && !IS_ERR(sb)) {
-                        xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+                        xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
                        thaw_bdev(sb->s_bdev, sb);
                }
        
                break;
        }
        case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
-                xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+                xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
                break;
        case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
-                xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
+                xfs_force_shutdown(mp,
+                                SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
                break;
        default:
                return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index deddbd03c166..33164a85aa9d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -1174,6 +1172,9 @@ xfs_dilocate(
        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
+                /* no diagnostics for bulkstat, ino comes from userspace */
+                if (flags & XFS_IMAP_BULKSTAT)
+                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                        "xfs_dilocate: agno (%d) >= "
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 60c65683462d..616eeeb6953e 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b53854325266..0724df7fabb7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -186,7 +184,7 @@ xfs_ihash_promote(
 */
 STATIC int
 xfs_iget_core(
-        vnode_t         *vp,
+        bhv_vnode_t     *vp,
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
@@ -198,7 +196,7 @@ xfs_iget_core(
        xfs_ihash_t     *ih;
        xfs_inode_t     *ip;
        xfs_inode_t     *iq;
-        vnode_t         *inode_vp;
+        bhv_vnode_t     *inode_vp;
        ulong           version;
        int             error;
        /* REFERENCED */
@@ -468,7 +466,7 @@ finish_inode:
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-        VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
+        bhv_vfs_init_vnode(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
        return 0;
 }
@@ -489,7 +487,7 @@ xfs_iget(
        xfs_daddr_t     bno)
 {
        struct inode    *inode;
-        vnode_t         *vp = NULL;
+        bhv_vnode_t     *vp = NULL;
        int             error;
        XFS_STATS_INC(xs_ig_attempts);
@@ -543,7 +541,7 @@ retry:
 void
 xfs_inode_lock_init(
        xfs_inode_t     *ip,
-        vnode_t         *vp)
+        bhv_vnode_t     *vp)
 {
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", (long)vp->v_number);
@@ -603,12 +601,10 @@ void
 xfs_iput(xfs_inode_t    *ip,
         uint           lock_flags)
 {
-        vnode_t *vp = XFS_ITOV(ip);
+        bhv_vnode_t     *vp = XFS_ITOV(ip);
        vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
        xfs_iunlock(ip, lock_flags);
        VN_RELE(vp);
 }
@@ -619,7 +615,7 @@ void
 xfs_iput_new(xfs_inode_t        *ip,
             uint               lock_flags)
 {
-        vnode_t         *vp = XFS_ITOV(ip);
+        bhv_vnode_t     *vp = XFS_ITOV(ip);
        struct inode    *inode = vn_to_inode(vp);
        vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
@@ -645,7 +641,7 @@ xfs_iput_new(xfs_inode_t	*ip,
 void
 xfs_ireclaim(xfs_inode_t *ip)
 {
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        /*
         * Remove from old hash list and mount list.
@@ -1033,6 +1029,6 @@ xfs_iflock_nowait(xfs_inode_t *ip)
 void
 xfs_ifunlock(xfs_inode_t *ip)
 {
-        ASSERT(valusema(&(ip->i_flock)) <= 0);
+        ASSERT(issemalocked(&(ip->i_flock)));
        vsema(&(ip->i_flock));
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 94b60dd03801..86c1bf0bba9e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -26,14 +26,12 @@
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -256,13 +254,11 @@ xfs_itobp(
        xfs_daddr_t     bno,
        uint            imap_flags)
 {
+        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        xfs_imap_t      imap;
-#ifdef __KERNEL__
        int             i;
        int             ni;
-#endif
        if (ip->i_blkno == (xfs_daddr_t)0) {
                /*
@@ -319,7 +315,6 @@ xfs_itobp(
         */
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
        if (error) {
 #ifdef DEBUG
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
@@ -330,17 +325,21 @@ xfs_itobp(
 #endif /* DEBUG */
                return error;
        }
-#ifdef __KERNEL__
        /*
         * Validate the magic number and version of every inode in the buffer
         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+         * No validation is done here in userspace (xfs_repair).
         */
-#ifdef DEBUG
+#if !defined(__KERNEL__)
+        ni = 0;
+#elif defined(DEBUG)
        ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 :
                (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog);
-#else
+#else   /* usual case */
        ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1;
 #endif
        for (i = 0; i < ni; i++) {
                int             di_ok;
                xfs_dinode_t    *dip;
@@ -352,8 +351,11 @@ xfs_itobp(
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
                                 XFS_RANDOM_ITOBP_INOTOBP))) {
 #ifdef DEBUG
-                        prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                        if (!(imap_flags & XFS_IMAP_BULKSTAT))
-                                mp->m_ddev_targp,
+                                cmn_err(CE_ALERT,
+                                        "Device %s - bad inode magic/vsn "
+                                        "daddr %lld #%d (magic=%x)",
+                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap.im_blkno, i,
                                INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
 #endif
@@ -363,7 +365,6 @@ xfs_itobp(
                        return XFS_ERROR(EFSCORRUPTED);
                }
        }
-#endif  /* __KERNEL__ */
        xfs_inobp_check(mp, bp);
@@ -782,7 +783,6 @@ xfs_xlate_dinode_core(
 STATIC uint
 _xfs_dic2xflags(
-        xfs_dinode_core_t       *dic,
        __uint16_t              di_flags)
 {
        uint                    flags = 0;
@@ -812,6 +812,8 @@ _xfs_dic2xflags(
                        flags |= XFS_XFLAG_EXTSIZE;
                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
                        flags |= XFS_XFLAG_EXTSZINHERIT;
+                if (di_flags & XFS_DIFLAG_NODEFRAG)
+                        flags |= XFS_XFLAG_NODEFRAG;
        }
        return flags;
@@ -823,16 +825,16 @@ xfs_ip2xflags(
 {
        xfs_dinode_core_t       *dic = &ip->i_d;
-        return _xfs_dic2xflags(dic, dic->di_flags) |
+        return _xfs_dic2xflags(dic->di_flags) |
-                (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
+                                (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
 }
 uint
 xfs_dic2xflags(
        xfs_dinode_core_t       *dic)
 {
-        return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
+        return _xfs_dic2xflags(INT_GET(dic->di_flags, ARCH_CONVERT)) |
-                (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
+                                (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
 }
 /*
@@ -1083,7 +1085,7 @@ xfs_ialloc(
 {
        xfs_ino_t       ino;
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        uint            flags;
        int             error;
@@ -1221,6 +1223,9 @@ xfs_ialloc(
                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
                        if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                                di_flags |= XFS_DIFLAG_PROJINHERIT;
+                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+                            xfs_inherit_nodefrag)
+                                di_flags |= XFS_DIFLAG_NODEFRAG;
                        ip->i_d.di_flags |= di_flags;
                }
                /* FALLTHROUGH */
@@ -1244,8 +1249,8 @@ xfs_ialloc(
         */
        xfs_trans_log_inode(tp, ip, flags);
-        /* now that we have an i_mode  we can set Linux inode ops (& unlock) */
+        /* now that we have an i_mode we can setup inode ops and unlock */
-        VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
+        bhv_vfs_init_vnode(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
        *ipp = ip;
        return 0;
@@ -1285,7 +1290,7 @@ xfs_isize_check(
                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
                          map_first),
                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-                         NULL))
+                         NULL, NULL))
            return;
        ASSERT(nimaps == 1);
        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1421,7 +1426,7 @@ xfs_itruncate_start(
        xfs_fsize_t     last_byte;
        xfs_off_t       toss_start;
        xfs_mount_t     *mp;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
        ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
@@ -1434,9 +1439,9 @@ xfs_itruncate_start(
        vn_iowait(vp);  /* wait for the completion of any pending DIOs */
        
        /*
-         * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
+         * Call toss_pages or flushinval_pages to get rid of pages
         * overlapping the region being removed.  We have to use
-         * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
+         * the less efficient flushinval_pages in the case that the
         * caller may not be able to finish the truncate without
         * dropping the inode's I/O lock.  Make sure
         * to catch any pages brought in by buffers overlapping
@@ -1445,10 +1450,10 @@ xfs_itruncate_start(
         * so that we don't toss things on the same block as
         * new_size but before it.
         *
-         * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
+         * Before calling toss_page or flushinval_pages, make sure to
         * call remapf() over the same region if the file is mapped.
         * This frees up mapped file references to the pages in the
-         * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
+         * given range and for the flushinval_pages case it ensures
         * that we get the latest mapped changes flushed out.
         */
        toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
@@ -1466,9 +1471,9 @@ xfs_itruncate_start(
                         last_byte);
        if (last_byte > toss_start) {
                if (flags & XFS_ITRUNC_DEFINITE) {
-                        VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+                        bhv_vop_toss_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
                } else {
-                        VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+                        bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
                }
        }
@@ -1666,12 +1671,13 @@ xfs_itruncate_finish(
                 * runs.
                 */
                XFS_BMAP_INIT(&free_list, &first_block);
-                error = xfs_bunmapi(ntp, ip, first_unmap_block,
+                error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore,
-                                    unmap_len,
+                                    first_unmap_block, unmap_len,
                                    XFS_BMAPI_AFLAG(fork) |
                                      (sync ? 0 : XFS_BMAPI_ASYNC),
                                    XFS_ITRUNC_MAX_EXTENTS,
-                                    &first_block, &free_list, &done);
+                                    &first_block, &free_list,
+                                    NULL, &done);
                if (error) {
                        /*
                         * If the bunmapi call encounters an error,
@@ -1955,9 +1961,9 @@ xfs_iunlink_remove(
        xfs_agino_t     agino;
        xfs_agino_t     next_agino;
        xfs_buf_t       *last_ibp;
-        xfs_dinode_t    *last_dip;
+        xfs_dinode_t    *last_dip = NULL;
        short           bucket_index;
-        int             offset, last_offset;
+        int             offset, last_offset = 0;
        int             error;
        int             agi_ok;
@@ -2745,13 +2751,14 @@ xfs_iunpin(
                 * the inode to become unpinned.
                 */
                if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
-                        vnode_t *vp = XFS_ITOV_NULL(ip);
+                        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
                        /* make sync come back and flush this inode */
                        if (vp) {
                                struct inode    *inode = vn_to_inode(vp);
-                                if (!(inode->i_state & I_NEW))
+                                if (!(inode->i_state &
+                                                (I_NEW|I_FREEING|I_CLEAR)))
                                        mark_inode_dirty_sync(inode);
                        }
                }
@@ -2916,13 +2923,6 @@ xfs_iflush_fork(
                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
                }
-                if (whichfork == XFS_DATA_FORK) {
-                        if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
-                                XFS_ERROR_REPORT("xfs_iflush_fork",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                return XFS_ERROR(EFSCORRUPTED);
-                        }
-                }
                break;
        case XFS_DINODE_FMT_EXTENTS:
@@ -3006,7 +3006,7 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
-        ASSERT(valusema(&ip->i_flock) <= 0);
+        ASSERT(issemalocked(&(ip->i_flock)));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3199,7 +3199,7 @@ xfs_iflush(
 corrupt_out:
        xfs_buf_relse(bp);
-        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
        xfs_iflush_abort(ip);
        /*
         * Unlocks the flush lock
@@ -3221,7 +3221,7 @@ cluster_corrupt_out:
                xfs_buf_relse(bp);
        }
-        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
        if(!bufwasdelwri)  {
                /*
@@ -3264,7 +3264,7 @@ xfs_iflush_int(
        SPLDECL(s);
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
-        ASSERT(valusema(&ip->i_flock) <= 0);
+        ASSERT(issemalocked(&(ip->i_flock)));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3504,7 +3504,7 @@ xfs_iflush_all(
        xfs_mount_t     *mp)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
 again:
        XFS_MOUNT_ILOCK(mp);
@@ -4180,7 +4180,7 @@ xfs_iext_direct_to_inline(
         */
        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
                nextents * sizeof(xfs_bmbt_rec_t));
-        kmem_free(ifp->if_u1.if_extents, KM_SLEEP);
+        kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
        ifp->if_real_bytes = 0;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3b544db1790b..d10b76ed1e5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,9 +102,9 @@ typedef struct xfs_ifork {
 #ifdef __KERNEL__
 struct bhv_desc;
+struct bhv_vnode;
 struct cred;
 struct ktrace;
-struct vnode;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -400,7 +400,7 @@ void		xfs_chash_init(struct xfs_mount *);
 void            xfs_chash_free(struct xfs_mount *);
 xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
                                  struct xfs_trans *);
-void            xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
+void            xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *);
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                         uint, uint, xfs_inode_t **, xfs_daddr_t);
 void            xfs_iput(xfs_inode_t *, uint);
@@ -461,7 +461,7 @@ void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, int, uint);
-xfs_inode_t     *xfs_vtoi(struct vnode *vp);
+xfs_inode_t     *xfs_vtoi(struct bhv_vnode *vp);
 void            xfs_synchronize_atime(xfs_inode_t *);
@@ -509,7 +509,6 @@ extern struct kmem_zone	*xfs_chashlist_zone;
 extern struct kmem_zone *xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
-extern struct vnodeops  xfs_vnodeops;
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7497a481b2f5..f8e80d8e7237 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -25,7 +25,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -794,7 +792,7 @@ xfs_inode_item_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if ((valusema(&(ip->i_flock)) > 0)  ||
+        if (!issemalocked(&(ip->i_flock)) ||
            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                iip->ili_pushbuf_flag = 0;
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -816,7 +814,7 @@ xfs_inode_item_pushbuf(
                         * If not, we can flush it async.
                         */
                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-                                  (valusema(&(ip->i_flock)) <= 0));
+                                  issemalocked(&(ip->i_flock)));
                        iip->ili_pushbuf_flag = 0;
                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        xfs_buftrace("INODE ITEM PUSH", bp);
@@ -864,7 +862,7 @@ xfs_inode_item_push(
        ip = iip->ili_inode;
        ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
-        ASSERT(valusema(&(ip->i_flock)) <= 0);
+        ASSERT(issemalocked(&(ip->i_flock)));
        /*
         * Since we were able to lock the inode's flush lock and
         * we found it on the AIL, the inode must be dirty.  This
@@ -1084,3 +1082,52 @@ xfs_istale_done(
 {
        xfs_iflush_abort(iip->ili_inode);
 }
+/*
+ * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+ * (which can have different field alignments) to the native version
+ */
+int
+xfs_inode_item_format_convert(
+        xfs_log_iovec_t         *buf,
+        xfs_inode_log_format_t  *in_f)
+{
+        if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+                xfs_inode_log_format_32_t *in_f32;
+                in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
+                in_f->ilf_type = in_f32->ilf_type;
+                in_f->ilf_size = in_f32->ilf_size;
+                in_f->ilf_fields = in_f32->ilf_fields;
+                in_f->ilf_asize = in_f32->ilf_asize;
+                in_f->ilf_dsize = in_f32->ilf_dsize;
+                in_f->ilf_ino = in_f32->ilf_ino;
+                /* copy biggest field of ilf_u */
+                memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+                       in_f32->ilf_u.ilfu_uuid.__u_bits,
+                       sizeof(uuid_t));
+                in_f->ilf_blkno = in_f32->ilf_blkno;
+                in_f->ilf_len = in_f32->ilf_len;
+                in_f->ilf_boffset = in_f32->ilf_boffset;
+                return 0;
+        } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+                xfs_inode_log_format_64_t *in_f64;
+                in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
+                in_f->ilf_type = in_f64->ilf_type;
+                in_f->ilf_size = in_f64->ilf_size;
+                in_f->ilf_fields = in_f64->ilf_fields;
+                in_f->ilf_asize = in_f64->ilf_asize;
+                in_f->ilf_dsize = in_f64->ilf_dsize;
+                in_f->ilf_ino = in_f64->ilf_ino;
+                /* copy biggest field of ilf_u */
+                memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+                       in_f64->ilf_u.ilfu_uuid.__u_bits,
+                       sizeof(uuid_t));
+                in_f->ilf_blkno = in_f64->ilf_blkno;
+                in_f->ilf_len = in_f64->ilf_len;
+                in_f->ilf_boffset = in_f64->ilf_boffset;
+                return 0;
+        }
+        return EFSCORRUPTED;
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index c5dbf93b6661..5db6cd1b4cf3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -23,25 +23,6 @@
 * log.  The size of the inline data/extents/b-tree root to be logged
 * (if any) is indicated in the ilf_dsize field.  Changes to this structure
 * must be added on to the end.
- *
- * Convention for naming inode log item versions :  The current version
- * is always named XFS_LI_INODE.  When an inode log item gets superseded,
- * add the latest version of IRIX that will generate logs with that item
- * to the version name.
- *
- * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
- *      union (ilf_u) field.  This was released with IRIX 5.3-XFS.
- * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
- *      structure.  This was released with IRIX 6.0.1-XFS and IRIX 6.1.
- * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
- *      so a new structure definition wasn't necessary.  However, we had
- *      to add a new type because the inode cluster size changed from 4K
- *      to 8K and the version number had to be rev'ved to keep older kernels
- *      from trying to recover logs with the 8K buffers in them.  The logging
- *      code can handle recovery on different-sized clusters now so hopefully
- *      this'll be the last time we need to change the inode log item just
- *      for a change in the inode cluster size.  This new version was
- *      released with IRIX 6.2.
 */
 typedef struct xfs_inode_log_format {
        unsigned short          ilf_type;       /* inode log item type */
@@ -59,18 +40,38 @@ typedef struct xfs_inode_log_format {
        int                     ilf_boffset;    /* off of inode in buffer */
 } xfs_inode_log_format_t;
-/* Initial version shipped with IRIX 5.3-XFS */
+typedef struct xfs_inode_log_format_32 {
-typedef struct xfs_inode_log_format_v1 {
+        unsigned short          ilf_type;       /* 16: inode log item type */
-        unsigned short          ilf_type;       /* inode log item type */
+        unsigned short          ilf_size;       /* 16: size of this item */
-        unsigned short          ilf_size;       /* size of this item */
+        uint                    ilf_fields;     /* 32: flags for fields logged */
-        uint                    ilf_fields;     /* flags for fields logged */
+        ushort                  ilf_asize;      /* 32: size of attr d/ext/root */
-        uint                    ilf_dsize;      /* size of data/ext/root */
+        ushort                  ilf_dsize;      /* 32: size of data/ext/root */
-        xfs_ino_t               ilf_ino;        /* inode number */
+        xfs_ino_t               ilf_ino;        /* 64: inode number */
        union {
-                xfs_dev_t       ilfu_rdev;      /* rdev value for dev inode*/
+                xfs_dev_t       ilfu_rdev;      /* 32: rdev value for dev inode*/
-                uuid_t          ilfu_uuid;      /* mount point value */
+                uuid_t          ilfu_uuid;      /* 128: mount point value */
+        } ilf_u;
+        __int64_t               ilf_blkno;      /* 64: blkno of inode buffer */
+        int                     ilf_len;        /* 32: len of inode buffer */
+        int                     ilf_boffset;    /* 32: off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+typedef struct xfs_inode_log_format_64 {
+        unsigned short          ilf_type;       /* 16: inode log item type */
+        unsigned short          ilf_size;       /* 16: size of this item */
+        uint                    ilf_fields;     /* 32: flags for fields logged */
+        ushort                  ilf_asize;      /* 32: size of attr d/ext/root */
+        ushort                  ilf_dsize;      /* 32: size of data/ext/root */
+        __uint32_t              ilf_pad;        /* 32: pad for 64 bit boundary */
+        xfs_ino_t               ilf_ino;        /* 64: inode number */
+        union {
+                xfs_dev_t       ilfu_rdev;      /* 32: rdev value for dev inode*/
+                uuid_t          ilfu_uuid;      /* 128: mount point value */
        } ilf_u;
-} xfs_inode_log_format_t_v1;
+        __int64_t               ilf_blkno;      /* 64: blkno of inode buffer */
+        int                     ilf_len;        /* 32: len of inode buffer */
+        int                     ilf_boffset;    /* 32: off of inode in buffer */
+} xfs_inode_log_format_64_t;
 /*
 * Flags for xfs_trans_log_inode flags field.
@@ -172,6 +173,8 @@ extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
 extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
 extern void xfs_iflush_abort(struct xfs_inode *);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+                                         xfs_inode_log_format_t *);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
index a07815661a8c..06d710c9ce4b 100644
--- a/fs/xfs/xfs_iocore.c
+++ b/fs/xfs/xfs_iocore.c
@@ -24,14 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
+#include "xfs_dfrag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -58,7 +57,7 @@ xfs_size_fn(
 STATIC int
 xfs_ioinit(
-        struct vfs              *vfsp,
+        struct bhv_vfs          *vfsp,
        struct xfs_mount_args   *mntargs,
        int                     flags)
 {
@@ -68,6 +67,7 @@ xfs_ioinit(
 xfs_ioops_t     xfs_iocore_xfs = {
        .xfs_ioinit             = (xfs_ioinit_t) xfs_ioinit,
        .xfs_bmapi_func         = (xfs_bmapi_t) xfs_bmapi,
+        .xfs_bunmapi_func       = (xfs_bunmapi_t) xfs_bunmapi,
        .xfs_bmap_eof_func      = (xfs_bmap_eof_t) xfs_bmap_eof,
        .xfs_iomap_write_direct =
                        (xfs_iomap_write_direct_t) xfs_iomap_write_direct,
@@ -84,6 +84,7 @@ xfs_ioops_t	xfs_iocore_xfs = {
        .xfs_unlock             = (xfs_unlk_t) xfs_iunlock,
        .xfs_size_func          = (xfs_size_t) xfs_size_fn,
        .xfs_iodone             = (xfs_iodone_t) fs_noerr,
+        .xfs_swap_extents_func  = (xfs_swap_extents_t) xfs_swap_extents,
 };
 void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d5dfedcb8922..f1949c16df15 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -252,7 +250,7 @@ xfs_iomap(
        error = XFS_BMAPI(mp, NULL, io, offset_fsb,
                        (xfs_filblks_t)(end_fsb - offset_fsb),
                        bmapi_flags,  NULL, 0, &imap,
-                        &nimaps, NULL);
+                        &nimaps, NULL, NULL);
        if (error)
                goto out;
@@ -519,8 +517,8 @@ xfs_iomap_write_direct(
         */
        XFS_BMAP_INIT(&free_list, &firstfsb);
        nimaps = 1;
-        error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+        error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, bmapi_flag,
-                bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list);
+                &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
        if (error)
                goto error0;
@@ -610,8 +608,8 @@ xfs_iomap_eof_want_preallocate(
        while (count_fsb > 0) {
                imaps = nimaps;
                firstblock = NULLFSBLOCK;
-                error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
+                error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, 0,
-                                  0, &firstblock, 0, imap, &imaps, NULL);
+                                  &firstblock, 0, imap, &imaps, NULL, NULL);
                if (error)
                        return error;
                for (n = 0; n < imaps; n++) {
@@ -695,11 +693,11 @@ retry:
        nimaps = XFS_WRITE_IMAPS;
        firstblock = NULLFSBLOCK;
-        error = xfs_bmapi(NULL, ip, offset_fsb,
+        error = XFS_BMAPI(mp, NULL, io, offset_fsb,
                          (xfs_filblks_t)(last_fsb - offset_fsb),
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-                          &nimaps, NULL);
+                          &nimaps, NULL, NULL);
        if (error && (error != ENOSPC))
                return XFS_ERROR(error);
@@ -832,9 +830,9 @@ xfs_iomap_write_allocate(
                        }
                        /* Go get the actual blocks */
-                        error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+                        error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
                                        XFS_BMAPI_WRITE, &first_block, 1,
-                                        imap, &nimaps, &free_list);
+                                        imap, &nimaps, &free_list, NULL);
                        if (error)
                                goto trans_cancel;
@@ -955,9 +953,9 @@ xfs_iomap_write_unwritten(
                 */
                XFS_BMAP_INIT(&free_list, &firstfsb);
                nimaps = 1;
-                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+                error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb,
                                  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
-                                  1, &imap, &nimaps, &free_list);
+                                  1, &imap, &nimaps, &free_list, NULL);
                if (error)
                        goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 94068d014f27..46249e4d1fea 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -41,11 +39,6 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
-#ifndef HAVE_USERACC
-#define useracc(ubuffer, size, flags, foo) (0)
-#define unuseracc(ubuffer, size, flags)
-#endif
 STATIC int
 xfs_bulkstat_one_iget(
        xfs_mount_t     *mp,            /* mount point for filesystem */
@@ -56,7 +49,7 @@ xfs_bulkstat_one_iget(
 {
        xfs_dinode_core_t *dic;         /* dinode core info pointer */
        xfs_inode_t     *ip;            /* incore inode pointer */
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        int             error;
        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
@@ -336,15 +329,6 @@ xfs_bulkstat(
        nimask = ~(nicluster - 1);
        nbcluster = nicluster >> mp->m_sb.sb_inopblog;
        /*
-         * Lock down the user's buffer. If a buffer was not sent, as in the case
-         * disk quota code calls here, we skip this.
-         */
-        if (ubuffer &&
-            (error = useracc(ubuffer, ubcount * statstruct_size,
-                        (B_READ|B_PHYS), NULL))) {
-                return error;
-        }
-        /*
         * Allocate a page-sized buffer for inode btree records.
         * We could try allocating something smaller, but for normal
         * calls we'll always (potentially) need the whole page.
@@ -650,8 +634,6 @@ xfs_bulkstat(
         * Done, we're either out of filesystem or space to put the data.
         */
        kmem_free(irbuf, NBPC);
-        if (ubuffer)
-                unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
        *ubcountp = ubelem;
        if (agno >= mp->m_sb.sb_agcount) {
                /*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 11eb4e1b18c4..be5f12e07d22 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -45,7 +45,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 */
 #define BULKSTAT_FG_IGET        0x1     /* Go through the buffer cache */
 #define BULKSTAT_FG_QUICK       0x2     /* No iget, walk the dinode cluster */
-#define BULKSTAT_FG_VFSLOCKED   0x4     /* Already have vfs lock */
 /*
 * Return stat information in bulk (by-inode) for the filesystem.
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 32e841d2f26d..e730328636c3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -36,7 +35,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -402,7 +400,7 @@ xfs_log_release_iclog(xfs_mount_t *mp,
        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
        if (xlog_state_release_iclog(log, iclog)) {
-                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
                return EIO;
        }
@@ -498,9 +496,8 @@ xfs_log_mount(xfs_mount_t	*mp,
         * just worked.
         */
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
-                int     error;
+                bhv_vfs_t       *vfsp = XFS_MTOVFS(mp);
-                vfs_t   *vfsp = XFS_MTOVFS(mp);
+                int             error, readonly = (vfsp->vfs_flag & VFS_RDONLY);
-                int     readonly = (vfsp->vfs_flag & VFS_RDONLY);
                if (readonly)
                        vfsp->vfs_flag &= ~VFS_RDONLY;
@@ -726,7 +723,7 @@ xfs_log_write(xfs_mount_t *	mp,
                return XFS_ERROR(EIO);
        if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
-                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
        }
        return error;
 }       /* xfs_log_write */
@@ -816,9 +813,9 @@ xfs_log_need_covered(xfs_mount_t *mp)
        SPLDECL(s);
        int             needed = 0, gen;
        xlog_t          *log = mp->m_log;
-        vfs_t           *vfsp = XFS_MTOVFS(mp);
+        bhv_vfs_t       *vfsp = XFS_MTOVFS(mp);
-        if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
+        if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
            (vfsp->vfs_flag & VFS_RDONLY))
                return 0;
@@ -956,7 +953,7 @@ xlog_iodone(xfs_buf_t *bp)
                        XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
                xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
                XFS_BUF_STALE(bp);
-                xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
+                xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
                /*
                 * This flag will be propagated to the trans-committed
                 * callback routines to let them know that the log-commit
@@ -1261,7 +1258,7 @@ xlog_commit_record(xfs_mount_t  *mp,
        ASSERT_ALWAYS(iclog);
        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
                               iclog, XLOG_COMMIT_TRANS))) {
-                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
        }
        return error;
 }       /* xlog_commit_record */
@@ -1743,10 +1740,10 @@ xlog_write(xfs_mount_t *	mp,
           xlog_in_core_t       **commit_iclog,
           uint                 flags)
 {
-    xlog_t           *log    = mp->m_log;
+    xlog_t           *log = mp->m_log;
    xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
+    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
    xlog_op_header_t *logop_head;    /* ptr to log operation header */
-    xlog_in_core_t   *iclog;         /* ptr to current in-core log */
    __psint_t        ptr;            /* copy address into data region */
    int              len;            /* # xlog_write() bytes 2 still copy */
    int              index;          /* region index currently copying */
@@ -1790,7 +1787,7 @@ xlog_write(xfs_mount_t *	mp,
        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
                "xfs_log_write: reservation ran out. Need to up reservation");
        /* If we did not panic, shutdown the filesystem */
-        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 #endif
    } else
        ticket->t_curr_res -= len;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1f0016b0b4ec..3cb678e3a132 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -193,14 +191,14 @@ xlog_header_check_dump(
 {
        int                     b;
-        printk("%s:  SB : uuid = ", __FUNCTION__);
+        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
        for (b = 0; b < 16; b++)
-                printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
+                cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
-        printk(", fmt = %d\n", XLOG_FMT);
+        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
-        printk("    log : uuid = ");
+        cmn_err(CE_DEBUG, "    log : uuid = ");
        for (b = 0; b < 16; b++)
-                printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
+                cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
-        printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, ", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
 }
 #else
 #define xlog_header_check_dump(mp, head)
@@ -282,7 +280,7 @@ xlog_recover_iodone(
                mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
                xfs_ioerror_alert("xlog_recover_iodone",
                                  mp, bp, XFS_BUF_ADDR(bp));
-                xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        }
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
@@ -992,6 +990,8 @@ xlog_find_zeroed(
        xfs_daddr_t     num_scan_bblks;
        int             error, log_bbnum = log->l_logBBsize;
+        *blk_no = 0;
        /* check totally zeroed log */
        bp = xlog_get_bp(log, 1);
        if (!bp)
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
                                              next_unlinked_offset);
-                INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
+                *buffer_nextp = *logged_nextp;
        }
        return 0;
@@ -2292,12 +2292,22 @@ xlog_recover_do_inode_trans(
        int                     attr_index;
        uint                    fields;
        xfs_dinode_core_t       *dicp;
+        int                     need_free = 0;
        if (pass == XLOG_RECOVER_PASS1) {
                return 0;
        }
-        in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+                in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+        } else {
+                in_f = (xfs_inode_log_format_t *)kmem_alloc(
+                        sizeof(xfs_inode_log_format_t), KM_SLEEP);
+                need_free = 1;
+                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+                if (error)
+                        goto error;
+        }
        ino = in_f->ilf_ino;
        mp = log->l_mp;
        if (ITEM_TYPE(item) == XFS_LI_INODE) {
@@ -2323,8 +2333,10 @@ xlog_recover_do_inode_trans(
         * Inode buffers can be freed, look out for it,
         * and do not replay the inode.
         */
-        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0))
+        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
-                return 0;
+                error = 0;
+                goto error;
+        }
        bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
                                                                XFS_BUF_LOCK);
@@ -2333,7 +2345,7 @@ xlog_recover_do_inode_trans(
                                  bp, imap.im_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
-                return error;
+                goto error;
        }
        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
@@ -2350,7 +2362,8 @@ xlog_recover_do_inode_trans(
                        dip, bp, ino);
                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
                                 XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                error = EFSCORRUPTED;
+                goto error;
        }
        dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
@@ -2360,7 +2373,8 @@ xlog_recover_do_inode_trans(
                        item, ino);
                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
                                 XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                error = EFSCORRUPTED;
+                goto error;
        }
        /* Skip replay when the on disk inode is newer than the log one */
@@ -2376,7 +2390,8 @@ xlog_recover_do_inode_trans(
                        /* do nothing */
                } else {
                        xfs_buf_relse(bp);
-                        return 0;
+                        error = 0;
+                        goto error;
                }
        }
        /* Take the opportunity to reset the flush iteration count */
@@ -2391,7 +2406,8 @@ xlog_recover_do_inode_trans(
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                item, dip, bp, ino);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        error = EFSCORRUPTED;
+                        goto error;
                }
        } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2403,7 +2419,8 @@ xlog_recover_do_inode_trans(
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                item, dip, bp, ino);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        error = EFSCORRUPTED;
+                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
@@ -2415,7 +2432,8 @@ xlog_recover_do_inode_trans(
                        item, dip, bp, ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
-                return XFS_ERROR(EFSCORRUPTED);
+                error = EFSCORRUPTED;
+                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
@@ -2424,7 +2442,8 @@ xlog_recover_do_inode_trans(
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
                        item, dip, bp, ino, dicp->di_forkoff);
-                return XFS_ERROR(EFSCORRUPTED);
+                error = EFSCORRUPTED;
+                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
@@ -2433,7 +2452,8 @@ xlog_recover_do_inode_trans(
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
                        item->ri_buf[1].i_len, item);
-                return XFS_ERROR(EFSCORRUPTED);
+                error = EFSCORRUPTED;
+                goto error;
        }
        /* The core is in in-core format */
@@ -2521,7 +2541,8 @@ xlog_recover_do_inode_trans(
                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
                        ASSERT(0);
                        xfs_buf_relse(bp);
-                        return XFS_ERROR(EIO);
+                        error = EIO;
+                        goto error;
                }
        }
@@ -2537,7 +2558,10 @@ write_inode_buffer:
                error = xfs_bwrite(mp, bp);
        }
-        return (error);
+error:
+        if (need_free)
+                kmem_free(in_f, sizeof(*in_f));
+        return XFS_ERROR(error);
 }
 /*
@@ -2674,32 +2698,32 @@ xlog_recover_do_dquot_trans(
 * structure into it, and adds the efi to the AIL with the given
 * LSN.
 */
-STATIC void
+STATIC int
 xlog_recover_do_efi_trans(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
        xfs_lsn_t               lsn,
        int                     pass)
 {
+        int                     error;
        xfs_mount_t             *mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
        SPLDECL(s);
        if (pass == XLOG_RECOVER_PASS1) {
-                return;
+                return 0;
        }
        efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
-        ASSERT(item->ri_buf[0].i_len ==
-               (sizeof(xfs_efi_log_format_t) +
-                ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
-        memcpy((char *)&(efip->efi_format), (char *)efi_formatp,
+        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
-              sizeof(xfs_efi_log_format_t) +
+                                         &(efip->efi_format)))) {
-              ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
+                xfs_efi_item_free(efip);
+                return error;
+        }
        efip->efi_next_extent = efi_formatp->efi_nextents;
        efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -2708,6 +2732,7 @@ xlog_recover_do_efi_trans(
         * xfs_trans_update_ail() drops the AIL lock.
         */
        xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
+        return 0;
 }
@@ -2738,9 +2763,10 @@ xlog_recover_do_efd_trans(
        }
        efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
-        ASSERT(item->ri_buf[0].i_len ==
+        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
-               (sizeof(xfs_efd_log_format_t) +
+                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
-                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
+               (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
        efi_id = efd_formatp->efd_efi_id;
        /*
@@ -2810,15 +2836,14 @@ xlog_recover_do_trans(
                        if  ((error = xlog_recover_do_buffer_trans(log, item,
                                                                 pass)))
                                break;
-                } else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
+                } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
-                           (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
-                           (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
                        if ((error = xlog_recover_do_inode_trans(log, item,
                                                                pass)))
                                break;
                } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-                        xlog_recover_do_efi_trans(log, item, trans->r_lsn,
+                        if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-                                                  pass);
+                                                  pass)))
+                                break;
                } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
                        xlog_recover_do_efd_trans(log, item, pass);
                } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
@@ -3419,13 +3444,13 @@ xlog_unpack_data_checksum(
            if (rhead->h_chksum ||
                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
                    cmn_err(CE_DEBUG,
-                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
+                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
                            INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
                    cmn_err(CE_DEBUG,
 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
                    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
                            cmn_err(CE_DEBUG,
-                                "XFS: LogR this is a LogV2 filesystem");
+                                "XFS: LogR this is a LogV2 filesystem\n");
                    }
                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
            }
@@ -3798,7 +3823,7 @@ xlog_do_log_recovery(
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS2);
 #ifdef DEBUG
-        {
+        if (!error) {
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
@@ -3974,7 +3999,7 @@ xlog_recover_finish(
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
                cmn_err(CE_DEBUG,
-                        "!Ending clean XFS mount for filesystem: %s",
+                        "!Ending clean XFS mount for filesystem: %s\n",
                        log->l_mp->m_fsname);
        }
        return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c0b1c2906880..4be5c0b2d296 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -196,7 +194,7 @@ xfs_mount_free(
                kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
        if (remove_bhv) {
-                struct vfs      *vfsp = XFS_MTOVFS(mp);
+                struct bhv_vfs  *vfsp = XFS_MTOVFS(mp);
                bhv_remove_all_vfsops(vfsp, 0);
                VFS_REMOVEBHV(vfsp, &mp->m_bhv);
@@ -337,7 +335,7 @@ xfs_mount_validate_sb(
 xfs_agnumber_t
 xfs_initialize_perag(
-        struct vfs      *vfs,
+        bhv_vfs_t       *vfs,
        xfs_mount_t     *mp,
        xfs_agnumber_t  agcount)
 {
@@ -651,14 +649,14 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 */
 int
 xfs_mountfs(
-        vfs_t           *vfsp,
+        bhv_vfs_t       *vfsp,
        xfs_mount_t     *mp,
        int             mfsi_flags)
 {
        xfs_buf_t       *bp;
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
-        vnode_t         *rvp = NULL;
+        bhv_vnode_t     *rvp = NULL;
        int             readio_log, writeio_log;
        xfs_daddr_t     d;
        __uint64_t      ret64;
@@ -934,18 +932,7 @@ xfs_mountfs(
        vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
        mp->m_dmevmask = 0;     /* not persistent; set after each mount */
-        /*
+        xfs_dir_mount(mp);
-         * Select the right directory manager.
-         */
-        mp->m_dirops =
-                XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
-                        xfsv2_dirops :
-                        xfsv1_dirops;
-        /*
-         * Initialize directory manager's entries.
-         */
-        XFS_DIR_MOUNT(mp);
        /*
         * Initialize the attribute manager's entries.
@@ -1006,8 +993,9 @@ xfs_mountfs(
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
                cmn_err(CE_WARN, "XFS: corrupted root inode");
-                prdev("Root inode %llu is not a directory",
+                cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
-                      mp->m_ddev_targp, (unsigned long long)rip->i_ino);
+                        XFS_BUFTARG_NAME(mp->m_ddev_targp),
+                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
                                 mp);
@@ -1094,7 +1082,7 @@ xfs_mountfs(
 int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
-        struct vfs      *vfsp = XFS_MTOVFS(mp);
+        struct bhv_vfs  *vfsp = XFS_MTOVFS(mp);
 #if defined(DEBUG) || defined(INDUCE_IO_ERROR)
        int64_t         fsid;
 #endif
@@ -1254,6 +1242,26 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
        xfs_trans_log_buf(tp, bp, first, last);
 }
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
+*/
+#define SET_ASIDE_BLOCKS 8
 /*
 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
 * a delta to a specified field in the in-core superblock.  Simply
@@ -1298,7 +1306,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
                return 0;
        case XFS_SBS_FDBLOCKS:
-                lcounter = (long long)mp->m_sb.sb_fdblocks;
+                lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
                if (delta > 0) {                /* Putting blocks back */
@@ -1332,7 +1340,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
                        }
                }
-                mp->m_sb.sb_fdblocks = lcounter;
+                mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
                return 0;
        case XFS_SBS_FREXTENTS:
                lcounter = (long long)mp->m_sb.sb_frextents;
@@ -1713,15 +1721,14 @@ xfs_mount_log_sbunit(
 * is present to prevent thrashing).
 */
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * hot-plug CPU notifier support.
 *
- * We cannot use the hotcpu_register() function because it does
+ * We need a notifier per filesystem as we need to be able to identify
- * not allow notifier instances. We need a notifier per filesystem
+ * the filesystem to balance the counters out. This is achieved by
- * as we need to be able to identify the filesystem to balance
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
- * the counters out. This is achieved by having a notifier block
+ * magic to get the mount pointer from the notifier block address.
- * embedded in the xfs_mount_t and doing pointer magic to get the
- * mount pointer from the notifier block address.
 */
 STATIC int
 xfs_icsb_cpu_notify(
@@ -1771,6 +1778,7 @@ xfs_icsb_cpu_notify(
        return NOTIFY_OK;
 }
+#endif /* CONFIG_HOTPLUG_CPU */
 int
 xfs_icsb_init_counters(
@@ -1783,9 +1791,11 @@ xfs_icsb_init_counters(
        if (mp->m_sb_cnts == NULL)
                return -ENOMEM;
+#ifdef CONFIG_HOTPLUG_CPU
        mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
        mp->m_icsb_notifier.priority = 0;
-        register_cpu_notifier(&mp->m_icsb_notifier);
+        register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
        for_each_online_cpu(i) {
                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
@@ -1804,7 +1814,7 @@ xfs_icsb_destroy_counters(
        xfs_mount_t     *mp)
 {
        if (mp->m_sb_cnts) {
-                unregister_cpu_notifier(&mp->m_icsb_notifier);
+                unregister_hotcpu_notifier(&mp->m_icsb_notifier);
                free_percpu(mp->m_sb_cnts);
        }
 }
@@ -2018,7 +2028,7 @@ xfs_icsb_balance_counter(
        xfs_sb_field_t  field,
        int             flags)
 {
-        uint64_t        count, resid = 0;
+        uint64_t        count, resid;
        int             weight = num_online_cpus();
        int             s;
@@ -2050,6 +2060,7 @@ xfs_icsb_balance_counter(
                break;
        default:
                BUG();
+                count = resid = 0;      /* quiet, gcc */
                break;
        }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 668ad23fd37c..b2bd4be4200a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,8 +53,8 @@ typedef struct xfs_trans_reservations {
 #else
 struct cred;
 struct log;
-struct vfs;
+struct bhv_vfs;
-struct vnode;
+struct bhv_vnode;
 struct xfs_mount_args;
 struct xfs_ihash;
 struct xfs_chash;
@@ -63,9 +63,11 @@ struct xfs_perag;
 struct xfs_iocore;
 struct xfs_bmbt_irec;
 struct xfs_bmap_free;
+struct xfs_extdelta;
+struct xfs_swapext;
-extern struct vfsops xfs_vfsops;
+extern struct bhv_vfsops xfs_vfsops;
-extern struct vnodeops xfs_vnodeops;
+extern struct bhv_vnodeops xfs_vnodeops;
 #define AIL_LOCK_T              lock_t
 #define AIL_LOCKINIT(x,y)       spinlock_init(x,y)
@@ -78,15 +80,15 @@ extern struct vnodeops xfs_vnodeops;
 * Prototypes and functions for the Data Migration subsystem.
 */
-typedef int     (*xfs_send_data_t)(int, struct vnode *,
+typedef int     (*xfs_send_data_t)(int, struct bhv_vnode *,
-                        xfs_off_t, size_t, int, vrwlock_t *);
+                        xfs_off_t, size_t, int, bhv_vrwlock_t *);
 typedef int     (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int     (*xfs_send_destroy_t)(struct vnode *, dm_right_t);
+typedef int     (*xfs_send_destroy_t)(struct bhv_vnode *, dm_right_t);
-typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct vfs *,
+typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *,
-                        struct vnode *,
+                        struct bhv_vnode *,
-                        dm_right_t, struct vnode *, dm_right_t,
+                        dm_right_t, struct bhv_vnode *, dm_right_t,
                        char *, char *, mode_t, int, int);
-typedef void    (*xfs_send_unmount_t)(struct vfs *, struct vnode *,
+typedef void    (*xfs_send_unmount_t)(struct bhv_vfs *, struct bhv_vnode *,
                        dm_right_t, mode_t, int, int);
 typedef struct xfs_dmops {
@@ -188,13 +190,18 @@ typedef struct xfs_qmops {
 * Prototypes and functions for I/O core modularization.
 */
-typedef int             (*xfs_ioinit_t)(struct vfs *,
+typedef int             (*xfs_ioinit_t)(struct bhv_vfs *,
                                struct xfs_mount_args *, int);
 typedef int             (*xfs_bmapi_t)(struct xfs_trans *, void *,
                                xfs_fileoff_t, xfs_filblks_t, int,
                                xfs_fsblock_t *, xfs_extlen_t,
                                struct xfs_bmbt_irec *, int *,
-                                struct xfs_bmap_free *);
+                                struct xfs_bmap_free *, struct xfs_extdelta *);
+typedef int             (*xfs_bunmapi_t)(struct xfs_trans *,
+                                void *, xfs_fileoff_t,
+                                xfs_filblks_t, int, xfs_extnum_t,
+                                xfs_fsblock_t *, struct xfs_bmap_free *,
+                                struct xfs_extdelta *, int *);
 typedef int             (*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
 typedef int             (*xfs_iomap_write_direct_t)(
                                void *, xfs_off_t, size_t, int,
@@ -213,11 +220,14 @@ typedef void		(*xfs_lock_demote_t)(void *, uint);
 typedef int             (*xfs_lock_nowait_t)(void *, uint);
 typedef void            (*xfs_unlk_t)(void *, unsigned int);
 typedef xfs_fsize_t     (*xfs_size_t)(void *);
-typedef xfs_fsize_t     (*xfs_iodone_t)(struct vfs *);
+typedef xfs_fsize_t     (*xfs_iodone_t)(struct bhv_vfs *);
+typedef int             (*xfs_swap_extents_t)(void *, void *,
+                                struct xfs_swapext*);
 typedef struct xfs_ioops {
        xfs_ioinit_t                    xfs_ioinit;
        xfs_bmapi_t                     xfs_bmapi_func;
+        xfs_bunmapi_t                   xfs_bunmapi_func;
        xfs_bmap_eof_t                  xfs_bmap_eof_func;
        xfs_iomap_write_direct_t        xfs_iomap_write_direct;
        xfs_iomap_write_delay_t         xfs_iomap_write_delay;
@@ -230,13 +240,17 @@ typedef struct xfs_ioops {
        xfs_unlk_t                      xfs_unlock;
        xfs_size_t                      xfs_size_func;
        xfs_iodone_t                    xfs_iodone;
+        xfs_swap_extents_t              xfs_swap_extents_func;
 } xfs_ioops_t;
 #define XFS_IOINIT(vfsp, args, flags) \
        (*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags)
-#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist)     \
+#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist,delta) \
        (*(mp)->m_io_ops.xfs_bmapi_func) \
-                (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
+                (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist,delta)
+#define XFS_BUNMAPI(mp, trans,io,bno,len,f,nexts,first,flist,delta,done) \
+        (*(mp)->m_io_ops.xfs_bunmapi_func) \
+                (trans,(io)->io_obj,bno,len,f,nexts,first,flist,delta,done)
 #define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
        (*(mp)->m_io_ops.xfs_bmap_eof_func) \
                ((io)->io_obj, endoff, whichfork, eof)
@@ -266,6 +280,9 @@ typedef struct xfs_ioops {
        (*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
 #define XFS_IODONE(vfsp) \
        (*(mp)->m_io_ops.xfs_iodone)(vfsp)
+#define XFS_SWAP_EXTENTS(mp, io, tio, sxp) \
+        (*(mp)->m_io_ops.xfs_swap_extents_func) \
+                ((io)->io_obj, (tio)->io_obj, sxp)
 #ifdef HAVE_PERCPU_SB
@@ -386,8 +403,6 @@ typedef struct xfs_mount {
        __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
                                                   field governed by m_ilock */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
-        __uint8_t               m_dirversion;   /* 1 or 2 */
-        xfs_dirops_t            m_dirops;       /* table of dir funcs */
        int                     m_dirblksize;   /* directory block sz--bytes */
        int                     m_dirblkfsbs;   /* directory block sz--fsbs */
        xfs_dablk_t             m_dirdatablk;   /* blockno of dir data v2 */
@@ -494,16 +509,7 @@ xfs_preferred_iosize(xfs_mount_t *mp)
 #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
 #define xfs_force_shutdown(m,f) \
-        VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
+        bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
-/*
- * Flags sent to xfs_force_shutdown.
- */
-#define XFS_METADATA_IO_ERROR   0x1
-#define XFS_LOG_IO_ERROR        0x2
-#define XFS_FORCE_UMOUNT        0x4
-#define XFS_CORRUPT_INCORE      0x8     /* Corrupt in-memory data structures */
-#define XFS_SHUTDOWN_REMOTE_REQ 0x10    /* Shutdown came from remote cell */
 /*
 * Flags for xfs_mountfs
@@ -521,7 +527,7 @@ xfs_preferred_iosize(xfs_mount_t *mp)
 * Macros for getting from mount to vfs and back.
 */
 #define XFS_MTOVFS(mp)          xfs_mtovfs(mp)
-static inline struct vfs *xfs_mtovfs(xfs_mount_t *mp)
+static inline struct bhv_vfs *xfs_mtovfs(xfs_mount_t *mp)
 {
        return bhvtovfs(&mp->m_bhv);
 }
@@ -533,7 +539,7 @@ static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp)
 }
 #define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
-static inline xfs_mount_t *xfs_vfstom(vfs_t *vfs)
+static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs)
 {
        return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops));
 }
@@ -571,7 +577,7 @@ typedef struct xfs_mod_sb {
 extern xfs_mount_t *xfs_mount_init(void);
 extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern void     xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
-extern int      xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern int      xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
 extern int      xfs_unmountfs(xfs_mount_t *, struct cred *);
@@ -589,7 +595,7 @@ extern void	xfs_freesb(xfs_mount_t *);
 extern void     xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
 extern int      xfs_syncsub(xfs_mount_t *, int, int, int *);
 extern int      xfs_sync_inodes(xfs_mount_t *, int, int, int *);
-extern xfs_agnumber_t   xfs_initialize_perag(struct vfs *, xfs_mount_t *,
+extern xfs_agnumber_t   xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *,
                                                xfs_agnumber_t);
 extern void     xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 1408a32eef88..320d63ff9ca2 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 7fbef974bce6..acb853b33ebb 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -365,7 +365,7 @@ typedef struct xfs_dqtrxops {
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
-extern struct bhv_vfsops xfs_qmops;
+extern struct bhv_module_vfsops xfs_qmops;
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 1f148762eb28..d98171deaa1c 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -22,13 +22,11 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -40,7 +38,6 @@
 #include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
-#include "xfs_dir_leaf.h"
 /*
@@ -87,8 +84,8 @@ STATIC int
 xfs_lock_for_rename(
        xfs_inode_t     *dp1,   /* old (source) directory inode */
        xfs_inode_t     *dp2,   /* new (target) directory inode */
-        vname_t         *vname1,/* old entry name */
+        bhv_vname_t     *vname1,/* old entry name */
-        vname_t         *vname2,/* new entry name */
+        bhv_vname_t     *vname2,/* new entry name */
        xfs_inode_t     **ipp1, /* inode of old entry */
        xfs_inode_t     **ipp2, /* inode of new entry, if it
                                   already exists, NULL otherwise. */
@@ -225,9 +222,9 @@ xfs_lock_for_rename(
 int
 xfs_rename(
        bhv_desc_t      *src_dir_bdp,
-        vname_t         *src_vname,
+        bhv_vname_t     *src_vname,
-        vnode_t         *target_dir_vp,
+        bhv_vnode_t     *target_dir_vp,
-        vname_t         *target_vname,
+        bhv_vname_t     *target_vname,
        cred_t          *credp)
 {
        xfs_trans_t     *tp;
@@ -242,7 +239,7 @@ xfs_rename(
        int             committed;
        xfs_inode_t     *inodes[4];
        int             target_ip_dropped = 0;  /* dropped target_ip link? */
-        vnode_t         *src_dir_vp;
+        bhv_vnode_t     *src_dir_vp;
        int             spaceres;
        int             target_link_zero = 0;
        int             num_inodes;
@@ -398,34 +395,29 @@ xfs_rename(
                 * fit before actually inserting it.
                 */
                if (spaceres == 0 &&
-                    (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
+                    (error = xfs_dir_canenter(tp, target_dp, target_name,
-                                target_namelen))) {
+                                                target_namelen)))
                        goto error_return;
-                }
                /*
                 * If target does not exist and the rename crosses
                 * directories, adjust the target directory link count
                 * to account for the ".." reference from the new entry.
                 */
-                error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
+                error = xfs_dir_createname(tp, target_dp, target_name,
                                           target_namelen, src_ip->i_ino,
                                           &first_block, &free_list, spaceres);
-                if (error == ENOSPC) {
+                if (error == ENOSPC)
                        goto error_return;
-                }
+                if (error)
-                if (error) {
                        goto abort_return;
-                }
                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
-                        if (error) {
+                        if (error)
                                goto abort_return;
-                        }
                }
        } else { /* target_ip != NULL */
                /*
                 * If target exists and it's a directory, check that both
                 * target and source are directories and that target can be
@@ -435,7 +427,7 @@ xfs_rename(
                        /*
                         * Make sure target dir is empty.
                         */
-                        if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
+                        if (!(xfs_dir_isempty(target_ip)) ||
                            (target_ip->i_d.di_nlink > 2)) {
                                error = XFS_ERROR(EEXIST);
                                goto error_return;
@@ -451,12 +443,11 @@ xfs_rename(
                 * In case there is already an entry with the same
                 * name at the destination directory, remove it first.
                 */
-                error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
+                error = xfs_dir_replace(tp, target_dp, target_name,
-                        target_namelen, src_ip->i_ino, &first_block,
+                                        target_namelen, src_ip->i_ino,
-                        &free_list, spaceres);
+                                        &first_block, &free_list, spaceres);
-                if (error) {
+                if (error)
                        goto abort_return;
-                }
                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                /*
@@ -464,9 +455,8 @@ xfs_rename(
                 * dir no longer points to it.
                 */
                error = xfs_droplink(tp, target_ip);
-                if (error) {
+                if (error)
                        goto abort_return;
-                }
                target_ip_dropped = 1;
                if (src_is_directory) {
@@ -474,9 +464,8 @@ xfs_rename(
                         * Drop the link from the old "." entry.
                         */
                        error = xfs_droplink(tp, target_ip);
-                        if (error) {
+                        if (error)
                                goto abort_return;
-                        }
                }
                /* Do this test while we still hold the locks */
@@ -488,18 +477,15 @@ xfs_rename(
         * Remove the source.
         */
        if (new_parent && src_is_directory) {
                /*
                 * Rewrite the ".." entry to point to the new
                 * directory.
                 */
-                error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
+                error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
-                                        target_dp->i_ino, &first_block,
+                                        &first_block, &free_list, spaceres);
-                                        &free_list, spaceres);
                ASSERT(error != EEXIST);
-                if (error) {
+                if (error)
                        goto abort_return;
-                }
                xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        } else {
@@ -527,16 +513,14 @@ xfs_rename(
                 * entry that's moved no longer points to it.
                 */
                error = xfs_droplink(tp, src_dp);
-                if (error) {
+                if (error)
                        goto abort_return;
-                }
        }
-        error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
+        error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
                        src_ip->i_ino, &first_block, &free_list, spaceres);
-        if (error) {
+        if (error)
                goto abort_return;
-        }
        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        /*
@@ -609,7 +593,7 @@ xfs_rename(
         * Let interposed file systems know about removed links.
         */
        if (target_ip_dropped) {
-                VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
+                bhv_vop_link_removed(XFS_ITOV(target_ip), target_dir_vp,
                                        target_link_zero);
                IRELE(target_ip);
        }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5b413946b1c5..5a0b678956e0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -141,7 +139,7 @@ xfs_growfs_rt_alloc(
                cancelflags |= XFS_TRANS_ABORT;
                error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
                        XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-                        resblks, &map, &nmap, &flist);
+                        resblks, &map, &nmap, &flist, NULL);
                if (!error && nmap < 1)
                        error = XFS_ERROR(ENOSPC);
                if (error)
@@ -1931,7 +1929,7 @@ xfs_growfs_rt(
        /*
         * Initial error checking.
         */
-        if (mp->m_rtdev_targp || mp->m_rbmip == NULL ||
+        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
                return XFS_ERROR(EINVAL);
@@ -2404,10 +2402,10 @@ xfs_rtprint_range(
 {
        xfs_extlen_t    i;              /* block number in the extent */
-        printk("%Ld: ", (long long)start);
+        cmn_err(CE_DEBUG, "%Ld: ", (long long)start);
        for (i = 0; i < len; i++)
-                printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
+                cmn_err(CE_DEBUG, "%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
-        printk("\n");
+        cmn_err(CE_DEBUG, "\n");
 }
 /*
@@ -2431,17 +2429,17 @@ xfs_rtprint_summary(
                        (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
                        if (c) {
                                if (!p) {
-                                        printk("%Ld-%Ld:", 1LL << l,
+                                        cmn_err(CE_DEBUG, "%Ld-%Ld:", 1LL << l,
                                                XFS_RTMIN((1LL << l) +
                                                          ((1LL << l) - 1LL),
                                                         mp->m_sb.sb_rextents));
                                        p = 1;
                                }
-                                printk(" %Ld:%d", (long long)i, c);
+                                cmn_err(CE_DEBUG, " %Ld:%d", (long long)i, c);
                        }
                }
                if (p)
-                        printk("\n");
+                        cmn_err(CE_DEBUG, "\n");
        }
        if (sumbp)
                xfs_trans_brelse(tp, sumbp);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index a59c102cf214..defb2febaaf5 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -92,6 +90,90 @@ xfs_write_clear_setuid(
 }
 /*
+ * Handle logging requirements of various synchronous types of write.
+ */
+int
+xfs_write_sync_logforce(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip)
+{
+        int             error = 0;
+        /*
+         * If we're treating this as O_DSYNC and we have not updated the
+         * size, force the log.
+         */
+        if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
+            !(ip->i_update_size)) {
+                xfs_inode_log_item_t    *iip = ip->i_itemp;
+                /*
+                 * If an allocation transaction occurred
+                 * without extending the size, then we have to force
+                 * the log up the proper point to ensure that the
+                 * allocation is permanent.  We can't count on
+                 * the fact that buffered writes lock out direct I/O
+                 * writes - the direct I/O write could have extended
+                 * the size nontransactionally, then finished before
+                 * we started.  xfs_write_file will think that the file
+                 * didn't grow but the update isn't safe unless the
+                 * size change is logged.
+                 *
+                 * Force the log if we've committed a transaction
+                 * against the inode or if someone else has and
+                 * the commit record hasn't gone to disk (e.g.
+                 * the inode is pinned).  This guarantees that
+                 * all changes affecting the inode are permanent
+                 * when we return.
+                 */
+                if (iip && iip->ili_last_lsn) {
+                        xfs_log_force(mp, iip->ili_last_lsn,
+                                        XFS_LOG_FORCE | XFS_LOG_SYNC);
+                } else if (xfs_ipincount(ip) > 0) {
+                        xfs_log_force(mp, (xfs_lsn_t)0,
+                                        XFS_LOG_FORCE | XFS_LOG_SYNC);
+                }
+        } else {
+                xfs_trans_t     *tp;
+                /*
+                 * O_SYNC or O_DSYNC _with_ a size update are handled
+                 * the same way.
+                 *
+                 * If the write was synchronous then we need to make
+                 * sure that the inode modification time is permanent.
+                 * We'll have updated the timestamp above, so here
+                 * we use a synchronous transaction to log the inode.
+                 * It's not fast, but it's necessary.
+                 *
+                 * If this a dsync write and the size got changed
+                 * non-transactionally, then we need to ensure that
+                 * the size change gets logged in a synchronous
+                 * transaction.
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+                if ((error = xfs_trans_reserve(tp, 0,
+                                                XFS_SWRITE_LOG_RES(mp),
+                                                0, 0, 0))) {
+                        /* Transaction reserve failed */
+                        xfs_trans_cancel(tp, 0);
+                } else {
+                        /* Transaction reserve successful */
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(tp, ip);
+                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                        xfs_trans_set_sync(tp);
+                        error = xfs_trans_commit(tp, 0, NULL);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+        }
+        return error;
+}
+/*
 * Force a shutdown of the filesystem instantly while keeping
 * the filesystem consistent. We don't do an unmount here; just shutdown
 * the shop, make sure that absolutely nothing persistent happens to
@@ -109,12 +191,12 @@ xfs_do_force_shutdown(
        xfs_mount_t     *mp;
        mp = XFS_BHVTOM(bdp);
-        logerror = flags & XFS_LOG_IO_ERROR;
+        logerror = flags & SHUTDOWN_LOG_IO_ERROR;
-        if (!(flags & XFS_FORCE_UMOUNT)) {
+        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_NOTE,
+                cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
-                "xfs_force_shutdown(%s,0x%x) called from line %d of file %s.  Return address = 0x%p",
+                                 "line %d of file %s.  Return address = 0x%p",
-                        mp->m_fsname,flags,lnnum,fname,__return_address);
+                        mp->m_fsname, flags, lnnum, fname, __return_address);
        }
        /*
         * No need to duplicate efforts.
@@ -125,33 +207,37 @@ xfs_do_force_shutdown(
        /*
         * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
         * queue up anybody new on the log reservations, and wakes up
-         * everybody who's sleeping on log reservations and tells
+         * everybody who's sleeping on log reservations to tell them
-         * them the bad news.
+         * the bad news.
         */
        if (xfs_log_force_umount(mp, logerror))
                return;
-        if (flags & XFS_CORRUPT_INCORE) {
+        if (flags & SHUTDOWN_CORRUPT_INCORE) {
                xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
                        mp->m_fsname);
                if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
                        xfs_stack_trace();
                }
-        } else if (!(flags & XFS_FORCE_UMOUNT)) {
+        } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
                if (logerror) {
                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
-                        "Log I/O Error Detected.  Shutting down filesystem: %s",
+                "Log I/O Error Detected.  Shutting down filesystem: %s",
+                                mp->m_fsname);
+                } else if (flags & SHUTDOWN_DEVICE_REQ) {
+                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                "All device paths lost.  Shutting down filesystem: %s",
                                mp->m_fsname);
-                } else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
+                } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
-                                "I/O Error Detected.  Shutting down filesystem: %s",
+                "I/O Error Detected.  Shutting down filesystem: %s",
                                mp->m_fsname);
                }
        }
-        if (!(flags & XFS_FORCE_UMOUNT)) {
+        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_ALERT,
+                cmn_err(CE_ALERT, "Please umount the filesystem, "
-                "Please umount the filesystem, and rectify the problem(s)");
+                                  "and rectify the problem(s)");
        }
 }
@@ -335,7 +421,7 @@ xfs_bwrite(
                 * from bwrite and we could be tracing a buffer that has
                 * been reused.
                 */
-                xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        }
        return (error);
 }
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index e63795644478..188b296ff50c 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -75,6 +75,7 @@ xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb)
 * Prototypes for functions in xfs_rw.c.
 */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
+extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -87,9 +88,10 @@ extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
 /*
 * Prototypes for functions in xfs_vnodeops.c.
 */
-extern int xfs_rwlock(bhv_desc_t *bdp, vrwlock_t write_lock);
+extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
-extern void xfs_rwunlock(bhv_desc_t *bdp, vrwlock_t write_lock);
+extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
-extern int xfs_setattr(bhv_desc_t *bdp, vattr_t *vap, int flags, cred_t *credp);
+extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags,
+                       cred_t *credp);
 extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf,
                                 xfs_off_t offset, cred_t *credp, int flags);
 extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8d056cef5d1f..ee2721e0de4d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -236,11 +234,8 @@ xfs_trans_alloc(
        xfs_mount_t     *mp,
        uint            type)
 {
-        fs_check_frozen(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
+        vfs_wait_for_freeze(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
-        atomic_inc(&mp->m_active_trans);
+        return _xfs_trans_alloc(mp, type);
-        return (_xfs_trans_alloc(mp, type));
 }
 xfs_trans_t *
@@ -250,12 +245,9 @@ _xfs_trans_alloc(
 {
        xfs_trans_t     *tp;
-        ASSERT(xfs_trans_zone != NULL);
+        atomic_inc(&mp->m_active_trans);
-        tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
-        /*
+        tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
-         * Initialize the transaction structure.
-         */
        tp->t_magic = XFS_TRANS_MAGIC;
        tp->t_type = type;
        tp->t_mountp = mp;
@@ -263,8 +255,7 @@ _xfs_trans_alloc(
        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
        XFS_LIC_INIT(&(tp->t_items));
        XFS_LBC_INIT(&(tp->t_busy));
+        return tp;
-        return (tp);
 }
 /*
@@ -303,7 +294,7 @@ xfs_trans_dup(
        tp->t_blk_res = tp->t_blk_res_used;
        ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
        tp->t_rtx_res = tp->t_rtx_res_used;
-        PFLAGS_DUP(&tp->t_pflags, &ntp->t_pflags);
+        ntp->t_pflags = tp->t_pflags;
        XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
@@ -335,14 +326,11 @@ xfs_trans_reserve(
        uint            logcount)
 {
        int             log_flags;
-        int             error;
+        int             error = 0;
-        int     rsvd;
+        int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-        error = 0;
-        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
        /* Mark this thread as being in a transaction */
-        PFLAGS_SET_FSTRANS(&tp->t_pflags);
+        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
        /*
         * Attempt to reserve the needed disk blocks by decrementing
@@ -353,7 +341,7 @@ xfs_trans_reserve(
                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                          -blocks, rsvd);
                if (error != 0) {
-                        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                        return (XFS_ERROR(ENOSPC));
                }
                tp->t_blk_res += blocks;
@@ -426,9 +414,9 @@ undo_blocks:
                tp->t_blk_res = 0;
        }
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        return (error);
+        return error;
 }
@@ -819,7 +807,7 @@ shut_us_down:
                        if (commit_lsn == -1 && !shutdown)
                                shutdown = XFS_ERROR(EIO);
                }
-                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
                xfs_trans_free_busy(tp);
                xfs_trans_free(tp);
@@ -846,7 +834,7 @@ shut_us_down:
         */
        nvec = xfs_trans_count_vecs(tp);
        if (nvec == 0) {
-                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
                goto shut_us_down;
        } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
                log_vector = log_vector_fast;
@@ -884,7 +872,7 @@ shut_us_down:
         * had pinned, clean up, free trans structure, and return error.
         */
        if (error || commit_lsn == -1) {
-                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
                return XFS_ERROR(EIO);
        }
@@ -926,7 +914,7 @@ shut_us_down:
        /*
         * Mark this thread as no longer being in a transaction
         */
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        /*
         * Once all the items of the transaction have been copied
@@ -1148,7 +1136,7 @@ xfs_trans_cancel(
         */
        if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
                XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
-                xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
        }
 #ifdef DEBUG
        if (!(flags & XFS_TRANS_ABORT)) {
@@ -1182,7 +1170,7 @@ xfs_trans_cancel(
        }
        /* mark this thread as no longer being in a transaction */
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free_items(tp, flags);
        xfs_trans_free_busy(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 100d9a4b38ee..9dc88b380608 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -338,8 +338,6 @@ typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
 typedef struct xfs_trans {
        unsigned int            t_magic;        /* magic number */
        xfs_log_callback_t      t_logcb;        /* log callback struct */
-        struct xfs_trans        *t_forw;        /* async list pointers */
-        struct xfs_trans        *t_back;        /* async list pointers */
        unsigned int            t_type;         /* transaction type */
        unsigned int            t_log_res;      /* amt of log space resvd */
        unsigned int            t_log_count;    /* count for perm log res */
@@ -364,9 +362,11 @@ typedef struct xfs_trans {
        long                    t_res_fdblocks_delta; /* on-disk only chg */
        long                    t_frextents_delta;/* superblock freextents chg*/
        long                    t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
        long                    t_ag_freeblks_delta; /* debugging counter */
        long                    t_ag_flist_delta; /* debugging counter */
        long                    t_ag_btree_delta; /* debugging counter */
+#endif
        long                    t_dblocks_delta;/* superblock dblocks change */
        long                    t_agcount_delta;/* superblock agcount change */
        long                    t_imaxpct_delta;/* superblock imaxpct change */
@@ -805,12 +805,9 @@ typedef struct xfs_trans {
        ((mp)->m_sb.sb_inodesize + \
         (mp)->m_sb.sb_sectsize * 2 + \
         (mp)->m_dirblksize + \
-         (XFS_DIR_IS_V1(mp) ? 0 : \
+         XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-            XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (4 + \
+         (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-                 (XFS_DIR_IS_V1(mp) ? 0 : \
-                         XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
                 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 19ab24af1c1c..558c87ff0c41 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -22,7 +22,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
@@ -363,9 +362,10 @@ xfs_trans_delete_ail(
                        AIL_UNLOCK(mp, s);
                else {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                                "xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
+                "%s: attempting to delete a log item that is not in the AIL",
+                                        __FUNCTION__);
                        AIL_UNLOCK(mp, s);
-                        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
        }
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c74c31ebc81c..60b6b898022b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -320,7 +318,7 @@ xfs_trans_read_buf(
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
-                                        printk("Returning error!\n");
+                                        cmn_err(CE_DEBUG, "Returning error!\n");
                                        return XFS_ERROR(EIO);
                                }
                        }
@@ -369,7 +367,7 @@ xfs_trans_read_buf(
                                 */
                                if (tp->t_flags & XFS_TRANS_DIRTY)
                                        xfs_force_shutdown(tp->t_mountp,
-                                                           XFS_METADATA_IO_ERROR);
+                                                        SHUTDOWN_META_IO_ERROR);
                                return error;
                        }
                }
@@ -414,7 +412,7 @@ xfs_trans_read_buf(
                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                  bp, blkno);
                if (tp->t_flags & XFS_TRANS_DIRTY)
-                        xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+                        xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
                xfs_buf_relse(bp);
                return error;
        }
@@ -423,9 +421,9 @@ xfs_trans_read_buf(
                if (xfs_error_target == target) {
                        if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                xfs_force_shutdown(tp->t_mountp,
-                                                   XFS_METADATA_IO_ERROR);
+                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
-                                printk("Returning error in trans!\n");
+                                cmn_err(CE_DEBUG, "Returning trans error!\n");
                                return XFS_ERROR(EIO);
                        }
                }
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 7d7d627f25df..b290270dd4a6 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -22,7 +22,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 7c5894d59f81..b8db1d5cde5a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 1117d600d741..2912aac07c7b 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -493,7 +493,7 @@ xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
                                break;
                        } else {
                                /* out-of-order vacancy */
-                                printk("OOO vacancy lbcp 0x%p\n", lbcp);
+                                cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
                                ASSERT(0);
                        }
                }
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7fe3792b18df..4ea2e5074bdd 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -30,8 +30,7 @@
          XFS_EXTENTADD_SPACE_RES(mp,w))
 #define XFS_DAENTER_1B(mp,w)    ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
 #define XFS_DAENTER_DBS(mp,w)   \
-        (XFS_DA_NODE_MAXDEPTH + \
+        (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
-         ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
 #define XFS_DAENTER_BLOCKS(mp,w)        \
        (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
 #define XFS_DAENTER_BMAP1B(mp,w)        \
@@ -41,10 +40,7 @@
 #define XFS_DAENTER_SPACE_RES(mp,w)     \
        (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
 #define XFS_DAREMOVE_SPACE_RES(mp,w)    XFS_DAENTER_BMAPS(mp,w)
-#define XFS_DIRENTER_MAX_SPLIT(mp,nl)   \
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl)   1
-        (((mp)->m_sb.sb_blocksize == 512 && \
-          XFS_DIR_IS_V1(mp) && \
-          (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
 #define XFS_DIRENTER_SPACE_RES(mp,nl)   \
        (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
         XFS_DIRENTER_MAX_SPLIT(mp,nl))
@@ -57,8 +53,7 @@
 * Space reservation values for various transactions.
 */
 #define XFS_ADDAFORK_SPACE_RES(mp)      \
-        ((mp)->m_dirblkfsbs + \
+        ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
-         (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
 #define XFS_ATTRRM_SPACE_RES(mp)        \
        XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
 /* This macro is not used - see inline code in xfs_attr_set */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 34654ec6ae10..9014d7e44488 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -24,12 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -51,10 +49,10 @@
 */
 int
 xfs_get_dir_entry(
-        vname_t         *dentry,
+        bhv_vname_t     *dentry,
        xfs_inode_t     **ipp)
 {
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = VNAME_TO_VNODE(dentry);
@@ -69,11 +67,11 @@ int
 xfs_dir_lookup_int(
        bhv_desc_t      *dir_bdp,
        uint            lock_mode,
-        vname_t         *dentry,
+        bhv_vname_t     *dentry,
        xfs_ino_t       *inum,
        xfs_inode_t     **ipp)
 {
-        vnode_t         *dir_vp;
+        bhv_vnode_t     *dir_vp;
        xfs_inode_t     *dp;
        int             error;
@@ -82,8 +80,7 @@ xfs_dir_lookup_int(
        dp = XFS_BHVTOI(dir_bdp);
-        error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp,
+        error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
-                                VNAME(dentry), VNAMELEN(dentry), inum);
        if (!error) {
                /*
                 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 472661a3b6d8..fe953e98afa7 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -23,9 +23,10 @@
 #define ITRACE(ip)      vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
                                (inst_t *)__return_address)
-extern int xfs_rename (bhv_desc_t *, vname_t *, vnode_t *, vname_t *, cred_t *);
+extern int xfs_rename (bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *,
-extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **);
+                        bhv_vname_t *, cred_t *);
-extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *,
+extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
+extern int xfs_dir_lookup_int (bhv_desc_t *, uint, bhv_vname_t *, xfs_ino_t *,
                                xfs_inode_t **);
 extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 36ea1b2094f2..6c96391f3f1a 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -131,9 +129,6 @@ xfs_init(void)
 #ifdef XFS_BMBT_TRACE
        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
 #endif
-#ifdef XFS_DIR_TRACE
-        xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
-#endif
 #ifdef XFS_ATTR_TRACE
        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
 #endif
@@ -177,9 +172,6 @@ xfs_cleanup(void)
 #ifdef XFS_ATTR_TRACE
        ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_DIR_TRACE
-        ktrace_free(xfs_dir_trace_buf);
-#endif
 #ifdef XFS_BMBT_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
 #endif
@@ -212,7 +204,7 @@ xfs_cleanup(void)
 */
 STATIC int
 xfs_start_flags(
-        struct vfs              *vfs,
+        struct bhv_vfs          *vfs,
        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
@@ -337,7 +329,7 @@ xfs_start_flags(
 */
 STATIC int
 xfs_finish_flags(
-        struct vfs              *vfs,
+        struct bhv_vfs          *vfs,
        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
@@ -423,7 +415,7 @@ xfs_mount(
        struct xfs_mount_args   *args,
        cred_t                  *credp)
 {
-        struct vfs              *vfsp = bhvtovfs(bhvp);
+        struct bhv_vfs          *vfsp = bhvtovfs(bhvp);
        struct bhv_desc         *p;
        struct xfs_mount        *mp = XFS_BHVTOM(bhvp);
        struct block_device     *ddev, *logdev, *rtdev;
@@ -552,10 +544,10 @@ xfs_unmount(
        int             flags,
        cred_t          *credp)
 {
-        struct vfs      *vfsp = bhvtovfs(bdp);
+        bhv_vfs_t       *vfsp = bhvtovfs(bdp);
        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
        xfs_inode_t     *rip;
-        vnode_t         *rvp;
+        bhv_vnode_t     *rvp;
        int             unmount_event_wanted = 0;
        int             unmount_event_flags = 0;
        int             xfs_unmountfs_needed = 0;
@@ -665,9 +657,8 @@ xfs_mntupdate(
        int                             *flags,
        struct xfs_mount_args           *args)
 {
-        struct vfs      *vfsp = bhvtovfs(bdp);
+        bhv_vfs_t       *vfsp = bhvtovfs(bdp);
        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
-        int             error;
        if (!(*flags & MS_RDONLY)) {                    /* rw/ro -> rw */
                if (vfsp->vfs_flag & VFS_RDONLY)
@@ -679,7 +670,7 @@ xfs_mntupdate(
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                }
        } else if (!(vfsp->vfs_flag & VFS_RDONLY)) {    /* rw -> ro */
-                VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+                bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL);
                xfs_quiesce_fs(mp);
                xfs_log_unmount_write(mp);
                xfs_unmountfs_writesb(mp);
@@ -702,7 +693,7 @@ xfs_unmount_flush(
        xfs_inode_t     *rip = mp->m_rootip;
        xfs_inode_t     *rbmip;
        xfs_inode_t     *rsumip = NULL;
-        vnode_t         *rvp = XFS_ITOV(rip);
+        bhv_vnode_t     *rvp = XFS_ITOV(rip);
        int             error;
        xfs_ilock(rip, XFS_ILOCK_EXCL);
@@ -781,9 +772,9 @@ fscorrupt_out2:
 STATIC int
 xfs_root(
        bhv_desc_t      *bdp,
-        vnode_t         **vpp)
+        bhv_vnode_t     **vpp)
 {
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
        VN_HOLD(vp);
@@ -801,8 +792,8 @@ xfs_root(
 STATIC int
 xfs_statvfs(
        bhv_desc_t      *bdp,
-        xfs_statfs_t    *statp,
+        bhv_statvfs_t   *statp,
-        vnode_t         *vp)
+        bhv_vnode_t     *vp)
 {
        __uint64_t      fakeinos;
        xfs_extlen_t    lsize;
@@ -900,7 +891,7 @@ xfs_sync(
 /*
 * xfs sync routine for internal use
 *
- * This routine supports all of the flags defined for the generic VFS_SYNC
+ * This routine supports all of the flags defined for the generic vfs_sync
 * interface as explained above under xfs_sync.  In the interests of not
 * changing interfaces within the 6.5 family, additional internally-
 * required functions are specified within a separate xflags parameter,
@@ -917,7 +908,7 @@ xfs_sync_inodes(
        xfs_inode_t     *ip = NULL;
        xfs_inode_t     *ip_next;
        xfs_buf_t       *bp;
-        vnode_t         *vp = NULL;
+        bhv_vnode_t     *vp = NULL;
        int             error;
        int             last_error;
        uint64_t        fflag;
@@ -1156,9 +1147,9 @@ xfs_sync_inodes(
                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        if (XFS_FORCED_SHUTDOWN(mp)) {
-                                VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+                                bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
                        } else {
-                                VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
+                                bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF);
                        }
                        xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1178,8 +1169,8 @@ xfs_sync_inodes(
                                 * across calls to the buffer cache.
                                 */
                                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                                VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
+                                error = bhv_vop_flush_pages(vp, (xfs_off_t)0,
-                                                        fflag, FI_NONE, error);
+                                                        -1, fflag, FI_NONE);
                                xfs_ilock(ip, XFS_ILOCK_SHARED);
                        }
@@ -1231,9 +1222,7 @@ xfs_sync_inodes(
                                                 * marker and free it.
                                                 */
                                                XFS_MOUNT_ILOCK(mp);
                                                IPOINTER_REMOVE(ip, mp);
                                                XFS_MOUNT_IUNLOCK(mp);
                                                ASSERT(!(lock_flags &
@@ -1421,7 +1410,7 @@ xfs_sync_inodes(
 /*
 * xfs sync routine for internal use
 *
- * This routine supports all of the flags defined for the generic VFS_SYNC
+ * This routine supports all of the flags defined for the generic vfs_sync
 * interface as explained above under xfs_sync.  In the interests of not
 * changing interfaces within the 6.5 family, additional internally-
 * required functions are specified within a separate xflags parameter,
@@ -1574,7 +1563,7 @@ xfs_syncsub(
 STATIC int
 xfs_vget(
        bhv_desc_t      *bdp,
-        vnode_t         **vpp,
+        bhv_vnode_t     **vpp,
        fid_t           *fidp)
 {
        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
@@ -1657,10 +1646,10 @@ xfs_vget(
 #define MNTOPT_NOATTR2  "noattr2"       /* do not use attr2 attribute format */
 STATIC unsigned long
-suffix_strtoul(const char *cp, char **endp, unsigned int base)
+suffix_strtoul(char *s, char **endp, unsigned int base)
 {
        int     last, shift_left_factor = 0;
-        char    *value = (char *)cp;
+        char    *value = s;
        last = strlen(value) - 1;
        if (value[last] == 'K' || value[last] == 'k') {
@@ -1676,7 +1665,7 @@ suffix_strtoul(const char *cp, char **endp, unsigned int base)
                value[last] = '\0';
        }
-        return simple_strtoul(cp, endp, base) << shift_left_factor;
+        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
 STATIC int
@@ -1686,7 +1675,7 @@ xfs_parseargs(
        struct xfs_mount_args   *args,
        int                     update)
 {
-        struct vfs              *vfsp = bhvtovfs(bhv);
+        bhv_vfs_t               *vfsp = bhvtovfs(bhv);
        char                    *this_char, *value, *eov;
        int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
        int                     iosize;
@@ -1708,42 +1697,48 @@ xfs_parseargs(
                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        args->logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        args->logbufsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        strncpy(args->logname, value, MAXNAMELEN);
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        strncpy(args->mtpt, value, MAXNAMELEN);
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        strncpy(args->rtname, value, MAXNAMELEN);
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -1752,7 +1747,8 @@ xfs_parseargs(
                        args->iosizelog = (uint8_t) iosize;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -1761,7 +1757,8 @@ xfs_parseargs(
                        args->iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -1782,7 +1779,8 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_INO64)) {
                        args->flags |= XFSMNT_INO64;
 #if !XFS_BIG_INUMS
-                        printk("XFS: %s option not allowed on this system\n",
+                        cmn_err(CE_WARN,
+                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
@@ -1792,14 +1790,16 @@ xfs_parseargs(
                        args->flags |= XFSMNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        dsunit = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
-                                printk("XFS: %s option requires an argument\n",
+                                cmn_err(CE_WARN,
+                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -1807,7 +1807,8 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        args->flags &= ~XFSMNT_32BITINODES;
 #if !XFS_BIG_INUMS
-                        printk("XFS: %s option not allowed on this system\n",
+                        cmn_err(CE_WARN,
+                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
@@ -1831,36 +1832,41 @@ xfs_parseargs(
                        args->flags &= ~XFSMNT_ATTR2;
                } else if (!strcmp(this_char, "osyncisdsync")) {
                        /* no-op, this is now the default */
-printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
+                        cmn_err(CE_WARN,
+        "XFS: osyncisdsync is now the default, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
-printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n");
+                        cmn_err(CE_WARN,
+        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
-                        printk("XFS: unknown mount option [%s].\n", this_char);
+                        cmn_err(CE_WARN,
+                                "XFS: unknown mount option [%s].", this_char);
                        return EINVAL;
                }
        }
        if (args->flags & XFSMNT_NORECOVERY) {
                if ((vfsp->vfs_flag & VFS_RDONLY) == 0) {
-                        printk("XFS: no-recovery mounts must be read-only.\n");
+                        cmn_err(CE_WARN,
+                                "XFS: no-recovery mounts must be read-only.");
                        return EINVAL;
                }
        }
        if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
-                printk(
+                cmn_err(CE_WARN,
-        "XFS: sunit and swidth options incompatible with the noalign option\n");
+        "XFS: sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-                printk("XFS: sunit and swidth must be specified together\n");
+                cmn_err(CE_WARN,
+                        "XFS: sunit and swidth must be specified together");
                return EINVAL;
        }
        if (dsunit && (dswidth % dsunit != 0)) {
-                printk(
+                cmn_err(CE_WARN,
-        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
+        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
                return EINVAL;
        }
@@ -1907,7 +1913,7 @@ xfs_showargs(
        };
        struct proc_xfs_info    *xfs_infop;
        struct xfs_mount        *mp = XFS_BHVTOM(bhv);
-        struct vfs              *vfsp = XFS_MTOVFS(mp);
+        struct bhv_vfs          *vfsp = XFS_MTOVFS(mp);
        for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
                if (mp->m_flags & xfs_infop->flag)
@@ -1967,7 +1973,7 @@ xfs_freeze(
 }
-vfsops_t xfs_vfsops = {
+bhv_vfsops_t xfs_vfsops = {
        BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
        .vfs_parseargs          = xfs_parseargs,
        .vfs_showargs           = xfs_showargs,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7027ae68ee38..23cfa5837728 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -16,8 +16,6 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/capability.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
@@ -27,7 +25,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -35,13 +32,11 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_itable.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
@@ -58,32 +53,14 @@
 #include "xfs_log_priv.h"
 #include "xfs_mac.h"
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-/*
- * For xfs, we check that the file isn't too big to be opened by this kernel.
- * No other open action is required for regular files.  Devices are handled
- * through the specfs file system, pipes through fifofs.  Device and
- * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
- * when a new vnode is first looked up or created.
- */
 STATIC int
 xfs_open(
        bhv_desc_t      *bdp,
        cred_t          *credp)
 {
        int             mode;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
-        xfs_inode_t     *ip;
+        xfs_inode_t     *ip = XFS_BHVTOI(bdp);
-        vp = BHV_TO_VNODE(bdp);
-        ip = XFS_BHVTOI(bdp);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return XFS_ERROR(EIO);
@@ -101,6 +78,35 @@ xfs_open(
        return 0;
 }
+STATIC int
+xfs_close(
+        bhv_desc_t      *bdp,
+        int             flags,
+        lastclose_t     lastclose,
+        cred_t          *credp)
+{
+        bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
+        xfs_inode_t     *ip = XFS_BHVTOI(bdp);
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return XFS_ERROR(EIO);
+        if (lastclose != L_TRUE || !VN_ISREG(vp))
+                return 0;
+        /*
+         * If we previously truncated this file and removed old data in
+         * the process, we want to initiate "early" writeout on the last
+         * close.  This is an attempt to combat the notorious NULL files
+         * problem which is particularly noticable from a truncate down,
+         * buffered (re-)write (delalloc), followed by a crash.  What we
+         * are effectively doing here is significantly reducing the time
+         * window where we'd otherwise be exposed to that problem.
+         */
+        if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+                return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
+        return 0;
+}
 /*
 * xfs_getattr
@@ -108,13 +114,13 @@ xfs_open(
 STATIC int
 xfs_getattr(
        bhv_desc_t      *bdp,
-        vattr_t         *vap,
+        bhv_vattr_t     *vap,
        int             flags,
        cred_t          *credp)
 {
        xfs_inode_t     *ip;
        xfs_mount_t     *mp;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp  = BHV_TO_VNODE(bdp);
        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -241,7 +247,7 @@ xfs_getattr(
 int
 xfs_setattr(
        bhv_desc_t              *bdp,
-        vattr_t                 *vap,
+        bhv_vattr_t             *vap,
        int                     flags,
        cred_t                  *credp)
 {
@@ -255,7 +261,7 @@ xfs_setattr(
        uid_t                   uid=0, iuid=0;
        gid_t                   gid=0, igid=0;
        int                     timeflags = 0;
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        xfs_prid_t              projid=0, iprojid=0;
        int                     mandlock_before, mandlock_after;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
@@ -347,7 +353,6 @@ xfs_setattr(
         */
        tp = NULL;
        lock_flags = XFS_ILOCK_EXCL;
-        ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
        if (flags & ATTR_NOLOCK)
                need_iolock = 0;
        if (!(mask & XFS_AT_SIZE)) {
@@ -666,9 +671,17 @@ xfs_setattr(
                                            ((ip->i_d.di_nlink != 0 ||
                                              !(mp->m_flags & XFS_MOUNT_WSYNC))
                                             ? 1 : 0));
-                        if (code) {
+                        if (code)
                                goto abort_return;
-                        }
+                        /*
+                         * Truncated "down", so we're removing references
+                         * to old data here - if we now delay flushing for
+                         * a long time, we expose ourselves unduly to the
+                         * notorious NULL files problem.  So, we mark this
+                         * vnode and flush it when the file is closed, and
+                         * do not wait the usual (long) time for writeout.
+                         */
+                        VTRUNCATE(vp);
                }
                /*
                 * Have to do this even if the file's size doesn't change.
@@ -800,6 +813,8 @@ xfs_setattr(
                                di_flags |= XFS_DIFLAG_NODUMP;
                        if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
                                di_flags |= XFS_DIFLAG_PROJINHERIT;
+                        if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
+                                di_flags |= XFS_DIFLAG_NODEFRAG;
                        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
                                if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
                                        di_flags |= XFS_DIFLAG_RTINHERIT;
@@ -869,7 +884,7 @@ xfs_setattr(
         */
        mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
        if (mandlock_before != mandlock_after) {
-                VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
+                bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
                                 mandlock_after);
        }
@@ -936,6 +951,13 @@ xfs_access(
 /*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+/*
 * xfs_readlink
 *
 */
@@ -950,7 +972,7 @@ xfs_readlink(
        int             count;
        xfs_off_t       offset;
        int             pathlen;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        int             error = 0;
        xfs_mount_t     *mp;
        int             nmaps;
@@ -1000,7 +1022,7 @@ xfs_readlink(
                nmaps = SYMLINK_MAPS;
                error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
-                                  0, NULL, 0, mval, &nmaps, NULL);
+                                  0, NULL, 0, mval, &nmaps, NULL, NULL);
                if (error) {
                        goto error_return;
@@ -1208,8 +1230,8 @@ xfs_inactive_free_eofblocks(
        nimaps = 1;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+        error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
-                          NULL, 0, &imap, &nimaps, NULL);
+                          NULL, 0, &imap, &nimaps, NULL, NULL);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!error && (nimaps != 0) &&
@@ -1338,7 +1360,7 @@ xfs_inactive_symlink_rmt(
        nmaps = ARRAY_SIZE(mval);
        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-                        &free_list)))
+                        &free_list, NULL)))
                goto error0;
        /*
         * Invalidate the block(s).
@@ -1353,7 +1375,7 @@ xfs_inactive_symlink_rmt(
         * Unmap the dead block(s) to the free_list.
         */
        if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-                        &first_block, &free_list, &done)))
+                        &first_block, &free_list, NULL, &done)))
                goto error1;
        ASSERT(done);
        /*
@@ -1469,9 +1491,6 @@ xfs_inactive_symlink_local(
        return 0;
 }
-/*
- *
- */
 STATIC int
 xfs_inactive_attrs(
        xfs_inode_t     *ip,
@@ -1524,16 +1543,16 @@ xfs_release(
        bhv_desc_t      *bdp)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        xfs_mount_t     *mp;
        int             error;
        vp = BHV_TO_VNODE(bdp);
        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
-        if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
+        if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
                return 0;
-        }
        /* If this is a read-only mount, don't do this (would generate I/O) */
        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
@@ -1545,8 +1564,6 @@ xfs_release(
                return 0;
 #endif
-        mp = ip->i_mount;
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -1579,8 +1596,8 @@ xfs_inactive(
        cred_t          *credp)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
-        xfs_bmap_free_t free_list; 
+        xfs_bmap_free_t free_list;
        xfs_fsblock_t   first_block;
        int             committed;
        xfs_trans_t     *tp;
@@ -1760,7 +1777,7 @@ xfs_inactive(
                        cmn_err(CE_NOTE,
                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
                                error, mp->m_fsname);
-                        xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
        } else {
@@ -1795,17 +1812,17 @@ xfs_inactive(
 STATIC int
 xfs_lookup(
        bhv_desc_t              *dir_bdp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
-        vnode_t                 **vpp,
+        bhv_vnode_t             **vpp,
        int                     flags,
-        vnode_t                 *rdir,
+        bhv_vnode_t             *rdir,
        cred_t                  *credp)
 {
        xfs_inode_t             *dp, *ip;
        xfs_ino_t               e_inum;
        int                     error;
        uint                    lock_mode;
-        vnode_t                 *dir_vp;
+        bhv_vnode_t             *dir_vp;
        dir_vp = BHV_TO_VNODE(dir_bdp);
        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
@@ -1832,15 +1849,15 @@ xfs_lookup(
 STATIC int
 xfs_create(
        bhv_desc_t              *dir_bdp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
-        vattr_t                 *vap,
+        bhv_vattr_t             *vap,
-        vnode_t                 **vpp,
+        bhv_vnode_t             **vpp,
        cred_t                  *credp)
 {
        char                    *name = VNAME(dentry);
-        vnode_t                 *dir_vp;
+        bhv_vnode_t             *dir_vp;
        xfs_inode_t             *dp, *ip;
-        vnode_t                 *vp=NULL;
+        bhv_vnode_t             *vp = NULL;
        xfs_trans_t             *tp;
        xfs_mount_t             *mp;
        xfs_dev_t               rdev;
@@ -1938,8 +1955,7 @@ xfs_create(
        if (error)
                goto error_return;
-        if (resblks == 0 &&
+        if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
-            (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
                goto error_return;
        rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
        error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
@@ -1970,9 +1986,9 @@ xfs_create(
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        dp_joined_to_trans = B_TRUE;
-        error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
+        error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
-                &first_block, &free_list,
+                                        &first_block, &free_list, resblks ?
-                resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != ENOSPC);
                goto abort_return;
@@ -2026,7 +2042,7 @@ xfs_create(
         * Propagate the fact that the vnode changed after the
         * xfs_inode locks have been released.
         */
-        VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
+        bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
        *vpp = vp;
@@ -2107,7 +2123,7 @@ int xfs_rm_attempts;
 STATIC int
 xfs_lock_dir_and_entry(
        xfs_inode_t     *dp,
-        vname_t         *dentry,
+        bhv_vname_t     *dentry,
        xfs_inode_t     *ip)    /* inode of entry 'name' */
 {
        int             attempts;
@@ -2321,10 +2337,10 @@ int remove_which_error_return = 0;
 STATIC int
 xfs_remove(
        bhv_desc_t              *dir_bdp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
        cred_t                  *credp)
 {
-        vnode_t                 *dir_vp;
+        bhv_vnode_t             *dir_vp;
        char                    *name = VNAME(dentry);
        xfs_inode_t             *dp, *ip;
        xfs_trans_t             *tp = NULL;
@@ -2448,8 +2464,8 @@ xfs_remove(
         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
         */
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
+        error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
-                &first_block, &free_list, 0);
+                                        &first_block, &free_list, 0);
        if (error) {
                ASSERT(error != ENOENT);
                REMOVE_DEBUG_TRACE(__LINE__);
@@ -2511,7 +2527,7 @@ xfs_remove(
        /*
         * Let interposed file systems know about removed links.
         */
-        VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
+        bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
        IRELE(ip);
@@ -2564,8 +2580,8 @@ xfs_remove(
 STATIC int
 xfs_link(
        bhv_desc_t              *target_dir_bdp,
-        vnode_t                 *src_vp,
+        bhv_vnode_t             *src_vp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
        cred_t                  *credp)
 {
        xfs_inode_t             *tdp, *sip;
@@ -2577,7 +2593,7 @@ xfs_link(
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        vnode_t                 *target_dir_vp;
+        bhv_vnode_t             *target_dir_vp;
        int                     resblks;
        char                    *target_name = VNAME(dentry);
        int                     target_namelen;
@@ -2587,8 +2603,7 @@ xfs_link(
        vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
        target_namelen = VNAMELEN(dentry);
-        if (VN_ISDIR(src_vp))
+        ASSERT(!VN_ISDIR(src_vp));
-                return XFS_ERROR(EPERM);
        sip = xfs_vtoi(src_vp);
        tdp = XFS_BHVTOI(target_dir_bdp);
@@ -2668,13 +2683,12 @@ xfs_link(
        }
        if (resblks == 0 &&
-            (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
+            (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
-                        target_namelen)))
                goto error_return;
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
+        error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
                                   sip->i_ino, &first_block, &free_list,
                                   resblks);
        if (error)
@@ -2684,9 +2698,8 @@ xfs_link(
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
-        if (error) {
+        if (error)
                goto abort_return;
-        }
        /*
         * If this is a synchronous mount, make sure that the
@@ -2704,9 +2717,8 @@ xfs_link(
        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
-        if (error) {
+        if (error)
                goto std_return;
-        }
        /* Fall through to std_return with error = 0. */
 std_return:
@@ -2727,6 +2739,8 @@ std_return:
        xfs_trans_cancel(tp, cancel_flags);
        goto std_return;
 }
 /*
 * xfs_mkdir
 *
@@ -2734,15 +2748,15 @@ std_return:
 STATIC int
 xfs_mkdir(
        bhv_desc_t              *dir_bdp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
-        vattr_t                 *vap,
+        bhv_vattr_t             *vap,
-        vnode_t                 **vpp,
+        bhv_vnode_t             **vpp,
        cred_t                  *credp)
 {
        char                    *dir_name = VNAME(dentry);
        xfs_inode_t             *dp;
        xfs_inode_t             *cdp;   /* inode of created dir */
-        vnode_t                 *cvp;   /* vnode of created dir */
+        bhv_vnode_t             *cvp;   /* vnode of created dir */
        xfs_trans_t             *tp;
        xfs_mount_t             *mp;
        int                     cancel_flags;
@@ -2750,7 +2764,7 @@ xfs_mkdir(
        int                     committed;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
-        vnode_t                 *dir_vp;
+        bhv_vnode_t             *dir_vp;
        boolean_t               dp_joined_to_trans;
        boolean_t               created = B_FALSE;
        int                     dm_event_sent = 0;
@@ -2840,7 +2854,7 @@ xfs_mkdir(
                goto error_return;
        if (resblks == 0 &&
-            (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
+            (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
                goto error_return;
        /*
         * create the directory inode.
@@ -2867,9 +2881,9 @@ xfs_mkdir(
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
+        error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
-                        cdp->i_ino, &first_block, &free_list,
+                                   &first_block, &free_list, resblks ?
-                        resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+                                   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != ENOSPC);
                goto error1;
@@ -2883,16 +2897,14 @@ xfs_mkdir(
         */
        dp->i_gen++;
-        error = XFS_DIR_INIT(mp, tp, cdp, dp);
+        error = xfs_dir_init(tp, cdp, dp);
-        if (error) {
+        if (error)
                goto error2;
-        }
        cdp->i_gen = 1;
        error = xfs_bumplink(tp, dp);
-        if (error) {
+        if (error)
                goto error2;
-        }
        cvp = XFS_ITOV(cdp);
@@ -2969,7 +2981,7 @@ std_return:
 STATIC int
 xfs_rmdir(
        bhv_desc_t              *dir_bdp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
        cred_t                  *credp)
 {
        char                    *name = VNAME(dentry);
@@ -2982,7 +2994,7 @@ xfs_rmdir(
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        vnode_t                 *dir_vp;
+        bhv_vnode_t             *dir_vp;
        int                     dm_di_mode = 0;
        int                     last_cdp_link;
        int                     namelen;
@@ -3101,16 +3113,15 @@ xfs_rmdir(
                error = XFS_ERROR(ENOTEMPTY);
                goto error_return;
        }
-        if (!XFS_DIR_ISEMPTY(mp, cdp)) {
+        if (!xfs_dir_isempty(cdp)) {
                error = XFS_ERROR(ENOTEMPTY);
                goto error_return;
        }
-        error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
+        error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
-                &first_block, &free_list, resblks);
+                                        &first_block, &free_list, resblks);
-        if (error) {
+        if (error)
                goto error1;
-        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3181,7 +3192,7 @@ xfs_rmdir(
        /*
         * Let interposed file systems know about removed links.
         */
-        VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
+        bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
        IRELE(cdp);
@@ -3209,8 +3220,6 @@ xfs_rmdir(
 /*
- * xfs_readdir
- *
 * Read dp's entries starting at uiop->uio_offset and translate them into
 * bufsize bytes worth of struct dirents starting at bufbase.
 */
@@ -3230,28 +3239,23 @@ xfs_readdir(
                                               (inst_t *)__return_address);
        dp = XFS_BHVTOI(dir_bdp);
-        if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
-        }
        lock_mode = xfs_ilock_map_shared(dp);
-        error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
+        error = xfs_dir_getdents(tp, dp, uiop, eofp);
        xfs_iunlock_map_shared(dp, lock_mode);
        return error;
 }
-/*
- * xfs_symlink
- *
- */
 STATIC int
 xfs_symlink(
        bhv_desc_t              *dir_bdp,
-        vname_t                 *dentry,
+        bhv_vname_t             *dentry,
-        vattr_t                 *vap,
+        bhv_vattr_t             *vap,
        char                    *target_path,
-        vnode_t                 **vpp,
+        bhv_vnode_t             **vpp,
        cred_t                  *credp)
 {
        xfs_trans_t             *tp;
@@ -3263,7 +3267,7 @@ xfs_symlink(
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        boolean_t               dp_joined_to_trans;
-        vnode_t                 *dir_vp;
+        bhv_vnode_t             *dir_vp;
        uint                    cancel_flags;
        int                     committed;
        xfs_fileoff_t           first_fsb;
@@ -3308,7 +3312,7 @@ xfs_symlink(
                int len, total;
                char *path;
-                for(total = 0, path = target_path; total < pathlen;) {
+                for (total = 0, path = target_path; total < pathlen;) {
                        /*
                         * Skip any slashes.
                         */
@@ -3402,7 +3406,7 @@ xfs_symlink(
         * Check for ability to enter directory entry, if no space reserved.
         */
        if (resblks == 0 &&
-            (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
+            (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
                goto error_return;
        /*
         * Initialize the bmap freelist prior to calling either
@@ -3457,7 +3461,7 @@ xfs_symlink(
                error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
-                                  &free_list);
+                                  &free_list, NULL);
                if (error) {
                        goto error1;
                }
@@ -3489,11 +3493,10 @@ xfs_symlink(
        /*
         * Create the directory entry for the symlink.
         */
-        error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
+        error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
-                        ip->i_ino, &first_block, &free_list, resblks);
+                                   &first_block, &free_list, resblks);
-        if (error) {
+        if (error)
                goto error1;
-        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -3541,7 +3544,7 @@ std_return:
        }
        if (!error) {
-                vnode_t *vp;
+                bhv_vnode_t *vp;
                ASSERT(ip);
                vp = XFS_ITOV(ip);
@@ -3606,10 +3609,10 @@ xfs_fid2(
 int
 xfs_rwlock(
        bhv_desc_t      *bdp,
-        vrwlock_t       locktype)
+        bhv_vrwlock_t   locktype)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = BHV_TO_VNODE(bdp);
        if (VN_ISDIR(vp))
@@ -3637,10 +3640,10 @@ xfs_rwlock(
 void
 xfs_rwunlock(
        bhv_desc_t      *bdp,
-        vrwlock_t       locktype)
+        bhv_vrwlock_t   locktype)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = BHV_TO_VNODE(bdp);
        if (VN_ISDIR(vp))
@@ -3744,7 +3747,6 @@ xfs_inode_flush(
        return error;
 }
 int
 xfs_set_dmattrs (
        bhv_desc_t      *bdp,
@@ -3785,16 +3787,12 @@ xfs_set_dmattrs (
        return error;
 }
-/*
- * xfs_reclaim
- */
 STATIC int
 xfs_reclaim(
        bhv_desc_t      *bdp)
 {
        xfs_inode_t     *ip;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = BHV_TO_VNODE(bdp);
        ip = XFS_BHVTOI(bdp);
@@ -3849,7 +3847,7 @@ xfs_finish_reclaim(
        int             sync_mode)
 {
        xfs_ihash_t     *ih = ip->i_hash;
-        vnode_t         *vp = XFS_ITOV_NULL(ip);
+        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
        int             error;
        if (vp && VN_BAD(vp))
@@ -4116,10 +4114,10 @@ retry:
                 * Issue the xfs_bmapi() call to allocate the blocks
                 */
                XFS_BMAP_INIT(&free_list, &firstfsb);
-                error = xfs_bmapi(tp, ip, startoffset_fsb,
+                error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
                                  allocatesize_fsb, bmapi_flag,
                                  &firstfsb, 0, imapp, &nimaps,
-                                  &free_list);
+                                  &free_list, NULL);
                if (error) {
                        goto error0;
                }
@@ -4199,8 +4197,8 @@ xfs_zero_remaining_bytes(
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
                nimap = 1;
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
+                error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
-                        &nimap, NULL);
+                        NULL, 0, &imap, &nimap, NULL, NULL);
                if (error || nimap < 1)
                        break;
                ASSERT(imap.br_blockcount >= 1);
@@ -4259,7 +4257,7 @@ xfs_free_file_space(
        xfs_off_t               len,
        int                     attr_flags)
 {
-        vnode_t                 *vp;
+        bhv_vnode_t             *vp;
        int                     committed;
        int                     done;
        xfs_off_t               end_dmi_offset;
@@ -4308,7 +4306,6 @@ xfs_free_file_space(
                        return error;
        }
-        ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
        if (attr_flags & ATTR_NOLOCK)
                need_iolock = 0;
        if (need_iolock) {
@@ -4326,7 +4323,7 @@ xfs_free_file_space(
        if (VN_CACHED(vp) != 0) {
                xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
                                ctooff(offtoct(ioffset)), -1);
-                VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
+                bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
                                -1, FI_REMAPF_LOCKED);
        }
@@ -4338,8 +4335,8 @@ xfs_free_file_space(
         */
        if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
                nimap = 1;
-                error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
+                error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
-                        &imap, &nimap, NULL);
+                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -4353,8 +4350,8 @@ xfs_free_file_space(
                                startoffset_fsb += mp->m_sb.sb_rextsize - mod;
                }
                nimap = 1;
-                error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
+                error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
-                        &imap, &nimap, NULL);
+                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -4426,9 +4423,9 @@ xfs_free_file_space(
                 * issue the bunmapi() call to free the blocks
                 */
                XFS_BMAP_INIT(&free_list, &firstfsb);
-                error = xfs_bunmapi(tp, ip, startoffset_fsb,
+                error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
                                  endoffset_fsb - startoffset_fsb,
-                                  0, 2, &firstfsb, &free_list, &done);
+                                  0, 2, &firstfsb, &free_list, NULL, &done);
                if (error) {
                        goto error0;
                }
@@ -4488,8 +4485,8 @@ xfs_change_file_space(
        xfs_off_t       startoffset;
        xfs_off_t       llen;
        xfs_trans_t     *tp;
-        vattr_t         va;
+        bhv_vattr_t     va;
-        vnode_t         *vp;
+        bhv_vnode_t     *vp;
        vp = BHV_TO_VNODE(bdp);
        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -4642,9 +4639,10 @@ xfs_change_file_space(
        return error;
 }
-vnodeops_t xfs_vnodeops = {
+bhv_vnodeops_t xfs_vnodeops = {
        BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
        .vop_open               = xfs_open,
+        .vop_close              = xfs_close,
        .vop_read               = xfs_read,
 #ifdef HAVE_SENDFILE
        .vop_sendfile           = xfs_sendfile,
author	Steven Whitehouse <swhiteho@redhat.com>	2006-07-03 10:25:08 -0400
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-07-03 10:25:08 -0400
commit	0a1340c185734a57fbf4775927966ad4a1347b02 (patch)
tree	d9ed8f0dd809a7c542a3356601125ea5b5aaa804 /fs
parent	af18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff)
parent	29454dde27d8e340bb1987bad9aa504af7081eba (diff)