270 files changed, 16638 insertions, 13239 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 124a085d1f2e..b01b0a457932 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -415,7 +415,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
        file_inode = file->d_inode;
        sb = file_inode->i_sb;
        v9ses = v9fs_inode2v9ses(file_inode);
-        v9fid = v9fs_fid_lookup(file);
+        v9fid = v9fs_fid_clone(file);
        if(IS_ERR(v9fid))
                return PTR_ERR(v9fid);
diff --git a/fs/Kconfig b/fs/Kconfig
index 3c4886b849f5..8ea7b04c661f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1734,6 +1734,18 @@ config SUNRPC
 config SUNRPC_GSS
        tristate
+config SUNRPC_BIND34
+        bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
+        depends on SUNRPC && EXPERIMENTAL
+        help
+          Provides kernel support for querying rpcbind servers via versions 3
+          and 4 of the rpcbind protocol.  The kernel automatically falls back
+          to version 2 if a remote rpcbind service does not support versions
+          3 or 4.
+          If unsure, say N to get traditional behavior (version 2 rpcbind
+          requests only).
 config RPCSEC_GSS_KRB5
        tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
        depends on SUNRPC && EXPERIMENTAL
@@ -2019,7 +2031,7 @@ config CODA_FS_OLD_API
 config AFS_FS
        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
-        select RXRPC
+        select AF_RXRPC
        help
          If you say Y here, you will get an experimental Andrew File System
          driver. It currently only supports unsecured read-only AFS access.
@@ -2028,8 +2040,15 @@ config AFS_FS
          If unsure, say N.
-config RXRPC
+config AFS_DEBUG
-        tristate
+        bool "AFS dynamic debugging"
+        depends on AFS_FS
+        help
+          Say Y here to make runtime controllable debugging messages appear.
+          See <file:Documentation/filesystems/afs.txt> for more information.
+          If unsure, say N.
 config 9P_FS
        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f3d3d81eb7e9..74c64409ddbc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -26,7 +26,7 @@ config BINFMT_ELF
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
-        depends on FRV
+        depends on (FRV || BLACKFIN)
        help
          ELF FDPIC binaries are based on ELF, but allow the individual load
          segments of a binary to be located in memory independently of each
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2e5f2c8371ee..30c296508497 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -232,8 +232,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c3986a1911b0..beff7d21e6e2 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -87,8 +87,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct affs_inode_info *ei = (struct affs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                init_MUTEX(&ei->i_link_lock);
                init_MUTEX(&ei->i_ext_lock);
                inode_init_once(&ei->vfs_inode);
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4029c9da4b86..cf83e5d63512 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,8 +2,6 @@
 # Makefile for Red Hat Linux AFS client.
 #
-#CFLAGS += -finstrument-functions
 kafs-objs := \
        callback.o \
        cell.o \
@@ -12,14 +10,15 @@ kafs-objs := \
        file.o \
        fsclient.o \
        inode.o \
-        kafsasyncd.o \
-        kafstimod.o \
        main.o \
        misc.o \
        mntpt.o \
        proc.o \
+        rxrpc.o \
+        security.o \
        server.o \
        super.o \
+        netdevices.o \
        vlclient.o \
        vlocation.o \
        vnode.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
new file mode 100644
index 000000000000..52d0752265b8
--- /dev/null
+++ b/fs/afs/afs.h
@@ -0,0 +1,146 @@
+/* AFS common types
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef AFS_H
+#define AFS_H
+#include <linux/in.h>
+#define AFS_MAXCELLNAME 64              /* maximum length of a cell name */
+#define AFS_MAXVOLNAME  64              /* maximum length of a volume name */
+typedef unsigned                        afs_volid_t;
+typedef unsigned                        afs_vnodeid_t;
+typedef unsigned long long              afs_dataversion_t;
+typedef enum {
+        AFSVL_RWVOL,                    /* read/write volume */
+        AFSVL_ROVOL,                    /* read-only volume */
+        AFSVL_BACKVOL,                  /* backup volume */
+} __attribute__((packed)) afs_voltype_t;
+typedef enum {
+        AFS_FTYPE_INVALID       = 0,
+        AFS_FTYPE_FILE          = 1,
+        AFS_FTYPE_DIR           = 2,
+        AFS_FTYPE_SYMLINK       = 3,
+} afs_file_type_t;
+/*
+ * AFS file identifier
+ */
+struct afs_fid {
+        afs_volid_t     vid;            /* volume ID */
+        afs_vnodeid_t   vnode;          /* file index within volume */
+        unsigned        unique;         /* unique ID number (file index version) */
+};
+/*
+ * AFS callback notification
+ */
+typedef enum {
+        AFSCM_CB_UNTYPED        = 0,    /* no type set on CB break */
+        AFSCM_CB_EXCLUSIVE      = 1,    /* CB exclusive to CM [not implemented] */
+        AFSCM_CB_SHARED         = 2,    /* CB shared by other CM's */
+        AFSCM_CB_DROPPED        = 3,    /* CB promise cancelled by file server */
+} afs_callback_type_t;
+struct afs_callback {
+        struct afs_fid          fid;            /* file identifier */
+        unsigned                version;        /* callback version */
+        unsigned                expiry;         /* time at which expires */
+        afs_callback_type_t     type;           /* type of callback */
+};
+#define AFSCBMAX 50     /* maximum callbacks transferred per bulk op */
+/*
+ * AFS volume information
+ */
+struct afs_volume_info {
+        afs_volid_t             vid;            /* volume ID */
+        afs_voltype_t           type;           /* type of this volume */
+        afs_volid_t             type_vids[5];   /* volume ID's for possible types for this vol */
+        /* list of fileservers serving this volume */
+        size_t                  nservers;       /* number of entries used in servers[] */
+        struct {
+                struct in_addr  addr;           /* fileserver address */
+        } servers[8];
+};
+/*
+ * AFS security ACE access mask
+ */
+typedef u32 afs_access_t;
+#define AFS_ACE_READ            0x00000001U     /* - permission to read a file/dir */
+#define AFS_ACE_WRITE           0x00000002U     /* - permission to write/chmod a file */
+#define AFS_ACE_INSERT          0x00000004U     /* - permission to create dirent in a dir */
+#define AFS_ACE_LOOKUP          0x00000008U     /* - permission to lookup a file/dir in a dir */
+#define AFS_ACE_DELETE          0x00000010U     /* - permission to delete a dirent from a dir */
+#define AFS_ACE_LOCK            0x00000020U     /* - permission to lock a file */
+#define AFS_ACE_ADMINISTER      0x00000040U     /* - permission to change ACL */
+#define AFS_ACE_USER_A          0x01000000U     /* - 'A' user-defined permission */
+#define AFS_ACE_USER_B          0x02000000U     /* - 'B' user-defined permission */
+#define AFS_ACE_USER_C          0x04000000U     /* - 'C' user-defined permission */
+#define AFS_ACE_USER_D          0x08000000U     /* - 'D' user-defined permission */
+#define AFS_ACE_USER_E          0x10000000U     /* - 'E' user-defined permission */
+#define AFS_ACE_USER_F          0x20000000U     /* - 'F' user-defined permission */
+#define AFS_ACE_USER_G          0x40000000U     /* - 'G' user-defined permission */
+#define AFS_ACE_USER_H          0x80000000U     /* - 'H' user-defined permission */
+/*
+ * AFS file status information
+ */
+struct afs_file_status {
+        unsigned                if_version;     /* interface version */
+#define AFS_FSTATUS_VERSION     1
+        afs_file_type_t         type;           /* file type */
+        unsigned                nlink;          /* link count */
+        u64                     size;           /* file size */
+        afs_dataversion_t       data_version;   /* current data version */
+        u32                     author;         /* author ID */
+        u32                     owner;          /* owner ID */
+        u32                     group;          /* group ID */
+        afs_access_t            caller_access;  /* access rights for authenticated caller */
+        afs_access_t            anon_access;    /* access rights for unauthenticated caller */
+        umode_t                 mode;           /* UNIX mode */
+        struct afs_fid          parent;         /* parent dir ID for non-dirs only */
+        time_t                  mtime_client;   /* last time client changed data */
+        time_t                  mtime_server;   /* last time server changed data */
+};
+/*
+ * AFS file status change request
+ */
+struct afs_store_status {
+        u32                     mask;           /* which bits of the struct are set */
+        u32                     mtime_client;   /* last time client changed data */
+        u32                     owner;          /* owner ID */
+        u32                     group;          /* group ID */
+        umode_t                 mode;           /* UNIX mode */
+};
+#define AFS_SET_MTIME           0x01            /* set the mtime */
+#define AFS_SET_OWNER           0x02            /* set the owner ID */
+#define AFS_SET_GROUP           0x04            /* set the group ID (unsupported?) */
+#define AFS_SET_MODE            0x08            /* set the UNIX mode */
+#define AFS_SET_SEG_SIZE        0x10            /* set the segment size (unsupported) */
+/*
+ * AFS volume synchronisation information
+ */
+struct afs_volsync {
+        time_t                  creation;       /* volume creation time */
+};
+#endif /* AFS_H */
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
new file mode 100644
index 000000000000..7b4d4fab4c80
--- /dev/null
+++ b/fs/afs/afs_cm.h
@@ -0,0 +1,32 @@
+/* AFS Cache Manager definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef AFS_CM_H
+#define AFS_CM_H
+#define AFS_CM_PORT             7001    /* AFS file server port */
+#define CM_SERVICE              1       /* AFS File Service ID */
+enum AFS_CM_Operations {
+        CBCallBack              = 204,  /* break callback promises */
+        CBInitCallBackState     = 205,  /* initialise callback state */
+        CBProbe                 = 206,  /* probe client */
+        CBGetLock               = 207,  /* get contents of CM lock table */
+        CBGetCE                 = 208,  /* get cache file description */
+        CBGetXStatsVersion      = 209,  /* get version of extended statistics */
+        CBGetXStats             = 210,  /* get contents of extended statistics data */
+        CBInitCallBackState3    = 213,  /* initialise callback state, version 3 */
+        CBGetCapabilities       = 65538, /* get client capabilities */
+};
+#define AFS_CAP_ERROR_TRANSLATION       0x1
+#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
new file mode 100644
index 000000000000..89e0d1650a72
--- /dev/null
+++ b/fs/afs/afs_fs.h
@@ -0,0 +1,48 @@
+/* AFS File Service definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef AFS_FS_H
+#define AFS_FS_H
+#define AFS_FS_PORT             7000    /* AFS file server port */
+#define FS_SERVICE              1       /* AFS File Service ID */
+enum AFS_FS_Operations {
+        FSFETCHDATA             = 130,  /* AFS Fetch file data */
+        FSFETCHSTATUS           = 132,  /* AFS Fetch file status */
+        FSREMOVEFILE            = 136,  /* AFS Remove a file */
+        FSCREATEFILE            = 137,  /* AFS Create a file */
+        FSRENAME                = 138,  /* AFS Rename or move a file or directory */
+        FSSYMLINK               = 139,  /* AFS Create a symbolic link */
+        FSLINK                  = 140,  /* AFS Create a hard link */
+        FSMAKEDIR               = 141,  /* AFS Create a directory */
+        FSREMOVEDIR             = 142,  /* AFS Remove a directory */
+        FSGIVEUPCALLBACKS       = 147,  /* AFS Discard callback promises */
+        FSGETVOLUMEINFO         = 148,  /* AFS Get root volume information */
+        FSGETROOTVOLUME         = 151,  /* AFS Get root volume name */
+        FSLOOKUP                = 161,  /* AFS lookup file in directory */
+};
+enum AFS_FS_Errors {
+        VSALVAGE        = 101,  /* volume needs salvaging */
+        VNOVNODE        = 102,  /* no such file/dir (vnode) */
+        VNOVOL          = 103,  /* no such volume or volume unavailable */
+        VVOLEXISTS      = 104,  /* volume name already exists */
+        VNOSERVICE      = 105,  /* volume not currently in service */
+        VOFFLINE        = 106,  /* volume is currently offline (more info available [VVL-spec]) */
+        VONLINE         = 107,  /* volume is already online */
+        VDISKFULL       = 108,  /* disk partition is full */
+        VOVERQUOTA      = 109,  /* volume's maximum quota exceeded */
+        VBUSY           = 110,  /* volume is temporarily unavailable */
+        VMOVED          = 111,  /* volume moved to new server - ask this FS where */
+};
+#endif /* AFS_FS_H */
diff --git a/fs/afs/vlclient.h b/fs/afs/afs_vl.h
index e3d601179c46..8bbefe009ed4 100644
--- a/fs/afs/vlclient.h
+++ b/fs/afs/afs_vl.h
@@ -1,6 +1,6 @@
-/* vlclient.h: Volume Location Service client interface
+/* AFS Volume Location Service client interface
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,10 +9,19 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef _LINUX_AFS_VLCLIENT_H
+#ifndef AFS_VL_H
-#define _LINUX_AFS_VLCLIENT_H
+#define AFS_VL_H
-#include "types.h"
+#include "afs.h"
+#define AFS_VL_PORT             7003    /* volume location service port */
+#define VL_SERVICE              52      /* RxRPC service ID for the Volume Location service */
+enum AFSVL_Operations {
+        VLGETENTRYBYID          = 503,  /* AFS Get Cache Entry By ID operation ID */
+        VLGETENTRYBYNAME        = 504,  /* AFS Get Cache Entry By Name operation ID */
+        VLPROBE                 = 514,  /* AFS Probe Volume Location Service operation ID */
+};
 enum AFSVL_Errors {
        AFSVL_IDEXIST           = 363520,       /* Volume Id entry exists in vl database */
@@ -40,14 +49,16 @@ enum AFSVL_Errors {
        AFSVL_BADVOLOPER        = 363542,       /* Bad volume operation code */
        AFSVL_BADRELLOCKTYPE    = 363543,       /* Bad release lock type */
        AFSVL_RERELEASE         = 363544,       /* Status report: last release was aborted */
-        AFSVL_BADSERVERFLAG     = 363545,       /* Invalid replication site server �ag */
+        AFSVL_BADSERVERFLAG     = 363545,       /* Invalid replication site server ��ag */
        AFSVL_PERM              = 363546,       /* No permission access */
        AFSVL_NOMEM             = 363547,       /* malloc/realloc failed to alloc enough memory */
 };
-/* maps to "struct vldbentry" in vvl-spec.pdf */
+/*
+ * maps to "struct vldbentry" in vvl-spec.pdf
+ */
 struct afs_vldbentry {
-        char            name[65];               /* name of volume (including NUL char) */
+        char            name[65];               /* name of volume (with NUL char) */
        afs_voltype_t   type;                   /* volume type */
        unsigned        num_servers;            /* num servers that hold instances of this vol */
        unsigned        clone_id;               /* cloning ID */
@@ -68,26 +79,6 @@ struct afs_vldbentry {
 #define AFS_VLSF_RWVOL          0x0004  /* this server holds a R/W instance of the volume */
 #define AFS_VLSF_BACKVOL        0x0008  /* this server holds a backup instance of the volume */
        } servers[8];
 };
-/* look up a volume location database entry by name */
+#endif /* AFS_VL_H */
-extern int afs_rxvl_get_entry_by_name(struct afs_server *server,
-                                      const char *volname,
-                                      unsigned volnamesz,
-                                      struct afs_cache_vlocation *entry);
-/* look up a volume location database entry by ID */
-extern int afs_rxvl_get_entry_by_id(struct afs_server *server,
-                                    afs_volid_t volid,
-                                    afs_voltype_t voltype,
-                                    struct afs_cache_vlocation *entry);
-extern int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
-                                          afs_volid_t volid,
-                                          afs_voltype_t voltype);
-extern int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
-                                           struct afs_cache_vlocation *entry);
-#endif /* _LINUX_AFS_VLCLIENT_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
new file mode 100644
index 000000000000..de0d7de69edc
--- /dev/null
+++ b/fs/afs/cache.c
@@ -0,0 +1,256 @@
+/* AFS caching stuff
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+                                                const void *entry);
+static void afs_cell_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_cache_cell_index_def = {
+        .name                   = "cell_ix",
+        .data_size              = sizeof(struct afs_cache_cell),
+        .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+        .match                  = afs_cell_cache_match,
+        .update                 = afs_cell_cache_update,
+};
+#endif
+/*
+ * match a cell record obtained from the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+                                                const void *entry)
+{
+        const struct afs_cache_cell *ccell = entry;
+        struct afs_cell *cell = target;
+        _enter("{%s},{%s}", ccell->name, cell->name);
+        if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
+                _leave(" = SUCCESS");
+                return CACHEFS_MATCH_SUCCESS;
+        }
+        _leave(" = FAILED");
+        return CACHEFS_MATCH_FAILED;
+}
+#endif
+/*
+ * update a cell record in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_cell_cache_update(void *source, void *entry)
+{
+        struct afs_cache_cell *ccell = entry;
+        struct afs_cell *cell = source;
+        _enter("%p,%p", source, entry);
+        strncpy(ccell->name, cell->name, sizeof(ccell->name));
+        memcpy(ccell->vl_servers,
+               cell->vl_addrs,
+               min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+}
+#endif
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                                     const void *entry);
+static void afs_vlocation_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_vlocation_cache_index_def = {
+        .name           = "vldb",
+        .data_size      = sizeof(struct afs_cache_vlocation),
+        .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+        .match          = afs_vlocation_cache_match,
+        .update         = afs_vlocation_cache_update,
+};
+#endif
+/*
+ * match a VLDB record stored in the cache
+ * - may also load target from entry
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                                     const void *entry)
+{
+        const struct afs_cache_vlocation *vldb = entry;
+        struct afs_vlocation *vlocation = target;
+        _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+        if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+            ) {
+                if (!vlocation->valid ||
+                    vlocation->vldb.rtime == vldb->rtime
+                    ) {
+                        vlocation->vldb = *vldb;
+                        vlocation->valid = 1;
+                        _leave(" = SUCCESS [c->m]");
+                        return CACHEFS_MATCH_SUCCESS;
+                } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
+                        /* delete if VIDs for this name differ */
+                        if (memcmp(&vlocation->vldb.vid,
+                                   &vldb->vid,
+                                   sizeof(vldb->vid)) != 0) {
+                                _leave(" = DELETE");
+                                return CACHEFS_MATCH_SUCCESS_DELETE;
+                        }
+                        _leave(" = UPDATE");
+                        return CACHEFS_MATCH_SUCCESS_UPDATE;
+                } else {
+                        _leave(" = SUCCESS");
+                        return CACHEFS_MATCH_SUCCESS;
+                }
+        }
+        _leave(" = FAILED");
+        return CACHEFS_MATCH_FAILED;
+}
+#endif
+/*
+ * update a VLDB record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vlocation_cache_update(void *source, void *entry)
+{
+        struct afs_cache_vlocation *vldb = entry;
+        struct afs_vlocation *vlocation = source;
+        _enter("");
+        *vldb = vlocation->vldb;
+}
+#endif
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+                                                  const void *entry);
+static void afs_volume_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_volume_cache_index_def = {
+        .name           = "volume",
+        .data_size      = sizeof(struct afs_cache_vhash),
+        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
+        .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
+        .match          = afs_volume_cache_match,
+        .update         = afs_volume_cache_update,
+};
+#endif
+/*
+ * match a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+                                                  const void *entry)
+{
+        const struct afs_cache_vhash *vhash = entry;
+        struct afs_volume *volume = target;
+        _enter("{%u},{%u}", volume->type, vhash->vtype);
+        if (volume->type == vhash->vtype) {
+                _leave(" = SUCCESS");
+                return CACHEFS_MATCH_SUCCESS;
+        }
+        _leave(" = FAILED");
+        return CACHEFS_MATCH_FAILED;
+}
+#endif
+/*
+ * update a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_volume_cache_update(void *source, void *entry)
+{
+        struct afs_cache_vhash *vhash = entry;
+        struct afs_volume *volume = source;
+        _enter("");
+        vhash->vtype = volume->type;
+}
+#endif
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                                                 const void *entry);
+static void afs_vnode_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_vnode_cache_index_def = {
+        .name           = "vnode",
+        .data_size      = sizeof(struct afs_cache_vnode),
+        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
+        .match          = afs_vnode_cache_match,
+        .update         = afs_vnode_cache_update,
+};
+#endif
+/*
+ * match a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                                                 const void *entry)
+{
+        const struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = target;
+        _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               vnode->status.version,
+               cvnode->vnode_id,
+               cvnode->vnode_unique,
+               cvnode->data_version);
+        if (vnode->fid.vnode != cvnode->vnode_id) {
+                _leave(" = FAILED");
+                return CACHEFS_MATCH_FAILED;
+        }
+        if (vnode->fid.unique != cvnode->vnode_unique ||
+            vnode->status.version != cvnode->data_version) {
+                _leave(" = DELETE");
+                return CACHEFS_MATCH_SUCCESS_DELETE;
+        }
+        _leave(" = SUCCESS");
+        return CACHEFS_MATCH_SUCCESS;
+}
+#endif
+/*
+ * update a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_update(void *source, void *entry)
+{
+        struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = source;
+        _enter("");
+        cvnode->vnode_id        = vnode->fid.vnode;
+        cvnode->vnode_unique    = vnode->fid.unique;
+        cvnode->data_version    = vnode->status.version;
+}
+#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 9eb7722b34d5..36a3642cf90e 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,4 +1,4 @@
-/* cache.h: AFS local cache management interface
+/* AFS local cache management interface
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -9,8 +9,8 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef _LINUX_AFS_CACHE_H
+#ifndef AFS_CACHE_H
-#define _LINUX_AFS_CACHE_H
+#define AFS_CACHE_H
 #undef AFS_CACHING_SUPPORT
@@ -20,8 +20,4 @@
 #endif
 #include "types.h"
-#ifdef __KERNEL__
+#endif /* AFS_CACHE_H */
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_CACHE_H */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9cb206e9d4be..9bdbf36a9aa9 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
 *
 * This software may be freely redistributed under the terms of the
 * GNU General Public License.
@@ -16,85 +16,187 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include "server.h"
+#include <linux/circ_buf.h>
-#include "vnode.h"
 #include "internal.h"
-#include "cmservice.h"
-/*****************************************************************************/
+unsigned afs_vnode_update_timeout = 10;
+#define afs_breakring_space(server) \
+        CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,    \
+                   ARRAY_SIZE((server)->cb_break))
+//static void afs_callback_updater(struct work_struct *);
+static struct workqueue_struct *afs_callback_update_worker;
 /*
 * allow the fileserver to request callback state (re-)initialisation
 */
-int SRXAFSCM_InitCallBackState(struct afs_server *server)
+void afs_init_callback_state(struct afs_server *server)
 {
-        struct list_head callbacks;
+        struct afs_vnode *vnode;
-        _enter("%p", server);
+        _enter("{%p}", server);
-        INIT_LIST_HEAD(&callbacks);
-        /* transfer the callback list from the server to a temp holding area */
        spin_lock(&server->cb_lock);
-        list_add(&callbacks, &server->cb_promises);
+        /* kill all the promises on record from this server */
-        list_del_init(&server->cb_promises);
+        while (!RB_EMPTY_ROOT(&server->cb_promises)) {
+                vnode = rb_entry(server->cb_promises.rb_node,
+                                 struct afs_vnode, cb_promise);
+                _debug("UNPROMISE { vid=%x vn=%u uq=%u}",
+                       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+                rb_erase(&vnode->cb_promise, &server->cb_promises);
+                vnode->cb_promised = false;
+        }
-        /* munch our way through the list, grabbing the inode, dropping all the
+        spin_unlock(&server->cb_lock);
-         * locks and regetting them in the right order
+        _leave("");
-         */
+}
-        while (!list_empty(&callbacks)) {
-                struct afs_vnode *vnode;
-                struct inode *inode;
-                vnode = list_entry(callbacks.next, struct afs_vnode, cb_link);
+/*
-                list_del_init(&vnode->cb_link);
+ * handle the data invalidation side of a callback being broken
+ */
+void afs_broken_callback_work(struct work_struct *work)
+{
+        struct afs_vnode *vnode =
+                container_of(work, struct afs_vnode, cb_broken_work);
-                /* try and grab the inode - may fail */
+        _enter("");
-                inode = igrab(AFS_VNODE_TO_I(vnode));
-                if (inode) {
-                        int release = 0;
-                        spin_unlock(&server->cb_lock);
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-                        spin_lock(&vnode->lock);
+                return;
-                        if (vnode->cb_server == server) {
+        /* we're only interested in dealing with a broken callback on *this*
-                                vnode->cb_server = NULL;
+         * vnode and only if no-one else has dealt with it yet */
-                                afs_kafstimod_del_timer(&vnode->cb_timeout);
+        if (!mutex_trylock(&vnode->validate_lock))
-                                spin_lock(&afs_cb_hash_lock);
+                return; /* someone else is dealing with it */
-                                list_del_init(&vnode->cb_hash_link);
-                                spin_unlock(&afs_cb_hash_lock);
-                                release = 1;
-                        }
-                        spin_unlock(&vnode->lock);
+        if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+                if (S_ISDIR(vnode->vfs_inode.i_mode))
+                        afs_clear_permits(vnode);
-                        iput(inode);
+                if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
-                        afs_put_server(server);
+                        goto out;
-                        spin_lock(&server->cb_lock);
+                if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+                        goto out;
+                /* if the vnode's data version number changed then its contents
+                 * are different */
+                if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                        _debug("zap data {%x:%u}",
+                               vnode->fid.vid, vnode->fid.vnode);
+                        invalidate_remote_inode(&vnode->vfs_inode);
                }
        }
-        spin_unlock(&server->cb_lock);
+out:
+        mutex_unlock(&vnode->validate_lock);
-        _leave(" = 0");
+        /* avoid the potential race whereby the mutex_trylock() in this
-        return 0;
+         * function happens again between the clear_bit() and the
-} /* end SRXAFSCM_InitCallBackState() */
+         * mutex_unlock() */
+        if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+                _debug("requeue");
+                queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+        }
+        _leave("");
+}
+/*
+ * actually break a callback
+ */
+static void afs_break_callback(struct afs_server *server,
+                               struct afs_vnode *vnode)
+{
+        _enter("");
+        set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+        if (vnode->cb_promised) {
+                spin_lock(&vnode->lock);
+                _debug("break callback");
+                spin_lock(&server->cb_lock);
+                if (vnode->cb_promised) {
+                        rb_erase(&vnode->cb_promise, &server->cb_promises);
+                        vnode->cb_promised = false;
+                }
+                spin_unlock(&server->cb_lock);
+                queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+                spin_unlock(&vnode->lock);
+        }
+}
+/*
+ * allow the fileserver to explicitly break one callback
+ * - happens when
+ *   - the backing file is changed
+ *   - a lock is released
+ */
+static void afs_break_one_callback(struct afs_server *server,
+                                   struct afs_fid *fid)
+{
+        struct afs_vnode *vnode;
+        struct rb_node *p;
+        _debug("find");
+        spin_lock(&server->fs_lock);
+        p = server->fs_vnodes.rb_node;
+        while (p) {
+                vnode = rb_entry(p, struct afs_vnode, server_rb);
+                if (fid->vid < vnode->fid.vid)
+                        p = p->rb_left;
+                else if (fid->vid > vnode->fid.vid)
+                        p = p->rb_right;
+                else if (fid->vnode < vnode->fid.vnode)
+                        p = p->rb_left;
+                else if (fid->vnode > vnode->fid.vnode)
+                        p = p->rb_right;
+                else if (fid->unique < vnode->fid.unique)
+                        p = p->rb_left;
+                else if (fid->unique > vnode->fid.unique)
+                        p = p->rb_right;
+                else
+                        goto found;
+        }
+        /* not found so we just ignore it (it may have moved to another
+         * server) */
+not_available:
+        _debug("not avail");
+        spin_unlock(&server->fs_lock);
+        _leave("");
+        return;
+found:
+        _debug("found");
+        ASSERTCMP(server, ==, vnode->server);
+        if (!igrab(AFS_VNODE_TO_I(vnode)))
+                goto not_available;
+        spin_unlock(&server->fs_lock);
+        afs_break_callback(server, vnode);
+        iput(&vnode->vfs_inode);
+        _leave("");
+}
-/*****************************************************************************/
 /*
 * allow the fileserver to break callback promises
 */
-int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
+void afs_break_callbacks(struct afs_server *server, size_t count,
-                      struct afs_callback callbacks[])
+                         struct afs_callback callbacks[])
 {
-        _enter("%p,%u,", server, count);
+        _enter("%p,%zu,", server, count);
-        for (; count > 0; callbacks++, count--) {
+        ASSERT(server != NULL);
-                struct afs_vnode *vnode = NULL;
+        ASSERTCMP(count, <=, AFSCBMAX);
-                struct inode *inode = NULL;
-                int valid = 0;
+        for (; count > 0; callbacks++, count--) {
                _debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
                       callbacks->fid.vid,
                       callbacks->fid.vnode,
@@ -103,67 +205,270 @@ int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
                       callbacks->expiry,
                       callbacks->type
                       );
+                afs_break_one_callback(server, &callbacks->fid);
+        }
-                /* find the inode for this fid */
+        _leave("");
-                spin_lock(&afs_cb_hash_lock);
+        return;
+}
-                list_for_each_entry(vnode,
+/*
-                                    &afs_cb_hash(server, &callbacks->fid),
+ * record the callback for breaking
-                                    cb_hash_link) {
+ * - the caller must hold server->cb_lock
-                        if (memcmp(&vnode->fid, &callbacks->fid,
+ */
-                                   sizeof(struct afs_fid)) != 0)
+static void afs_do_give_up_callback(struct afs_server *server,
-                                continue;
+                                    struct afs_vnode *vnode)
+{
+        struct afs_callback *cb;
-                        /* right vnode, but is it same server? */
+        _enter("%p,%p", server, vnode);
-                        if (vnode->cb_server != server)
-                                break; /* no */
-                        /* try and nail the inode down */
+        cb = &server->cb_break[server->cb_break_head];
-                        inode = igrab(AFS_VNODE_TO_I(vnode));
+        cb->fid         = vnode->fid;
-                        break;
+        cb->version     = vnode->cb_version;
+        cb->expiry      = vnode->cb_expiry;
+        cb->type        = vnode->cb_type;
+        smp_wmb();
+        server->cb_break_head =
+                (server->cb_break_head + 1) &
+                (ARRAY_SIZE(server->cb_break) - 1);
+        /* defer the breaking of callbacks to try and collect as many as
+         * possible to ship in one operation */
+        switch (atomic_inc_return(&server->cb_break_n)) {
+        case 1 ... AFSCBMAX - 1:
+                queue_delayed_work(afs_callback_update_worker,
+                                   &server->cb_break_work, HZ * 2);
+                break;
+        case AFSCBMAX:
+                afs_flush_callback_breaks(server);
+                break;
+        default:
+                break;
+        }
+        ASSERT(server->cb_promises.rb_node != NULL);
+        rb_erase(&vnode->cb_promise, &server->cb_promises);
+        vnode->cb_promised = false;
+        _leave("");
+}
+/*
+ * discard the callback on a deleted item
+ */
+void afs_discard_callback_on_delete(struct afs_vnode *vnode)
+{
+        struct afs_server *server = vnode->server;
+        _enter("%d", vnode->cb_promised);
+        if (!vnode->cb_promised) {
+                _leave(" [not promised]");
+                return;
+        }
+        ASSERT(server != NULL);
+        spin_lock(&server->cb_lock);
+        if (vnode->cb_promised) {
+                ASSERT(server->cb_promises.rb_node != NULL);
+                rb_erase(&vnode->cb_promise, &server->cb_promises);
+                vnode->cb_promised = false;
+        }
+        spin_unlock(&server->cb_lock);
+        _leave("");
+}
+/*
+ * give up the callback registered for a vnode on the file server when the
+ * inode is being cleared
+ */
+void afs_give_up_callback(struct afs_vnode *vnode)
+{
+        struct afs_server *server = vnode->server;
+        DECLARE_WAITQUEUE(myself, current);
+        _enter("%d", vnode->cb_promised);
+        _debug("GIVE UP INODE %p", &vnode->vfs_inode);
+        if (!vnode->cb_promised) {
+                _leave(" [not promised]");
+                return;
+        }
+        ASSERT(server != NULL);
+        spin_lock(&server->cb_lock);
+        if (vnode->cb_promised && afs_breakring_space(server) == 0) {
+                add_wait_queue(&server->cb_break_waitq, &myself);
+                for (;;) {
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        if (!vnode->cb_promised ||
+                            afs_breakring_space(server) != 0)
+                                break;
+                        spin_unlock(&server->cb_lock);
+                        schedule();
+                        spin_lock(&server->cb_lock);
                }
+                remove_wait_queue(&server->cb_break_waitq, &myself);
+                __set_current_state(TASK_RUNNING);
+        }
+        /* of course, it's always possible for the server to break this vnode's
+         * callback first... */
+        if (vnode->cb_promised)
+                afs_do_give_up_callback(server, vnode);
+        spin_unlock(&server->cb_lock);
+        _leave("");
+}
+/*
+ * dispatch a deferred give up callbacks operation
+ */
+void afs_dispatch_give_up_callbacks(struct work_struct *work)
+{
+        struct afs_server *server =
+                container_of(work, struct afs_server, cb_break_work.work);
+        _enter("");
+        /* tell the fileserver to discard the callback promises it has
+         * - in the event of ENOMEM or some other error, we just forget that we
+         *   had callbacks entirely, and the server will call us later to break
+         *   them
+         */
+        afs_fs_give_up_callbacks(server, &afs_async_call);
+}
+/*
+ * flush the outstanding callback breaks on a server
+ */
+void afs_flush_callback_breaks(struct afs_server *server)
+{
+        cancel_delayed_work(&server->cb_break_work);
+        queue_delayed_work(afs_callback_update_worker,
+                           &server->cb_break_work, 0);
+}
-                spin_unlock(&afs_cb_hash_lock);
+#if 0
+/*
-                if (inode) {
+ * update a bunch of callbacks
-                        /* we've found the record for this vnode */
+ */
-                        spin_lock(&vnode->lock);
+static void afs_callback_updater(struct work_struct *work)
-                        if (vnode->cb_server == server) {
+{
-                                /* the callback _is_ on the calling server */
+        struct afs_server *server;
-                                vnode->cb_server = NULL;
+        struct afs_vnode *vnode, *xvnode;
-                                valid = 1;
+        time_t now;
+        long timeout;
-                                afs_kafstimod_del_timer(&vnode->cb_timeout);
+        int ret;
-                                vnode->flags |= AFS_VNODE_CHANGED;
+        server = container_of(work, struct afs_server, updater);
-                                spin_lock(&server->cb_lock);
-                                list_del_init(&vnode->cb_link);
+        _enter("");
-                                spin_unlock(&server->cb_lock);
+        now = get_seconds();
-                                spin_lock(&afs_cb_hash_lock);
-                                list_del_init(&vnode->cb_hash_link);
+        /* find the first vnode to update */
-                                spin_unlock(&afs_cb_hash_lock);
+        spin_lock(&server->cb_lock);
-                        }
+        for (;;) {
-                        spin_unlock(&vnode->lock);
+                if (RB_EMPTY_ROOT(&server->cb_promises)) {
+                        spin_unlock(&server->cb_lock);
-                        if (valid) {
+                        _leave(" [nothing]");
-                                invalidate_remote_inode(inode);
+                        return;
-                                afs_put_server(server);
-                        }
-                        iput(inode);
                }
+                vnode = rb_entry(rb_first(&server->cb_promises),
+                                 struct afs_vnode, cb_promise);
+                if (atomic_read(&vnode->usage) > 0)
+                        break;
+                rb_erase(&vnode->cb_promise, &server->cb_promises);
+                vnode->cb_promised = false;
        }
-        _leave(" = 0");
+        timeout = vnode->update_at - now;
-        return 0;
+        if (timeout > 0) {
-} /* end SRXAFSCM_CallBack() */
+                queue_delayed_work(afs_vnode_update_worker,
+                                   &afs_vnode_update, timeout * HZ);
+                spin_unlock(&server->cb_lock);
+                _leave(" [nothing]");
+                return;
+        }
+        list_del_init(&vnode->update);
+        atomic_inc(&vnode->usage);
+        spin_unlock(&server->cb_lock);
+        /* we can now perform the update */
+        _debug("update %s", vnode->vldb.name);
+        vnode->state = AFS_VL_UPDATING;
+        vnode->upd_rej_cnt = 0;
+        vnode->upd_busy_cnt = 0;
+        ret = afs_vnode_update_record(vl, &vldb);
+        switch (ret) {
+        case 0:
+                afs_vnode_apply_update(vl, &vldb);
+                vnode->state = AFS_VL_UPDATING;
+                break;
+        case -ENOMEDIUM:
+                vnode->state = AFS_VL_VOLUME_DELETED;
+                break;
+        default:
+                vnode->state = AFS_VL_UNCERTAIN;
+                break;
+        }
+        /* and then reschedule */
+        _debug("reschedule");
+        vnode->update_at = get_seconds() + afs_vnode_update_timeout;
+        spin_lock(&server->cb_lock);
+        if (!list_empty(&server->cb_promises)) {
+                /* next update in 10 minutes, but wait at least 1 second more
+                 * than the newest record already queued so that we don't spam
+                 * the VL server suddenly with lots of requests
+                 */
+                xvnode = list_entry(server->cb_promises.prev,
+                                    struct afs_vnode, update);
+                if (vnode->update_at <= xvnode->update_at)
+                        vnode->update_at = xvnode->update_at + 1;
+                xvnode = list_entry(server->cb_promises.next,
+                                    struct afs_vnode, update);
+                timeout = xvnode->update_at - now;
+                if (timeout < 0)
+                        timeout = 0;
+        } else {
+                timeout = afs_vnode_update_timeout;
+        }
+        list_add_tail(&vnode->update, &server->cb_promises);
+        _debug("timeout %ld", timeout);
+        queue_delayed_work(afs_vnode_update_worker,
+                           &afs_vnode_update, timeout * HZ);
+        spin_unlock(&server->cb_lock);
+        afs_put_vnode(vl);
+}
+#endif
+/*
+ * initialise the callback update process
+ */
+int __init afs_callback_update_init(void)
+{
+        afs_callback_update_worker =
+                create_singlethread_workqueue("kafs_callbackd");
+        return afs_callback_update_worker ? 0 : -ENOMEM;
+}
-/*****************************************************************************/
 /*
- * allow the fileserver to see if the cache manager is still alive
+ * shut down the callback update process
 */
-int SRXAFSCM_Probe(struct afs_server *server)
+void afs_callback_update_kill(void)
 {
-        _debug("SRXAFSCM_Probe(%p)\n", server);
+        destroy_workqueue(afs_callback_update_worker);
-        return 0;
+}
-} /* end SRXAFSCM_Probe() */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 1fc578372759..9b1311a1df51 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,4 +1,4 @@
-/* cell.c: AFS cell and server record management
+/* AFS cell and server record management
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -11,15 +11,9 @@
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <rxrpc/peer.h>
+#include <linux/key.h>
-#include <rxrpc/connection.h>
+#include <linux/ctype.h>
-#include "volume.h"
+#include <keys/rxrpc-type.h>
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include "super.h"
 #include "internal.h"
 DECLARE_RWSEM(afs_proc_cells_sem);
@@ -28,66 +22,47 @@ LIST_HEAD(afs_proc_cells);
 static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells);
 static DEFINE_RWLOCK(afs_cells_lock);
 static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
+static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
 static struct afs_cell *afs_cell_root;
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-                                                const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_cache_cell_index_def = {
-        .name                   = "cell_ix",
-        .data_size              = sizeof(struct afs_cache_cell),
-        .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-        .match                  = afs_cell_cache_match,
-        .update                 = afs_cell_cache_update,
-};
-#endif
-/*****************************************************************************/
 /*
- * create a cell record
+ * allocate a cell record and fill in its name, VL server address list and
- * - "name" is the name of the cell
+ * allocate an anonymous key
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
 */
-int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
+static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
 {
        struct afs_cell *cell;
-        char *next;
+        size_t namelen;
+        char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
        int ret;
-        _enter("%s", name);
+        _enter("%s,%s", name, vllist);
        BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+        namelen = strlen(name);
+        if (namelen > AFS_MAXCELLNAME)
+                return ERR_PTR(-ENAMETOOLONG);
        /* allocate and initialise a cell record */
-        cell = kmalloc(sizeof(struct afs_cell) + strlen(name) + 1, GFP_KERNEL);
+        cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
        if (!cell) {
                _leave(" = -ENOMEM");
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        }
-        down_write(&afs_cells_sem);
+        memcpy(cell->name, name, namelen);
+        cell->name[namelen] = 0;
-        memset(cell, 0, sizeof(struct afs_cell));
-        atomic_set(&cell->usage, 0);
+        atomic_set(&cell->usage, 1);
        INIT_LIST_HEAD(&cell->link);
+        rwlock_init(&cell->servers_lock);
-        rwlock_init(&cell->sv_lock);
+        INIT_LIST_HEAD(&cell->servers);
-        INIT_LIST_HEAD(&cell->sv_list);
-        INIT_LIST_HEAD(&cell->sv_graveyard);
-        spin_lock_init(&cell->sv_gylock);
        init_rwsem(&cell->vl_sem);
        INIT_LIST_HEAD(&cell->vl_list);
-        INIT_LIST_HEAD(&cell->vl_graveyard);
+        spin_lock_init(&cell->vl_lock);
-        spin_lock_init(&cell->vl_gylock);
-        strcpy(cell->name,name);
        /* fill in the VL server list from the rest of the string */
-        ret = -EINVAL;
        do {
                unsigned a, b, c, d;
@@ -96,20 +71,75 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
                        *next++ = 0;
                if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
-                        goto badaddr;
+                        goto bad_address;
                if (a > 255 || b > 255 || c > 255 || d > 255)
-                        goto badaddr;
+                        goto bad_address;
                cell->vl_addrs[cell->vl_naddrs++].s_addr =
                        htonl((a << 24) | (b << 16) | (c << 8) | d);
-                if (cell->vl_naddrs >= AFS_CELL_MAX_ADDRS)
+        } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
-                        break;
+        /* create a key to represent an anonymous user */
+        memcpy(keyname, "afs@", 4);
+        dp = keyname + 4;
+        cp = cell->name;
+        do {
+                *dp++ = toupper(*cp);
+        } while (*cp++);
+        cell->anonymous_key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+                                        KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(cell->anonymous_key)) {
+                _debug("no key");
+                ret = PTR_ERR(cell->anonymous_key);
+                goto error;
+        }
+        ret = key_instantiate_and_link(cell->anonymous_key, NULL, 0,
+                                       NULL, NULL);
+        if (ret < 0) {
+                _debug("instantiate failed");
+                goto error;
+        }
+        _debug("anon key %p{%x}",
+               cell->anonymous_key, key_serial(cell->anonymous_key));
+        _leave(" = %p", cell);
+        return cell;
+bad_address:
+        printk(KERN_ERR "kAFS: bad VL server IP address\n");
+        ret = -EINVAL;
+error:
+        key_put(cell->anonymous_key);
+        kfree(cell);
+        _leave(" = %d", ret);
+        return ERR_PTR(ret);
+}
+/*
+ * create a cell record
+ * - "name" is the name of the cell
+ * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ */
+struct afs_cell *afs_cell_create(const char *name, char *vllist)
+{
+        struct afs_cell *cell;
+        int ret;
+        _enter("%s,%s", name, vllist);
-        } while(vllist = next, vllist);
+        cell = afs_cell_alloc(name, vllist);
+        if (IS_ERR(cell)) {
+                _leave(" = %ld", PTR_ERR(cell));
+                return cell;
+        }
+        down_write(&afs_cells_sem);
-        /* add a proc dir for this cell */
+        /* add a proc directory for this cell */
        ret = afs_proc_cell_setup(cell);
        if (ret < 0)
                goto error;
@@ -130,31 +160,28 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
        down_write(&afs_proc_cells_sem);
        list_add_tail(&cell->proc_link, &afs_proc_cells);
        up_write(&afs_proc_cells_sem);
-        *_cell = cell;
        up_write(&afs_cells_sem);
-        _leave(" = 0 (%p)", cell);
+        _leave(" = %p", cell);
-        return 0;
+        return cell;
- badaddr:
+error:
-        printk(KERN_ERR "kAFS: bad VL server IP address: '%s'\n", vllist);
- error:
        up_write(&afs_cells_sem);
+        key_put(cell->anonymous_key);
        kfree(cell);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
-} /* end afs_cell_create() */
+}
-/*****************************************************************************/
 /*
- * initialise the cell database from module parameters
+ * set the root cell information
+ * - can be called with a module parameter string
+ * - can be called from a write to /proc/fs/afs/rootcell
 */
 int afs_cell_init(char *rootcell)
 {
        struct afs_cell *old_root, *new_root;
        char *cp;
-        int ret;
        _enter("");
@@ -162,82 +189,60 @@ int afs_cell_init(char *rootcell)
                /* module is loaded with no parameters, or built statically.
                 * - in the future we might initialize cell DB here.
                 */
-                _leave(" = 0 (but no root)");
+                _leave(" = 0 [no root]");
                return 0;
        }
        cp = strchr(rootcell, ':');
        if (!cp) {
                printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
-                _leave(" = %d (no colon)", -EINVAL);
+                _leave(" = -EINVAL");
                return -EINVAL;
        }
        /* allocate a cell record for the root cell */
        *cp++ = 0;
-        ret = afs_cell_create(rootcell, cp, &new_root);
+        new_root = afs_cell_create(rootcell, cp);
-        if (ret < 0) {
+        if (IS_ERR(new_root)) {
-                _leave(" = %d", ret);
+                _leave(" = %ld", PTR_ERR(new_root));
-                return ret;
+                return PTR_ERR(new_root);
        }
-        /* as afs_put_cell() takes locks by itself, we have to do
+        /* install the new cell */
-         * a little gymnastics to be race-free.
-         */
-        afs_get_cell(new_root);
        write_lock(&afs_cells_lock);
-        while (afs_cell_root) {
+        old_root = afs_cell_root;
-                old_root = afs_cell_root;
-                afs_cell_root = NULL;
-                write_unlock(&afs_cells_lock);
-                afs_put_cell(old_root);
-                write_lock(&afs_cells_lock);
-        }
        afs_cell_root = new_root;
        write_unlock(&afs_cells_lock);
+        afs_put_cell(old_root);
-        _leave(" = %d", ret);
+        _leave(" = 0");
-        return ret;
+        return 0;
+}
-} /* end afs_cell_init() */
-/*****************************************************************************/
 /*
 * lookup a cell record
 */
-int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
 {
        struct afs_cell *cell;
-        int ret;
        _enter("\"%*.*s\",", namesz, namesz, name ? name : "");
-        *_cell = NULL;
+        down_read(&afs_cells_sem);
+        read_lock(&afs_cells_lock);
        if (name) {
                /* if the cell was named, look for it in the cell record list */
-                ret = -ENOENT;
-                cell = NULL;
-                read_lock(&afs_cells_lock);
                list_for_each_entry(cell, &afs_cells, link) {
                        if (strncmp(cell->name, name, namesz) == 0) {
                                afs_get_cell(cell);
                                goto found;
                        }
                }
-                cell = NULL;
+                cell = ERR_PTR(-ENOENT);
        found:
+                ;
-                read_unlock(&afs_cells_lock);
+        } else {
-                if (cell)
-                        ret = 0;
-        }
-        else {
-                read_lock(&afs_cells_lock);
                cell = afs_cell_root;
                if (!cell) {
                        /* this should not happen unless user tries to mount
@@ -246,44 +251,35 @@ int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
                         * ENOENT might be "more appropriate" but they happen
                         * for other reasons.
                         */
-                        ret = -EDESTADDRREQ;
+                        cell = ERR_PTR(-EDESTADDRREQ);
-                }
+                } else {
-                else {
                        afs_get_cell(cell);
-                        ret = 0;
                }
-                read_unlock(&afs_cells_lock);
        }
-        *_cell = cell;
+        read_unlock(&afs_cells_lock);
-        _leave(" = %d (%p)", ret, cell);
+        up_read(&afs_cells_sem);
-        return ret;
+        _leave(" = %p", cell);
+        return cell;
-} /* end afs_cell_lookup() */
+}
-/*****************************************************************************/
 /*
 * try and get a cell record
 */
-struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell)
+struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
 {
-        struct afs_cell *cell;
        write_lock(&afs_cells_lock);
-        cell = *_cell;
        if (cell && !list_empty(&cell->link))
                afs_get_cell(cell);
        else
                cell = NULL;
        write_unlock(&afs_cells_lock);
        return cell;
-} /* end afs_get_cell_maybe() */
+}
-/*****************************************************************************/
 /*
 * destroy a cell record
 */
@@ -294,8 +290,7 @@ void afs_put_cell(struct afs_cell *cell)
        _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
-        /* sanity check */
+        ASSERTCMP(atomic_read(&cell->usage), >, 0);
-        BUG_ON(atomic_read(&cell->usage) <= 0);
        /* to prevent a race, the decrement and the dequeue must be effectively
         * atomic */
@@ -307,36 +302,49 @@ void afs_put_cell(struct afs_cell *cell)
                return;
        }
+        ASSERT(list_empty(&cell->servers));
+        ASSERT(list_empty(&cell->vl_list));
        write_unlock(&afs_cells_lock);
-        BUG_ON(!list_empty(&cell->sv_list));
+        wake_up(&afs_cells_freeable_wq);
-        BUG_ON(!list_empty(&cell->sv_graveyard));
-        BUG_ON(!list_empty(&cell->vl_list));
-        BUG_ON(!list_empty(&cell->vl_graveyard));
        _leave(" [unused]");
-} /* end afs_put_cell() */
+}
-/*****************************************************************************/
 /*
 * destroy a cell record
+ * - must be called with the afs_cells_sem write-locked
+ * - cell->link should have been broken by the caller
 */
 static void afs_cell_destroy(struct afs_cell *cell)
 {
        _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
-        /* to prevent a race, the decrement and the dequeue must be effectively
+        ASSERTCMP(atomic_read(&cell->usage), >=, 0);
-         * atomic */
+        ASSERT(list_empty(&cell->link));
-        write_lock(&afs_cells_lock);
-        /* sanity check */
+        /* wait for everyone to stop using the cell */
-        BUG_ON(atomic_read(&cell->usage) != 0);
+        if (atomic_read(&cell->usage) > 0) {
+                DECLARE_WAITQUEUE(myself, current);
-        list_del_init(&cell->link);
+                _debug("wait for cell %s", cell->name);
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                add_wait_queue(&afs_cells_freeable_wq, &myself);
-        write_unlock(&afs_cells_lock);
+                while (atomic_read(&cell->usage) > 0) {
+                        schedule();
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                }
-        down_write(&afs_cells_sem);
+                remove_wait_queue(&afs_cells_freeable_wq, &myself);
+                set_current_state(TASK_RUNNING);
+        }
+        _debug("cell dead");
+        ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+        ASSERT(list_empty(&cell->servers));
+        ASSERT(list_empty(&cell->vl_list));
        afs_proc_cell_remove(cell);
@@ -348,104 +356,26 @@ static void afs_cell_destroy(struct afs_cell *cell)
        cachefs_relinquish_cookie(cell->cache, 0);
 #endif
-        up_write(&afs_cells_sem);
+        key_put(cell->anonymous_key);
-        BUG_ON(!list_empty(&cell->sv_list));
-        BUG_ON(!list_empty(&cell->sv_graveyard));
-        BUG_ON(!list_empty(&cell->vl_list));
-        BUG_ON(!list_empty(&cell->vl_graveyard));
-        /* finish cleaning up the cell */
        kfree(cell);
        _leave(" [destroyed]");
-} /* end afs_cell_destroy() */
+}
-/*****************************************************************************/
-/*
- * lookup the server record corresponding to an Rx RPC peer
- */
-int afs_server_find_by_peer(const struct rxrpc_peer *peer,
-                            struct afs_server **_server)
-{
-        struct afs_server *server;
-        struct afs_cell *cell;
-        _enter("%p{a=%08x},", peer, ntohl(peer->addr.s_addr));
-        /* search the cell list */
-        read_lock(&afs_cells_lock);
-        list_for_each_entry(cell, &afs_cells, link) {
-                _debug("? cell %s",cell->name);
-                write_lock(&cell->sv_lock);
-                /* check the active list */
-                list_for_each_entry(server, &cell->sv_list, link) {
-                        _debug("?? server %08x", ntohl(server->addr.s_addr));
-                        if (memcmp(&server->addr, &peer->addr,
-                                   sizeof(struct in_addr)) == 0)
-                                goto found_server;
-                }
-                /* check the inactive list */
-                spin_lock(&cell->sv_gylock);
-                list_for_each_entry(server, &cell->sv_graveyard, link) {
-                        _debug("?? dead server %08x",
-                               ntohl(server->addr.s_addr));
-                        if (memcmp(&server->addr, &peer->addr,
-                                   sizeof(struct in_addr)) == 0)
-                                goto found_dead_server;
-                }
-                spin_unlock(&cell->sv_gylock);
-                write_unlock(&cell->sv_lock);
-        }
-        read_unlock(&afs_cells_lock);
-        _leave(" = -ENOENT");
-        return -ENOENT;
-        /* we found it in the graveyard - resurrect it */
- found_dead_server:
-        list_move_tail(&server->link, &cell->sv_list);
-        afs_get_server(server);
-        afs_kafstimod_del_timer(&server->timeout);
-        spin_unlock(&cell->sv_gylock);
-        goto success;
-        /* we found it - increment its ref count and return it */
- found_server:
-        afs_get_server(server);
- success:
-        write_unlock(&cell->sv_lock);
-        read_unlock(&afs_cells_lock);
-        *_server = server;
-        _leave(" = 0 (s=%p c=%p)", server, cell);
-        return 0;
-} /* end afs_server_find_by_peer() */
-/*****************************************************************************/
 /*
 * purge in-memory cell database on module unload or afs_init() failure
 * - the timeout daemon is stopped before calling this
 */
 void afs_cell_purge(void)
 {
-        struct afs_vlocation *vlocation;
        struct afs_cell *cell;
        _enter("");
        afs_put_cell(afs_cell_root);
+        down_write(&afs_cells_sem);
        while (!list_empty(&afs_cells)) {
                cell = NULL;
@@ -464,104 +394,11 @@ void afs_cell_purge(void)
                        _debug("PURGING CELL %s (%d)",
                               cell->name, atomic_read(&cell->usage));
-                        BUG_ON(!list_empty(&cell->sv_list));
-                        BUG_ON(!list_empty(&cell->vl_list));
-                        /* purge the cell's VL graveyard list */
-                        _debug(" - clearing VL graveyard");
-                        spin_lock(&cell->vl_gylock);
-                        while (!list_empty(&cell->vl_graveyard)) {
-                                vlocation = list_entry(cell->vl_graveyard.next,
-                                                       struct afs_vlocation,
-                                                       link);
-                                list_del_init(&vlocation->link);
-                                afs_kafstimod_del_timer(&vlocation->timeout);
-                                spin_unlock(&cell->vl_gylock);
-                                afs_vlocation_do_timeout(vlocation);
-                                /* TODO: race if move to use krxtimod instead
-                                 * of kafstimod */
-                                spin_lock(&cell->vl_gylock);
-                        }
-                        spin_unlock(&cell->vl_gylock);
-                        /* purge the cell's server graveyard list */
-                        _debug(" - clearing server graveyard");
-                        spin_lock(&cell->sv_gylock);
-                        while (!list_empty(&cell->sv_graveyard)) {
-                                struct afs_server *server;
-                                server = list_entry(cell->sv_graveyard.next,
-                                                    struct afs_server, link);
-                                list_del_init(&server->link);
-                                afs_kafstimod_del_timer(&server->timeout);
-                                spin_unlock(&cell->sv_gylock);
-                                afs_server_do_timeout(server);
-                                spin_lock(&cell->sv_gylock);
-                        }
-                        spin_unlock(&cell->sv_gylock);
                        /* now the cell should be left with no references */
                        afs_cell_destroy(cell);
                }
        }
+        up_write(&afs_cells_sem);
        _leave("");
-} /* end afs_cell_purge() */
+}
-/*****************************************************************************/
-/*
- * match a cell record obtained from the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-                                                const void *entry)
-{
-        const struct afs_cache_cell *ccell = entry;
-        struct afs_cell *cell = target;
-        _enter("{%s},{%s}", ccell->name, cell->name);
-        if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
-                _leave(" = SUCCESS");
-                return CACHEFS_MATCH_SUCCESS;
-        }
-        _leave(" = FAILED");
-        return CACHEFS_MATCH_FAILED;
-} /* end afs_cell_cache_match() */
-#endif
-/*****************************************************************************/
-/*
- * update a cell record in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
-{
-        struct afs_cache_cell *ccell = entry;
-        struct afs_cell *cell = source;
-        _enter("%p,%p", source, entry);
-        strncpy(ccell->name, cell->name, sizeof(ccell->name));
-        memcpy(ccell->vl_servers,
-               cell->vl_addrs,
-               min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
-} /* end afs_cell_cache_update() */
-#endif
diff --git a/fs/afs/cell.h b/fs/afs/cell.h
deleted file mode 100644
index 48349108fb00..000000000000
--- a/fs/afs/cell.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* cell.h: AFS cell record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_CELL_H
-#define _LINUX_AFS_CELL_H
-#include "types.h"
-#include "cache.h"
-#define AFS_CELL_MAX_ADDRS 15
-extern volatile int afs_cells_being_purged; /* T when cells are being purged by rmmod */
-/*****************************************************************************/
-/*
- * entry in the cached cell catalogue
- */
-struct afs_cache_cell
-{
-        char                    name[64];       /* cell name (padded with NULs) */
-        struct in_addr          vl_servers[15]; /* cached cell VL servers */
-};
-/*****************************************************************************/
-/*
- * AFS cell record
- */
-struct afs_cell
-{
-        atomic_t                usage;
-        struct list_head        link;           /* main cell list link */
-        struct list_head        proc_link;      /* /proc cell list link */
-        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        /* server record management */
-        rwlock_t                sv_lock;        /* active server list lock */
-        struct list_head        sv_list;        /* active server list */
-        struct list_head        sv_graveyard;   /* inactive server list */
-        spinlock_t              sv_gylock;      /* inactive server list lock */
-        /* volume location record management */
-        struct rw_semaphore     vl_sem;         /* volume management serialisation semaphore */
-        struct list_head        vl_list;        /* cell's active VL record list */
-        struct list_head        vl_graveyard;   /* cell's inactive VL record list */
-        spinlock_t              vl_gylock;      /* graveyard lock */
-        unsigned short          vl_naddrs;      /* number of VL servers in addr list */
-        unsigned short          vl_curr_svix;   /* current server index */
-        struct in_addr          vl_addrs[AFS_CELL_MAX_ADDRS];   /* cell VL server addresses */
-        char                    name[0];        /* cell name - must go last */
-};
-extern int afs_cell_init(char *rootcell);
-extern int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell);
-extern int afs_cell_lookup(const char *name, unsigned nmsize, struct afs_cell **_cell);
-#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-extern struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell);
-extern void afs_put_cell(struct afs_cell *cell);
-extern void afs_cell_purge(void);
-#endif /* _LINUX_AFS_CELL_H */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3d097fddcb7a..d5b2ad6575bc 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -1,4 +1,4 @@
-/* cmservice.c: AFS Cache Manager Service
+/* AFS Cache Manager Service
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -12,641 +12,464 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <linux/completion.h>
+#include <linux/ip.h>
-#include "server.h"
-#include "cell.h"
-#include "transport.h"
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "cmservice.h"
 #include "internal.h"
+#include "afs_cm.h"
-static unsigned afscm_usage;            /* AFS cache manager usage count */
+struct workqueue_struct *afs_cm_workqueue;
-static struct rw_semaphore afscm_sem;   /* AFS cache manager start/stop semaphore */
-static int afscm_new_call(struct rxrpc_call *call);
-static void afscm_attention(struct rxrpc_call *call);
-static void afscm_error(struct rxrpc_call *call);
-static void afscm_aemap(struct rxrpc_call *call);
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call);
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call);
-static void _SRXAFSCM_Probe(struct rxrpc_call *call);
-typedef void (*_SRXAFSCM_xxxx_t)(struct rxrpc_call *call);
-static const struct rxrpc_operation AFSCM_ops[] = {
-        {
-                .id     = 204,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "CallBack",
-                .user   = _SRXAFSCM_CallBack,
-        },
-        {
-                .id     = 205,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "InitCallBackState",
-                .user   = _SRXAFSCM_InitCallBackState,
-        },
-        {
-                .id     = 206,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "Probe",
-                .user   = _SRXAFSCM_Probe,
-        },
-#if 0
-        {
-                .id     = 207,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetLock",
-                .user   = _SRXAFSCM_GetLock,
-        },
-        {
-                .id     = 208,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetCE",
-                .user   = _SRXAFSCM_GetCE,
-        },
-        {
-                .id     = 209,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetXStatsVersion",
-                .user   = _SRXAFSCM_GetXStatsVersion,
-        },
-        {
-                .id     = 210,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetXStats",
-                .user   = _SRXAFSCM_GetXStats,
-        }
-#endif
-};
-static struct rxrpc_service AFSCM_service = {
+static int afs_deliver_cb_init_call_back_state(struct afs_call *,
-        .name           = "AFS/CM",
+                                               struct sk_buff *, bool);
-        .owner          = THIS_MODULE,
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
-        .link           = LIST_HEAD_INIT(AFSCM_service.link),
+                                                struct sk_buff *, bool);
-        .new_call       = afscm_new_call,
+static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
-        .service_id     = 1,
+static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
-        .attn_func      = afscm_attention,
+static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *,
-        .error_func     = afscm_error,
+                                           bool);
-        .aemap_func     = afscm_aemap,
+static void afs_cm_destructor(struct afs_call *);
-        .ops_begin      = &AFSCM_ops[0],
-        .ops_end        = &AFSCM_ops[ARRAY_SIZE(AFSCM_ops)],
-};
-static DECLARE_COMPLETION(kafscmd_alive);
-static DECLARE_COMPLETION(kafscmd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafscmd_sleepq);
-static LIST_HEAD(kafscmd_attention_list);
-static LIST_HEAD(afscm_calls);
-static DEFINE_SPINLOCK(afscm_calls_lock);
-static DEFINE_SPINLOCK(kafscmd_attention_lock);
-static int kafscmd_die;
-/*****************************************************************************/
 /*
- * AFS Cache Manager kernel thread
+ * CB.CallBack operation type
 */
-static int kafscmd(void *arg)
+static const struct afs_call_type afs_SRXCBCallBack = {
-{
+        .name           = "CB.CallBack",
-        DECLARE_WAITQUEUE(myself, current);
+        .deliver        = afs_deliver_cb_callback,
+        .abort_to_error = afs_abort_to_error,
-        struct rxrpc_call *call;
+        .destructor     = afs_cm_destructor,
-        _SRXAFSCM_xxxx_t func;
+};
-        int die;
-        printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
-        daemonize("kafscmd");
-        complete(&kafscmd_alive);
-        /* loop around looking for things to attend to */
-        do {
-                if (list_empty(&kafscmd_attention_list)) {
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        add_wait_queue(&kafscmd_sleepq, &myself);
-                        for (;;) {
-                                set_current_state(TASK_INTERRUPTIBLE);
-                                if (!list_empty(&kafscmd_attention_list) ||
-                                    signal_pending(current) ||
-                                    kafscmd_die)
-                                        break;
-                                schedule();
-                        }
-                        remove_wait_queue(&kafscmd_sleepq, &myself);
-                        set_current_state(TASK_RUNNING);
-                }
-                die = kafscmd_die;
-                /* dequeue the next call requiring attention */
-                call = NULL;
-                spin_lock(&kafscmd_attention_lock);
-                if (!list_empty(&kafscmd_attention_list)) {
-                        call = list_entry(kafscmd_attention_list.next,
-                                          struct rxrpc_call,
-                                          app_attn_link);
-                        list_del_init(&call->app_attn_link);
-                        die = 0;
-                }
-                spin_unlock(&kafscmd_attention_lock);
-                if (call) {
-                        /* act upon it */
-                        _debug("@@@ Begin Attend Call %p", call);
-                        func = call->app_user;
-                        if (func)
-                                func(call);
-                        rxrpc_put_call(call);
-                        _debug("@@@ End Attend Call %p", call);
-                }
-        } while(!die);
-        /* and that's all */
-        complete_and_exit(&kafscmd_dead, 0);
-} /* end kafscmd() */
-/*****************************************************************************/
 /*
- * handle a call coming in to the cache manager
+ * CB.InitCallBackState operation type
- * - if I want to keep the call, I must increment its usage count
- * - the return value will be negated and passed back in an abort packet if
- *   non-zero
- * - serialised by virtue of there only being one krxiod
 */
-static int afscm_new_call(struct rxrpc_call *call)
+static const struct afs_call_type afs_SRXCBInitCallBackState = {
-{
+        .name           = "CB.InitCallBackState",
-        _enter("%p{cid=%u u=%d}",
+        .deliver        = afs_deliver_cb_init_call_back_state,
-               call, ntohl(call->call_id), atomic_read(&call->usage));
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_cm_destructor,
-        rxrpc_get_call(call);
+};
-        /* add to my current call list */
-        spin_lock(&afscm_calls_lock);
-        list_add(&call->app_link,&afscm_calls);
-        spin_unlock(&afscm_calls_lock);
-        _leave(" = 0");
-        return 0;
-} /* end afscm_new_call() */
-/*****************************************************************************/
 /*
- * queue on the kafscmd queue for attention
+ * CB.InitCallBackState3 operation type
 */
-static void afscm_attention(struct rxrpc_call *call)
+static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
-{
+        .name           = "CB.InitCallBackState3",
-        _enter("%p{cid=%u u=%d}",
+        .deliver        = afs_deliver_cb_init_call_back_state3,
-               call, ntohl(call->call_id), atomic_read(&call->usage));
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_cm_destructor,
-        spin_lock(&kafscmd_attention_lock);
+};
-        if (list_empty(&call->app_attn_link)) {
-                list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
-                rxrpc_get_call(call);
-        }
-        spin_unlock(&kafscmd_attention_lock);
-        wake_up(&kafscmd_sleepq);
-        _leave(" {u=%d}", atomic_read(&call->usage));
-} /* end afscm_attention() */
-/*****************************************************************************/
 /*
- * handle my call being aborted
+ * CB.Probe operation type
- * - clean up, dequeue and put my ref to the call
 */
-static void afscm_error(struct rxrpc_call *call)
+static const struct afs_call_type afs_SRXCBProbe = {
-{
+        .name           = "CB.Probe",
-        int removed;
+        .deliver        = afs_deliver_cb_probe,
+        .abort_to_error = afs_abort_to_error,
-        _enter("%p{est=%s ac=%u er=%d}",
+        .destructor     = afs_cm_destructor,
-               call,
+};
-               rxrpc_call_error_states[call->app_err_state],
-               call->app_abort_code,
-               call->app_errno);
-        spin_lock(&kafscmd_attention_lock);
-        if (list_empty(&call->app_attn_link)) {
-                list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
-                rxrpc_get_call(call);
-        }
-        spin_unlock(&kafscmd_attention_lock);
-        removed = 0;
-        spin_lock(&afscm_calls_lock);
-        if (!list_empty(&call->app_link)) {
-                list_del_init(&call->app_link);
-                removed = 1;
-        }
-        spin_unlock(&afscm_calls_lock);
-        if (removed)
-                rxrpc_put_call(call);
-        wake_up(&kafscmd_sleepq);
-        _leave("");
+/*
-} /* end afscm_error() */
+ * CB.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_SRXCBGetCapabilites = {
+        .name           = "CB.GetCapabilities",
+        .deliver        = afs_deliver_cb_get_capabilities,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_cm_destructor,
+};
-/*****************************************************************************/
 /*
- * map afs abort codes to/from Linux error codes
+ * route an incoming cache manager call
- * - called with call->lock held
+ * - return T if supported, F if not
 */
-static void afscm_aemap(struct rxrpc_call *call)
+bool afs_cm_incoming_call(struct afs_call *call)
 {
-        switch (call->app_err_state) {
+        u32 operation_id = ntohl(call->operation_ID);
-        case RXRPC_ESTATE_LOCAL_ABORT:
-                call->app_abort_code = -call->app_errno;
+        _enter("{CB.OP %u}", operation_id);
-                break;
-        case RXRPC_ESTATE_PEER_ABORT:
+        switch (operation_id) {
-                call->app_errno = -ECONNABORTED;
+        case CBCallBack:
-                break;
+                call->type = &afs_SRXCBCallBack;
+                return true;
+        case CBInitCallBackState:
+                call->type = &afs_SRXCBInitCallBackState;
+                return true;
+        case CBInitCallBackState3:
+                call->type = &afs_SRXCBInitCallBackState3;
+                return true;
+        case CBProbe:
+                call->type = &afs_SRXCBProbe;
+                return true;
+        case CBGetCapabilities:
+                call->type = &afs_SRXCBGetCapabilites;
+                return true;
        default:
-                break;
+                return false;
        }
-} /* end afscm_aemap() */
+}
-/*****************************************************************************/
 /*
- * start the cache manager service if not already started
+ * clean up a cache manager call
 */
-int afscm_start(void)
+static void afs_cm_destructor(struct afs_call *call)
 {
-        int ret;
+        _enter("");
-        down_write(&afscm_sem);
-        if (!afscm_usage) {
-                ret = kernel_thread(kafscmd, NULL, 0);
-                if (ret < 0)
-                        goto out;
-                wait_for_completion(&kafscmd_alive);
-                ret = rxrpc_add_service(afs_transport, &AFSCM_service);
-                if (ret < 0)
-                        goto kill;
-                afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
-                                        afs_mntpt_expiry_timeout * HZ);
-        }
-        afscm_usage++;
-        up_write(&afscm_sem);
-        return 0;
- kill:
-        kafscmd_die = 1;
-        wake_up(&kafscmd_sleepq);
-        wait_for_completion(&kafscmd_dead);
- out:
-        up_write(&afscm_sem);
-        return ret;
-} /* end afscm_start() */
+        afs_put_server(call->server);
+        call->server = NULL;
+        kfree(call->buffer);
+        call->buffer = NULL;
+}
-/*****************************************************************************/
 /*
- * stop the cache manager service
+ * allow the fileserver to see if the cache manager is still alive
 */
-void afscm_stop(void)
+static void SRXAFSCB_CallBack(struct work_struct *work)
 {
-        struct rxrpc_call *call;
+        struct afs_call *call = container_of(work, struct afs_call, work);
-        down_write(&afscm_sem);
+        _enter("");
-        BUG_ON(afscm_usage == 0);
+        /* be sure to send the reply *before* attempting to spam the AFS server
-        afscm_usage--;
+         * with FSFetchStatus requests on the vnodes with broken callbacks lest
+         * the AFS server get into a vicious cycle of trying to break further
+         * callbacks because it hadn't received completion of the CBCallBack op
+         * yet */
+        afs_send_empty_reply(call);
-        if (afscm_usage == 0) {
+        afs_break_callbacks(call->server, call->count, call->request);
-                /* don't want more incoming calls */
+        _leave("");
-                rxrpc_del_service(afs_transport, &AFSCM_service);
+}
-                /* abort any calls I've still got open (the afscm_error() will
-                 * dequeue them) */
-                spin_lock(&afscm_calls_lock);
-                while (!list_empty(&afscm_calls)) {
-                        call = list_entry(afscm_calls.next,
-                                          struct rxrpc_call,
-                                          app_link);
-                        list_del_init(&call->app_link);
+/*
-                        rxrpc_get_call(call);
+ * deliver request data to a CB.CallBack call
-                        spin_unlock(&afscm_calls_lock);
+ */
+static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
+                                   bool last)
+{
+        struct afs_callback *cb;
+        struct afs_server *server;
+        struct in_addr addr;
+        __be32 *bp;
+        u32 tmp;
+        int ret, loop;
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+        switch (call->unmarshall) {
+        case 0:
+                call->offset = 0;
+                call->unmarshall++;
+                /* extract the FID array and its count in two steps */
+        case 1:
+                _debug("extract FID count");
+                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-                        rxrpc_call_abort(call, -ESRCH); /* abort, dequeue and
+                call->count = ntohl(call->tmp);
-                                                         * put */
+                _debug("FID count: %u", call->count);
+                if (call->count > AFSCBMAX)
+                        return -EBADMSG;
+                call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
+                if (!call->buffer)
+                        return -ENOMEM;
+                call->offset = 0;
+                call->unmarshall++;
+        case 2:
+                _debug("extract FID array");
+                ret = afs_extract_data(call, skb, last, call->buffer,
+                                       call->count * 3 * 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-                        _debug("nuking active call %08x.%d",
+                _debug("unmarshall FID array");
-                               ntohl(call->conn->conn_id),
+                call->request = kcalloc(call->count,
-                               ntohl(call->call_id));
+                                        sizeof(struct afs_callback),
-                        rxrpc_put_call(call);
+                                        GFP_KERNEL);
-                        rxrpc_put_call(call);
+                if (!call->request)
+                        return -ENOMEM;
+                cb = call->request;
+                bp = call->buffer;
+                for (loop = call->count; loop > 0; loop--, cb++) {
+                        cb->fid.vid     = ntohl(*bp++);
+                        cb->fid.vnode   = ntohl(*bp++);
+                        cb->fid.unique  = ntohl(*bp++);
+                        cb->type        = AFSCM_CB_UNTYPED;
+                }
-                        spin_lock(&afscm_calls_lock);
+                call->offset = 0;
+                call->unmarshall++;
+                /* extract the callback array and its count in two steps */
+        case 3:
+                _debug("extract CB count");
+                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
                }
-                spin_unlock(&afscm_calls_lock);
-                /* get rid of my daemon */
+                tmp = ntohl(call->tmp);
-                kafscmd_die = 1;
+                _debug("CB count: %u", tmp);
-                wake_up(&kafscmd_sleepq);
+                if (tmp != call->count && tmp != 0)
-                wait_for_completion(&kafscmd_dead);
+                        return -EBADMSG;
+                call->offset = 0;
+                call->unmarshall++;
+                if (tmp == 0)
+                        goto empty_cb_array;
+        case 4:
+                _debug("extract CB array");
+                ret = afs_extract_data(call, skb, last, call->request,
+                                       call->count * 3 * 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-                /* dispose of any calls waiting for attention */
+                _debug("unmarshall CB array");
-                spin_lock(&kafscmd_attention_lock);
+                cb = call->request;
-                while (!list_empty(&kafscmd_attention_list)) {
+                bp = call->buffer;
-                        call = list_entry(kafscmd_attention_list.next,
+                for (loop = call->count; loop > 0; loop--, cb++) {
-                                          struct rxrpc_call,
+                        cb->version     = ntohl(*bp++);
-                                          app_attn_link);
+                        cb->expiry      = ntohl(*bp++);
+                        cb->type        = ntohl(*bp++);
+                }
-                        list_del_init(&call->app_attn_link);
+        empty_cb_array:
-                        spin_unlock(&kafscmd_attention_lock);
+                call->offset = 0;
+                call->unmarshall++;
-                        rxrpc_put_call(call);
+        case 5:
+                _debug("trailer");
+                if (skb->len != 0)
+                        return -EBADMSG;
+                break;
+        }
-                        spin_lock(&kafscmd_attention_lock);
+        if (!last)
-                }
+                return 0;
-                spin_unlock(&kafscmd_attention_lock);
-                afs_kafstimod_del_timer(&afs_mntpt_expiry_timer);
+        call->state = AFS_CALL_REPLYING;
-        }
-        up_write(&afscm_sem);
+        /* we'll need the file server record as that tells us which set of
+         * vnodes to operate upon */
+        memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+        server = afs_find_server(&addr);
+        if (!server)
+                return -ENOTCONN;
+        call->server = server;
-} /* end afscm_stop() */
+        INIT_WORK(&call->work, SRXAFSCB_CallBack);
+        schedule_work(&call->work);
+        return 0;
+}
-/*****************************************************************************/
 /*
- * handle the fileserver breaking a set of callbacks
+ * allow the fileserver to request callback state (re-)initialisation
 */
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call)
+static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 {
-        struct afs_server *server;
+        struct afs_call *call = container_of(work, struct afs_call, work);
-        size_t count, qty, tmp;
-        int ret = 0, removed;
-        _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
-        server = afs_server_get_from_peer(call->conn->peer);
-        switch (call->app_call_state) {
-                /* we've received the last packet
-                 * - drain all the data from the call and send the reply
-                 */
-        case RXRPC_CSTATE_SRVR_GOT_ARGS:
-                ret = -EBADMSG;
-                qty = call->app_ready_qty;
-                if (qty < 8 || qty > 50 * (6 * 4) + 8)
-                        break;
-                {
-                        struct afs_callback *cb, *pcb;
-                        int loop;
-                        __be32 *fp, *bp;
-                        fp = rxrpc_call_alloc_scratch(call, qty);
-                        /* drag the entire argument block out to the scratch
-                         * space */
-                        ret = rxrpc_call_read_data(call, fp, qty, 0);
-                        if (ret < 0)
-                                break;
-                        /* and unmarshall the parameter block */
-                        ret = -EBADMSG;
-                        count = ntohl(*fp++);
-                        if (count>AFSCBMAX ||
-                            (count * (3 * 4) + 8 != qty &&
-                             count * (6 * 4) + 8 != qty))
-                                break;
-                        bp = fp + count*3;
-                        tmp = ntohl(*bp++);
-                        if (tmp > 0 && tmp != count)
-                                break;
-                        if (tmp == 0)
-                                bp = NULL;
-                        pcb = cb = rxrpc_call_alloc_scratch_s(
-                                call, struct afs_callback);
-                        for (loop = count - 1; loop >= 0; loop--) {
-                                pcb->fid.vid    = ntohl(*fp++);
-                                pcb->fid.vnode  = ntohl(*fp++);
-                                pcb->fid.unique = ntohl(*fp++);
-                                if (bp) {
-                                        pcb->version    = ntohl(*bp++);
-                                        pcb->expiry     = ntohl(*bp++);
-                                        pcb->type       = ntohl(*bp++);
-                                }
-                                else {
-                                        pcb->version    = 0;
-                                        pcb->expiry     = 0;
-                                        pcb->type       = AFSCM_CB_UNTYPED;
-                                }
-                                pcb++;
-                        }
-                        /* invoke the actual service routine */
-                        ret = SRXAFSCM_CallBack(server, count, cb);
-                        if (ret < 0)
-                                break;
-                }
-                /* send the reply */
+        _enter("{%p}", call->server);
-                ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-                                            GFP_KERNEL, 0, &count);
-                if (ret < 0)
-                        break;
-                break;
-                /* operation complete */
-        case RXRPC_CSTATE_COMPLETE:
-                call->app_user = NULL;
-                removed = 0;
-                spin_lock(&afscm_calls_lock);
-                if (!list_empty(&call->app_link)) {
-                        list_del_init(&call->app_link);
-                        removed = 1;
-                }
-                spin_unlock(&afscm_calls_lock);
-                if (removed)
+        afs_init_callback_state(call->server);
-                        rxrpc_put_call(call);
+        afs_send_empty_reply(call);
-                break;
+        _leave("");
+}
-                /* operation terminated on error */
+/*
-        case RXRPC_CSTATE_ERROR:
+ * deliver request data to a CB.InitCallBackState call
-                call->app_user = NULL;
+ */
-                break;
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
+                                               struct sk_buff *skb,
+                                               bool last)
+{
+        struct afs_server *server;
+        struct in_addr addr;
-        default:
+        _enter(",{%u},%d", skb->len, last);
-                break;
-        }
-        if (ret < 0)
+        if (skb->len > 0)
-                rxrpc_call_abort(call, ret);
+                return -EBADMSG;
+        if (!last)
+                return 0;
-        afs_put_server(server);
+        /* no unmarshalling required */
+        call->state = AFS_CALL_REPLYING;
-        _leave(" = %d", ret);
+        /* we'll need the file server record as that tells us which set of
+         * vnodes to operate upon */
+        memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+        server = afs_find_server(&addr);
+        if (!server)
+                return -ENOTCONN;
+        call->server = server;
-} /* end _SRXAFSCM_CallBack() */
+        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+        schedule_work(&call->work);
+        return 0;
+}
-/*****************************************************************************/
 /*
- * handle the fileserver asking us to initialise our callback state
+ * deliver request data to a CB.InitCallBackState3 call
 */
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call)
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
+                                                struct sk_buff *skb,
+                                                bool last)
 {
        struct afs_server *server;
-        size_t count;
+        struct in_addr addr;
-        int ret = 0, removed;
-        _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
+        _enter(",{%u},%d", skb->len, last);
-        server = afs_server_get_from_peer(call->conn->peer);
+        if (!last)
+                return 0;
-        switch (call->app_call_state) {
+        /* no unmarshalling required */
-                /* we've received the last packet - drain all the data from the
+        call->state = AFS_CALL_REPLYING;
-                 * call */
-        case RXRPC_CSTATE_SRVR_GOT_ARGS:
-                /* shouldn't be any args */
-                ret = -EBADMSG;
-                break;
-                /* send the reply when asked for it */
-        case RXRPC_CSTATE_SRVR_SND_REPLY:
-                /* invoke the actual service routine */
-                ret = SRXAFSCM_InitCallBackState(server);
-                if (ret < 0)
-                        break;
-                ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-                                            GFP_KERNEL, 0, &count);
-                if (ret < 0)
-                        break;
-                break;
-                /* operation complete */
+        /* we'll need the file server record as that tells us which set of
-        case RXRPC_CSTATE_COMPLETE:
+         * vnodes to operate upon */
-                call->app_user = NULL;
+        memcpy(&addr, &ip_hdr(skb)->saddr, 4);
-                removed = 0;
+        server = afs_find_server(&addr);
-                spin_lock(&afscm_calls_lock);
+        if (!server)
-                if (!list_empty(&call->app_link)) {
+                return -ENOTCONN;
-                        list_del_init(&call->app_link);
+        call->server = server;
-                        removed = 1;
-                }
-                spin_unlock(&afscm_calls_lock);
-                if (removed)
+        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-                        rxrpc_put_call(call);
+        schedule_work(&call->work);
-                break;
+        return 0;
+}
-                /* operation terminated on error */
-        case RXRPC_CSTATE_ERROR:
-                call->app_user = NULL;
-                break;
-        default:
-                break;
-        }
-        if (ret < 0)
-                rxrpc_call_abort(call, ret);
-        afs_put_server(server);
-        _leave(" = %d", ret);
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_Probe(struct work_struct *work)
+{
+        struct afs_call *call = container_of(work, struct afs_call, work);
-} /* end _SRXAFSCM_InitCallBackState() */
+        _enter("");
+        afs_send_empty_reply(call);
+        _leave("");
+}
-/*****************************************************************************/
 /*
- * handle a probe from a fileserver
+ * deliver request data to a CB.Probe call
 */
-static void _SRXAFSCM_Probe(struct rxrpc_call *call)
+static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
+                                bool last)
 {
-        struct afs_server *server;
+        _enter(",{%u},%d", skb->len, last);
-        size_t count;
-        int ret = 0, removed;
-        _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
-        server = afs_server_get_from_peer(call->conn->peer);
+        if (skb->len > 0)
+                return -EBADMSG;
+        if (!last)
+                return 0;
-        switch (call->app_call_state) {
+        /* no unmarshalling required */
-                /* we've received the last packet - drain all the data from the
+        call->state = AFS_CALL_REPLYING;
-                 * call */
-        case RXRPC_CSTATE_SRVR_GOT_ARGS:
-                /* shouldn't be any args */
-                ret = -EBADMSG;
-                break;
-                /* send the reply when asked for it */
+        INIT_WORK(&call->work, SRXAFSCB_Probe);
-        case RXRPC_CSTATE_SRVR_SND_REPLY:
+        schedule_work(&call->work);
-                /* invoke the actual service routine */
+        return 0;
-                ret = SRXAFSCM_Probe(server);
+}
-                if (ret < 0)
-                        break;
-                ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-                                            GFP_KERNEL, 0, &count);
-                if (ret < 0)
-                        break;
-                break;
-                /* operation complete */
+/*
-        case RXRPC_CSTATE_COMPLETE:
+ * allow the fileserver to ask about the cache manager's capabilities
-                call->app_user = NULL;
+ */
-                removed = 0;
+static void SRXAFSCB_GetCapabilities(struct work_struct *work)
-                spin_lock(&afscm_calls_lock);
+{
-                if (!list_empty(&call->app_link)) {
+        struct afs_interface *ifs;
-                        list_del_init(&call->app_link);
+        struct afs_call *call = container_of(work, struct afs_call, work);
-                        removed = 1;
+        int loop, nifs;
+        struct {
+                struct /* InterfaceAddr */ {
+                        __be32 nifs;
+                        __be32 uuid[11];
+                        __be32 ifaddr[32];
+                        __be32 netmask[32];
+                        __be32 mtu[32];
+                } ia;
+                struct /* Capabilities */ {
+                        __be32 capcount;
+                        __be32 caps[1];
+                } cap;
+        } reply;
+        _enter("");
+        nifs = 0;
+        ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
+        if (ifs) {
+                nifs = afs_get_ipv4_interfaces(ifs, 32, false);
+                if (nifs < 0) {
+                        kfree(ifs);
+                        ifs = NULL;
+                        nifs = 0;
                }
-                spin_unlock(&afscm_calls_lock);
+        }
-                if (removed)
+        memset(&reply, 0, sizeof(reply));
-                        rxrpc_put_call(call);
+        reply.ia.nifs = htonl(nifs);
-                break;
+        reply.ia.uuid[0] = htonl(afs_uuid.time_low);
+        reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
+        reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+        reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
+        reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+        for (loop = 0; loop < 6; loop++)
+                reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+        if (ifs) {
+                for (loop = 0; loop < nifs; loop++) {
+                        reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
+                        reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
+                        reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+                }
+                kfree(ifs);
+        }
-                /* operation terminated on error */
+        reply.cap.capcount = htonl(1);
-        case RXRPC_CSTATE_ERROR:
+        reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
-                call->app_user = NULL;
+        afs_send_simple_reply(call, &reply, sizeof(reply));
-                break;
-        default:
+        _leave("");
-                break;
+}
-        }
-        if (ret < 0)
+/*
-                rxrpc_call_abort(call, ret);
+ * deliver request data to a CB.GetCapabilities call
+ */
+static int afs_deliver_cb_get_capabilities(struct afs_call *call,
+                                           struct sk_buff *skb, bool last)
+{
+        _enter(",{%u},%d", skb->len, last);
-        afs_put_server(server);
+        if (skb->len > 0)
+                return -EBADMSG;
+        if (!last)
+                return 0;
-        _leave(" = %d", ret);
+        /* no unmarshalling required */
+        call->state = AFS_CALL_REPLYING;
-} /* end _SRXAFSCM_Probe() */
+        INIT_WORK(&call->work, SRXAFSCB_GetCapabilities);
+        schedule_work(&call->work);
+        return 0;
+}
diff --git a/fs/afs/cmservice.h b/fs/afs/cmservice.h
deleted file mode 100644
index af8d4d689cb2..000000000000
--- a/fs/afs/cmservice.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* cmservice.h: AFS Cache Manager Service declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_CMSERVICE_H
-#define _LINUX_AFS_CMSERVICE_H
-#include <rxrpc/transport.h>
-#include "types.h"
-/* cache manager start/stop */
-extern int afscm_start(void);
-extern void afscm_stop(void);
-/* cache manager server functions */
-extern int SRXAFSCM_InitCallBackState(struct afs_server *server);
-extern int SRXAFSCM_CallBack(struct afs_server *server,
-                             size_t count,
-                             struct afs_callback callbacks[]);
-extern int SRXAFSCM_Probe(struct afs_server *server);
-#endif /* _LINUX_AFS_CMSERVICE_H */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b6dc2ebe47a8..0c1e902f17a3 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -15,45 +15,53 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
+#include <linux/ctype.h>
-#include "vnode.h"
-#include "volume.h"
-#include <rxrpc/call.h>
-#include "super.h"
 #include "internal.h"
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
-                                     struct nameidata *nd);
+                                 struct nameidata *nd);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
 static int afs_d_delete(struct dentry *dentry);
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
+static void afs_d_release(struct dentry *dentry);
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
                                  loff_t fpos, u64 ino, unsigned dtype);
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+                      struct nameidata *nd);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_rmdir(struct inode *dir, struct dentry *dentry);
+static int afs_unlink(struct inode *dir, struct dentry *dentry);
+static int afs_link(struct dentry *from, struct inode *dir,
+                    struct dentry *dentry);
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+                       const char *content);
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                      struct inode *new_dir, struct dentry *new_dentry);
 const struct file_operations afs_dir_file_operations = {
        .open           = afs_dir_open,
-        .readdir        = afs_dir_readdir,
+        .release        = afs_release,
+        .readdir        = afs_readdir,
 };
 const struct inode_operations afs_dir_inode_operations = {
-        .lookup         = afs_dir_lookup,
+        .create         = afs_create,
+        .lookup         = afs_lookup,
+        .link           = afs_link,
+        .unlink         = afs_unlink,
+        .symlink        = afs_symlink,
+        .mkdir          = afs_mkdir,
+        .rmdir          = afs_rmdir,
+        .rename         = afs_rename,
+        .permission     = afs_permission,
        .getattr        = afs_inode_getattr,
-#if 0 /* TODO */
-        .create         = afs_dir_create,
-        .link           = afs_dir_link,
-        .unlink         = afs_dir_unlink,
-        .symlink        = afs_dir_symlink,
-        .mkdir          = afs_dir_mkdir,
-        .rmdir          = afs_dir_rmdir,
-        .mknod          = afs_dir_mknod,
-        .rename         = afs_dir_rename,
-#endif
 };
 static struct dentry_operations afs_fs_dentry_operations = {
        .d_revalidate   = afs_d_revalidate,
        .d_delete       = afs_d_delete,
+        .d_release      = afs_d_release,
 };
 #define AFS_DIR_HASHTBL_SIZE    128
@@ -105,14 +113,13 @@ struct afs_dir_page {
        union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
 };
-struct afs_dir_lookup_cookie {
+struct afs_lookup_cookie {
        struct afs_fid  fid;
        const char      *name;
        size_t          nlen;
        int             found;
 };
-/*****************************************************************************/
 /*
 * check that a directory page is valid
 */
@@ -128,9 +135,10 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
        if (qty == 0)
                goto error;
-        if (page->index==0 && qty!=ntohs(dbuf->blocks[0].pagehdr.npages)) {
+        if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
                printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
-                       __FUNCTION__,dir->i_ino,qty,ntohs(dbuf->blocks[0].pagehdr.npages));
+                       __FUNCTION__, dir->i_ino, qty,
+                       ntohs(dbuf->blocks[0].pagehdr.npages));
                goto error;
        }
 #endif
@@ -157,13 +165,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
        SetPageChecked(page);
        return;
- error:
+error:
        SetPageChecked(page);
        SetPageError(page);
+}
-} /* end afs_dir_check_page() */
-/*****************************************************************************/
 /*
 * discard a page cached in the pagecache
 */
@@ -171,25 +177,24 @@ static inline void afs_dir_put_page(struct page *page)
 {
        kunmap(page);
        page_cache_release(page);
+}
-} /* end afs_dir_put_page() */
-/*****************************************************************************/
 /*
 * get a page into the pagecache
 */
-static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
+static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
+                                     struct key *key)
 {
        struct page *page;
+        struct file file = {
+                .private_data = key,
+        };
        _enter("{%lu},%lu", dir->i_ino, index);
-        page = read_mapping_page(dir->i_mapping, index, NULL);
+        page = read_mapping_page(dir->i_mapping, index, &file);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (!PageChecked(page))
                        afs_dir_check_page(dir, page);
                if (PageError(page))
@@ -197,12 +202,12 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
        }
        return page;
- fail:
+fail:
        afs_dir_put_page(page);
+        _leave(" = -EIO");
        return ERR_PTR(-EIO);
-} /* end afs_dir_get_page() */
+}
-/*****************************************************************************/
 /*
 * open an AFS directory file
 */
@@ -213,15 +218,12 @@ static int afs_dir_open(struct inode *inode, struct file *file)
        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
-        if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
+        if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
                return -ENOENT;
-        _leave(" = 0");
+        return afs_open(inode, file);
-        return 0;
+}
-} /* end afs_dir_open() */
-/*****************************************************************************/
 /*
 * deal with one block in an AFS directory
 */
@@ -250,7 +252,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                /* skip entries marked unused in the bitmap */
                if (!(block->pagehdr.bitmap[offset / 8] &
                      (1 << (offset % 8)))) {
-                        _debug("ENT[%Zu.%u]: unused\n",
+                        _debug("ENT[%Zu.%u]: unused",
                               blkoff / sizeof(union afs_dir_block), offset);
                        if (offset >= curr)
                                *fpos = blkoff +
@@ -264,7 +266,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                               sizeof(*block) -
                               offset * sizeof(union afs_dirent));
-                _debug("ENT[%Zu.%u]: %s %Zu \"%s\"\n",
+                _debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
                       blkoff / sizeof(union afs_dir_block), offset,
                       (offset < curr ? "skip" : "fill"),
                       nlen, dire->u.name);
@@ -274,7 +276,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        if (next >= AFS_DIRENT_PER_BLOCK) {
                                _debug("ENT[%Zu.%u]:"
                                       " %u travelled beyond end dir block"
-                                       " (len %u/%Zu)\n",
+                                       " (len %u/%Zu)",
                                       blkoff / sizeof(union afs_dir_block),
                                       offset, next, tmp, nlen);
                                return -EIO;
@@ -282,13 +284,13 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        if (!(block->pagehdr.bitmap[next / 8] &
                              (1 << (next % 8)))) {
                                _debug("ENT[%Zu.%u]:"
-                                       " %u unmarked extension (len %u/%Zu)\n",
+                                       " %u unmarked extension (len %u/%Zu)",
                                       blkoff / sizeof(union afs_dir_block),
                                       offset, next, tmp, nlen);
                                return -EIO;
                        }
-                        _debug("ENT[%Zu.%u]: ext %u/%Zu\n",
+                        _debug("ENT[%Zu.%u]: ext %u/%Zu",
                               blkoff / sizeof(union afs_dir_block),
                               next, tmp, nlen);
                        next++;
@@ -304,7 +306,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                              nlen,
                              blkoff + offset * sizeof(union afs_dirent),
                              ntohl(dire->u.vnode),
-                              filldir == afs_dir_lookup_filldir ?
+                              filldir == afs_lookup_filldir ?
                              ntohl(dire->u.unique) : DT_UNKNOWN);
                if (ret < 0) {
                        _leave(" = 0 [full]");
@@ -316,16 +318,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
        _leave(" = 1 [more]");
        return 1;
-} /* end afs_dir_iterate_block() */
+}
-/*****************************************************************************/
 /*
- * read an AFS directory
+ * iterate through the data blob that lists the contents of an AFS directory
 */
 static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
-                           filldir_t filldir)
+                           filldir_t filldir, struct key *key)
 {
-        union afs_dir_block     *dblock;
+        union afs_dir_block *dblock;
        struct afs_dir_page *dbuf;
        struct page *page;
        unsigned blkoff, limit;
@@ -333,7 +334,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
        _enter("{%lu},%u,,", dir->i_ino, *fpos);
-        if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+        if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
                _leave(" = -ESTALE");
                return -ESTALE;
        }
@@ -348,7 +349,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
                /* fetch the appropriate page from the directory */
-                page = afs_dir_get_page(dir, blkoff / PAGE_SIZE);
+                page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
                if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        break;
@@ -377,43 +378,50 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                ret = 0;
        }
- out:
+out:
        _leave(" = %d", ret);
        return ret;
-} /* end afs_dir_iterate() */
+}
-/*****************************************************************************/
 /*
 * read an AFS directory
 */
-static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
 {
        unsigned fpos;
        int ret;
-        _enter("{%Ld,{%lu}}", file->f_pos, file->f_path.dentry->d_inode->i_ino);
+        _enter("{%Ld,{%lu}}",
+               file->f_pos, file->f_path.dentry->d_inode->i_ino);
+        ASSERT(file->private_data != NULL);
        fpos = file->f_pos;
-        ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, cookie, filldir);
+        ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
+                              cookie, filldir, file->private_data);
        file->f_pos = fpos;
        _leave(" = %d", ret);
        return ret;
-} /* end afs_dir_readdir() */
+}
-/*****************************************************************************/
 /*
 * search the directory for a name
 * - if afs_dir_iterate_block() spots this function, it'll pass the FID
 *   uniquifier through dtype
 */
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
-                                  loff_t fpos, u64 ino, unsigned dtype)
+                              loff_t fpos, u64 ino, unsigned dtype)
 {
-        struct afs_dir_lookup_cookie *cookie = _cookie;
+        struct afs_lookup_cookie *cookie = _cookie;
+        _enter("{%s,%Zu},%s,%u,,%llu,%u",
+               cookie->name, cookie->nlen, name, nlen,
+               (unsigned long long) ino, dtype);
-        _enter("{%s,%Zu},%s,%u,,%lu,%u",
+        /* insanity checks first */
-               cookie->name, cookie->nlen, name, nlen, ino, dtype);
+        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
        if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
                _leave(" = 0 [no]");
@@ -426,216 +434,254 @@ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
        _leave(" = -1 [found]");
        return -1;
-} /* end afs_dir_lookup_filldir() */
+}
-/*****************************************************************************/
 /*
- * look up an entry in a directory
+ * do a lookup in a directory
+ * - just returns the FID the dentry name maps to if found
 */
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
+static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
-                                     struct nameidata *nd)
+                         struct afs_fid *fid, struct key *key)
 {
-        struct afs_dir_lookup_cookie cookie;
+        struct afs_lookup_cookie cookie;
        struct afs_super_info *as;
+        unsigned fpos;
+        int ret;
+        _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
+        as = dir->i_sb->s_fs_info;
+        /* search the directory */
+        cookie.name     = dentry->d_name.name;
+        cookie.nlen     = dentry->d_name.len;
+        cookie.fid.vid  = as->volume->vid;
+        cookie.found    = 0;
+        fpos = 0;
+        ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
+                              key);
+        if (ret < 0) {
+                _leave(" = %d [iter]", ret);
+                return ret;
+        }
+        ret = -ENOENT;
+        if (!cookie.found) {
+                _leave(" = -ENOENT [not found]");
+                return -ENOENT;
+        }
+        *fid = cookie.fid;
+        _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+        return 0;
+}
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+                                 struct nameidata *nd)
+{
        struct afs_vnode *vnode;
+        struct afs_fid fid;
        struct inode *inode;
-        unsigned fpos;
+        struct key *key;
        int ret;
-        _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
+        vnode = AFS_FS_I(dir);
-        /* insanity checks first */
+        _enter("{%x:%d},%p{%s},",
-        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+               vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
-        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+        ASSERTCMP(dentry->d_inode, ==, NULL);
        if (dentry->d_name.len > 255) {
                _leave(" = -ENAMETOOLONG");
                return ERR_PTR(-ENAMETOOLONG);
        }
-        vnode = AFS_FS_I(dir);
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-        if (vnode->flags & AFS_VNODE_DELETED) {
                _leave(" = -ESTALE");
                return ERR_PTR(-ESTALE);
        }
-        as = dir->i_sb->s_fs_info;
+        key = afs_request_key(vnode->volume->cell);
+        if (IS_ERR(key)) {
-        /* search the directory */
+                _leave(" = %ld [key]", PTR_ERR(key));
-        cookie.name     = dentry->d_name.name;
+                return ERR_PTR(PTR_ERR(key));
-        cookie.nlen     = dentry->d_name.len;
+        }
-        cookie.fid.vid  = as->volume->vid;
-        cookie.found    = 0;
-        fpos = 0;
+        ret = afs_validate(vnode, key);
-        ret = afs_dir_iterate(dir, &fpos, &cookie, afs_dir_lookup_filldir);
        if (ret < 0) {
-                _leave(" = %d", ret);
+                key_put(key);
+                _leave(" = %d [val]", ret);
                return ERR_PTR(ret);
        }
-        ret = -ENOENT;
+        ret = afs_do_lookup(dir, dentry, &fid, key);
-        if (!cookie.found) {
+        if (ret < 0) {
-                _leave(" = %d", ret);
+                key_put(key);
+                if (ret == -ENOENT) {
+                        d_add(dentry, NULL);
+                        _leave(" = NULL [negative]");
+                        return NULL;
+                }
+                _leave(" = %d [do]", ret);
                return ERR_PTR(ret);
        }
+        dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
        /* instantiate the dentry */
-        ret = afs_iget(dir->i_sb, &cookie.fid, &inode);
+        inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
-        if (ret < 0) {
+        key_put(key);
-                _leave(" = %d", ret);
+        if (IS_ERR(inode)) {
-                return ERR_PTR(ret);
+                _leave(" = %ld", PTR_ERR(inode));
+                return ERR_PTR(PTR_ERR(inode));
        }
        dentry->d_op = &afs_fs_dentry_operations;
-        dentry->d_fsdata = (void *) (unsigned long) vnode->status.version;
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
-               cookie.fid.vnode,
+               fid.vnode,
-               cookie.fid.unique,
+               fid.unique,
               dentry->d_inode->i_ino,
               dentry->d_inode->i_version);
        return NULL;
-} /* end afs_dir_lookup() */
+}
-/*****************************************************************************/
 /*
 * check that a dentry lookup hit has found a valid entry
 * - NOTE! the hit can be a negative hit too, so we can't assume we have an
 *   inode
- * (derived from nfs_lookup_revalidate)
 */
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct afs_dir_lookup_cookie cookie;
+        struct afs_vnode *vnode, *dir;
+        struct afs_fid fid;
        struct dentry *parent;
-        struct inode *inode, *dir;
+        struct key *key;
-        unsigned fpos;
+        void *dir_version;
        int ret;
-        _enter("{sb=%p n=%s},", dentry->d_sb, dentry->d_name.name);
+        vnode = AFS_FS_I(dentry->d_inode);
-        /* lock down the parent dentry so we can peer at it */
+        if (dentry->d_inode)
-        parent = dget_parent(dentry->d_parent);
+                _enter("{v={%x:%u} n=%s fl=%lx},",
+                       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+                       vnode->flags);
+        else
+                _enter("{neg n=%s}", dentry->d_name.name);
-        dir = parent->d_inode;
+        key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
-        inode = dentry->d_inode;
+        if (IS_ERR(key))
+                key = NULL;
-        /* handle a negative dentry */
+        /* lock down the parent dentry so we can peer at it */
-        if (!inode)
+        parent = dget_parent(dentry);
+        if (!parent->d_inode)
                goto out_bad;
-        /* handle a bad inode */
+        dir = AFS_FS_I(parent->d_inode);
-        if (is_bad_inode(inode)) {
-                printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
-                       dentry->d_parent->d_name.name, dentry->d_name.name);
-                goto out_bad;
-        }
-        /* force a full look up if the parent directory changed since last the
+        /* validate the parent directory */
-         * server was consulted
+        if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
-         * - otherwise this inode must still exist, even if the inode details
+                afs_validate(dir, key);
-         *   themselves have changed
-         */
-        if (AFS_FS_I(dir)->flags & AFS_VNODE_CHANGED)
-                afs_vnode_fetch_status(AFS_FS_I(dir));
-        if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+        if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
                _debug("%s: parent dir deleted", dentry->d_name.name);
                goto out_bad;
        }
-        if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) {
+        dir_version = (void *) (unsigned long) dir->status.data_version;
-                _debug("%s: file already deleted", dentry->d_name.name);
+        if (dentry->d_fsdata == dir_version)
-                goto out_bad;
+                goto out_valid; /* the dir contents are unchanged */
-        }
-        if ((unsigned long) dentry->d_fsdata !=
-            (unsigned long) AFS_FS_I(dir)->status.version) {
-                _debug("%s: parent changed %lu -> %u",
-                       dentry->d_name.name,
-                       (unsigned long) dentry->d_fsdata,
-                       (unsigned) AFS_FS_I(dir)->status.version);
-                /* search the directory for this vnode */
+        _debug("dir modified");
-                cookie.name     = dentry->d_name.name;
-                cookie.nlen     = dentry->d_name.len;
-                cookie.fid.vid  = AFS_FS_I(inode)->volume->vid;
-                cookie.found    = 0;
-                fpos = 0;
+        /* search the directory for this vnode */
-                ret = afs_dir_iterate(dir, &fpos, &cookie,
+        ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
-                                      afs_dir_lookup_filldir);
+        switch (ret) {
-                if (ret < 0) {
+        case 0:
-                        _debug("failed to iterate dir %s: %d",
+                /* the filename maps to something */
-                               parent->d_name.name, ret);
+                if (!dentry->d_inode)
+                        goto out_bad;
+                if (is_bad_inode(dentry->d_inode)) {
+                        printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
+                               parent->d_name.name, dentry->d_name.name);
                        goto out_bad;
-                }
-                if (!cookie.found) {
-                        _debug("%s: dirent not found", dentry->d_name.name);
-                        goto not_found;
                }
                /* if the vnode ID has changed, then the dirent points to a
                 * different file */
-                if (cookie.fid.vnode != AFS_FS_I(inode)->fid.vnode) {
+                if (fid.vnode != vnode->fid.vnode) {
-                        _debug("%s: dirent changed", dentry->d_name.name);
+                        _debug("%s: dirent changed [%u != %u]",
+                               dentry->d_name.name, fid.vnode,
+                               vnode->fid.vnode);
                        goto not_found;
                }
                /* if the vnode ID uniqifier has changed, then the file has
-                 * been deleted */
+                 * been deleted and replaced, and the original vnode ID has
-                if (cookie.fid.unique != AFS_FS_I(inode)->fid.unique) {
+                 * been reused */
+                if (fid.unique != vnode->fid.unique) {
                        _debug("%s: file deleted (uq %u -> %u I:%lu)",
-                               dentry->d_name.name,
+                               dentry->d_name.name, fid.unique,
-                               cookie.fid.unique,
+                               vnode->fid.unique, dentry->d_inode->i_version);
-                               AFS_FS_I(inode)->fid.unique,
+                        spin_lock(&vnode->lock);
-                               inode->i_version);
+                        set_bit(AFS_VNODE_DELETED, &vnode->flags);
-                        spin_lock(&AFS_FS_I(inode)->lock);
+                        spin_unlock(&vnode->lock);
-                        AFS_FS_I(inode)->flags |= AFS_VNODE_DELETED;
+                        goto not_found;
-                        spin_unlock(&AFS_FS_I(inode)->lock);
-                        invalidate_remote_inode(inode);
-                        goto out_bad;
                }
+                goto out_valid;
-                dentry->d_fsdata =
+        case -ENOENT:
-                        (void *) (unsigned long) AFS_FS_I(dir)->status.version;
+                /* the filename is unknown */
+                _debug("%s: dirent not found", dentry->d_name.name);
+                if (dentry->d_inode)
+                        goto not_found;
+                goto out_valid;
+        default:
+                _debug("failed to iterate dir %s: %d",
+                       parent->d_name.name, ret);
+                goto out_bad;
        }
- out_valid:
+out_valid:
+        dentry->d_fsdata = dir_version;
+out_skip:
        dput(parent);
+        key_put(key);
        _leave(" = 1 [valid]");
        return 1;
        /* the dirent, if it exists, now points to a different vnode */
- not_found:
+not_found:
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_NFSFS_RENAMED;
        spin_unlock(&dentry->d_lock);
- out_bad:
+out_bad:
-        if (inode) {
+        if (dentry->d_inode) {
                /* don't unhash if we have submounts */
                if (have_submounts(dentry))
-                        goto out_valid;
+                        goto out_skip;
        }
-        shrink_dcache_parent(dentry);
        _debug("dropping dentry %s/%s",
-               dentry->d_parent->d_name.name, dentry->d_name.name);
+               parent->d_name.name, dentry->d_name.name);
+        shrink_dcache_parent(dentry);
        d_drop(dentry);
        dput(parent);
+        key_put(key);
        _leave(" = 0 [bad]");
        return 0;
-} /* end afs_d_revalidate() */
+}
-/*****************************************************************************/
 /*
 * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
 * sleep)
@@ -649,15 +695,444 @@ static int afs_d_delete(struct dentry *dentry)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
                goto zap;
-        if (dentry->d_inode) {
+        if (dentry->d_inode &&
-                if (AFS_FS_I(dentry->d_inode)->flags & AFS_VNODE_DELETED)
+            test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
                        goto zap;
-        }
        _leave(" = 0 [keep]");
        return 0;
- zap:
+zap:
        _leave(" = 1 [zap]");
        return 1;
-} /* end afs_d_delete() */
+}
+/*
+ * handle dentry release
+ */
+static void afs_d_release(struct dentry *dentry)
+{
+        _enter("%s", dentry->d_name.name);
+}
+/*
+ * create a directory on an AFS filesystem
+ */
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct afs_file_status status;
+        struct afs_callback cb;
+        struct afs_server *server;
+        struct afs_vnode *dvnode, *vnode;
+        struct afs_fid fid;
+        struct inode *inode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s},%o",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        mode |= S_IFDIR;
+        ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+                               mode, &fid, &status, &cb, &server);
+        if (ret < 0)
+                goto mkdir_error;
+        inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+        if (IS_ERR(inode)) {
+                /* ENOMEM at a really inconvenient time - just abandon the new
+                 * directory on the server */
+                ret = PTR_ERR(inode);
+                goto iget_error;
+        }
+        /* apply the status report we've got for the new vnode */
+        vnode = AFS_FS_I(inode);
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        afs_vnode_finalise_status_update(vnode, server);
+        afs_put_server(server);
+        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry)) {
+                _debug("not hashed");
+                d_rehash(dentry);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+iget_error:
+        afs_put_server(server);
+mkdir_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * remove a directory from an AFS filesystem
+ */
+static int afs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct afs_vnode *dvnode, *vnode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s}",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
+        if (ret < 0)
+                goto rmdir_error;
+        if (dentry->d_inode) {
+                vnode = AFS_FS_I(dentry->d_inode);
+                clear_nlink(&vnode->vfs_inode);
+                set_bit(AFS_VNODE_DELETED, &vnode->flags);
+                afs_discard_callback_on_delete(vnode);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+rmdir_error:
+        key_put(key);
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * remove a file from an AFS filesystem
+ */
+static int afs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct afs_vnode *dvnode, *vnode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s}",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        if (dentry->d_inode) {
+                vnode = AFS_FS_I(dentry->d_inode);
+                /* make sure we have a callback promise on the victim */
+                ret = afs_validate(vnode, key);
+                if (ret < 0)
+                        goto error;
+        }
+        ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
+        if (ret < 0)
+                goto remove_error;
+        if (dentry->d_inode) {
+                /* if the file wasn't deleted due to excess hard links, the
+                 * fileserver will break the callback promise on the file - if
+                 * it had one - before it returns to us, and if it was deleted,
+                 * it won't
+                 *
+                 * however, if we didn't have a callback promise outstanding,
+                 * or it was outstanding on a different server, then it won't
+                 * break it either...
+                 */
+                vnode = AFS_FS_I(dentry->d_inode);
+                if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+                        _debug("AFS_VNODE_DELETED");
+                if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+                        _debug("AFS_VNODE_CB_BROKEN");
+                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+                ret = afs_validate(vnode, key);
+                _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+remove_error:
+        key_put(key);
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * create a regular file on an AFS filesystem
+ */
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+                      struct nameidata *nd)
+{
+        struct afs_file_status status;
+        struct afs_callback cb;
+        struct afs_server *server;
+        struct afs_vnode *dvnode, *vnode;
+        struct afs_fid fid;
+        struct inode *inode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s},%o,",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        mode |= S_IFREG;
+        ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+                               mode, &fid, &status, &cb, &server);
+        if (ret < 0)
+                goto create_error;
+        inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+        if (IS_ERR(inode)) {
+                /* ENOMEM at a really inconvenient time - just abandon the new
+                 * directory on the server */
+                ret = PTR_ERR(inode);
+                goto iget_error;
+        }
+        /* apply the status report we've got for the new vnode */
+        vnode = AFS_FS_I(inode);
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        afs_vnode_finalise_status_update(vnode, server);
+        afs_put_server(server);
+        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry)) {
+                _debug("not hashed");
+                d_rehash(dentry);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+iget_error:
+        afs_put_server(server);
+create_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * create a hard link between files in an AFS filesystem
+ */
+static int afs_link(struct dentry *from, struct inode *dir,
+                    struct dentry *dentry)
+{
+        struct afs_vnode *dvnode, *vnode;
+        struct key *key;
+        int ret;
+        vnode = AFS_FS_I(from->d_inode);
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%x:%d},{%s}",
+               vnode->fid.vid, vnode->fid.vnode,
+               dvnode->fid.vid, dvnode->fid.vnode,
+               dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
+        if (ret < 0)
+                goto link_error;
+        atomic_inc(&vnode->vfs_inode.i_count);
+        d_instantiate(dentry, &vnode->vfs_inode);
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+link_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * create a symlink in an AFS filesystem
+ */
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+                       const char *content)
+{
+        struct afs_file_status status;
+        struct afs_server *server;
+        struct afs_vnode *dvnode, *vnode;
+        struct afs_fid fid;
+        struct inode *inode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s},%s",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
+               content);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        ret = -EINVAL;
+        if (strlen(content) > 1023)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
+                                &fid, &status, &server);
+        if (ret < 0)
+                goto create_error;
+        inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
+        if (IS_ERR(inode)) {
+                /* ENOMEM at a really inconvenient time - just abandon the new
+                 * directory on the server */
+                ret = PTR_ERR(inode);
+                goto iget_error;
+        }
+        /* apply the status report we've got for the new vnode */
+        vnode = AFS_FS_I(inode);
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        afs_vnode_finalise_status_update(vnode, server);
+        afs_put_server(server);
+        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry)) {
+                _debug("not hashed");
+                d_rehash(dentry);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+iget_error:
+        afs_put_server(server);
+create_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * rename a file in an AFS filesystem and/or move it between directories
+ */
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                      struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+        struct key *key;
+        int ret;
+        vnode = AFS_FS_I(old_dentry->d_inode);
+        orig_dvnode = AFS_FS_I(old_dir);
+        new_dvnode = AFS_FS_I(new_dir);
+        _enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
+               orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
+               vnode->fid.vid, vnode->fid.vnode,
+               new_dvnode->fid.vid, new_dvnode->fid.vnode,
+               new_dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (new_dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(orig_dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
+                               old_dentry->d_name.name,
+                               new_dentry->d_name.name);
+        if (ret < 0)
+                goto rename_error;
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+rename_error:
+        key_put(key);
+error:
+        d_drop(new_dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/afs/errors.h b/fs/afs/errors.h
deleted file mode 100644
index 574d94ac8d05..000000000000
--- a/fs/afs/errors.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* errors.h: AFS abort/error codes
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_ERRORS_H
-#define _LINUX_AFS_ERRORS_H
-#include "types.h"
-/* file server abort codes */
-typedef enum {
-        VSALVAGE        = 101,  /* volume needs salvaging */
-        VNOVNODE        = 102,  /* no such file/dir (vnode) */
-        VNOVOL          = 103,  /* no such volume or volume unavailable */
-        VVOLEXISTS      = 104,  /* volume name already exists */
-        VNOSERVICE      = 105,  /* volume not currently in service */
-        VOFFLINE        = 106,  /* volume is currently offline (more info available [VVL-spec]) */
-        VONLINE         = 107,  /* volume is already online */
-        VDISKFULL       = 108,  /* disk partition is full */
-        VOVERQUOTA      = 109,  /* volume's maximum quota exceeded */
-        VBUSY           = 110,  /* volume is temporarily unavailable */
-        VMOVED          = 111,  /* volume moved to new server - ask this FS where */
-} afs_rxfs_abort_t;
-extern int afs_abort_to_error(int abortcode);
-#endif /* _LINUX_AFS_ERRORS_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index b17634541f67..ae256498f4f7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -1,6 +1,6 @@
-/* file.c: AFS filesystem file handling
+/* AFS filesystem file handling
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -15,22 +15,25 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include <rxrpc/call.h>
 #include "internal.h"
-#if 0
-static int afs_file_open(struct inode *inode, struct file *file);
-static int afs_file_release(struct inode *inode, struct file *file);
-#endif
 static int afs_file_readpage(struct file *file, struct page *page);
 static void afs_file_invalidatepage(struct page *page, unsigned long offset);
 static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
+const struct file_operations afs_file_operations = {
+        .open           = afs_open,
+        .release        = afs_release,
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .mmap           = generic_file_readonly_mmap,
+        .sendfile       = generic_file_sendfile,
+};
 const struct inode_operations afs_file_inode_operations = {
        .getattr        = afs_inode_getattr,
+        .permission     = afs_permission,
 };
 const struct address_space_operations afs_fs_aops = {
@@ -40,7 +43,48 @@ const struct address_space_operations afs_fs_aops = {
        .invalidatepage = afs_file_invalidatepage,
 };
-/*****************************************************************************/
+/*
+ * open an AFS file or directory and attach a key to it
+ */
+int afs_open(struct inode *inode, struct file *file)
+{
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        struct key *key;
+        int ret;
+        _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+        key = afs_request_key(vnode->volume->cell);
+        if (IS_ERR(key)) {
+                _leave(" = %ld [key]", PTR_ERR(key));
+                return PTR_ERR(key);
+        }
+        ret = afs_validate(vnode, key);
+        if (ret < 0) {
+                _leave(" = %d [val]", ret);
+                return ret;
+        }
+        file->private_data = key;
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * release an AFS file or directory and discard its key
+ */
+int afs_release(struct inode *inode, struct file *file)
+{
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+        key_put(file->private_data);
+        _leave(" = 0");
+        return 0;
+}
 /*
 * deal with notification that a page was read from the cache
 */
@@ -58,10 +102,9 @@ static void afs_file_readpage_read_complete(void *cookie_data,
                SetPageUptodate(page);
        unlock_page(page);
-} /* end afs_file_readpage_read_complete() */
+}
 #endif
-/*****************************************************************************/
 /*
 * deal with notification that a page was written to the cache
 */
@@ -74,41 +117,38 @@ static void afs_file_readpage_write_complete(void *cookie_data,
        _enter("%p,%p,%p,%d", cookie_data, page, data, error);
        unlock_page(page);
+}
-} /* end afs_file_readpage_write_complete() */
 #endif
-/*****************************************************************************/
 /*
 * AFS read page from file (or symlink)
 */
 static int afs_file_readpage(struct file *file, struct page *page)
 {
-        struct afs_rxfs_fetch_descriptor desc;
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_page *pageio;
-#endif
        struct afs_vnode *vnode;
        struct inode *inode;
+        struct key *key;
+        size_t len;
+        off_t offset;
        int ret;
        inode = page->mapping->host;
-        _enter("{%lu},{%lu}", inode->i_ino, page->index);
+        ASSERT(file != NULL);
+        key = file->private_data;
+        ASSERT(key != NULL);
+        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
        vnode = AFS_FS_I(inode);
        BUG_ON(!PageLocked(page));
        ret = -ESTALE;
-        if (vnode->flags & AFS_VNODE_DELETED)
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                goto error;
 #ifdef AFS_CACHING_SUPPORT
-        ret = cachefs_page_get_private(page, &pageio, GFP_NOIO);
-        if (ret < 0)
-                goto error;
        /* is it cached? */
        ret = cachefs_read_or_alloc_page(vnode->cache,
                                         page,
@@ -132,26 +172,19 @@ static int afs_file_readpage(struct file *file, struct page *page)
        case -ENOBUFS:
        case -ENODATA:
        default:
-                desc.fid        = vnode->fid;
+                offset = page->index << PAGE_CACHE_SHIFT;
-                desc.offset     = page->index << PAGE_CACHE_SHIFT;
+                len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
-                desc.size       = min((size_t) (inode->i_size - desc.offset),
-                                      (size_t) PAGE_SIZE);
-                desc.buffer     = kmap(page);
-                clear_page(desc.buffer);
                /* read the contents of the file from the server into the
                 * page */
-                ret = afs_vnode_fetch_data(vnode, &desc);
+                ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
-                kunmap(page);
                if (ret < 0) {
-                        if (ret==-ENOENT) {
+                        if (ret == -ENOENT) {
                                _debug("got NOENT from server"
                                       " - marking file deleted and stale");
-                                vnode->flags |= AFS_VNODE_DELETED;
+                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
                                ret = -ESTALE;
                        }
 #ifdef AFS_CACHING_SUPPORT
                        cachefs_uncache_page(vnode->cache, page);
 #endif
@@ -178,16 +211,13 @@ static int afs_file_readpage(struct file *file, struct page *page)
        _leave(" = 0");
        return 0;
- error:
+error:
        SetPageError(page);
        unlock_page(page);
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_file_readpage() */
-/*****************************************************************************/
 /*
 * get a page cookie for the specified page
 */
@@ -202,10 +232,9 @@ int afs_cache_get_page_cookie(struct page *page,
        _leave(" = %d", ret);
        return ret;
-} /* end afs_cache_get_page_cookie() */
+}
 #endif
-/*****************************************************************************/
 /*
 * invalidate part or all of a page
 */
@@ -240,9 +269,8 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
        }
        _leave(" = %d", ret);
-} /* end afs_file_invalidatepage() */
+}
-/*****************************************************************************/
 /*
 * release a page and cleanup its private data
 */
@@ -267,4 +295,4 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
        _leave(" = 0");
        return 0;
-} /* end afs_file_releasepage() */
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 61bc371532ab..e54e6c2ad343 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1,6 +1,6 @@
-/* fsclient.c: AFS File Server client stubs
+/* AFS File Server client stubs
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -11,827 +11,928 @@
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
+#include <linux/circ_buf.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "fsclient.h"
-#include "cmservice.h"
-#include "vnode.h"
-#include "server.h"
-#include "errors.h"
 #include "internal.h"
+#include "afs_fs.h"
-#define FSFETCHSTATUS           132     /* AFS Fetch file status */
-#define FSFETCHDATA             130     /* AFS Fetch file data */
-#define FSGIVEUPCALLBACKS       147     /* AFS Discard callback promises */
-#define FSGETVOLUMEINFO         148     /* AFS Get root volume information */
-#define FSGETROOTVOLUME         151     /* AFS Get root volume name */
-#define FSLOOKUP                161     /* AFS lookup file in directory */
-/*****************************************************************************/
 /*
- * map afs abort codes to/from Linux error codes
+ * decode an AFSFid block
- * - called with call->lock held
 */
-static void afs_rxfs_aemap(struct rxrpc_call *call)
+static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
 {
-        switch (call->app_err_state) {
+        const __be32 *bp = *_bp;
-        case RXRPC_ESTATE_LOCAL_ABORT:
-                call->app_abort_code = -call->app_errno;
+        fid->vid                = ntohl(*bp++);
-                break;
+        fid->vnode              = ntohl(*bp++);
-        case RXRPC_ESTATE_PEER_ABORT:
+        fid->unique             = ntohl(*bp++);
-                call->app_errno = afs_abort_to_error(call->app_abort_code);
+        *_bp = bp;
-                break;
+}
-        default:
-                break;
-        }
-} /* end afs_rxfs_aemap() */
-/*****************************************************************************/
 /*
- * get the root volume name from a fileserver
+ * decode an AFSFetchStatus block
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
 */
-#if 0
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
-int afs_rxfs_get_root_volume(struct afs_server *server,
+                                      struct afs_file_status *status,
-                             char *buf, size_t *buflen)
+                                      struct afs_vnode *vnode)
 {
-        struct rxrpc_connection *conn;
+        const __be32 *bp = *_bp;
-        struct rxrpc_call *call;
+        umode_t mode;
-        struct kvec piov[2];
+        u64 data_version, size;
-        size_t sent;
+        u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
-        int ret;
-        u32 param[1];
+#define EXTRACT(DST)                            \
+        do {                                    \
+                u32 x = ntohl(*bp++);           \
+                changed |= DST - x;             \
+                DST = x;                        \
+        } while (0)
+        status->if_version = ntohl(*bp++);
+        EXTRACT(status->type);
+        EXTRACT(status->nlink);
+        size = ntohl(*bp++);
+        data_version = ntohl(*bp++);
+        EXTRACT(status->author);
+        EXTRACT(status->owner);
+        EXTRACT(status->caller_access); /* call ticket dependent */
+        EXTRACT(status->anon_access);
+        EXTRACT(status->mode);
+        EXTRACT(status->parent.vnode);
+        EXTRACT(status->parent.unique);
+        bp++; /* seg size */
+        status->mtime_client = ntohl(*bp++);
+        status->mtime_server = ntohl(*bp++);
+        EXTRACT(status->group);
+        bp++; /* sync counter */
+        data_version |= (u64) ntohl(*bp++) << 32;
+        bp++; /* lock count */
+        size |= (u64) ntohl(*bp++) << 32;
+        bp++; /* spare 4 */
+        *_bp = bp;
+        if (size != status->size) {
+                status->size = size;
+                changed |= true;
+        }
+        status->mode &= S_IALLUGO;
+        _debug("vnode time %lx, %lx",
+               status->mtime_client, status->mtime_server);
+        if (vnode) {
+                status->parent.vid = vnode->fid.vid;
+                if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+                        _debug("vnode changed");
+                        i_size_write(&vnode->vfs_inode, size);
+                        vnode->vfs_inode.i_uid = status->owner;
+                        vnode->vfs_inode.i_gid = status->group;
+                        vnode->vfs_inode.i_version = vnode->fid.unique;
+                        vnode->vfs_inode.i_nlink = status->nlink;
+                        mode = vnode->vfs_inode.i_mode;
+                        mode &= ~S_IALLUGO;
+                        mode |= status->mode;
+                        barrier();
+                        vnode->vfs_inode.i_mode = mode;
+                }
-        DECLARE_WAITQUEUE(myself, current);
+                vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server;
+                vnode->vfs_inode.i_mtime        = vnode->vfs_inode.i_ctime;
+                vnode->vfs_inode.i_atime        = vnode->vfs_inode.i_ctime;
+        }
-        kenter("%p,%p,%u",server, buf, *buflen);
+        if (status->data_version != data_version) {
+                status->data_version = data_version;
+                if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+                        _debug("vnode modified %llx on {%x:%u}",
+                               (unsigned long long) data_version,
+                               vnode->fid.vid, vnode->fid.vnode);
+                        set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+                        set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+                }
+        }
+}
-        /* get hold of the fileserver connection */
+/*
-        ret = afs_server_get_fsconn(server, &conn);
+ * decode an AFSCallBack block
-        if (ret < 0)
+ */
-                goto out;
+static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+{
+        const __be32 *bp = *_bp;
-        /* create a call through that connection */
+        vnode->cb_version       = ntohl(*bp++);
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
+        vnode->cb_expiry        = ntohl(*bp++);
-        if (ret < 0) {
+        vnode->cb_type          = ntohl(*bp++);
-                printk("kAFS: Unable to create call: %d\n", ret);
+        vnode->cb_expires       = vnode->cb_expiry + get_seconds();
-                goto out_put_conn;
+        *_bp = bp;
-        }
+}
-        call->app_opcode = FSGETROOTVOLUME;
-        /* we want to get event notifications from the call */
+static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
-        add_wait_queue(&call->waitq, &myself);
+                                       struct afs_callback *cb)
+{
+        const __be32 *bp = *_bp;
-        /* marshall the parameters */
+        cb->version     = ntohl(*bp++);
-        param[0] = htonl(FSGETROOTVOLUME);
+        cb->expiry      = ntohl(*bp++);
+        cb->type        = ntohl(*bp++);
-        piov[0].iov_len = sizeof(param);
+        *_bp = bp;
-        piov[0].iov_base = param;
+}
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-                    signal_pending(current))
-                        break;
-                schedule();
-        }
-        set_current_state(TASK_RUNNING);
-        ret = -EINTR;
+/*
-        if (signal_pending(current))
+ * decode an AFSVolSync block
-                goto abort;
+ */
+static void xdr_decode_AFSVolSync(const __be32 **_bp,
+                                  struct afs_volsync *volsync)
+{
+        const __be32 *bp = *_bp;
-        switch (call->app_call_state) {
+        volsync->creation = ntohl(*bp++);
-        case RXRPC_CSTATE_ERROR:
+        bp++; /* spare2 */
-                ret = call->app_errno;
+        bp++; /* spare3 */
-                kdebug("Got Error: %d", ret);
+        bp++; /* spare4 */
-                goto out_unwait;
+        bp++; /* spare5 */
+        bp++; /* spare6 */
+        *_bp = bp;
+}
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
+/*
-                /* read the reply */
+ * deliver reply data to an FS.FetchStatus
-                kdebug("Got Reply: qty=%d", call->app_ready_qty);
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call,
+                                       struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *vnode = call->reply;
+        const __be32 *bp;
-                ret = -EBADMSG;
+        _enter(",,%u", last);
-                if (call->app_ready_qty <= 4)
-                        goto abort;
-                ret = rxrpc_call_read_data(call, NULL, call->app_ready_qty, 0);
+        afs_transfer_reply(call, skb);
-                if (ret < 0)
+        if (!last)
-                        goto abort;
+                return 0;
-#if 0
+        if (call->reply_size != call->reply_max)
-                /* unmarshall the reply */
+                return -EBADMSG;
-                bp = buffer;
-                for (loop = 0; loop < 65; loop++)
-                        entry->name[loop] = ntohl(*bp++);
-                entry->name[64] = 0;
-                entry->type = ntohl(*bp++);
+        /* unmarshall the reply once we've received all of it */
-                entry->num_servers = ntohl(*bp++);
+        bp = call->buffer;
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        xdr_decode_AFSCallBack(&bp, vnode);
+        if (call->reply2)
+                xdr_decode_AFSVolSync(&bp, call->reply2);
-                for (loop = 0; loop < 8; loop++)
+        _leave(" = 0 [done]");
-                        entry->servers[loop].addr.s_addr = *bp++;
+        return 0;
+}
-                for (loop = 0; loop < 8; loop++)
+/*
-                        entry->servers[loop].partition = ntohl(*bp++);
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+        .name           = "FS.FetchStatus",
+        .deliver        = afs_deliver_fs_fetch_status,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
-                for (loop = 0; loop < 8; loop++)
+/*
-                        entry->servers[loop].flags = ntohl(*bp++);
+ * fetch the status information for a file
+ */
+int afs_fs_fetch_file_status(struct afs_server *server,
+                             struct key *key,
+                             struct afs_vnode *vnode,
+                             struct afs_volsync *volsync,
+                             const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        __be32 *bp;
-                for (loop = 0; loop < 3; loop++)
+        _enter(",%x,{%x:%d},,",
-                        entry->volume_ids[loop] = ntohl(*bp++);
+               key_serial(key), vnode->fid.vid, vnode->fid.vnode);
-                entry->clone_id = ntohl(*bp++);
+        call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
-                entry->flags = ntohl(*bp);
+        if (!call)
-#endif
+                return -ENOMEM;
-                /* success */
+        call->key = key;
-                ret = 0;
+        call->reply = vnode;
-                goto out_unwait;
+        call->reply2 = volsync;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
-        default:
+        /* marshall the parameters */
-                BUG();
+        bp = call->request;
-        }
+        bp[0] = htonl(FSFETCHSTATUS);
+        bp[1] = htonl(vnode->fid.vid);
+        bp[2] = htonl(vnode->fid.vnode);
+        bp[3] = htonl(vnode->fid.unique);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_fsconn(server, conn);
- out:
-        kleave("");
-        return ret;
-} /* end afs_rxfs_get_root_volume() */
-#endif
-/*****************************************************************************/
 /*
- * get information about a volume
+ * deliver reply data to an FS.FetchData
 */
-#if 0
+static int afs_deliver_fs_fetch_data(struct afs_call *call,
-int afs_rxfs_get_volume_info(struct afs_server *server,
+                                     struct sk_buff *skb, bool last)
-                             const char *name,
-                             struct afs_volume_info *vinfo)
 {
-        struct rxrpc_connection *conn;
+        struct afs_vnode *vnode = call->reply;
-        struct rxrpc_call *call;
+        const __be32 *bp;
-        struct kvec piov[3];
+        struct page *page;
-        size_t sent;
+        void *buffer;
        int ret;
-        u32 param[2], *bp, zero;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+        switch (call->unmarshall) {
+        case 0:
+                call->offset = 0;
+                call->unmarshall++;
+                /* extract the returned data length */
+        case 1:
+                _debug("extract data length");
+                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-        _enter("%p,%s,%p", server, name, vinfo);
+                call->count = ntohl(call->tmp);
+                _debug("DATA length: %u", call->count);
+                if (call->count > PAGE_SIZE)
+                        return -EBADMSG;
+                call->offset = 0;
+                call->unmarshall++;
+                if (call->count < PAGE_SIZE) {
+                        page = call->reply3;
+                        buffer = kmap_atomic(page, KM_USER0);
+                        memset(buffer + PAGE_SIZE - call->count, 0,
+                               call->count);
+                        kunmap_atomic(buffer, KM_USER0);
+                }
-        /* get hold of the fileserver connection */
+                /* extract the returned data */
-        ret = afs_server_get_fsconn(server, &conn);
+        case 2:
-        if (ret < 0)
+                _debug("extract data");
-                goto out;
+                page = call->reply3;
+                buffer = kmap_atomic(page, KM_USER0);
+                ret = afs_extract_data(call, skb, last, buffer, call->count);
+                kunmap_atomic(buffer, KM_USER0);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-        /* create a call through that connection */
+                call->offset = 0;
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
+                call->unmarshall++;
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
+                /* extract the metadata */
-                goto out_put_conn;
+        case 3:
-        }
+                ret = afs_extract_data(call, skb, last, call->buffer,
-        call->app_opcode = FSGETVOLUMEINFO;
+                                       (21 + 3 + 6) * 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-        /* we want to get event notifications from the call */
+                bp = call->buffer;
-        add_wait_queue(&call->waitq, &myself);
+                xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+                xdr_decode_AFSCallBack(&bp, vnode);
+                if (call->reply2)
+                        xdr_decode_AFSVolSync(&bp, call->reply2);
-        /* marshall the parameters */
+                call->offset = 0;
-        piov[1].iov_len = strlen(name);
+                call->unmarshall++;
-        piov[1].iov_base = (char *) name;
+        case 4:
-        zero = 0;
+                _debug("trailer");
-        piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
+                if (skb->len != 0)
-        piov[2].iov_base = &zero;
+                        return -EBADMSG;
+                break;
-        param[0] = htonl(FSGETVOLUMEINFO);
-        param[1] = htonl(piov[1].iov_len);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 64);
-        ret = rxrpc_call_read_data(call, bp, 64,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
        }
-        /* unmarshall the reply */
+        if (!last)
-        vinfo->vid = ntohl(*bp++);
+                return 0;
-        vinfo->type = ntohl(*bp++);
+        _leave(" = 0 [done]");
-        vinfo->type_vids[0] = ntohl(*bp++);
+        return 0;
-        vinfo->type_vids[1] = ntohl(*bp++);
+}
-        vinfo->type_vids[2] = ntohl(*bp++);
-        vinfo->type_vids[3] = ntohl(*bp++);
-        vinfo->type_vids[4] = ntohl(*bp++);
-        vinfo->nservers = ntohl(*bp++);
-        vinfo->servers[0].addr.s_addr = *bp++;
-        vinfo->servers[1].addr.s_addr = *bp++;
-        vinfo->servers[2].addr.s_addr = *bp++;
-        vinfo->servers[3].addr.s_addr = *bp++;
-        vinfo->servers[4].addr.s_addr = *bp++;
-        vinfo->servers[5].addr.s_addr = *bp++;
-        vinfo->servers[6].addr.s_addr = *bp++;
-        vinfo->servers[7].addr.s_addr = *bp++;
-        ret = -EBADMSG;
-        if (vinfo->nservers > 8)
-                goto abort;
-        /* success */
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_fsconn(server, conn);
- out:
-        _leave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_get_volume_info() */
-#endif
-/*****************************************************************************/
 /*
- * fetch the status information for a file
+ * FS.FetchData operation type
+ */
+static const struct afs_call_type afs_RXFSFetchData = {
+        .name           = "FS.FetchData",
+        .deliver        = afs_deliver_fs_fetch_data,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * fetch data from a file
 */
-int afs_rxfs_fetch_file_status(struct afs_server *server,
+int afs_fs_fetch_data(struct afs_server *server,
-                               struct afs_vnode *vnode,
+                      struct key *key,
-                               struct afs_volsync *volsync)
+                      struct afs_vnode *vnode,
+                      off_t offset, size_t length,
+                      struct page *buffer,
+                      const struct afs_wait_mode *wait_mode)
 {
-        struct afs_server_callslot callslot;
+        struct afs_call *call;
-        struct rxrpc_call *call;
-        struct kvec piov[1];
-        size_t sent;
-        int ret;
        __be32 *bp;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
-        _enter("%p,{%u,%u,%u}",
+        call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
-               server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+        if (!call)
+                return -ENOMEM;
-        /* get hold of the fileserver connection */
+        call->key = key;
-        ret = afs_server_request_callslot(server, &callslot);
+        call->reply = vnode;
-        if (ret < 0)
+        call->reply2 = NULL; /* volsync */
-                goto out;
+        call->reply3 = buffer;
+        call->service_id = FS_SERVICE;
-        /* create a call through that connection */
+        call->port = htons(AFS_FS_PORT);
-        ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap,
-                                &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = FSFETCHSTATUS;
-        /* we want to get event notifications from the call */
-        add_wait_queue(&call->waitq, &myself);
        /* marshall the parameters */
-        bp = rxrpc_call_alloc_scratch(call, 16);
+        bp = call->request;
-        bp[0] = htonl(FSFETCHSTATUS);
+        bp[0] = htonl(FSFETCHDATA);
        bp[1] = htonl(vnode->fid.vid);
        bp[2] = htonl(vnode->fid.vnode);
        bp[3] = htonl(vnode->fid.unique);
+        bp[4] = htonl(offset);
+        bp[5] = htonl(length);
-        piov[0].iov_len = 16;
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        piov[0].iov_base = bp;
+}
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 120);
-        ret = rxrpc_call_read_data(call, bp, 120,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
-        }
-        /* unmarshall the reply */
+/*
-        vnode->status.if_version        = ntohl(*bp++);
+ * deliver reply data to an FS.GiveUpCallBacks
-        vnode->status.type              = ntohl(*bp++);
+ */
-        vnode->status.nlink             = ntohl(*bp++);
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
-        vnode->status.size              = ntohl(*bp++);
+                                            struct sk_buff *skb, bool last)
-        vnode->status.version           = ntohl(*bp++);
+{
-        vnode->status.author            = ntohl(*bp++);
+        _enter(",{%u},%d", skb->len, last);
-        vnode->status.owner             = ntohl(*bp++);
-        vnode->status.caller_access     = ntohl(*bp++);
-        vnode->status.anon_access       = ntohl(*bp++);
-        vnode->status.mode              = ntohl(*bp++);
-        vnode->status.parent.vid        = vnode->fid.vid;
-        vnode->status.parent.vnode      = ntohl(*bp++);
-        vnode->status.parent.unique     = ntohl(*bp++);
-        bp++; /* seg size */
-        vnode->status.mtime_client      = ntohl(*bp++);
-        vnode->status.mtime_server      = ntohl(*bp++);
-        bp++; /* group */
-        bp++; /* sync counter */
-        vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-        bp++; /* spare2 */
-        bp++; /* spare3 */
-        bp++; /* spare4 */
-        vnode->cb_version               = ntohl(*bp++);
+        if (skb->len > 0)
-        vnode->cb_expiry                = ntohl(*bp++);
+                return -EBADMSG; /* shouldn't be any reply data */
-        vnode->cb_type                  = ntohl(*bp++);
+        return 0;
+}
-        if (volsync) {
-                volsync->creation       = ntohl(*bp++);
-                bp++; /* spare2 */
-                bp++; /* spare3 */
-                bp++; /* spare4 */
-                bp++; /* spare5 */
-                bp++; /* spare6 */
-        }
-        /* success */
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_callslot(server, &callslot);
- out:
-        _leave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_fetch_file_status() */
-/*****************************************************************************/
 /*
- * fetch the contents of a file or directory
+ * FS.GiveUpCallBacks operation type
 */
-int afs_rxfs_fetch_file_data(struct afs_server *server,
+static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
-                             struct afs_vnode *vnode,
+        .name           = "FS.GiveUpCallBacks",
-                             struct afs_rxfs_fetch_descriptor *desc,
+        .deliver        = afs_deliver_fs_give_up_callbacks,
-                             struct afs_volsync *volsync)
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * give up a set of callbacks
+ * - the callbacks are held in the server->cb_break ring
+ */
+int afs_fs_give_up_callbacks(struct afs_server *server,
+                             const struct afs_wait_mode *wait_mode)
 {
-        struct afs_server_callslot callslot;
+        struct afs_call *call;
-        struct rxrpc_call *call;
+        size_t ncallbacks;
-        struct kvec piov[1];
+        __be32 *bp, *tp;
-        size_t sent;
+        int loop;
-        int ret;
-        __be32 *bp;
-        DECLARE_WAITQUEUE(myself, current);
+        ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
+                              ARRAY_SIZE(server->cb_break));
-        _enter("%p,{fid={%u,%u,%u},sz=%Zu,of=%lu}",
-               server,
+        _enter("{%zu},", ncallbacks);
-               desc->fid.vid,
-               desc->fid.vnode,
+        if (ncallbacks == 0)
-               desc->fid.unique,
+                return 0;
-               desc->size,
+        if (ncallbacks > AFSCBMAX)
-               desc->offset);
+                ncallbacks = AFSCBMAX;
-        /* get hold of the fileserver connection */
+        _debug("break %zu callbacks", ncallbacks);
-        ret = afs_server_request_callslot(server, &callslot);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = FSFETCHDATA;
-        /* we want to get event notifications from the call */
+        call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
-        add_wait_queue(&call->waitq, &myself);
+                                   12 + ncallbacks * 6 * 4, 0);
+        if (!call)
+                return -ENOMEM;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
        /* marshall the parameters */
-        bp = rxrpc_call_alloc_scratch(call, 24);
+        bp = call->request;
-        bp[0] = htonl(FSFETCHDATA);
+        tp = bp + 2 + ncallbacks * 3;
-        bp[1] = htonl(desc->fid.vid);
+        *bp++ = htonl(FSGIVEUPCALLBACKS);
-        bp[2] = htonl(desc->fid.vnode);
+        *bp++ = htonl(ncallbacks);
-        bp[3] = htonl(desc->fid.unique);
+        *tp++ = htonl(ncallbacks);
-        bp[4] = htonl(desc->offset);
-        bp[5] = htonl(desc->size);
+        atomic_sub(ncallbacks, &server->cb_break_n);
+        for (loop = ncallbacks; loop > 0; loop--) {
-        piov[0].iov_len = 24;
+                struct afs_callback *cb =
-        piov[0].iov_base = bp;
+                        &server->cb_break[server->cb_break_tail];
-        /* send the parameters to the server */
+                *bp++ = htonl(cb->fid.vid);
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
+                *bp++ = htonl(cb->fid.vnode);
-                                    0, &sent);
+                *bp++ = htonl(cb->fid.unique);
-        if (ret < 0)
+                *tp++ = htonl(cb->version);
-                goto abort;
+                *tp++ = htonl(cb->expiry);
+                *tp++ = htonl(cb->type);
-        /* wait for the data count to arrive */
+                smp_mb();
-        ret = rxrpc_call_read_data(call, bp, 4, RXRPC_CALL_READ_BLOCK);
+                server->cb_break_tail =
-        if (ret < 0)
+                        (server->cb_break_tail + 1) &
-                goto read_failed;
+                        (ARRAY_SIZE(server->cb_break) - 1);
-        desc->actual = ntohl(bp[0]);
-        if (desc->actual != desc->size) {
-                ret = -EBADMSG;
-                goto abort;
        }
-        /* call the app to read the actual data */
+        ASSERT(ncallbacks > 0);
-        rxrpc_call_reset_scratch(call);
+        wake_up_nr(&server->cb_break_waitq, ncallbacks);
-        ret = rxrpc_call_read_data(call, desc->buffer, desc->actual,
-                                   RXRPC_CALL_READ_BLOCK);
-        if (ret < 0)
-                goto read_failed;
-        /* wait for the rest of the reply to completely arrive */
-        rxrpc_call_reset_scratch(call);
-        bp = rxrpc_call_alloc_scratch(call, 120);
-        ret = rxrpc_call_read_data(call, bp, 120,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0)
-                goto read_failed;
-        /* unmarshall the reply */
-        vnode->status.if_version        = ntohl(*bp++);
-        vnode->status.type              = ntohl(*bp++);
-        vnode->status.nlink             = ntohl(*bp++);
-        vnode->status.size              = ntohl(*bp++);
-        vnode->status.version           = ntohl(*bp++);
-        vnode->status.author            = ntohl(*bp++);
-        vnode->status.owner             = ntohl(*bp++);
-        vnode->status.caller_access     = ntohl(*bp++);
-        vnode->status.anon_access       = ntohl(*bp++);
-        vnode->status.mode              = ntohl(*bp++);
-        vnode->status.parent.vid        = desc->fid.vid;
-        vnode->status.parent.vnode      = ntohl(*bp++);
-        vnode->status.parent.unique     = ntohl(*bp++);
-        bp++; /* seg size */
-        vnode->status.mtime_client      = ntohl(*bp++);
-        vnode->status.mtime_server      = ntohl(*bp++);
-        bp++; /* group */
-        bp++; /* sync counter */
-        vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-        bp++; /* spare2 */
-        bp++; /* spare3 */
-        bp++; /* spare4 */
-        vnode->cb_version               = ntohl(*bp++);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        vnode->cb_expiry                = ntohl(*bp++);
+}
-        vnode->cb_type                  = ntohl(*bp++);
-        if (volsync) {
-                volsync->creation       = ntohl(*bp++);
-                bp++; /* spare2 */
-                bp++; /* spare3 */
-                bp++; /* spare4 */
-                bp++; /* spare5 */
-                bp++; /* spare6 */
-        }
-        /* success */
+/*
-        ret = 0;
+ * deliver reply data to an FS.CreateFile or an FS.MakeDir
+ */
- out_unwait:
+static int afs_deliver_fs_create_vnode(struct afs_call *call,
-        set_current_state(TASK_RUNNING);
+                                       struct sk_buff *skb, bool last)
-        remove_wait_queue(&call->waitq,&myself);
+{
-        rxrpc_put_call(call);
+        struct afs_vnode *vnode = call->reply;
- out_put_conn:
+        const __be32 *bp;
-        afs_server_release_callslot(server, &callslot);
- out:
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        _leave(" = %d", ret);
-        return ret;
- read_failed:
-        if (ret == -ECONNABORTED) {
-                ret = call->app_errno;
-                goto out_unwait;
-        }
- abort:
+        afs_transfer_reply(call, skb);
-        set_current_state(TASK_UNINTERRUPTIBLE);
+        if (!last)
-        rxrpc_call_abort(call, ret);
+                return 0;
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_fetch_file_data() */
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFid(&bp, call->reply2);
+        xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        xdr_decode_AFSCallBack_raw(&bp, call->reply4);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateXXXX = {
+        .name           = "FS.CreateXXXX",
+        .deliver        = afs_deliver_fs_create_vnode,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
-/*****************************************************************************/
 /*
- * ask the AFS fileserver to discard a callback request on a file
+ * create a file or make a directory
 */
-int afs_rxfs_give_up_callback(struct afs_server *server,
+int afs_fs_create(struct afs_server *server,
-                              struct afs_vnode *vnode)
+                  struct key *key,
+                  struct afs_vnode *vnode,
+                  const char *name,
+                  umode_t mode,
+                  struct afs_fid *newfid,
+                  struct afs_file_status *newstatus,
+                  struct afs_callback *newcb,
+                  const struct afs_wait_mode *wait_mode)
 {
-        struct afs_server_callslot callslot;
+        struct afs_call *call;
-        struct rxrpc_call *call;
+        size_t namesz, reqsz, padsz;
-        struct kvec piov[1];
-        size_t sent;
-        int ret;
        __be32 *bp;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
-        _enter("%p,{%u,%u,%u}",
+        namesz = strlen(name);
-               server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+        padsz = (4 - (namesz & 3)) & 3;
+        reqsz = (5 * 4) + namesz + padsz + (6 * 4);
-        /* get hold of the fileserver connection */
+        call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
-        ret = afs_server_request_callslot(server, &callslot);
+                                   (3 + 21 + 21 + 3 + 6) * 4);
-        if (ret < 0)
+        if (!call)
-                goto out;
+                return -ENOMEM;
-        /* create a call through that connection */
+        call->key = key;
-        ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
+        call->reply = vnode;
-        if (ret < 0) {
+        call->reply2 = newfid;
-                printk("kAFS: Unable to create call: %d\n", ret);
+        call->reply3 = newstatus;
-                goto out_put_conn;
+        call->reply4 = newcb;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        *bp++ = htonl(namesz);
+        memcpy(bp, name, namesz);
+        bp = (void *) bp + namesz;
+        if (padsz > 0) {
+                memset(bp, 0, padsz);
+                bp = (void *) bp + padsz;
        }
-        call->app_opcode = FSGIVEUPCALLBACKS;
+        *bp++ = htonl(AFS_SET_MODE);
+        *bp++ = 0; /* mtime */
+        *bp++ = 0; /* owner */
+        *bp++ = 0; /* group */
+        *bp++ = htonl(mode & S_IALLUGO); /* unix mode */
+        *bp++ = 0; /* segment size */
-        /* we want to get event notifications from the call */
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        add_wait_queue(&call->waitq, &myself);
+}
-        /* marshall the parameters */
+/*
-        bp = rxrpc_call_alloc_scratch(call, (1 + 4 + 4) * 4);
+ * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ */
+static int afs_deliver_fs_remove(struct afs_call *call,
+                                 struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *vnode = call->reply;
+        const __be32 *bp;
-        piov[0].iov_len = (1 + 4 + 4) * 4;
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        piov[0].iov_base = bp;
-        *bp++ = htonl(FSGIVEUPCALLBACKS);
+        afs_transfer_reply(call, skb);
-        *bp++ = htonl(1);
+        if (!last)
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.RemoveDir/FS.RemoveFile operation type
+ */
+static const struct afs_call_type afs_RXFSRemoveXXXX = {
+        .name           = "FS.RemoveXXXX",
+        .deliver        = afs_deliver_fs_remove,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * remove a file or directory
+ */
+int afs_fs_remove(struct afs_server *server,
+                  struct key *key,
+                  struct afs_vnode *vnode,
+                  const char *name,
+                  bool isdir,
+                  const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        size_t namesz, reqsz, padsz;
+        __be32 *bp;
+        _enter("");
+        namesz = strlen(name);
+        padsz = (4 - (namesz & 3)) & 3;
+        reqsz = (5 * 4) + namesz + padsz;
+        call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
        *bp++ = htonl(vnode->fid.vid);
        *bp++ = htonl(vnode->fid.vnode);
        *bp++ = htonl(vnode->fid.unique);
-        *bp++ = htonl(1);
+        *bp++ = htonl(namesz);
-        *bp++ = htonl(vnode->cb_version);
+        memcpy(bp, name, namesz);
-        *bp++ = htonl(vnode->cb_expiry);
+        bp = (void *) bp + namesz;
-        *bp++ = htonl(vnode->cb_type);
+        if (padsz > 0) {
+                memset(bp, 0, padsz);
-        /* send the parameters to the server */
+                bp = (void *) bp + padsz;
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-                    signal_pending(current))
-                        break;
-                schedule();
        }
-        set_current_state(TASK_RUNNING);
-        ret = -EINTR;
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        if (signal_pending(current))
+}
-                goto abort;
-        switch (call->app_call_state) {
+/*
-        case RXRPC_CSTATE_ERROR:
+ * deliver reply data to an FS.Link
-                ret = call->app_errno;
+ */
-                goto out_unwait;
+static int afs_deliver_fs_link(struct afs_call *call,
+                               struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+        const __be32 *bp;
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-                ret = 0;
-                goto out_unwait;
-        default:
+        afs_transfer_reply(call, skb);
-                BUG();
+        if (!last)
-        }
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.Link operation type
+ */
+static const struct afs_call_type afs_RXFSLink = {
+        .name           = "FS.Link",
+        .deliver        = afs_deliver_fs_link,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_callslot(server, &callslot);
- out:
-        _leave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_give_up_callback() */
-/*****************************************************************************/
 /*
- * look a filename up in a directory
+ * make a hard link
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
 */
-#if 0
+int afs_fs_link(struct afs_server *server,
-int afs_rxfs_lookup(struct afs_server *server,
+                struct key *key,
-                    struct afs_vnode *dir,
+                struct afs_vnode *dvnode,
-                    const char *filename,
+                struct afs_vnode *vnode,
-                    struct afs_vnode *vnode,
+                const char *name,
-                    struct afs_volsync *volsync)
+                const struct afs_wait_mode *wait_mode)
 {
-        struct rxrpc_connection *conn;
+        struct afs_call *call;
-        struct rxrpc_call *call;
+        size_t namesz, reqsz, padsz;
-        struct kvec piov[3];
+        __be32 *bp;
-        size_t sent;
-        int ret;
-        u32 *bp, zero;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
-        kenter("%p,{%u,%u,%u},%s",
+        namesz = strlen(name);
-               server, fid->vid, fid->vnode, fid->unique, filename);
+        padsz = (4 - (namesz & 3)) & 3;
+        reqsz = (5 * 4) + namesz + padsz + (3 * 4);
-        /* get hold of the fileserver connection */
+        call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
-        ret = afs_server_get_fsconn(server, &conn);
+        if (!call)
-        if (ret < 0)
+                return -ENOMEM;
-                goto out;
-        /* create a call through that connection */
+        call->key = key;
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
+        call->reply = dvnode;
-        if (ret < 0) {
+        call->reply2 = vnode;
-                printk("kAFS: Unable to create call: %d\n", ret);
+        call->service_id = FS_SERVICE;
-                goto out_put_conn;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSLINK);
+        *bp++ = htonl(dvnode->fid.vid);
+        *bp++ = htonl(dvnode->fid.vnode);
+        *bp++ = htonl(dvnode->fid.unique);
+        *bp++ = htonl(namesz);
+        memcpy(bp, name, namesz);
+        bp = (void *) bp + namesz;
+        if (padsz > 0) {
+                memset(bp, 0, padsz);
+                bp = (void *) bp + padsz;
        }
-        call->app_opcode = FSLOOKUP;
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+/*
+ * deliver reply data to an FS.Symlink
+ */
+static int afs_deliver_fs_symlink(struct afs_call *call,
+                                  struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *vnode = call->reply;
+        const __be32 *bp;
-        /* we want to get event notifications from the call */
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        add_wait_queue(&call->waitq,&myself);
+        afs_transfer_reply(call, skb);
+        if (!last)
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFid(&bp, call->reply2);
+        xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.Symlink operation type
+ */
+static const struct afs_call_type afs_RXFSSymlink = {
+        .name           = "FS.Symlink",
+        .deliver        = afs_deliver_fs_symlink,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * create a symbolic link
+ */
+int afs_fs_symlink(struct afs_server *server,
+                   struct key *key,
+                   struct afs_vnode *vnode,
+                   const char *name,
+                   const char *contents,
+                   struct afs_fid *newfid,
+                   struct afs_file_status *newstatus,
+                   const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        size_t namesz, reqsz, padsz, c_namesz, c_padsz;
+        __be32 *bp;
+        _enter("");
+        namesz = strlen(name);
+        padsz = (4 - (namesz & 3)) & 3;
+        c_namesz = strlen(contents);
+        c_padsz = (4 - (c_namesz & 3)) & 3;
+        reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
+        call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+                                   (3 + 21 + 21 + 6) * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->reply2 = newfid;
+        call->reply3 = newstatus;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
        /* marshall the parameters */
-        bp = rxrpc_call_alloc_scratch(call, 20);
+        bp = call->request;
+        *bp++ = htonl(FSSYMLINK);
-        zero = 0;
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
-        piov[0].iov_len = 20;
+        *bp++ = htonl(vnode->fid.unique);
-        piov[0].iov_base = bp;
+        *bp++ = htonl(namesz);
-        piov[1].iov_len = strlen(filename);
+        memcpy(bp, name, namesz);
-        piov[1].iov_base = (char *) filename;
+        bp = (void *) bp + namesz;
-        piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
+        if (padsz > 0) {
-        piov[2].iov_base = &zero;
+                memset(bp, 0, padsz);
+                bp = (void *) bp + padsz;
-        *bp++ = htonl(FSLOOKUP);
-        *bp++ = htonl(dirfid->vid);
-        *bp++ = htonl(dirfid->vnode);
-        *bp++ = htonl(dirfid->unique);
-        *bp++ = htonl(piov[1].iov_len);
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 220);
-        ret = rxrpc_call_read_data(call, bp, 220,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
        }
+        *bp++ = htonl(c_namesz);
+        memcpy(bp, contents, c_namesz);
+        bp = (void *) bp + c_namesz;
+        if (c_padsz > 0) {
+                memset(bp, 0, c_padsz);
+                bp = (void *) bp + c_padsz;
+        }
+        *bp++ = htonl(AFS_SET_MODE);
+        *bp++ = 0; /* mtime */
+        *bp++ = 0; /* owner */
+        *bp++ = 0; /* group */
+        *bp++ = htonl(S_IRWXUGO); /* unix mode */
+        *bp++ = 0; /* segment size */
-        /* unmarshall the reply */
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        fid->vid                = ntohl(*bp++);
+}
-        fid->vnode              = ntohl(*bp++);
-        fid->unique             = ntohl(*bp++);
-        vnode->status.if_version        = ntohl(*bp++);
+/*
-        vnode->status.type              = ntohl(*bp++);
+ * deliver reply data to an FS.Rename
-        vnode->status.nlink             = ntohl(*bp++);
+ */
-        vnode->status.size              = ntohl(*bp++);
+static int afs_deliver_fs_rename(struct afs_call *call,
-        vnode->status.version           = ntohl(*bp++);
+                                  struct sk_buff *skb, bool last)
-        vnode->status.author            = ntohl(*bp++);
+{
-        vnode->status.owner             = ntohl(*bp++);
+        struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
-        vnode->status.caller_access     = ntohl(*bp++);
+        const __be32 *bp;
-        vnode->status.anon_access       = ntohl(*bp++);
-        vnode->status.mode              = ntohl(*bp++);
-        vnode->status.parent.vid        = dirfid->vid;
-        vnode->status.parent.vnode      = ntohl(*bp++);
-        vnode->status.parent.unique     = ntohl(*bp++);
-        bp++; /* seg size */
-        vnode->status.mtime_client      = ntohl(*bp++);
-        vnode->status.mtime_server      = ntohl(*bp++);
-        bp++; /* group */
-        bp++; /* sync counter */
-        vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-        bp++; /* spare2 */
-        bp++; /* spare3 */
-        bp++; /* spare4 */
-        dir->status.if_version          = ntohl(*bp++);
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        dir->status.type                = ntohl(*bp++);
-        dir->status.nlink               = ntohl(*bp++);
+        afs_transfer_reply(call, skb);
-        dir->status.size                = ntohl(*bp++);
+        if (!last)
-        dir->status.version             = ntohl(*bp++);
+                return 0;
-        dir->status.author              = ntohl(*bp++);
-        dir->status.owner               = ntohl(*bp++);
+        if (call->reply_size != call->reply_max)
-        dir->status.caller_access       = ntohl(*bp++);
+                return -EBADMSG;
-        dir->status.anon_access         = ntohl(*bp++);
-        dir->status.mode                = ntohl(*bp++);
+        /* unmarshall the reply once we've received all of it */
-        dir->status.parent.vid          = dirfid->vid;
+        bp = call->buffer;
-        dir->status.parent.vnode        = ntohl(*bp++);
+        xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
-        dir->status.parent.unique       = ntohl(*bp++);
+        if (new_dvnode != orig_dvnode)
-        bp++; /* seg size */
+                xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
-        dir->status.mtime_client        = ntohl(*bp++);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
-        dir->status.mtime_server        = ntohl(*bp++);
-        bp++; /* group */
+        _leave(" = 0 [done]");
-        bp++; /* sync counter */
+        return 0;
-        dir->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
+}
-        bp++; /* spare2 */
-        bp++; /* spare3 */
+/*
-        bp++; /* spare4 */
+ * FS.Rename operation type
+ */
+static const struct afs_call_type afs_RXFSRename = {
+        .name           = "FS.Rename",
+        .deliver        = afs_deliver_fs_rename,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * create a symbolic link
+ */
+int afs_fs_rename(struct afs_server *server,
+                  struct key *key,
+                  struct afs_vnode *orig_dvnode,
+                  const char *orig_name,
+                  struct afs_vnode *new_dvnode,
+                  const char *new_name,
+                  const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
+        __be32 *bp;
+        _enter("");
+        o_namesz = strlen(orig_name);
+        o_padsz = (4 - (o_namesz & 3)) & 3;
+        n_namesz = strlen(new_name);
+        n_padsz = (4 - (n_namesz & 3)) & 3;
+        reqsz = (4 * 4) +
+                4 + o_namesz + o_padsz +
+                (3 * 4) +
+                4 + n_namesz + n_padsz;
+        call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = orig_dvnode;
+        call->reply2 = new_dvnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSRENAME);
+        *bp++ = htonl(orig_dvnode->fid.vid);
+        *bp++ = htonl(orig_dvnode->fid.vnode);
+        *bp++ = htonl(orig_dvnode->fid.unique);
+        *bp++ = htonl(o_namesz);
+        memcpy(bp, orig_name, o_namesz);
+        bp = (void *) bp + o_namesz;
+        if (o_padsz > 0) {
+                memset(bp, 0, o_padsz);
+                bp = (void *) bp + o_padsz;
+        }
-        callback->fid           = *fid;
+        *bp++ = htonl(new_dvnode->fid.vid);
-        callback->version       = ntohl(*bp++);
+        *bp++ = htonl(new_dvnode->fid.vnode);
-        callback->expiry        = ntohl(*bp++);
+        *bp++ = htonl(new_dvnode->fid.unique);
-        callback->type          = ntohl(*bp++);
+        *bp++ = htonl(n_namesz);
+        memcpy(bp, new_name, n_namesz);
-        if (volsync) {
+        bp = (void *) bp + n_namesz;
-                volsync->creation       = ntohl(*bp++);
+        if (n_padsz > 0) {
-                bp++; /* spare2 */
+                memset(bp, 0, n_padsz);
-                bp++; /* spare3 */
+                bp = (void *) bp + n_padsz;
-                bp++; /* spare4 */
-                bp++; /* spare5 */
-                bp++; /* spare6 */
        }
-        /* success */
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        ret = 0;
+}
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_fsconn(server, conn);
- out:
-        kleave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_lookup() */
-#endif
diff --git a/fs/afs/fsclient.h b/fs/afs/fsclient.h
deleted file mode 100644
index 8ba3e749ee3c..000000000000
--- a/fs/afs/fsclient.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* fsclient.h: AFS File Server client stub declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_FSCLIENT_H
-#define _LINUX_AFS_FSCLIENT_H
-#include "server.h"
-extern int afs_rxfs_get_volume_info(struct afs_server *server,
-                                    const char *name,
-                                    struct afs_volume_info *vinfo);
-extern int afs_rxfs_fetch_file_status(struct afs_server *server,
-                                      struct afs_vnode *vnode,
-                                      struct afs_volsync *volsync);
-struct afs_rxfs_fetch_descriptor {
-        struct afs_fid  fid;            /* file ID to fetch */
-        size_t          size;           /* total number of bytes to fetch */
-        off_t           offset;         /* offset in file to start from */
-        void            *buffer;        /* read buffer */
-        size_t          actual;         /* actual size sent back by server */
-};
-extern int afs_rxfs_fetch_file_data(struct afs_server *server,
-                                    struct afs_vnode *vnode,
-                                    struct afs_rxfs_fetch_descriptor *desc,
-                                    struct afs_volsync *volsync);
-extern int afs_rxfs_give_up_callback(struct afs_server *server,
-                                     struct afs_vnode *vnode);
-/* this doesn't appear to work in OpenAFS server */
-extern int afs_rxfs_lookup(struct afs_server *server,
-                           struct afs_vnode *dir,
-                           const char *filename,
-                           struct afs_vnode *vnode,
-                           struct afs_volsync *volsync);
-/* this is apparently mis-implemented in OpenAFS server */
-extern int afs_rxfs_get_root_volume(struct afs_server *server,
-                                    char *buf,
-                                    size_t *buflen);
-#endif /* _LINUX_AFS_FSCLIENT_H */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 9d9bca6c28b5..c184a4ee5995 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,9 +19,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "super.h"
 #include "internal.h"
 struct afs_iget_data {
@@ -29,26 +26,25 @@ struct afs_iget_data {
        struct afs_volume       *volume;        /* volume on which resides */
 };
-/*****************************************************************************/
 /*
 * map the AFS file status to the inode member variables
 */
-static int afs_inode_map_status(struct afs_vnode *vnode)
+static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 {
        struct inode *inode = AFS_VNODE_TO_I(vnode);
-        _debug("FS: ft=%d lk=%d sz=%Zu ver=%Lu mod=%hu",
+        _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
               vnode->status.type,
               vnode->status.nlink,
-               vnode->status.size,
+               (unsigned long long) vnode->status.size,
-               vnode->status.version,
+               vnode->status.data_version,
               vnode->status.mode);
        switch (vnode->status.type) {
        case AFS_FTYPE_FILE:
                inode->i_mode   = S_IFREG | vnode->status.mode;
                inode->i_op     = &afs_file_inode_operations;
-                inode->i_fop    = &generic_ro_fops;
+                inode->i_fop    = &afs_file_operations;
                break;
        case AFS_FTYPE_DIR:
                inode->i_mode   = S_IFDIR | vnode->status.mode;
@@ -77,9 +73,9 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
        /* check to see whether a symbolic link is really a mountpoint */
        if (vnode->status.type == AFS_FTYPE_SYMLINK) {
-                afs_mntpt_check_symlink(vnode);
+                afs_mntpt_check_symlink(vnode, key);
-                if (vnode->flags & AFS_VNODE_MOUNTPOINT) {
+                if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
                        inode->i_mode   = S_IFDIR | vnode->status.mode;
                        inode->i_op     = &afs_mntpt_inode_operations;
                        inode->i_fop    = &afs_mntpt_file_operations;
@@ -87,30 +83,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
        }
        return 0;
-} /* end afs_inode_map_status() */
+}
-/*****************************************************************************/
-/*
- * attempt to fetch the status of an inode, coelescing multiple simultaneous
- * fetches
- */
-static int afs_inode_fetch_status(struct inode *inode)
-{
-        struct afs_vnode *vnode;
-        int ret;
-        vnode = AFS_FS_I(inode);
-        ret = afs_vnode_fetch_status(vnode);
-        if (ret == 0)
-                ret = afs_inode_map_status(vnode);
-        return ret;
-} /* end afs_inode_fetch_status() */
-/*****************************************************************************/
 /*
 * iget5() comparator
 */
@@ -120,9 +94,8 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
        return inode->i_ino == data->fid.vnode &&
                inode->i_version == data->fid.unique;
-} /* end afs_iget5_test() */
+}
-/*****************************************************************************/
 /*
 * iget5() inode initialiser
 */
@@ -137,14 +110,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
        vnode->volume = data->volume;
        return 0;
-} /* end afs_iget5_set() */
+}
-/*****************************************************************************/
 /*
 * inode retrieval
 */
-inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
+struct inode *afs_iget(struct super_block *sb, struct key *key,
-                    struct inode **_inode)
+                       struct afs_fid *fid, struct afs_file_status *status,
+                       struct afs_callback *cb)
 {
        struct afs_iget_data data = { .fid = *fid };
        struct afs_super_info *as;
@@ -161,20 +134,18 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
                             &data);
        if (!inode) {
                _leave(" = -ENOMEM");
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        }
+        _debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+               inode, fid->vid, fid->vnode, fid->unique);
        vnode = AFS_FS_I(inode);
        /* deal with an existing inode */
        if (!(inode->i_state & I_NEW)) {
-                ret = afs_vnode_fetch_status(vnode);
+                _leave(" = %p", inode);
-                if (ret==0)
+                return inode;
-                        *_inode = inode;
-                else
-                        iput(inode);
-                _leave(" = %d", ret);
-                return ret;
        }
 #ifdef AFS_CACHING_SUPPORT
@@ -186,100 +157,185 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
                               &vnode->cache);
 #endif
-        /* okay... it's a new inode */
+        if (!status) {
-        inode->i_flags |= S_NOATIME;
+                /* it's a remotely extant inode */
-        vnode->flags |= AFS_VNODE_CHANGED;
+                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-        ret = afs_inode_fetch_status(inode);
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
-        if (ret<0)
+                if (ret < 0)
+                        goto bad_inode;
+        } else {
+                /* it's an inode we just created */
+                memcpy(&vnode->status, status, sizeof(vnode->status));
+                if (!cb) {
+                        /* it's a symlink we just created (the fileserver
+                         * didn't give us a callback) */
+                        vnode->cb_version = 0;
+                        vnode->cb_expiry = 0;
+                        vnode->cb_type = 0;
+                        vnode->cb_expires = get_seconds();
+                } else {
+                        vnode->cb_version = cb->version;
+                        vnode->cb_expiry = cb->expiry;
+                        vnode->cb_type = cb->type;
+                        vnode->cb_expires = vnode->cb_expiry + get_seconds();
+                }
+        }
+        ret = afs_inode_map_status(vnode, key);
+        if (ret < 0)
                goto bad_inode;
        /* success */
+        clear_bit(AFS_VNODE_UNSET, &vnode->flags);
+        inode->i_flags |= S_NOATIME;
        unlock_new_inode(inode);
+        _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
-        *_inode = inode;
+        return inode;
-        _leave(" = 0 [CB { v=%u x=%lu t=%u }]",
-               vnode->cb_version,
-               vnode->cb_timeout.timo_jif,
-               vnode->cb_type);
-        return 0;
        /* failure */
- bad_inode:
+bad_inode:
        make_bad_inode(inode);
        unlock_new_inode(inode);
        iput(inode);
        _leave(" = %d [bad]", ret);
+        return ERR_PTR(ret);
+}
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ *     symlink)
+ *   - parent dir metadata changed (security changes)
+ *   - dentry data changed (write, truncate)
+ *   - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+        int ret;
+        _enter("{v={%x:%u} fl=%lx},%x",
+               vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+               key_serial(key));
+        if (vnode->cb_promised &&
+            !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+            !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
+            !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                if (vnode->cb_expires < get_seconds() + 10) {
+                        _debug("callback expired");
+                        set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+                } else {
+                        goto valid;
+                }
+        }
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+                goto valid;
+        mutex_lock(&vnode->validate_lock);
+        /* if the promise has expired, we need to check the server again to get
+         * a new promise - note that if the (parent) directory's metadata was
+         * changed then the security may be different and we may no longer have
+         * access */
+        if (!vnode->cb_promised ||
+            test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+                _debug("not promised");
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
+                if (ret < 0)
+                        goto error_unlock;
+                _debug("new promise [fl=%lx]", vnode->flags);
+        }
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+                _debug("file already deleted");
+                ret = -ESTALE;
+                goto error_unlock;
+        }
+        /* if the vnode's data version number changed then its contents are
+         * different */
+        if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                _debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
+                invalidate_remote_inode(&vnode->vfs_inode);
+        }
+        clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+        mutex_unlock(&vnode->validate_lock);
+valid:
+        _leave(" = 0");
+        return 0;
+error_unlock:
+        mutex_unlock(&vnode->validate_lock);
+        _leave(" = %d", ret);
        return ret;
-} /* end afs_iget() */
+}
-/*****************************************************************************/
 /*
 * read the attributes of an inode
 */
 int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
                      struct kstat *stat)
 {
-        struct afs_vnode *vnode;
        struct inode *inode;
-        int ret;
        inode = dentry->d_inode;
        _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
-        vnode = AFS_FS_I(inode);
-        ret = afs_inode_fetch_status(inode);
-        if (ret == -ENOENT) {
-                _leave(" = %d [%d %p]",
-                       ret, atomic_read(&dentry->d_count), dentry->d_inode);
-                return ret;
-        }
-        else if (ret < 0) {
-                make_bad_inode(inode);
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* transfer attributes from the inode structure to the stat
-         * structure */
        generic_fillattr(inode, stat);
-        _leave(" = 0 CB { v=%u x=%u t=%u }",
-               vnode->cb_version,
-               vnode->cb_expiry,
-               vnode->cb_type);
        return 0;
-} /* end afs_inode_getattr() */
+}
-/*****************************************************************************/
 /*
 * clear an AFS inode
 */
 void afs_clear_inode(struct inode *inode)
 {
+        struct afs_permits *permits;
        struct afs_vnode *vnode;
        vnode = AFS_FS_I(inode);
-        _enter("ino=%lu { vn=%08x v=%u x=%u t=%u }",
+        _enter("{%x:%d.%d} v=%u x=%u t=%u }",
-               inode->i_ino,
+               vnode->fid.vid,
               vnode->fid.vnode,
+               vnode->fid.unique,
               vnode->cb_version,
               vnode->cb_expiry,
-               vnode->cb_type
+               vnode->cb_type);
-               );
-        BUG_ON(inode->i_ino != vnode->fid.vnode);
+        _debug("CLEAR INODE %p", inode);
-        afs_vnode_give_up_callback(vnode);
+        ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+        afs_give_up_callback(vnode);
+        if (vnode->server) {
+                spin_lock(&vnode->server->fs_lock);
+                rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+                spin_unlock(&vnode->server->fs_lock);
+                afs_put_server(vnode->server);
+                vnode->server = NULL;
+        }
+        ASSERT(!vnode->cb_promised);
 #ifdef AFS_CACHING_SUPPORT
        cachefs_relinquish_cookie(vnode->cache, 0);
        vnode->cache = NULL;
 #endif
+        mutex_lock(&vnode->permits_lock);
+        permits = vnode->permits;
+        rcu_assign_pointer(vnode->permits, NULL);
+        mutex_unlock(&vnode->permits_lock);
+        if (permits)
+                call_rcu(&permits->rcu, afs_zap_permits);
        _leave("");
-} /* end afs_clear_inode() */
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5151d5da2c2f..d90c158cd934 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1,6 +1,6 @@
-/* internal.h: internal AFS stuff
+/* internal AFS stuff
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,48 +9,390 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef AFS_INTERNAL_H
-#define AFS_INTERNAL_H
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/skbuff.h>
+#include <linux/rxrpc.h>
+#include <linux/key.h>
+#include "afs.h"
+#include "afs_vl.h"
+#define AFS_CELL_MAX_ADDRS 15
+struct afs_call;
+typedef enum {
+        AFS_VL_NEW,                     /* new, uninitialised record */
+        AFS_VL_CREATING,                /* creating record */
+        AFS_VL_VALID,                   /* record is pending */
+        AFS_VL_NO_VOLUME,               /* no such volume available */
+        AFS_VL_UPDATING,                /* update in progress */
+        AFS_VL_VOLUME_DELETED,          /* volume was deleted */
+        AFS_VL_UNCERTAIN,               /* uncertain state (update failed) */
+} __attribute__((packed)) afs_vlocation_state_t;
+struct afs_mount_params {
+        bool                    rwpath;         /* T if the parent should be considered R/W */
+        bool                    force;          /* T to force cell type */
+        afs_voltype_t           type;           /* type of volume requested */
+        int                     volnamesz;      /* size of volume name */
+        const char              *volname;       /* name of volume to mount */
+        struct afs_cell         *cell;          /* cell in which to find volume */
+        struct afs_volume       *volume;        /* volume record */
+        struct key              *key;           /* key to use for secure mounting */
+};
 /*
- * debug tracing
+ * definition of how to wait for the completion of an operation
 */
-#define kenter(FMT, a...)       printk("==> %s("FMT")\n",__FUNCTION__ , ## a)
+struct afs_wait_mode {
-#define kleave(FMT, a...)       printk("<== %s()"FMT"\n",__FUNCTION__ , ## a)
+        /* RxRPC received message notification */
-#define kdebug(FMT, a...)       printk(FMT"\n" , ## a)
+        void (*rx_wakeup)(struct afs_call *call);
-#define kproto(FMT, a...)       printk("### "FMT"\n" , ## a)
-#define knet(FMT, a...)         printk(FMT"\n" , ## a)
-#ifdef __KDEBUG
-#define _enter(FMT, a...)       kenter(FMT , ## a)
-#define _leave(FMT, a...)       kleave(FMT , ## a)
-#define _debug(FMT, a...)       kdebug(FMT , ## a)
-#define _proto(FMT, a...)       kproto(FMT , ## a)
-#define _net(FMT, a...)         knet(FMT , ## a)
-#else
-#define _enter(FMT, a...)       do { } while(0)
-#define _leave(FMT, a...)       do { } while(0)
-#define _debug(FMT, a...)       do { } while(0)
-#define _proto(FMT, a...)       do { } while(0)
-#define _net(FMT, a...)         do { } while(0)
-#endif
-static inline void afs_discard_my_signals(void)
+        /* synchronous call waiter and call dispatched notification */
-{
+        int (*wait)(struct afs_call *call);
-        while (signal_pending(current)) {
-                siginfo_t sinfo;
+        /* asynchronous call completion */
+        void (*async_complete)(void *reply, int error);
+};
+extern const struct afs_wait_mode afs_sync_call;
+extern const struct afs_wait_mode afs_async_call;
-                spin_lock_irq(&current->sighand->siglock);
+/*
-                dequeue_signal(current,&current->blocked, &sinfo);
+ * a record of an in-progress RxRPC call
-                spin_unlock_irq(&current->sighand->siglock);
+ */
-        }
+struct afs_call {
+        const struct afs_call_type *type;       /* type of call */
+        const struct afs_wait_mode *wait_mode;  /* completion wait mode */
+        wait_queue_head_t       waitq;          /* processes awaiting completion */
+        struct work_struct      async_work;     /* asynchronous work processor */
+        struct work_struct      work;           /* actual work processor */
+        struct sk_buff_head     rx_queue;       /* received packets */
+        struct rxrpc_call       *rxcall;        /* RxRPC call handle */
+        struct key              *key;           /* security for this call */
+        struct afs_server       *server;        /* server affected by incoming CM call */
+        void                    *request;       /* request data (first part) */
+        void                    *request2;      /* request data (second part) */
+        void                    *buffer;        /* reply receive buffer */
+        void                    *reply;         /* reply buffer (first part) */
+        void                    *reply2;        /* reply buffer (second part) */
+        void                    *reply3;        /* reply buffer (third part) */
+        void                    *reply4;        /* reply buffer (fourth part) */
+        enum {                                  /* call state */
+                AFS_CALL_REQUESTING,    /* request is being sent for outgoing call */
+                AFS_CALL_AWAIT_REPLY,   /* awaiting reply to outgoing call */
+                AFS_CALL_AWAIT_OP_ID,   /* awaiting op ID on incoming call */
+                AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */
+                AFS_CALL_REPLYING,      /* replying to incoming call */
+                AFS_CALL_AWAIT_ACK,     /* awaiting final ACK of incoming call */
+                AFS_CALL_COMPLETE,      /* successfully completed */
+                AFS_CALL_BUSY,          /* server was busy */
+                AFS_CALL_ABORTED,       /* call was aborted */
+                AFS_CALL_ERROR,         /* call failed due to error */
+        }                       state;
+        int                     error;          /* error code */
+        unsigned                request_size;   /* size of request data */
+        unsigned                reply_max;      /* maximum size of reply */
+        unsigned                reply_size;     /* current size of reply */
+        unsigned short          offset;         /* offset into received data store */
+        unsigned char           unmarshall;     /* unmarshalling phase */
+        bool                    incoming;       /* T if incoming call */
+        u16                     service_id;     /* RxRPC service ID to call */
+        __be16                  port;           /* target UDP port */
+        __be32                  operation_ID;   /* operation ID for an incoming call */
+        u32                     count;          /* count for use in unmarshalling */
+        __be32                  tmp;            /* place to extract temporary data */
+};
+struct afs_call_type {
+        const char *name;
+        /* deliver request or reply data to an call
+         * - returning an error will cause the call to be aborted
+         */
+        int (*deliver)(struct afs_call *call, struct sk_buff *skb,
+                       bool last);
+        /* map an abort code to an error number */
+        int (*abort_to_error)(u32 abort_code);
+        /* clean up a call */
+        void (*destructor)(struct afs_call *call);
+};
+/*
+ * AFS superblock private data
+ * - there's one superblock per volume
+ */
+struct afs_super_info {
+        struct afs_volume       *volume;        /* volume record */
+        char                    rwparent;       /* T if parent is R/W AFS volume */
+};
+static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
+{
+        return sb->s_fs_info;
 }
+extern struct file_system_type afs_fs_type;
+/*
+ * entry in the cached cell catalogue
+ */
+struct afs_cache_cell {
+        char            name[AFS_MAXCELLNAME];  /* cell name (padded with NULs) */
+        struct in_addr  vl_servers[15];         /* cached cell VL servers */
+};
+/*
+ * AFS cell record
+ */
+struct afs_cell {
+        atomic_t                usage;
+        struct list_head        link;           /* main cell list link */
+        struct key              *anonymous_key; /* anonymous user key for this cell */
+        struct list_head        proc_link;      /* /proc cell list link */
+        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        /* server record management */
+        rwlock_t                servers_lock;   /* active server list lock */
+        struct list_head        servers;        /* active server list */
+        /* volume location record management */
+        struct rw_semaphore     vl_sem;         /* volume management serialisation semaphore */
+        struct list_head        vl_list;        /* cell's active VL record list */
+        spinlock_t              vl_lock;        /* vl_list lock */
+        unsigned short          vl_naddrs;      /* number of VL servers in addr list */
+        unsigned short          vl_curr_svix;   /* current server index */
+        struct in_addr          vl_addrs[AFS_CELL_MAX_ADDRS];   /* cell VL server addresses */
+        char                    name[0];        /* cell name - must go last */
+};
+/*
+ * entry in the cached volume location catalogue
+ */
+struct afs_cache_vlocation {
+        /* volume name (lowercase, padded with NULs) */
+        uint8_t                 name[AFS_MAXVOLNAME + 1];
+        uint8_t                 nservers;       /* number of entries used in servers[] */
+        uint8_t                 vidmask;        /* voltype mask for vid[] */
+        uint8_t                 srvtmask[8];    /* voltype masks for servers[] */
+#define AFS_VOL_VTM_RW  0x01 /* R/W version of the volume is available (on this server) */
+#define AFS_VOL_VTM_RO  0x02 /* R/O version of the volume is available (on this server) */
+#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+        afs_volid_t             vid[3];         /* volume IDs for R/W, R/O and Bak volumes */
+        struct in_addr          servers[8];     /* fileserver addresses */
+        time_t                  rtime;          /* last retrieval time */
+};
+/*
+ * volume -> vnode hash table entry
+ */
+struct afs_cache_vhash {
+        afs_voltype_t           vtype;          /* which volume variation */
+        uint8_t                 hash_bucket;    /* which hash bucket this represents */
+} __attribute__((packed));
+/*
+ * AFS volume location record
+ */
+struct afs_vlocation {
+        atomic_t                usage;
+        time_t                  time_of_death;  /* time at which put reduced usage to 0 */
+        struct list_head        link;           /* link in cell volume location list */
+        struct list_head        grave;          /* link in master graveyard list */
+        struct list_head        update;         /* link in master update list */
+        struct afs_cell         *cell;          /* cell to which volume belongs */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        struct afs_cache_vlocation vldb;        /* volume information DB record */
+        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
+        wait_queue_head_t       waitq;          /* status change waitqueue */
+        time_t                  update_at;      /* time at which record should be updated */
+        spinlock_t              lock;           /* access lock */
+        afs_vlocation_state_t   state;          /* volume location state */
+        unsigned short          upd_rej_cnt;    /* ENOMEDIUM count during update */
+        unsigned short          upd_busy_cnt;   /* EBUSY count during update */
+        bool                    valid;          /* T if valid */
+};
+/*
+ * AFS fileserver record
+ */
+struct afs_server {
+        atomic_t                usage;
+        time_t                  time_of_death;  /* time at which put reduced usage to 0 */
+        struct in_addr          addr;           /* server address */
+        struct afs_cell         *cell;          /* cell in which server resides */
+        struct list_head        link;           /* link in cell's server list */
+        struct list_head        grave;          /* link in master graveyard list */
+        struct rb_node          master_rb;      /* link in master by-addr tree */
+        struct rw_semaphore     sem;            /* access lock */
+        /* file service access */
+        struct rb_root          fs_vnodes;      /* vnodes backed by this server (ordered by FID) */
+        unsigned long           fs_act_jif;     /* time at which last activity occurred */
+        unsigned long           fs_dead_jif;    /* time at which no longer to be considered dead */
+        spinlock_t              fs_lock;        /* access lock */
+        int                     fs_state;       /* 0 or reason FS currently marked dead (-errno) */
+        /* callback promise management */
+        struct rb_root          cb_promises;    /* vnode expiration list (ordered earliest first) */
+        struct delayed_work     cb_updater;     /* callback updater */
+        struct delayed_work     cb_break_work;  /* collected break dispatcher */
+        wait_queue_head_t       cb_break_waitq; /* space available in cb_break waitqueue */
+        spinlock_t              cb_lock;        /* access lock */
+        struct afs_callback     cb_break[64];   /* ring of callbacks awaiting breaking */
+        atomic_t                cb_break_n;     /* number of pending breaks */
+        u8                      cb_break_head;  /* head of callback breaking ring */
+        u8                      cb_break_tail;  /* tail of callback breaking ring */
+};
+/*
+ * AFS volume access record
+ */
+struct afs_volume {
+        atomic_t                usage;
+        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
+        struct afs_vlocation    *vlocation;     /* volume location */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        afs_volid_t             vid;            /* volume ID */
+        afs_voltype_t           type;           /* type of volume */
+        char                    type_force;     /* force volume type (suppress R/O -> R/W) */
+        unsigned short          nservers;       /* number of server slots filled */
+        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
+        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
+        struct rw_semaphore     server_sem;     /* lock for accessing current server */
+};
+/*
+ * vnode catalogue entry
+ */
+struct afs_cache_vnode {
+        afs_vnodeid_t           vnode_id;       /* vnode ID */
+        unsigned                vnode_unique;   /* vnode ID uniquifier */
+        afs_dataversion_t       data_version;   /* data version */
+};
+/*
+ * AFS inode private data
+ */
+struct afs_vnode {
+        struct inode            vfs_inode;      /* the VFS's inode record */
+        struct afs_volume       *volume;        /* volume on which vnode resides */
+        struct afs_server       *server;        /* server currently supplying this file */
+        struct afs_fid          fid;            /* the file identifier for this inode */
+        struct afs_file_status  status;         /* AFS status info for this file */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        struct afs_permits      *permits;       /* cache of permits so far obtained */
+        struct mutex            permits_lock;   /* lock for altering permits list */
+        struct mutex            validate_lock;  /* lock for validating this vnode */
+        wait_queue_head_t       update_waitq;   /* status fetch waitqueue */
+        int                     update_cnt;     /* number of outstanding ops that will update the
+                                                 * status */
+        spinlock_t              lock;           /* waitqueue/flags lock */
+        unsigned long           flags;
+#define AFS_VNODE_CB_BROKEN     0               /* set if vnode's callback was broken */
+#define AFS_VNODE_UNSET         1               /* set if vnode attributes not yet set */
+#define AFS_VNODE_MODIFIED      2               /* set if vnode's data modified */
+#define AFS_VNODE_ZAP_DATA      3               /* set if vnode's data should be invalidated */
+#define AFS_VNODE_DELETED       4               /* set if vnode deleted on server */
+#define AFS_VNODE_MOUNTPOINT    5               /* set if vnode is a mountpoint symlink */
+        long                    acl_order;      /* ACL check count (callback break count) */
+        /* outstanding callback notification on this file */
+        struct rb_node          server_rb;      /* link in server->fs_vnodes */
+        struct rb_node          cb_promise;     /* link in server->cb_promises */
+        struct work_struct      cb_broken_work; /* work to be done on callback break */
+        time_t                  cb_expires;     /* time at which callback expires */
+        time_t                  cb_expires_at;  /* time used to order cb_promise */
+        unsigned                cb_version;     /* callback version */
+        unsigned                cb_expiry;      /* callback expiry time */
+        afs_callback_type_t     cb_type;        /* type of callback */
+        bool                    cb_promised;    /* true if promise still holds */
+};
+/*
+ * cached security record for one user's attempt to access a vnode
+ */
+struct afs_permit {
+        struct key              *key;           /* RxRPC ticket holding a security context */
+        afs_access_t            access_mask;    /* access mask for this key */
+};
+/*
+ * cache of security records from attempts to access a vnode
+ */
+struct afs_permits {
+        struct rcu_head         rcu;            /* disposal procedure */
+        int                     count;          /* number of records */
+        struct afs_permit       permits[0];     /* the permits so far examined */
+};
+/*
+ * record of one of a system's set of network interfaces
+ */
+struct afs_interface {
+        struct in_addr  address;        /* IPv4 address bound to interface */
+        struct in_addr  netmask;        /* netmask applied to address */
+        unsigned        mtu;            /* MTU of interface */
+};
+/*
+ * UUID definition [internet draft]
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ *   increments since midnight 15th October 1582
+ *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ *     time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct afs_uuid {
+        u32             time_low;                       /* low part of timestamp */
+        u16             time_mid;                       /* mid part of timestamp */
+        u16             time_hi_and_version;            /* high part of timestamp and version  */
+#define AFS_UUID_TO_UNIX_TIME   0x01b21dd213814000ULL
+#define AFS_UUID_TIMEHI_MASK    0x0fff
+#define AFS_UUID_VERSION_TIME   0x1000  /* time-based UUID */
+#define AFS_UUID_VERSION_NAME   0x3000  /* name-based UUID */
+#define AFS_UUID_VERSION_RANDOM 0x4000  /* (pseudo-)random generated UUID */
+        u8              clock_seq_hi_and_reserved;      /* clock seq hi and variant */
+#define AFS_UUID_CLOCKHI_MASK   0x3f
+#define AFS_UUID_VARIANT_STD    0x80
+        u8              clock_seq_low;                  /* clock seq low */
+        u8              node[6];                        /* spatially unique node ID (MAC addr) */
+};
+/*****************************************************************************/
+/*
+ * callback.c
+ */
+extern void afs_init_callback_state(struct afs_server *);
+extern void afs_broken_callback_work(struct work_struct *);
+extern void afs_break_callbacks(struct afs_server *, size_t,
+                                struct afs_callback[]);
+extern void afs_discard_callback_on_delete(struct afs_vnode *);
+extern void afs_give_up_callback(struct afs_vnode *);
+extern void afs_dispatch_give_up_callbacks(struct work_struct *);
+extern void afs_flush_callback_breaks(struct afs_server *);
+extern int __init afs_callback_update_init(void);
+extern void afs_callback_update_kill(void);
 /*
 * cell.c
 */
@@ -60,57 +402,156 @@ extern struct list_head afs_proc_cells;
 extern struct cachefs_index_def afs_cache_cell_index_def;
 #endif
+#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
+extern int afs_cell_init(char *);
+extern struct afs_cell *afs_cell_create(const char *, char *);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_grab_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_cell *);
+extern void afs_cell_purge(void);
+/*
+ * cmservice.c
+ */
+extern bool afs_cm_incoming_call(struct afs_call *);
 /*
 * dir.c
 */
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct file_operations afs_dir_file_operations;
+extern int afs_permission(struct inode *, int, struct nameidata *);
 /*
 * file.c
 */
 extern const struct address_space_operations afs_fs_aops;
 extern const struct inode_operations afs_file_inode_operations;
+extern const struct file_operations afs_file_operations;
+extern int afs_open(struct inode *, struct file *);
+extern int afs_release(struct inode *, struct file *);
 #ifdef AFS_CACHING_SUPPORT
-extern int afs_cache_get_page_cookie(struct page *page,
+extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
-                                     struct cachefs_page **_page_cookie);
 #endif
 /*
- * inode.c
+ * fsclient.c
 */
-extern int afs_iget(struct super_block *sb, struct afs_fid *fid,
+extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
-                    struct inode **_inode);
+                                    struct afs_vnode *, struct afs_volsync *,
-extern int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                                    const struct afs_wait_mode *);
-                             struct kstat *stat);
+extern int afs_fs_give_up_callbacks(struct afs_server *,
-extern void afs_clear_inode(struct inode *inode);
+                                    const struct afs_wait_mode *);
+extern int afs_fs_fetch_data(struct afs_server *, struct key *,
+                             struct afs_vnode *, off_t, size_t, struct page *,
+                             const struct afs_wait_mode *);
+extern int afs_fs_create(struct afs_server *, struct key *,
+                         struct afs_vnode *, const char *, umode_t,
+                         struct afs_fid *, struct afs_file_status *,
+                         struct afs_callback *,
+                         const struct afs_wait_mode *);
+extern int afs_fs_remove(struct afs_server *, struct key *,
+                         struct afs_vnode *, const char *, bool,
+                         const struct afs_wait_mode *);
+extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
+                       struct afs_vnode *, const char *,
+                       const struct afs_wait_mode *);
+extern int afs_fs_symlink(struct afs_server *, struct key *,
+                          struct afs_vnode *, const char *, const char *,
+                          struct afs_fid *, struct afs_file_status *,
+                          const struct afs_wait_mode *);
+extern int afs_fs_rename(struct afs_server *, struct key *,
+                         struct afs_vnode *, const char *,
+                         struct afs_vnode *, const char *,
+                         const struct afs_wait_mode *);
 /*
- * key_afs.c
+ * inode.c
 */
-#ifdef CONFIG_KEYS
+extern struct inode *afs_iget(struct super_block *, struct key *,
-extern int afs_key_register(void);
+                              struct afs_fid *, struct afs_file_status *,
-extern void afs_key_unregister(void);
+                              struct afs_callback *);
-#endif
+extern int afs_validate(struct afs_vnode *, struct key *);
+extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
+                             struct kstat *);
+extern void afs_zap_permits(struct rcu_head *);
+extern void afs_clear_inode(struct inode *);
 /*
 * main.c
 */
+extern struct afs_uuid afs_uuid;
 #ifdef AFS_CACHING_SUPPORT
 extern struct cachefs_netfs afs_cache_netfs;
 #endif
 /*
+ * misc.c
+ */
+extern int afs_abort_to_error(u32);
+/*
 * mntpt.c
 */
 extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
-extern struct afs_timer afs_mntpt_expiry_timer;
-extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
 extern unsigned long afs_mntpt_expiry_timeout;
-extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
+extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
+extern void afs_mntpt_kill_timer(void);
+extern void afs_umount_begin(struct vfsmount *, int);
+/*
+ * proc.c
+ */
+extern int afs_proc_init(void);
+extern void afs_proc_cleanup(void);
+extern int afs_proc_cell_setup(struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_cell *);
+/*
+ * rxrpc.c
+ */
+extern int afs_open_socket(void);
+extern void afs_close_socket(void);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
+                         const struct afs_wait_mode *);
+extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+                                            size_t, size_t);
+extern void afs_flat_call_destructor(struct afs_call *);
+extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern void afs_send_empty_reply(struct afs_call *);
+extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
+extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
+                            size_t);
+/*
+ * security.c
+ */
+extern void afs_clear_permits(struct afs_vnode *);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_permission(struct inode *, int, struct nameidata *);
+/*
+ * server.c
+ */
+extern spinlock_t afs_server_peer_lock;
+#define afs_get_server(S)                                       \
+do {                                                            \
+        _debug("GET SERVER %d", atomic_read(&(S)->usage));      \
+        atomic_inc(&(S)->usage);                                \
+} while(0)
+extern struct afs_server *afs_lookup_server(struct afs_cell *,
+                                            const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct in_addr *);
+extern void afs_put_server(struct afs_server *);
+extern void __exit afs_purge_servers(void);
 /*
 * super.c
@@ -118,22 +559,211 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
 extern int afs_fs_init(void);
 extern void afs_fs_exit(void);
-#define AFS_CB_HASH_COUNT (PAGE_SIZE / sizeof(struct list_head))
+/*
+ * use-rtnetlink.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+extern int afs_get_MAC_address(u8 *, size_t);
-extern struct list_head afs_cb_hash_tbl[];
+/*
-extern spinlock_t afs_cb_hash_lock;
+ * vlclient.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vlocation_cache_index_def;
+#endif
-#define afs_cb_hash(SRV,FID) \
+extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
-        afs_cb_hash_tbl[((unsigned long)(SRV) + \
+                                    const char *, struct afs_cache_vlocation *,
-                        (FID)->vid + (FID)->vnode + (FID)->unique) % \
+                                    const struct afs_wait_mode *);
-                        AFS_CB_HASH_COUNT]
+extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+                                  afs_volid_t, afs_voltype_t,
+                                  struct afs_cache_vlocation *,
+                                  const struct afs_wait_mode *);
 /*
- * proc.c
+ * vlocation.c
 */
-extern int afs_proc_init(void);
+#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *cell);
+extern int __init afs_vlocation_update_init(void);
-extern void afs_proc_cell_remove(struct afs_cell *cell);
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+                                                  struct key *,
+                                                  const char *, size_t);
+extern void afs_put_vlocation(struct afs_vlocation *);
+extern void afs_vlocation_purge(void);
+/*
+ * vnode.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vnode_cache_index_def;
+#endif
+extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+        return container_of(inode, struct afs_vnode, vfs_inode);
+}
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+        return &vnode->vfs_inode;
+}
+extern void afs_vnode_finalise_status_update(struct afs_vnode *,
+                                             struct afs_server *);
+extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
+                                  struct key *);
+extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
+                                off_t, size_t, struct page *);
+extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
+                            umode_t, struct afs_fid *, struct afs_file_status *,
+                            struct afs_callback *, struct afs_server **);
+extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
+                            bool);
+extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
+                          const char *);
+extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
+                             const char *, struct afs_fid *,
+                             struct afs_file_status *, struct afs_server **);
+extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
+                            struct key *, const char *, const char *);
+/*
+ * volume.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_volume_cache_index_def;
+#endif
+#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+extern void afs_put_volume(struct afs_volume *);
+extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
+extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
+extern int afs_volume_release_fileserver(struct afs_vnode *,
+                                         struct afs_server *, int);
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern unsigned afs_debug;
+#define dbgprintk(FMT,...) \
+        printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__)
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf,1,2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kdebug(FMT,...) dbgprintk("    "FMT ,##__VA_ARGS__)
+#if defined(__KDEBUG)
+#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__)
+#elif defined(CONFIG_AFS_DEBUG)
+#define AFS_DEBUG_KENTER        0x01
+#define AFS_DEBUG_KLEAVE        0x02
+#define AFS_DEBUG_KDEBUG        0x04
+#define _enter(FMT,...)                                 \
+do {                                                    \
+        if (unlikely(afs_debug & AFS_DEBUG_KENTER))     \
+                kenter(FMT,##__VA_ARGS__);              \
+} while (0)
+#define _leave(FMT,...)                                 \
+do {                                                    \
+        if (unlikely(afs_debug & AFS_DEBUG_KLEAVE))     \
+                kleave(FMT,##__VA_ARGS__);              \
+} while (0)
+#define _debug(FMT,...)                                 \
+do {                                                    \
+        if (unlikely(afs_debug & AFS_DEBUG_KDEBUG))     \
+                kdebug(FMT,##__VA_ARGS__);              \
+} while (0)
+#else
+#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define _debug(FMT,...) _dbprintk("    "FMT ,##__VA_ARGS__)
+#endif
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+#define ASSERT(X)                                               \
+do {                                                            \
+        if (unlikely(!(X))) {                                   \
+                printk(KERN_ERR "\n");                          \
+                printk(KERN_ERR "AFS: Assertion failed\n");     \
+                BUG();                                          \
+        }                                                       \
+} while(0)
+#define ASSERTCMP(X, OP, Y)                                             \
+do {                                                                    \
+        if (unlikely(!((X) OP (Y)))) {                                  \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "AFS: Assertion failed\n");             \
+                printk(KERN_ERR "%lu " #OP " %lu is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",       \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while(0)
+#define ASSERTIF(C, X)                                          \
+do {                                                            \
+        if (unlikely((C) && !(X))) {                            \
+                printk(KERN_ERR "\n");                          \
+                printk(KERN_ERR "AFS: Assertion failed\n");     \
+                BUG();                                          \
+        }                                                       \
+} while(0)
+#define ASSERTIFCMP(C, X, OP, Y)                                        \
+do {                                                                    \
+        if (unlikely((C) && !((X) OP (Y)))) {                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "AFS: Assertion failed\n");             \
+                printk(KERN_ERR "%lu " #OP " %lu is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",       \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while(0)
+#else
+#define ASSERT(X)                               \
+do {                                            \
+} while(0)
+#define ASSERTCMP(X, OP, Y)                     \
+do {                                            \
+} while(0)
+#define ASSERTIF(C, X)                          \
+do {                                            \
+} while(0)
+#define ASSERTIFCMP(C, X, OP, Y)                \
+do {                                            \
+} while(0)
-#endif /* AFS_INTERNAL_H */
+#endif /* __KDEBUGALL */
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
deleted file mode 100644
index 615df2407cb2..000000000000
--- a/fs/afs/kafsasyncd.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* kafsasyncd.c: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- * The AFS async daemon is used to the following:
- * - probe "dead" servers to see whether they've come back to life yet.
- * - probe "live" servers that we haven't talked to for a while to see if they are better
- *   candidates for serving than what we're currently using
- * - poll volume location servers to keep up to date volume location lists
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "server.h"
-#include "volume.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include <rxrpc/call.h>
-#include <asm/errno.h>
-#include "internal.h"
-static DECLARE_COMPLETION(kafsasyncd_alive);
-static DECLARE_COMPLETION(kafsasyncd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafsasyncd_sleepq);
-static struct task_struct *kafsasyncd_task;
-static int kafsasyncd_die;
-static int kafsasyncd(void *arg);
-static LIST_HEAD(kafsasyncd_async_attnq);
-static LIST_HEAD(kafsasyncd_async_busyq);
-static DEFINE_SPINLOCK(kafsasyncd_async_lock);
-static void kafsasyncd_null_call_attn_func(struct rxrpc_call *call)
-{
-}
-static void kafsasyncd_null_call_error_func(struct rxrpc_call *call)
-{
-}
-/*****************************************************************************/
-/*
- * start the async daemon
- */
-int afs_kafsasyncd_start(void)
-{
-        int ret;
-        ret = kernel_thread(kafsasyncd, NULL, 0);
-        if (ret < 0)
-                return ret;
-        wait_for_completion(&kafsasyncd_alive);
-        return ret;
-} /* end afs_kafsasyncd_start() */
-/*****************************************************************************/
-/*
- * stop the async daemon
- */
-void afs_kafsasyncd_stop(void)
-{
-        /* get rid of my daemon */
-        kafsasyncd_die = 1;
-        wake_up(&kafsasyncd_sleepq);
-        wait_for_completion(&kafsasyncd_dead);
-} /* end afs_kafsasyncd_stop() */
-/*****************************************************************************/
-/*
- * probing daemon
- */
-static int kafsasyncd(void *arg)
-{
-        struct afs_async_op *op;
-        int die;
-        DECLARE_WAITQUEUE(myself, current);
-        kafsasyncd_task = current;
-        printk("kAFS: Started kafsasyncd %d\n", current->pid);
-        daemonize("kafsasyncd");
-        complete(&kafsasyncd_alive);
-        /* loop around looking for things to attend to */
-        do {
-                set_current_state(TASK_INTERRUPTIBLE);
-                add_wait_queue(&kafsasyncd_sleepq, &myself);
-                for (;;) {
-                        if (!list_empty(&kafsasyncd_async_attnq) ||
-                            signal_pending(current) ||
-                            kafsasyncd_die)
-                                break;
-                        schedule();
-                        set_current_state(TASK_INTERRUPTIBLE);
-                }
-                remove_wait_queue(&kafsasyncd_sleepq, &myself);
-                set_current_state(TASK_RUNNING);
-                try_to_freeze();
-                /* discard pending signals */
-                afs_discard_my_signals();
-                die = kafsasyncd_die;
-                /* deal with the next asynchronous operation requiring
-                 * attention */
-                if (!list_empty(&kafsasyncd_async_attnq)) {
-                        struct afs_async_op *op;
-                        _debug("@@@ Begin Asynchronous Operation");
-                        op = NULL;
-                        spin_lock(&kafsasyncd_async_lock);
-                        if (!list_empty(&kafsasyncd_async_attnq)) {
-                                op = list_entry(kafsasyncd_async_attnq.next,
-                                                struct afs_async_op, link);
-                                list_move_tail(&op->link,
-                                              &kafsasyncd_async_busyq);
-                        }
-                        spin_unlock(&kafsasyncd_async_lock);
-                        _debug("@@@ Operation %p {%p}\n",
-                               op, op ? op->ops : NULL);
-                        if (op)
-                                op->ops->attend(op);
-                        _debug("@@@ End Asynchronous Operation");
-                }
-        } while(!die);
-        /* need to kill all outstanding asynchronous operations before
-         * exiting */
-        kafsasyncd_task = NULL;
-        spin_lock(&kafsasyncd_async_lock);
-        /* fold the busy and attention queues together */
-        list_splice_init(&kafsasyncd_async_busyq,
-                         &kafsasyncd_async_attnq);
-        /* dequeue kafsasyncd from all their wait queues */
-        list_for_each_entry(op, &kafsasyncd_async_attnq, link) {
-                op->call->app_attn_func = kafsasyncd_null_call_attn_func;
-                op->call->app_error_func = kafsasyncd_null_call_error_func;
-                remove_wait_queue(&op->call->waitq, &op->waiter);
-        }
-        spin_unlock(&kafsasyncd_async_lock);
-        /* abort all the operations */
-        while (!list_empty(&kafsasyncd_async_attnq)) {
-                op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link);
-                list_del_init(&op->link);
-                rxrpc_call_abort(op->call, -EIO);
-                rxrpc_put_call(op->call);
-                op->call = NULL;
-                op->ops->discard(op);
-        }
-        /* and that's all */
-        _leave("");
-        complete_and_exit(&kafsasyncd_dead, 0);
-} /* end kafsasyncd() */
-/*****************************************************************************/
-/*
- * begin an operation
- * - place operation on busy queue
- */
-void afs_kafsasyncd_begin_op(struct afs_async_op *op)
-{
-        _enter("");
-        spin_lock(&kafsasyncd_async_lock);
-        init_waitqueue_entry(&op->waiter, kafsasyncd_task);
-        add_wait_queue(&op->call->waitq, &op->waiter);
-        list_move_tail(&op->link, &kafsasyncd_async_busyq);
-        spin_unlock(&kafsasyncd_async_lock);
-        _leave("");
-} /* end afs_kafsasyncd_begin_op() */
-/*****************************************************************************/
-/*
- * request attention for an operation
- * - move to attention queue
- */
-void afs_kafsasyncd_attend_op(struct afs_async_op *op)
-{
-        _enter("");
-        spin_lock(&kafsasyncd_async_lock);
-        list_move_tail(&op->link, &kafsasyncd_async_attnq);
-        spin_unlock(&kafsasyncd_async_lock);
-        wake_up(&kafsasyncd_sleepq);
-        _leave("");
-} /* end afs_kafsasyncd_attend_op() */
-/*****************************************************************************/
-/*
- * terminate an operation
- * - remove from either queue
- */
-void afs_kafsasyncd_terminate_op(struct afs_async_op *op)
-{
-        _enter("");
-        spin_lock(&kafsasyncd_async_lock);
-        if (!list_empty(&op->link)) {
-                list_del_init(&op->link);
-                remove_wait_queue(&op->call->waitq, &op->waiter);
-        }
-        spin_unlock(&kafsasyncd_async_lock);
-        wake_up(&kafsasyncd_sleepq);
-        _leave("");
-} /* end afs_kafsasyncd_terminate_op() */
diff --git a/fs/afs/kafsasyncd.h b/fs/afs/kafsasyncd.h
deleted file mode 100644
index 791803f9a6fb..000000000000
--- a/fs/afs/kafsasyncd.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* kafsasyncd.h: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_KAFSASYNCD_H
-#define _LINUX_AFS_KAFSASYNCD_H
-#include "types.h"
-struct afs_async_op;
-struct afs_async_op_ops {
-        void (*attend)(struct afs_async_op *op);
-        void (*discard)(struct afs_async_op *op);
-};
-/*****************************************************************************/
-/*
- * asynchronous operation record
- */
-struct afs_async_op
-{
-        struct list_head                link;
-        struct afs_server               *server;        /* server being contacted */
-        struct rxrpc_call               *call;          /* RxRPC call performing op */
-        wait_queue_t                    waiter;         /* wait queue for kafsasyncd */
-        const struct afs_async_op_ops   *ops;           /* operations */
-};
-static inline void afs_async_op_init(struct afs_async_op *op,
-                                     const struct afs_async_op_ops *ops)
-{
-        INIT_LIST_HEAD(&op->link);
-        op->call = NULL;
-        op->ops = ops;
-}
-extern int afs_kafsasyncd_start(void);
-extern void afs_kafsasyncd_stop(void);
-extern void afs_kafsasyncd_begin_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_attend_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_terminate_op(struct afs_async_op *op);
-#endif /* _LINUX_AFS_KAFSASYNCD_H */
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
deleted file mode 100644
index 694344e4d3c7..000000000000
--- a/fs/afs/kafstimod.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* kafstimod.c: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "volume.h"
-#include "kafstimod.h"
-#include <asm/errno.h>
-#include "internal.h"
-static DECLARE_COMPLETION(kafstimod_alive);
-static DECLARE_COMPLETION(kafstimod_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafstimod_sleepq);
-static int kafstimod_die;
-static LIST_HEAD(kafstimod_list);
-static DEFINE_SPINLOCK(kafstimod_lock);
-static int kafstimod(void *arg);
-/*****************************************************************************/
-/*
- * start the timeout daemon
- */
-int afs_kafstimod_start(void)
-{
-        int ret;
-        ret = kernel_thread(kafstimod, NULL, 0);
-        if (ret < 0)
-                return ret;
-        wait_for_completion(&kafstimod_alive);
-        return ret;
-} /* end afs_kafstimod_start() */
-/*****************************************************************************/
-/*
- * stop the timeout daemon
- */
-void afs_kafstimod_stop(void)
-{
-        /* get rid of my daemon */
-        kafstimod_die = 1;
-        wake_up(&kafstimod_sleepq);
-        wait_for_completion(&kafstimod_dead);
-} /* end afs_kafstimod_stop() */
-/*****************************************************************************/
-/*
- * timeout processing daemon
- */
-static int kafstimod(void *arg)
-{
-        struct afs_timer *timer;
-        DECLARE_WAITQUEUE(myself, current);
-        printk("kAFS: Started kafstimod %d\n", current->pid);
-        daemonize("kafstimod");
-        complete(&kafstimod_alive);
-        /* loop around looking for things to attend to */
- loop:
-        set_current_state(TASK_INTERRUPTIBLE);
-        add_wait_queue(&kafstimod_sleepq, &myself);
-        for (;;) {
-                unsigned long jif;
-                signed long timeout;
-                /* deal with the server being asked to die */
-                if (kafstimod_die) {
-                        remove_wait_queue(&kafstimod_sleepq, &myself);
-                        _leave("");
-                        complete_and_exit(&kafstimod_dead, 0);
-                }
-                try_to_freeze();
-                /* discard pending signals */
-                afs_discard_my_signals();
-                /* work out the time to elapse before the next event */
-                spin_lock(&kafstimod_lock);
-                if (list_empty(&kafstimod_list)) {
-                        timeout = MAX_SCHEDULE_TIMEOUT;
-                }
-                else {
-                        timer = list_entry(kafstimod_list.next,
-                                           struct afs_timer, link);
-                        timeout = timer->timo_jif;
-                        jif = jiffies;
-                        if (time_before_eq((unsigned long) timeout, jif))
-                                goto immediate;
-                        else {
-                                timeout = (long) timeout - (long) jiffies;
-                        }
-                }
-                spin_unlock(&kafstimod_lock);
-                schedule_timeout(timeout);
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        /* the thing on the front of the queue needs processing
-         * - we come here with the lock held and timer pointing to the expired
-         *   entry
-         */
- immediate:
-        remove_wait_queue(&kafstimod_sleepq, &myself);
-        set_current_state(TASK_RUNNING);
-        _debug("@@@ Begin Timeout of %p", timer);
-        /* dequeue the timer */
-        list_del_init(&timer->link);
-        spin_unlock(&kafstimod_lock);
-        /* call the timeout function */
-        timer->ops->timed_out(timer);
-        _debug("@@@ End Timeout");
-        goto loop;
-} /* end kafstimod() */
-/*****************************************************************************/
-/*
- * (re-)queue a timer
- */
-void afs_kafstimod_add_timer(struct afs_timer *timer, unsigned long timeout)
-{
-        struct afs_timer *ptimer;
-        struct list_head *_p;
-        _enter("%p,%lu", timer, timeout);
-        spin_lock(&kafstimod_lock);
-        list_del(&timer->link);
-        /* the timer was deferred or reset - put it back in the queue at the
-         * right place */
-        timer->timo_jif = jiffies + timeout;
-        list_for_each(_p, &kafstimod_list) {
-                ptimer = list_entry(_p, struct afs_timer, link);
-                if (time_before(timer->timo_jif, ptimer->timo_jif))
-                        break;
-        }
-        list_add_tail(&timer->link, _p); /* insert before stopping point */
-        spin_unlock(&kafstimod_lock);
-        wake_up(&kafstimod_sleepq);
-        _leave("");
-} /* end afs_kafstimod_add_timer() */
-/*****************************************************************************/
-/*
- * dequeue a timer
- * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued
- */
-int afs_kafstimod_del_timer(struct afs_timer *timer)
-{
-        int ret = 0;
-        _enter("%p", timer);
-        spin_lock(&kafstimod_lock);
-        if (list_empty(&timer->link))
-                ret = -ENOENT;
-        else
-                list_del_init(&timer->link);
-        spin_unlock(&kafstimod_lock);
-        wake_up(&kafstimod_sleepq);
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_kafstimod_del_timer() */
diff --git a/fs/afs/kafstimod.h b/fs/afs/kafstimod.h
deleted file mode 100644
index e312f1a61a7f..000000000000
--- a/fs/afs/kafstimod.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* kafstimod.h: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_KAFSTIMOD_H
-#define _LINUX_AFS_KAFSTIMOD_H
-#include "types.h"
-struct afs_timer;
-struct afs_timer_ops {
-        /* called when the front of the timer queue has timed out */
-        void (*timed_out)(struct afs_timer *timer);
-};
-/*****************************************************************************/
-/*
- * AFS timer/timeout record
- */
-struct afs_timer
-{
-        struct list_head                link;           /* link in timer queue */
-        unsigned long                   timo_jif;       /* timeout time */
-        const struct afs_timer_ops      *ops;           /* timeout expiry function */
-};
-static inline void afs_timer_init(struct afs_timer *timer,
-                                  const struct afs_timer_ops *ops)
-{
-        INIT_LIST_HEAD(&timer->link);
-        timer->ops = ops;
-}
-extern int afs_kafstimod_start(void);
-extern void afs_kafstimod_stop(void);
-extern void afs_kafstimod_add_timer(struct afs_timer *timer,
-                                    unsigned long timeout);
-extern int afs_kafstimod_del_timer(struct afs_timer *timer);
-#endif /* _LINUX_AFS_KAFSTIMOD_H */
diff --git a/fs/afs/main.c b/fs/afs/main.c
index f2704ba53857..80ec6fd19a73 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,4 +1,4 @@
-/* main.c: AFS client file system
+/* AFS client file system
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -13,43 +13,21 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/completion.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/call.h>
-#include <rxrpc/peer.h>
-#include "cache.h"
-#include "cell.h"
-#include "server.h"
-#include "fsclient.h"
-#include "cmservice.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
 #include "internal.h"
-struct rxrpc_transport *afs_transport;
-static int afs_adding_peer(struct rxrpc_peer *peer);
-static void afs_discarding_peer(struct rxrpc_peer *peer);
 MODULE_DESCRIPTION("AFS Client File System");
 MODULE_AUTHOR("Red Hat, Inc.");
 MODULE_LICENSE("GPL");
+unsigned afs_debug;
+module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
 static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-static struct rxrpc_peer_ops afs_peer_ops = {
-        .adding         = afs_adding_peer,
-        .discarding     = afs_discarding_peer,
-};
-struct list_head afs_cb_hash_tbl[AFS_CB_HASH_COUNT];
-DEFINE_SPINLOCK(afs_cb_hash_lock);
 #ifdef AFS_CACHING_SUPPORT
 static struct cachefs_netfs_operations afs_cache_ops = {
        .get_page_cookie        = afs_cache_get_page_cookie,
@@ -62,20 +40,63 @@ struct cachefs_netfs afs_cache_netfs = {
 };
 #endif
-/*****************************************************************************/
+struct afs_uuid afs_uuid;
+/*
+ * get a client UUID
+ */
+static int __init afs_get_client_UUID(void)
+{
+        struct timespec ts;
+        u64 uuidtime;
+        u16 clockseq;
+        int ret;
+        /* read the MAC address of one of the external interfaces and construct
+         * a UUID from it */
+        ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
+        if (ret < 0)
+                return ret;
+        getnstimeofday(&ts);
+        uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
+        uuidtime += ts.tv_nsec / 100;
+        uuidtime += AFS_UUID_TO_UNIX_TIME;
+        afs_uuid.time_low = uuidtime;
+        afs_uuid.time_mid = uuidtime >> 32;
+        afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
+        afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+        get_random_bytes(&clockseq, 2);
+        afs_uuid.clock_seq_low = clockseq;
+        afs_uuid.clock_seq_hi_and_reserved =
+                (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
+        afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+        _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+               afs_uuid.time_low,
+               afs_uuid.time_mid,
+               afs_uuid.time_hi_and_version,
+               afs_uuid.clock_seq_hi_and_reserved,
+               afs_uuid.clock_seq_low,
+               afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
+               afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
+        return 0;
+}
 /*
 * initialise the AFS client FS module
 */
 static int __init afs_init(void)
 {
-        int loop, ret;
+        int ret;
        printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
-        /* initialise the callback hash table */
+        ret = afs_get_client_UUID();
-        spin_lock_init(&afs_cb_hash_lock);
+        if (ret < 0)
-        for (loop = AFS_CB_HASH_COUNT - 1; loop >= 0; loop--)
+                return ret;
-                INIT_LIST_HEAD(&afs_cb_hash_tbl[loop]);
        /* register the /proc stuff */
        ret = afs_proc_init();
@@ -87,70 +108,56 @@ static int __init afs_init(void)
        ret = cachefs_register_netfs(&afs_cache_netfs,
                                     &afs_cache_cell_index_def);
        if (ret < 0)
-                goto error;
-#endif
-#ifdef CONFIG_KEYS_TURNED_OFF
-        ret = afs_key_register();
-        if (ret < 0)
                goto error_cache;
 #endif
        /* initialise the cell DB */
        ret = afs_cell_init(rootcell);
        if (ret < 0)
-                goto error_keys;
+                goto error_cell_init;
-        /* start the timeout daemon */
+        /* initialise the VL update process */
-        ret = afs_kafstimod_start();
+        ret = afs_vlocation_update_init();
        if (ret < 0)
-                goto error_keys;
+                goto error_vl_update_init;
-        /* start the async operation daemon */
+        /* initialise the callback update process */
-        ret = afs_kafsasyncd_start();
+        ret = afs_callback_update_init();
-        if (ret < 0)
-                goto error_kafstimod;
        /* create the RxRPC transport */
-        ret = rxrpc_create_transport(7001, &afs_transport);
+        ret = afs_open_socket();
        if (ret < 0)
-                goto error_kafsasyncd;
+                goto error_open_socket;
-        afs_transport->peer_ops = &afs_peer_ops;
        /* register the filesystems */
        ret = afs_fs_init();
        if (ret < 0)
-                goto error_transport;
+                goto error_fs;
        return ret;
- error_transport:
+error_fs:
-        rxrpc_put_transport(afs_transport);
+        afs_close_socket();
- error_kafsasyncd:
+error_open_socket:
-        afs_kafsasyncd_stop();
+error_vl_update_init:
- error_kafstimod:
+error_cell_init:
-        afs_kafstimod_stop();
- error_keys:
-#ifdef CONFIG_KEYS_TURNED_OFF
-        afs_key_unregister();
- error_cache:
-#endif
 #ifdef AFS_CACHING_SUPPORT
        cachefs_unregister_netfs(&afs_cache_netfs);
- error:
+error_cache:
 #endif
+        afs_callback_update_kill();
+        afs_vlocation_purge();
        afs_cell_purge();
        afs_proc_cleanup();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
        return ret;
-} /* end afs_init() */
+}
 /* XXX late_initcall is kludgy, but the only alternative seems to create
 * a transport upon the first mount, which is worse. Or is it?
 */
 late_initcall(afs_init);        /* must be called after net/ to create socket */
-/*****************************************************************************/
 /*
 * clean up on module removal
 */
@@ -159,127 +166,16 @@ static void __exit afs_exit(void)
        printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
        afs_fs_exit();
-        rxrpc_put_transport(afs_transport);
+        afs_close_socket();
-        afs_kafstimod_stop();
+        afs_purge_servers();
-        afs_kafsasyncd_stop();
+        afs_callback_update_kill();
+        afs_vlocation_purge();
+        flush_scheduled_work();
        afs_cell_purge();
-#ifdef CONFIG_KEYS_TURNED_OFF
-        afs_key_unregister();
-#endif
 #ifdef AFS_CACHING_SUPPORT
        cachefs_unregister_netfs(&afs_cache_netfs);
 #endif
        afs_proc_cleanup();
-} /* end afs_exit() */
-module_exit(afs_exit);
-/*****************************************************************************/
-/*
- * notification that new peer record is being added
- * - called from krxsecd
- * - return an error to induce an abort
- * - mustn't sleep (caller holds an rwlock)
- */
-static int afs_adding_peer(struct rxrpc_peer *peer)
-{
-        struct afs_server *server;
-        int ret;
-        _debug("kAFS: Adding new peer %08x\n", ntohl(peer->addr.s_addr));
-        /* determine which server the peer resides in (if any) */
-        ret = afs_server_find_by_peer(peer, &server);
-        if (ret < 0)
-                return ret; /* none that we recognise, so abort */
-        _debug("Server %p{u=%d}\n", server, atomic_read(&server->usage));
-        _debug("Cell %p{u=%d}\n",
-               server->cell, atomic_read(&server->cell->usage));
-        /* cross-point the structs under a global lock */
-        spin_lock(&afs_server_peer_lock);
-        peer->user = server;
-        server->peer = peer;
-        spin_unlock(&afs_server_peer_lock);
-        afs_put_server(server);
-        return 0;
-} /* end afs_adding_peer() */
-/*****************************************************************************/
-/*
- * notification that a peer record is being discarded
- * - called from krxiod or krxsecd
- */
-static void afs_discarding_peer(struct rxrpc_peer *peer)
-{
-        struct afs_server *server;
-        _enter("%p",peer);
-        _debug("Discarding peer %08x (rtt=%lu.%lumS)\n",
-               ntohl(peer->addr.s_addr),
-               (long) (peer->rtt / 1000),
-               (long) (peer->rtt % 1000));
-        /* uncross-point the structs under a global lock */
-        spin_lock(&afs_server_peer_lock);
-        server = peer->user;
-        if (server) {
-                peer->user = NULL;
-                server->peer = NULL;
-        }
-        spin_unlock(&afs_server_peer_lock);
-        _leave("");
-} /* end afs_discarding_peer() */
-/*****************************************************************************/
-/*
- * clear the dead space between task_struct and kernel stack
- * - called by supplying -finstrument-functions to gcc
- */
-#if 0
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-__attribute__((no_instrument_function));
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-{
-       asm volatile("  movl    %%esp,%%edi     \n"
-                    "  andl    %0,%%edi        \n"
-                    "  addl    %1,%%edi        \n"
-                    "  movl    %%esp,%%ecx     \n"
-                    "  subl    %%edi,%%ecx     \n"
-                    "  shrl    $2,%%ecx        \n"
-                    "  movl    $0xedededed,%%eax     \n"
-                    "  rep stosl               \n"
-                    :
-                    : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
-                    : "eax", "ecx", "edi", "memory", "cc"
-                    );
 }
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
+module_exit(afs_exit);
-__attribute__((no_instrument_function));
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
-{
-       asm volatile("  movl    %%esp,%%edi     \n"
-                    "  andl    %0,%%edi        \n"
-                    "  addl    %1,%%edi        \n"
-                    "  movl    %%esp,%%ecx     \n"
-                    "  subl    %%edi,%%ecx     \n"
-                    "  shrl    $2,%%ecx        \n"
-                    "  movl    $0xdadadada,%%eax     \n"
-                    "  rep stosl               \n"
-                    :
-                    : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
-                    : "eax", "ecx", "edi", "memory", "cc"
-                    );
-}
-#endif
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index e4fce66d76e0..cdb9792d8161 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -1,6 +1,6 @@
-/* misc.c: miscellaneous bits
+/* miscellaneous bits
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -12,19 +12,20 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
-#include "errors.h"
 #include "internal.h"
+#include "afs_fs.h"
-/*****************************************************************************/
 /*
 * convert an AFS abort code to a Linux error number
 */
-int afs_abort_to_error(int abortcode)
+int afs_abort_to_error(u32 abort_code)
 {
-        switch (abortcode) {
+        switch (abort_code) {
+        case 13:                return -EACCES;
+        case 30:                return -EROFS;
        case VSALVAGE:          return -EIO;
        case VNOVNODE:          return -ENOENT;
-        case VNOVOL:            return -ENXIO;
+        case VNOVOL:            return -ENOMEDIUM;
        case VVOLEXISTS:        return -EEXIST;
        case VNOSERVICE:        return -EIO;
        case VOFFLINE:          return -ENOENT;
@@ -33,7 +34,24 @@ int afs_abort_to_error(int abortcode)
        case VOVERQUOTA:        return -EDQUOT;
        case VBUSY:             return -EBUSY;
        case VMOVED:            return -ENXIO;
-        default:                return -EIO;
+        case 0x2f6df0c:         return -EACCES;
+        case 0x2f6df0f:         return -EBUSY;
+        case 0x2f6df10:         return -EEXIST;
+        case 0x2f6df11:         return -EXDEV;
+        case 0x2f6df13:         return -ENOTDIR;
+        case 0x2f6df14:         return -EISDIR;
+        case 0x2f6df15:         return -EINVAL;
+        case 0x2f6df1a:         return -EFBIG;
+        case 0x2f6df1b:         return -ENOSPC;
+        case 0x2f6df1d:         return -EROFS;
+        case 0x2f6df1e:         return -EMLINK;
+        case 0x2f6df20:         return -EDOM;
+        case 0x2f6df21:         return -ERANGE;
+        case 0x2f6df22:         return -EDEADLK;
+        case 0x2f6df23:         return -ENAMETOOLONG;
+        case 0x2f6df24:         return -ENOLCK;
+        case 0x2f6df26:         return -ENOTEMPTY;
+        case 0x2f6df78:         return -EDQUOT;
+        default:                return -EREMOTEIO;
        }
+}
-} /* end afs_abort_to_error() */
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 68495f0de7b3..034fcfd4e330 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -1,4 +1,4 @@
-/* mntpt.c: mountpoint management
+/* mountpoint management
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -18,10 +18,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
-#include "super.h"
-#include "cell.h"
-#include "volume.h"
-#include "vnode.h"
 #include "internal.h"
@@ -30,6 +26,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
+static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 const struct file_operations afs_mntpt_file_operations = {
        .open           = afs_mntpt_open,
@@ -43,24 +40,19 @@ const struct inode_operations afs_mntpt_inode_operations = {
 };
 static LIST_HEAD(afs_vfsmounts);
+static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer);
+unsigned long afs_mntpt_expiry_timeout = 10 * 60;
-struct afs_timer_ops afs_mntpt_expiry_timer_ops = {
-        .timed_out      = afs_mntpt_expiry_timed_out,
-};
-struct afs_timer afs_mntpt_expiry_timer;
-unsigned long afs_mntpt_expiry_timeout = 20;
-/*****************************************************************************/
 /*
 * check a symbolic link to see whether it actually encodes a mountpoint
 * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
 */
-int afs_mntpt_check_symlink(struct afs_vnode *vnode)
+int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
+        struct file file = {
+                .private_data = key,
+        };
        struct page *page;
        size_t size;
        char *buf;
@@ -69,23 +61,21 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
        _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
        /* read the contents of the symlink into the pagecache */
-        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
+        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto out;
        }
        ret = -EIO;
-        wait_on_page_locked(page);
-        buf = kmap(page);
-        if (!PageUptodate(page))
-                goto out_free;
        if (PageError(page))
                goto out_free;
+        buf = kmap(page);
        /* examine the symlink's contents */
        size = vnode->status.size;
-        _debug("symlink to %*.*s", size, (int) size, buf);
+        _debug("symlink to %*.*s", (int) size, (int) size, buf);
        if (size > 2 &&
            (buf[0] == '%' || buf[0] == '#') &&
@@ -93,22 +83,20 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
            ) {
                _debug("symlink is a mountpoint");
                spin_lock(&vnode->lock);
-                vnode->flags |= AFS_VNODE_MOUNTPOINT;
+                set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
                spin_unlock(&vnode->lock);
        }
        ret = 0;
- out_free:
        kunmap(page);
+out_free:
        page_cache_release(page);
- out:
+out:
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_mntpt_check_symlink() */
-/*****************************************************************************/
 /*
 * no valid lookup procedure on this sort of dir
 */
@@ -116,7 +104,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct dentry *dentry,
                                       struct nameidata *nd)
 {
-        kenter("%p,%p{%p{%s},%s}",
+        _enter("%p,%p{%p{%s},%s}",
               dir,
               dentry,
               dentry->d_parent,
@@ -125,15 +113,14 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
               dentry->d_name.name);
        return ERR_PTR(-EREMOTE);
-} /* end afs_mntpt_lookup() */
+}
-/*****************************************************************************/
 /*
 * no valid open procedure on this sort of dir
 */
 static int afs_mntpt_open(struct inode *inode, struct file *file)
 {
-        kenter("%p,%p{%p{%s},%s}",
+        _enter("%p,%p{%p{%s},%s}",
               inode, file,
               file->f_path.dentry->d_parent,
               file->f_path.dentry->d_parent ?
@@ -142,9 +129,8 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
               file->f_path.dentry->d_name.name);
        return -EREMOTE;
-} /* end afs_mntpt_open() */
+}
-/*****************************************************************************/
 /*
 * create a vfsmount to be automounted
 */
@@ -157,7 +143,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        char *buf, *devname = NULL, *options = NULL;
        int ret;
-        kenter("{%s}", mntpt->d_name.name);
+        _enter("{%s}", mntpt->d_name.name);
        BUG_ON(!mntpt->d_inode);
@@ -183,8 +169,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        }
        ret = -EIO;
-        wait_on_page_locked(page);
+        if (PageError(page))
-        if (!PageUptodate(page) || PageError(page))
                goto error;
        buf = kmap(page);
@@ -201,79 +186,108 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
                strcat(options, ",rwpath");
        /* try and do the mount */
-        kdebug("--- attempting mount %s -o %s ---", devname, options);
+        _debug("--- attempting mount %s -o %s ---", devname, options);
        mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
-        kdebug("--- mount result %p ---", mnt);
+        _debug("--- mount result %p ---", mnt);
        free_page((unsigned long) devname);
        free_page((unsigned long) options);
-        kleave(" = %p", mnt);
+        _leave(" = %p", mnt);
        return mnt;
- error:
+error:
        if (page)
                page_cache_release(page);
        if (devname)
                free_page((unsigned long) devname);
        if (options)
                free_page((unsigned long) options);
-        kleave(" = %d", ret);
+        _leave(" = %d", ret);
        return ERR_PTR(ret);
-} /* end afs_mntpt_do_automount() */
+}
-/*****************************************************************************/
 /*
 * follow a link from a mountpoint directory, thus causing it to be mounted
 */
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct vfsmount *newmnt;
-        struct dentry *old_dentry;
        int err;
-        kenter("%p{%s},{%s:%p{%s}}",
+        _enter("%p{%s},{%s:%p{%s},}",
               dentry,
               dentry->d_name.name,
               nd->mnt->mnt_devname,
               dentry,
               nd->dentry->d_name.name);
-        newmnt = afs_mntpt_do_automount(dentry);
+        dput(nd->dentry);
+        nd->dentry = dget(dentry);
+        newmnt = afs_mntpt_do_automount(nd->dentry);
        if (IS_ERR(newmnt)) {
                path_release(nd);
                return (void *)newmnt;
        }
-        old_dentry = nd->dentry;
+        mntget(newmnt);
-        nd->dentry = dentry;
+        err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
-        err = do_add_mount(newmnt, nd, 0, &afs_vfsmounts);
+        switch (err) {
-        nd->dentry = old_dentry;
+        case 0:
+                mntput(nd->mnt);
-        path_release(nd);
+                dput(nd->dentry);
-        if (!err) {
-                mntget(newmnt);
                nd->mnt = newmnt;
-                dget(newmnt->mnt_root);
+                nd->dentry = dget(newmnt->mnt_root);
-                nd->dentry = newmnt->mnt_root;
+                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                                      afs_mntpt_expiry_timeout * HZ);
+                break;
+        case -EBUSY:
+                /* someone else made a mount here whilst we were busy */
+                while (d_mountpoint(nd->dentry) &&
+                       follow_down(&nd->mnt, &nd->dentry))
+                        ;
+                err = 0;
+        default:
+                mntput(newmnt);
+                break;
        }
-        kleave(" = %d", err);
+        _leave(" = %d", err);
        return ERR_PTR(err);
-} /* end afs_mntpt_follow_link() */
+}
-/*****************************************************************************/
 /*
 * handle mountpoint expiry timer going off
 */
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer)
+static void afs_mntpt_expiry_timed_out(struct work_struct *work)
 {
-        kenter("");
+        _enter("");
-        mark_mounts_for_expiry(&afs_vfsmounts);
+        if (!list_empty(&afs_vfsmounts)) {
+                mark_mounts_for_expiry(&afs_vfsmounts);
+                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                                      afs_mntpt_expiry_timeout * HZ);
+        }
-        afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
+        _leave("");
-                                afs_mntpt_expiry_timeout * HZ);
+}
-        kleave("");
+/*
-} /* end afs_mntpt_expiry_timed_out() */
+ * kill the AFS mountpoint timer if it's still running
+ */
+void afs_mntpt_kill_timer(void)
+{
+        _enter("");
+        ASSERT(list_empty(&afs_vfsmounts));
+        cancel_delayed_work(&afs_mntpt_expiry_timer);
+        flush_scheduled_work();
+}
+/*
+ * begin unmount by attempting to remove all automounted mountpoints we added
+ */
+void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+        shrink_submounts(vfsmnt, &afs_vfsmounts);
+}
diff --git a/fs/afs/mount.h b/fs/afs/mount.h
deleted file mode 100644
index 9d2f46ec549f..000000000000
--- a/fs/afs/mount.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* mount.h: mount parameters
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_MOUNT_H
-#define _LINUX_AFS_MOUNT_H
-struct afs_mountdata {
-        const char              *volume;        /* name of volume */
-        const char              *cell;          /* name of cell containing volume */
-        const char              *cache;         /* name of cache block device */
-        size_t                  nservers;       /* number of server addresses listed */
-        uint32_t                servers[10];    /* IP addresses of servers in this cell */
-};
-#endif /* _LINUX_AFS_MOUNT_H */
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
new file mode 100644
index 000000000000..fc27d4b52e5f
--- /dev/null
+++ b/fs/afs/netdevices.c
@@ -0,0 +1,68 @@
+/* AFS network device helpers
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/string.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include "internal.h"
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer will normally be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 *mac, size_t maclen)
+{
+        struct net_device *dev;
+        int ret = -ENODEV;
+        if (maclen != ETH_ALEN)
+                BUG();
+        rtnl_lock();
+        dev = __dev_getfirstbyhwtype(ARPHRD_ETHER);
+        if (dev) {
+                memcpy(mac, dev->dev_addr, maclen);
+                ret = 0;
+        }
+        rtnl_unlock();
+        return ret;
+}
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - maxbufs must be at least 1
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+                            bool wantloopback)
+{
+        struct net_device *dev;
+        struct in_device *idev;
+        int n = 0;
+        ASSERT(maxbufs > 0);
+        rtnl_lock();
+        for_each_netdev(dev) {
+                if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
+                        continue;
+                idev = __in_dev_get_rtnl(dev);
+                if (!idev)
+                        continue;
+                for_primary_ifa(idev) {
+                        bufs[n].address.s_addr = ifa->ifa_address;
+                        bufs[n].netmask.s_addr = ifa->ifa_mask;
+                        bufs[n].mtu = dev->mtu;
+                        n++;
+                        if (n >= maxbufs)
+                                goto out;
+                } endfor_ifa(idev);
+        }
+out:
+        rtnl_unlock();
+        return n;
+}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index ae6b85b1e484..d5601f617cdb 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -1,4 +1,4 @@
-/* proc.c: /proc interface for AFS
+/* /proc interface for AFS
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -13,8 +13,6 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include "cell.h"
-#include "volume.h"
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -130,7 +128,6 @@ static const struct file_operations afs_proc_cell_servers_fops = {
        .release        = afs_proc_cell_servers_release,
 };
-/*****************************************************************************/
 /*
 * initialise the /proc/fs/afs/ directory
 */
@@ -142,47 +139,43 @@ int afs_proc_init(void)
        proc_afs = proc_mkdir("fs/afs", NULL);
        if (!proc_afs)
-                goto error;
+                goto error_dir;
        proc_afs->owner = THIS_MODULE;
        p = create_proc_entry("cells", 0, proc_afs);
        if (!p)
-                goto error_proc;
+                goto error_cells;
        p->proc_fops = &afs_proc_cells_fops;
        p->owner = THIS_MODULE;
        p = create_proc_entry("rootcell", 0, proc_afs);
        if (!p)
-                goto error_cells;
+                goto error_rootcell;
        p->proc_fops = &afs_proc_rootcell_fops;
        p->owner = THIS_MODULE;
        _leave(" = 0");
        return 0;
- error_cells:
+error_rootcell:
        remove_proc_entry("cells", proc_afs);
- error_proc:
+error_cells:
        remove_proc_entry("fs/afs", NULL);
- error:
+error_dir:
        _leave(" = -ENOMEM");
        return -ENOMEM;
+}
-} /* end afs_proc_init() */
-/*****************************************************************************/
 /*
 * clean up the /proc/fs/afs/ directory
 */
 void afs_proc_cleanup(void)
 {
+        remove_proc_entry("rootcell", proc_afs);
        remove_proc_entry("cells", proc_afs);
        remove_proc_entry("fs/afs", NULL);
+}
-} /* end afs_proc_cleanup() */
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/cells" which provides a summary of extant cells
 */
@@ -199,9 +192,8 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
        m->private = PDE(inode)->data;
        return 0;
-} /* end afs_proc_cells_open() */
+}
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
@@ -225,9 +217,8 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
                        break;
        return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -241,19 +232,16 @@ static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
        _p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
        return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
 static void afs_proc_cells_stop(struct seq_file *p, void *v)
 {
        up_read(&afs_proc_cells_sem);
+}
-} /* end afs_proc_cells_stop() */
-/*****************************************************************************/
 /*
 * display a header line followed by a load of cell lines
 */
@@ -261,19 +249,18 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
        struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
-        /* display header on line 1 */
        if (v == (void *) 1) {
+                /* display header on line 1 */
                seq_puts(m, "USE NAME\n");
                return 0;
        }
        /* display one cell per line on subsequent lines */
-        seq_printf(m, "%3d %s\n", atomic_read(&cell->usage), cell->name);
+        seq_printf(m, "%3d %s\n",
+                   atomic_read(&cell->usage), cell->name);
        return 0;
-} /* end afs_proc_cells_show() */
+}
-/*****************************************************************************/
 /*
 * handle writes to /proc/fs/afs/cells
 * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
@@ -326,30 +313,32 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
        if (strcmp(kbuf, "add") == 0) {
                struct afs_cell *cell;
-                ret = afs_cell_create(name, args, &cell);
-                if (ret < 0)
+                cell = afs_cell_create(name, args);
+                if (IS_ERR(cell)) {
+                        ret = PTR_ERR(cell);
                        goto done;
+                }
+                afs_put_cell(cell);
                printk("kAFS: Added new cell '%s'\n", name);
-        }
+        } else {
-        else {
                goto inval;
        }
        ret = size;
- done:
+done:
        kfree(kbuf);
        _leave(" = %d", ret);
        return ret;
- inval:
+inval:
        ret = -EINVAL;
        printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
        goto done;
-} /* end afs_proc_cells_write() */
+}
-/*****************************************************************************/
 /*
 * Stubs for /proc/fs/afs/rootcell
 */
@@ -369,7 +358,6 @@ static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
        return 0;
 }
-/*****************************************************************************/
 /*
 * handle writes to /proc/fs/afs/rootcell
 * - to initialize rootcell: echo "cell.name:192.168.231.14"
@@ -407,14 +395,13 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
        if (ret >= 0)
                ret = size;     /* consume everything, always */
- infault:
+infault:
        kfree(kbuf);
- nomem:
+nomem:
        _leave(" = %d", ret);
        return ret;
-} /* end afs_proc_rootcell_write() */
+}
-/*****************************************************************************/
 /*
 * initialise /proc/fs/afs/<cell>/
 */
@@ -426,25 +413,25 @@ int afs_proc_cell_setup(struct afs_cell *cell)
        cell->proc_dir = proc_mkdir(cell->name, proc_afs);
        if (!cell->proc_dir)
-                return -ENOMEM;
+                goto error_dir;
        p = create_proc_entry("servers", 0, cell->proc_dir);
        if (!p)
-                goto error_proc;
+                goto error_servers;
        p->proc_fops = &afs_proc_cell_servers_fops;
        p->owner = THIS_MODULE;
        p->data = cell;
        p = create_proc_entry("vlservers", 0, cell->proc_dir);
        if (!p)
-                goto error_servers;
+                goto error_vlservers;
        p->proc_fops = &afs_proc_cell_vlservers_fops;
        p->owner = THIS_MODULE;
        p->data = cell;
        p = create_proc_entry("volumes", 0, cell->proc_dir);
        if (!p)
-                goto error_vlservers;
+                goto error_volumes;
        p->proc_fops = &afs_proc_cell_volumes_fops;
        p->owner = THIS_MODULE;
        p->data = cell;
@@ -452,17 +439,17 @@ int afs_proc_cell_setup(struct afs_cell *cell)
        _leave(" = 0");
        return 0;
- error_vlservers:
+error_volumes:
        remove_proc_entry("vlservers", cell->proc_dir);
- error_servers:
+error_vlservers:
        remove_proc_entry("servers", cell->proc_dir);
- error_proc:
+error_servers:
        remove_proc_entry(cell->name, proc_afs);
+error_dir:
        _leave(" = -ENOMEM");
        return -ENOMEM;
-} /* end afs_proc_cell_setup() */
+}
-/*****************************************************************************/
 /*
 * remove /proc/fs/afs/<cell>/
 */
@@ -476,9 +463,8 @@ void afs_proc_cell_remove(struct afs_cell *cell)
        remove_proc_entry(cell->name, proc_afs);
        _leave("");
-} /* end afs_proc_cell_remove() */
+}
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
 */
@@ -488,7 +474,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int ret;
-        cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+        cell = PDE(inode)->data;
        if (!cell)
                return -ENOENT;
@@ -500,25 +486,16 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
        m->private = cell;
        return 0;
-} /* end afs_proc_cell_volumes_open() */
+}
-/*****************************************************************************/
 /*
 * close the file and release the ref to the cell
 */
 static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
 {
-        struct afs_cell *cell = PDE(inode)->data;
+        return seq_release(inode, file);
-        int ret;
+}
-        ret = seq_release(inode,file);
-        afs_put_cell(cell);
-        return ret;
-} /* end afs_proc_cell_volumes_release() */
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
@@ -545,9 +522,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
                        break;
        return _p != &cell->vl_list ? _p : NULL;
-} /* end afs_proc_cell_volumes_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -562,12 +538,11 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
        (*_pos)++;
        _p = v;
-        _p = v == (void *) 1 ? cell->vl_list.next : _p->next;
+        _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
-        return _p != &cell->vl_list ? _p : NULL;
+        return (_p != &cell->vl_list) ? _p : NULL;
-} /* end afs_proc_cell_volumes_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
@@ -576,10 +551,18 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
        struct afs_cell *cell = p->private;
        up_read(&cell->vl_sem);
+}
-} /* end afs_proc_cell_volumes_stop() */
+const char afs_vlocation_states[][4] = {
+        [AFS_VL_NEW]                    = "New",
+        [AFS_VL_CREATING]               = "Crt",
+        [AFS_VL_VALID]                  = "Val",
+        [AFS_VL_NO_VOLUME]              = "NoV",
+        [AFS_VL_UPDATING]               = "Upd",
+        [AFS_VL_VOLUME_DELETED]         = "Del",
+        [AFS_VL_UNCERTAIN]              = "Unc",
+};
-/*****************************************************************************/
 /*
 * display a header line followed by a load of volume lines
 */
@@ -590,23 +573,22 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
        /* display header on line 1 */
        if (v == (void *) 1) {
-                seq_puts(m, "USE VLID[0]  VLID[1]  VLID[2]  NAME\n");
+                seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
                return 0;
        }
        /* display one cell per line on subsequent lines */
-        seq_printf(m, "%3d %08x %08x %08x %s\n",
+        seq_printf(m, "%3d %s %08x %08x %08x %s\n",
                   atomic_read(&vlocation->usage),
+                   afs_vlocation_states[vlocation->state],
                   vlocation->vldb.vid[0],
                   vlocation->vldb.vid[1],
                   vlocation->vldb.vid[2],
-                   vlocation->vldb.name
+                   vlocation->vldb.name);
-                   );
        return 0;
-} /* end afs_proc_cell_volumes_show() */
+}
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
 * location server
@@ -617,11 +599,11 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int ret;
-        cell = afs_get_cell_maybe((struct afs_cell**)&PDE(inode)->data);
+        cell = PDE(inode)->data;
        if (!cell)
                return -ENOENT;
-        ret = seq_open(file,&afs_proc_cell_vlservers_ops);
+        ret = seq_open(file, &afs_proc_cell_vlservers_ops);
        if (ret<0)
                return ret;
@@ -629,26 +611,17 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
        m->private = cell;
        return 0;
-} /* end afs_proc_cell_vlservers_open() */
+}
-/*****************************************************************************/
 /*
 * close the file and release the ref to the cell
 */
 static int afs_proc_cell_vlservers_release(struct inode *inode,
                                           struct file *file)
 {
-        struct afs_cell *cell = PDE(inode)->data;
+        return seq_release(inode, file);
-        int ret;
+}
-        ret = seq_release(inode,file);
-        afs_put_cell(cell);
-        return ret;
-} /* end afs_proc_cell_vlservers_release() */
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
@@ -672,9 +645,8 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
                return NULL;
        return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -692,9 +664,8 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
                return NULL;
        return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
@@ -703,10 +674,8 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
        struct afs_cell *cell = p->private;
        up_read(&cell->vl_sem);
+}
-} /* end afs_proc_cell_vlservers_stop() */
-/*****************************************************************************/
 /*
 * display a header line followed by a load of volume lines
 */
@@ -722,11 +691,9 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
        /* display one cell per line on subsequent lines */
        seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr));
        return 0;
-} /* end afs_proc_cell_vlservers_show() */
+}
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
 * servers
@@ -737,7 +704,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int ret;
-        cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+        cell = PDE(inode)->data;
        if (!cell)
                return -ENOENT;
@@ -747,34 +714,24 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
        m = file->private_data;
        m->private = cell;
        return 0;
-} /* end afs_proc_cell_servers_open() */
+}
-/*****************************************************************************/
 /*
 * close the file and release the ref to the cell
 */
 static int afs_proc_cell_servers_release(struct inode *inode,
                                         struct file *file)
 {
-        struct afs_cell *cell = PDE(inode)->data;
+        return seq_release(inode, file);
-        int ret;
+}
-        ret = seq_release(inode, file);
-        afs_put_cell(cell);
-        return ret;
-} /* end afs_proc_cell_servers_release() */
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
 */
 static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
-        __acquires(m->private->sv_lock)
+        __acquires(m->private->servers_lock)
 {
        struct list_head *_p;
        struct afs_cell *cell = m->private;
@@ -783,7 +740,7 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
        _enter("cell=%p pos=%Ld", cell, *_pos);
        /* lock the list against modification */
-        read_lock(&cell->sv_lock);
+        read_lock(&cell->servers_lock);
        /* allow for the header line */
        if (!pos)
@@ -791,14 +748,13 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
        pos--;
        /* find the n'th element in the list */
-        list_for_each(_p, &cell->sv_list)
+        list_for_each(_p, &cell->servers)
                if (!pos--)
                        break;
-        return _p != &cell->sv_list ? _p : NULL;
+        return _p != &cell->servers ? _p : NULL;
-} /* end afs_proc_cell_servers_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -813,25 +769,22 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
        (*_pos)++;
        _p = v;
-        _p = v == (void *) 1 ? cell->sv_list.next : _p->next;
+        _p = v == (void *) 1 ? cell->servers.next : _p->next;
-        return _p != &cell->sv_list ? _p : NULL;
+        return _p != &cell->servers ? _p : NULL;
-} /* end afs_proc_cell_servers_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
 static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
-        __releases(p->private->sv_lock)
+        __releases(p->private->servers_lock)
 {
        struct afs_cell *cell = p->private;
-        read_unlock(&cell->sv_lock);
+        read_unlock(&cell->servers_lock);
+}
-} /* end afs_proc_cell_servers_stop() */
-/*****************************************************************************/
 /*
 * display a header line followed by a load of volume lines
 */
@@ -849,10 +802,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
        /* display one cell per line on subsequent lines */
        sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr));
        seq_printf(m, "%3d %-15.15s %5d\n",
-                   atomic_read(&server->usage),
+                   atomic_read(&server->usage), ipaddr, server->fs_state);
-                   ipaddr,
-                   server->fs_state
-                   );
        return 0;
-} /* end afs_proc_cell_servers_show() */
+}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
new file mode 100644
index 000000000000..222c1a3abbb8
--- /dev/null
+++ b/fs/afs/rxrpc.c
@@ -0,0 +1,782 @@
+/* Maintain an RxRPC server socket to do AFS communications through
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_cm.h"
+static struct socket *afs_socket; /* my RxRPC socket */
+static struct workqueue_struct *afs_async_calls;
+static atomic_t afs_outstanding_calls;
+static atomic_t afs_outstanding_skbs;
+static void afs_wake_up_call_waiter(struct afs_call *);
+static int afs_wait_for_call_to_complete(struct afs_call *);
+static void afs_wake_up_async_call(struct afs_call *);
+static int afs_dont_wait_for_call_to_complete(struct afs_call *);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
+static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+/* synchronous call management */
+const struct afs_wait_mode afs_sync_call = {
+        .rx_wakeup      = afs_wake_up_call_waiter,
+        .wait           = afs_wait_for_call_to_complete,
+};
+/* asynchronous call management */
+const struct afs_wait_mode afs_async_call = {
+        .rx_wakeup      = afs_wake_up_async_call,
+        .wait           = afs_dont_wait_for_call_to_complete,
+};
+/* asynchronous incoming call management */
+static const struct afs_wait_mode afs_async_incoming_call = {
+        .rx_wakeup      = afs_wake_up_async_call,
+};
+/* asynchronous incoming call initial processing */
+static const struct afs_call_type afs_RXCMxxxx = {
+        .name           = "CB.xxxx",
+        .deliver        = afs_deliver_cm_op_id,
+        .abort_to_error = afs_abort_to_error,
+};
+static void afs_collect_incoming_call(struct work_struct *);
+static struct sk_buff_head afs_incoming_calls;
+static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+/*
+ * open an RxRPC socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+int afs_open_socket(void)
+{
+        struct sockaddr_rxrpc srx;
+        struct socket *socket;
+        int ret;
+        _enter("");
+        skb_queue_head_init(&afs_incoming_calls);
+        afs_async_calls = create_singlethread_workqueue("kafsd");
+        if (!afs_async_calls) {
+                _leave(" = -ENOMEM [wq]");
+                return -ENOMEM;
+        }
+        ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+        if (ret < 0) {
+                destroy_workqueue(afs_async_calls);
+                _leave(" = %d [socket]", ret);
+                return ret;
+        }
+        socket->sk->sk_allocation = GFP_NOFS;
+        /* bind the callback manager's address to make this a server socket */
+        srx.srx_family                  = AF_RXRPC;
+        srx.srx_service                 = CM_SERVICE;
+        srx.transport_type              = SOCK_DGRAM;
+        srx.transport_len               = sizeof(srx.transport.sin);
+        srx.transport.sin.sin_family    = AF_INET;
+        srx.transport.sin.sin_port      = htons(AFS_CM_PORT);
+        memset(&srx.transport.sin.sin_addr, 0,
+               sizeof(srx.transport.sin.sin_addr));
+        ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+        if (ret < 0) {
+                sock_release(socket);
+                _leave(" = %d [bind]", ret);
+                return ret;
+        }
+        rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
+        afs_socket = socket;
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * close the RxRPC socket AFS was using
+ */
+void afs_close_socket(void)
+{
+        _enter("");
+        sock_release(afs_socket);
+        _debug("dework");
+        destroy_workqueue(afs_async_calls);
+        ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
+        ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
+        _leave("");
+}
+/*
+ * note that the data in a socket buffer is now delivered and that the buffer
+ * should be freed
+ */
+static void afs_data_delivered(struct sk_buff *skb)
+{
+        if (!skb) {
+                _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
+                dump_stack();
+        } else {
+                _debug("DLVR %p{%u} [%d]",
+                       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+                if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+                        BUG();
+                rxrpc_kernel_data_delivered(skb);
+        }
+}
+/*
+ * free a socket buffer
+ */
+static void afs_free_skb(struct sk_buff *skb)
+{
+        if (!skb) {
+                _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
+                dump_stack();
+        } else {
+                _debug("FREE %p{%u} [%d]",
+                       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+                if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+                        BUG();
+                rxrpc_kernel_free_skb(skb);
+        }
+}
+/*
+ * free a call
+ */
+static void afs_free_call(struct afs_call *call)
+{
+        _debug("DONE %p{%s} [%d]",
+               call, call->type->name, atomic_read(&afs_outstanding_calls));
+        if (atomic_dec_return(&afs_outstanding_calls) == -1)
+                BUG();
+        ASSERTCMP(call->rxcall, ==, NULL);
+        ASSERT(!work_pending(&call->async_work));
+        ASSERT(skb_queue_empty(&call->rx_queue));
+        ASSERT(call->type->name != NULL);
+        kfree(call->request);
+        kfree(call);
+}
+/*
+ * allocate a call with flat request and reply buffers
+ */
+struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+                                     size_t request_size, size_t reply_size)
+{
+        struct afs_call *call;
+        call = kzalloc(sizeof(*call), GFP_NOFS);
+        if (!call)
+                goto nomem_call;
+        _debug("CALL %p{%s} [%d]",
+               call, type->name, atomic_read(&afs_outstanding_calls));
+        atomic_inc(&afs_outstanding_calls);
+        call->type = type;
+        call->request_size = request_size;
+        call->reply_max = reply_size;
+        if (request_size) {
+                call->request = kmalloc(request_size, GFP_NOFS);
+                if (!call->request)
+                        goto nomem_free;
+        }
+        if (reply_size) {
+                call->buffer = kmalloc(reply_size, GFP_NOFS);
+                if (!call->buffer)
+                        goto nomem_free;
+        }
+        init_waitqueue_head(&call->waitq);
+        skb_queue_head_init(&call->rx_queue);
+        return call;
+nomem_free:
+        afs_free_call(call);
+nomem_call:
+        return NULL;
+}
+/*
+ * clean up a call with flat buffer
+ */
+void afs_flat_call_destructor(struct afs_call *call)
+{
+        _enter("");
+        kfree(call->request);
+        call->request = NULL;
+        kfree(call->buffer);
+        call->buffer = NULL;
+}
+/*
+ * initiate a call
+ */
+int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
+                  const struct afs_wait_mode *wait_mode)
+{
+        struct sockaddr_rxrpc srx;
+        struct rxrpc_call *rxcall;
+        struct msghdr msg;
+        struct kvec iov[1];
+        int ret;
+        _enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+        ASSERT(call->type != NULL);
+        ASSERT(call->type->name != NULL);
+        _debug("MAKE %p{%s} [%d]",
+               call, call->type->name, atomic_read(&afs_outstanding_calls));
+        call->wait_mode = wait_mode;
+        INIT_WORK(&call->async_work, afs_process_async_call);
+        memset(&srx, 0, sizeof(srx));
+        srx.srx_family = AF_RXRPC;
+        srx.srx_service = call->service_id;
+        srx.transport_type = SOCK_DGRAM;
+        srx.transport_len = sizeof(srx.transport.sin);
+        srx.transport.sin.sin_family = AF_INET;
+        srx.transport.sin.sin_port = call->port;
+        memcpy(&srx.transport.sin.sin_addr, addr, 4);
+        /* create a call */
+        rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+                                         (unsigned long) call, gfp);
+        call->key = NULL;
+        if (IS_ERR(rxcall)) {
+                ret = PTR_ERR(rxcall);
+                goto error_kill_call;
+        }
+        call->rxcall = rxcall;
+        /* send the request */
+        iov[0].iov_base = call->request;
+        iov[0].iov_len  = call->request_size;
+        msg.msg_name            = NULL;
+        msg.msg_namelen         = 0;
+        msg.msg_iov             = (struct iovec *) iov;
+        msg.msg_iovlen          = 1;
+        msg.msg_control         = NULL;
+        msg.msg_controllen      = 0;
+        msg.msg_flags           = 0;
+        /* have to change the state *before* sending the last packet as RxRPC
+         * might give us the reply before it returns from sending the
+         * request */
+        call->state = AFS_CALL_AWAIT_REPLY;
+        ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+        if (ret < 0)
+                goto error_do_abort;
+        /* at this point, an async call may no longer exist as it may have
+         * already completed */
+        return wait_mode->wait(call);
+error_do_abort:
+        rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+        rxrpc_kernel_end_call(rxcall);
+        call->rxcall = NULL;
+error_kill_call:
+        call->type->destructor(call);
+        afs_free_call(call);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * handles intercepted messages that were arriving in the socket's Rx queue
+ * - called with the socket receive queue lock held to ensure message ordering
+ * - called with softirqs disabled
+ */
+static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
+                               struct sk_buff *skb)
+{
+        struct afs_call *call = (struct afs_call *) user_call_ID;
+        _enter("%p,,%u", call, skb->mark);
+        _debug("ICPT %p{%u} [%d]",
+               skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+        ASSERTCMP(sk, ==, afs_socket->sk);
+        atomic_inc(&afs_outstanding_skbs);
+        if (!call) {
+                /* its an incoming call for our callback service */
+                skb_queue_tail(&afs_incoming_calls, skb);
+                schedule_work(&afs_collect_incoming_call_work);
+        } else {
+                /* route the messages directly to the appropriate call */
+                skb_queue_tail(&call->rx_queue, skb);
+                call->wait_mode->rx_wakeup(call);
+        }
+        _leave("");
+}
+/*
+ * deliver messages to a call
+ */
+static void afs_deliver_to_call(struct afs_call *call)
+{
+        struct sk_buff *skb;
+        bool last;
+        u32 abort_code;
+        int ret;
+        _enter("");
+        while ((call->state == AFS_CALL_AWAIT_REPLY ||
+                call->state == AFS_CALL_AWAIT_OP_ID ||
+                call->state == AFS_CALL_AWAIT_REQUEST ||
+                call->state == AFS_CALL_AWAIT_ACK) &&
+               (skb = skb_dequeue(&call->rx_queue))) {
+                switch (skb->mark) {
+                case RXRPC_SKB_MARK_DATA:
+                        _debug("Rcv DATA");
+                        last = rxrpc_kernel_is_data_last(skb);
+                        ret = call->type->deliver(call, skb, last);
+                        switch (ret) {
+                        case 0:
+                                if (last &&
+                                    call->state == AFS_CALL_AWAIT_REPLY)
+                                        call->state = AFS_CALL_COMPLETE;
+                                break;
+                        case -ENOTCONN:
+                                abort_code = RX_CALL_DEAD;
+                                goto do_abort;
+                        case -ENOTSUPP:
+                                abort_code = RX_INVALID_OPERATION;
+                                goto do_abort;
+                        default:
+                                abort_code = RXGEN_CC_UNMARSHAL;
+                                if (call->state != AFS_CALL_AWAIT_REPLY)
+                                        abort_code = RXGEN_SS_UNMARSHAL;
+                        do_abort:
+                                rxrpc_kernel_abort_call(call->rxcall,
+                                                        abort_code);
+                                call->error = ret;
+                                call->state = AFS_CALL_ERROR;
+                                break;
+                        }
+                        afs_data_delivered(skb);
+                        skb = NULL;
+                        continue;
+                case RXRPC_SKB_MARK_FINAL_ACK:
+                        _debug("Rcv ACK");
+                        call->state = AFS_CALL_COMPLETE;
+                        break;
+                case RXRPC_SKB_MARK_BUSY:
+                        _debug("Rcv BUSY");
+                        call->error = -EBUSY;
+                        call->state = AFS_CALL_BUSY;
+                        break;
+                case RXRPC_SKB_MARK_REMOTE_ABORT:
+                        abort_code = rxrpc_kernel_get_abort_code(skb);
+                        call->error = call->type->abort_to_error(abort_code);
+                        call->state = AFS_CALL_ABORTED;
+                        _debug("Rcv ABORT %u -> %d", abort_code, call->error);
+                        break;
+                case RXRPC_SKB_MARK_NET_ERROR:
+                        call->error = -rxrpc_kernel_get_error_number(skb);
+                        call->state = AFS_CALL_ERROR;
+                        _debug("Rcv NET ERROR %d", call->error);
+                        break;
+                case RXRPC_SKB_MARK_LOCAL_ERROR:
+                        call->error = -rxrpc_kernel_get_error_number(skb);
+                        call->state = AFS_CALL_ERROR;
+                        _debug("Rcv LOCAL ERROR %d", call->error);
+                        break;
+                default:
+                        BUG();
+                        break;
+                }
+                afs_free_skb(skb);
+        }
+        /* make sure the queue is empty if the call is done with (we might have
+         * aborted the call early because of an unmarshalling error) */
+        if (call->state >= AFS_CALL_COMPLETE) {
+                while ((skb = skb_dequeue(&call->rx_queue)))
+                        afs_free_skb(skb);
+                if (call->incoming) {
+                        rxrpc_kernel_end_call(call->rxcall);
+                        call->rxcall = NULL;
+                        call->type->destructor(call);
+                        afs_free_call(call);
+                }
+        }
+        _leave("");
+}
+/*
+ * wait synchronously for a call to complete
+ */
+static int afs_wait_for_call_to_complete(struct afs_call *call)
+{
+        struct sk_buff *skb;
+        int ret;
+        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
+        add_wait_queue(&call->waitq, &myself);
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                /* deliver any messages that are in the queue */
+                if (!skb_queue_empty(&call->rx_queue)) {
+                        __set_current_state(TASK_RUNNING);
+                        afs_deliver_to_call(call);
+                        continue;
+                }
+                ret = call->error;
+                if (call->state >= AFS_CALL_COMPLETE)
+                        break;
+                ret = -EINTR;
+                if (signal_pending(current))
+                        break;
+                schedule();
+        }
+        remove_wait_queue(&call->waitq, &myself);
+        __set_current_state(TASK_RUNNING);
+        /* kill the call */
+        if (call->state < AFS_CALL_COMPLETE) {
+                _debug("call incomplete");
+                rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
+                while ((skb = skb_dequeue(&call->rx_queue)))
+                        afs_free_skb(skb);
+        }
+        _debug("call complete");
+        rxrpc_kernel_end_call(call->rxcall);
+        call->rxcall = NULL;
+        call->type->destructor(call);
+        afs_free_call(call);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * wake up a waiting call
+ */
+static void afs_wake_up_call_waiter(struct afs_call *call)
+{
+        wake_up(&call->waitq);
+}
+/*
+ * wake up an asynchronous call
+ */
+static void afs_wake_up_async_call(struct afs_call *call)
+{
+        _enter("");
+        queue_work(afs_async_calls, &call->async_work);
+}
+/*
+ * put a call into asynchronous mode
+ * - mustn't touch the call descriptor as the call my have completed by the
+ *   time we get here
+ */
+static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
+{
+        _enter("");
+        return -EINPROGRESS;
+}
+/*
+ * delete an asynchronous call
+ */
+static void afs_delete_async_call(struct work_struct *work)
+{
+        struct afs_call *call =
+                container_of(work, struct afs_call, async_work);
+        _enter("");
+        afs_free_call(call);
+        _leave("");
+}
+/*
+ * perform processing on an asynchronous call
+ * - on a multiple-thread workqueue this work item may try to run on several
+ *   CPUs at the same time
+ */
+static void afs_process_async_call(struct work_struct *work)
+{
+        struct afs_call *call =
+                container_of(work, struct afs_call, async_work);
+        _enter("");
+        if (!skb_queue_empty(&call->rx_queue))
+                afs_deliver_to_call(call);
+        if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+                if (call->wait_mode->async_complete)
+                        call->wait_mode->async_complete(call->reply,
+                                                        call->error);
+                call->reply = NULL;
+                /* kill the call */
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+                if (call->type->destructor)
+                        call->type->destructor(call);
+                /* we can't just delete the call because the work item may be
+                 * queued */
+                PREPARE_WORK(&call->async_work, afs_delete_async_call);
+                queue_work(afs_async_calls, &call->async_work);
+        }
+        _leave("");
+}
+/*
+ * empty a socket buffer into a flat reply buffer
+ */
+void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+{
+        size_t len = skb->len;
+        if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+                BUG();
+        call->reply_size += len;
+}
+/*
+ * accept the backlog of incoming calls
+ */
+static void afs_collect_incoming_call(struct work_struct *work)
+{
+        struct rxrpc_call *rxcall;
+        struct afs_call *call = NULL;
+        struct sk_buff *skb;
+        while ((skb = skb_dequeue(&afs_incoming_calls))) {
+                _debug("new call");
+                /* don't need the notification */
+                afs_free_skb(skb);
+                if (!call) {
+                        call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+                        if (!call) {
+                                rxrpc_kernel_reject_call(afs_socket);
+                                return;
+                        }
+                        INIT_WORK(&call->async_work, afs_process_async_call);
+                        call->wait_mode = &afs_async_incoming_call;
+                        call->type = &afs_RXCMxxxx;
+                        init_waitqueue_head(&call->waitq);
+                        skb_queue_head_init(&call->rx_queue);
+                        call->state = AFS_CALL_AWAIT_OP_ID;
+                        _debug("CALL %p{%s} [%d]",
+                               call, call->type->name,
+                               atomic_read(&afs_outstanding_calls));
+                        atomic_inc(&afs_outstanding_calls);
+                }
+                rxcall = rxrpc_kernel_accept_call(afs_socket,
+                                                  (unsigned long) call);
+                if (!IS_ERR(rxcall)) {
+                        call->rxcall = rxcall;
+                        call = NULL;
+                }
+        }
+        if (call)
+                afs_free_call(call);
+}
+/*
+ * grab the operation ID from an incoming cache manager call
+ */
+static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
+                                bool last)
+{
+        size_t len = skb->len;
+        void *oibuf = (void *) &call->operation_ID;
+        _enter("{%u},{%zu},%d", call->offset, len, last);
+        ASSERTCMP(call->offset, <, 4);
+        /* the operation ID forms the first four bytes of the request data */
+        len = min_t(size_t, len, 4 - call->offset);
+        if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
+                BUG();
+        if (!pskb_pull(skb, len))
+                BUG();
+        call->offset += len;
+        if (call->offset < 4) {
+                if (last) {
+                        _leave(" = -EBADMSG [op ID short]");
+                        return -EBADMSG;
+                }
+                _leave(" = 0 [incomplete]");
+                return 0;
+        }
+        call->state = AFS_CALL_AWAIT_REQUEST;
+        /* ask the cache manager to route the call (it'll change the call type
+         * if successful) */
+        if (!afs_cm_incoming_call(call))
+                return -ENOTSUPP;
+        /* pass responsibility for the remainer of this message off to the
+         * cache manager op */
+        return call->type->deliver(call, skb, last);
+}
+/*
+ * send an empty reply
+ */
+void afs_send_empty_reply(struct afs_call *call)
+{
+        struct msghdr msg;
+        struct iovec iov[1];
+        _enter("");
+        iov[0].iov_base         = NULL;
+        iov[0].iov_len          = 0;
+        msg.msg_name            = NULL;
+        msg.msg_namelen         = 0;
+        msg.msg_iov             = iov;
+        msg.msg_iovlen          = 0;
+        msg.msg_control         = NULL;
+        msg.msg_controllen      = 0;
+        msg.msg_flags           = 0;
+        call->state = AFS_CALL_AWAIT_ACK;
+        switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+        case 0:
+                _leave(" [replied]");
+                return;
+        case -ENOMEM:
+                _debug("oom");
+                rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+        default:
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+                call->type->destructor(call);
+                afs_free_call(call);
+                _leave(" [error]");
+                return;
+        }
+}
+/*
+ * send a simple reply
+ */
+void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
+{
+        struct msghdr msg;
+        struct iovec iov[1];
+        _enter("");
+        iov[0].iov_base         = (void *) buf;
+        iov[0].iov_len          = len;
+        msg.msg_name            = NULL;
+        msg.msg_namelen         = 0;
+        msg.msg_iov             = iov;
+        msg.msg_iovlen          = 1;
+        msg.msg_control         = NULL;
+        msg.msg_controllen      = 0;
+        msg.msg_flags           = 0;
+        call->state = AFS_CALL_AWAIT_ACK;
+        switch (rxrpc_kernel_send_data(call->rxcall, &msg, len)) {
+        case 0:
+                _leave(" [replied]");
+                return;
+        case -ENOMEM:
+                _debug("oom");
+                rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+        default:
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+                call->type->destructor(call);
+                afs_free_call(call);
+                _leave(" [error]");
+                return;
+        }
+}
+/*
+ * extract a piece of data from the received data socket buffers
+ */
+int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
+                     bool last, void *buf, size_t count)
+{
+        size_t len = skb->len;
+        _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+        ASSERTCMP(call->offset, <, count);
+        len = min_t(size_t, len, count - call->offset);
+        if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
+            !pskb_pull(skb, len))
+                BUG();
+        call->offset += len;
+        if (call->offset < count) {
+                if (last) {
+                        _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
+                        return -EBADMSG;
+                }
+                _leave(" = -EAGAIN");
+                return -EAGAIN;
+        }
+        return 0;
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
new file mode 100644
index 000000000000..f9f424d80458
--- /dev/null
+++ b/fs/afs/security.c
@@ -0,0 +1,356 @@
+/* AFS security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+/*
+ * get a key
+ */
+struct key *afs_request_key(struct afs_cell *cell)
+{
+        struct key *key;
+        _enter("{%x}", key_serial(cell->anonymous_key));
+        _debug("key %s", cell->anonymous_key->description);
+        key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
+                          NULL);
+        if (IS_ERR(key)) {
+                if (PTR_ERR(key) != -ENOKEY) {
+                        _leave(" = %ld", PTR_ERR(key));
+                        return key;
+                }
+                /* act as anonymous user */
+                _leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+                return key_get(cell->anonymous_key);
+        } else {
+                /* act as authorised user */
+                _leave(" = {%x} [auth]", key_serial(key));
+                return key;
+        }
+}
+/*
+ * dispose of a permits list
+ */
+void afs_zap_permits(struct rcu_head *rcu)
+{
+        struct afs_permits *permits =
+                container_of(rcu, struct afs_permits, rcu);
+        int loop;
+        _enter("{%d}", permits->count);
+        for (loop = permits->count - 1; loop >= 0; loop--)
+                key_put(permits->permits[loop].key);
+        kfree(permits);
+}
+/*
+ * dispose of a permits list in which all the key pointers have been copied
+ */
+static void afs_dispose_of_permits(struct rcu_head *rcu)
+{
+        struct afs_permits *permits =
+                container_of(rcu, struct afs_permits, rcu);
+        _enter("{%d}", permits->count);
+        kfree(permits);
+}
+/*
+ * get the authorising vnode - this is the specified inode itself if it's a
+ * directory or it's the parent directory if the specified inode is a file or
+ * symlink
+ * - the caller must release the ref on the inode
+ */
+static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
+                                            struct key *key)
+{
+        struct afs_vnode *auth_vnode;
+        struct inode *auth_inode;
+        _enter("");
+        if (S_ISDIR(vnode->vfs_inode.i_mode)) {
+                auth_inode = igrab(&vnode->vfs_inode);
+                ASSERT(auth_inode != NULL);
+        } else {
+                auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
+                                      &vnode->status.parent, NULL, NULL);
+                if (IS_ERR(auth_inode))
+                        return ERR_PTR(PTR_ERR(auth_inode));
+        }
+        auth_vnode = AFS_FS_I(auth_inode);
+        _leave(" = {%x}", auth_vnode->fid.vnode);
+        return auth_vnode;
+}
+/*
+ * clear the permit cache on a directory vnode
+ */
+void afs_clear_permits(struct afs_vnode *vnode)
+{
+        struct afs_permits *permits;
+        _enter("{%x}", vnode->fid.vnode);
+        mutex_lock(&vnode->permits_lock);
+        permits = vnode->permits;
+        rcu_assign_pointer(vnode->permits, NULL);
+        mutex_unlock(&vnode->permits_lock);
+        if (permits)
+                call_rcu(&permits->rcu, afs_zap_permits);
+        _leave("");
+}
+/*
+ * add the result obtained for a vnode to its or its parent directory's cache
+ * for the key used to access it
+ */
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+{
+        struct afs_permits *permits, *xpermits;
+        struct afs_permit *permit;
+        struct afs_vnode *auth_vnode;
+        int count, loop;
+        _enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
+        auth_vnode = afs_get_auth_inode(vnode, key);
+        if (IS_ERR(auth_vnode)) {
+                _leave(" [get error %ld]", PTR_ERR(auth_vnode));
+                return;
+        }
+        mutex_lock(&auth_vnode->permits_lock);
+        /* guard against a rename being detected whilst we waited for the
+         * lock */
+        if (memcmp(&auth_vnode->fid, &vnode->status.parent,
+                   sizeof(struct afs_fid)) != 0) {
+                _debug("renamed");
+                goto out_unlock;
+        }
+        /* have to be careful as the directory's callback may be broken between
+         * us receiving the status we're trying to cache and us getting the
+         * lock to update the cache for the status */
+        if (auth_vnode->acl_order - acl_order > 0) {
+                _debug("ACL changed?");
+                goto out_unlock;
+        }
+        /* always update the anonymous mask */
+        _debug("anon access %x", vnode->status.anon_access);
+        auth_vnode->status.anon_access = vnode->status.anon_access;
+        if (key == vnode->volume->cell->anonymous_key)
+                goto out_unlock;
+        xpermits = auth_vnode->permits;
+        count = 0;
+        if (xpermits) {
+                /* see if the permit is already in the list
+                 * - if it is then we just amend the list
+                 */
+                count = xpermits->count;
+                permit = xpermits->permits;
+                for (loop = count; loop > 0; loop--) {
+                        if (permit->key == key) {
+                                permit->access_mask =
+                                        vnode->status.caller_access;
+                                goto out_unlock;
+                        }
+                        permit++;
+                }
+        }
+        permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
+                          GFP_NOFS);
+        if (!permits)
+                goto out_unlock;
+        memcpy(permits->permits, xpermits->permits,
+               count * sizeof(struct afs_permit));
+        _debug("key %x access %x",
+               key_serial(key), vnode->status.caller_access);
+        permits->permits[count].access_mask = vnode->status.caller_access;
+        permits->permits[count].key = key_get(key);
+        permits->count = count + 1;
+        rcu_assign_pointer(auth_vnode->permits, permits);
+        if (xpermits)
+                call_rcu(&xpermits->rcu, afs_dispose_of_permits);
+out_unlock:
+        mutex_unlock(&auth_vnode->permits_lock);
+        iput(&auth_vnode->vfs_inode);
+        _leave("");
+}
+/*
+ * check with the fileserver to see if the directory or parent directory is
+ * permitted to be accessed with this authorisation, and if so, what access it
+ * is granted
+ */
+static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+                            afs_access_t *_access)
+{
+        struct afs_permits *permits;
+        struct afs_permit *permit;
+        struct afs_vnode *auth_vnode;
+        bool valid;
+        int loop, ret;
+        _enter("");
+        auth_vnode = afs_get_auth_inode(vnode, key);
+        if (IS_ERR(auth_vnode)) {
+                *_access = 0;
+                _leave(" = %ld", PTR_ERR(auth_vnode));
+                return PTR_ERR(auth_vnode);
+        }
+        ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+        /* check the permits to see if we've got one yet */
+        if (key == auth_vnode->volume->cell->anonymous_key) {
+                _debug("anon");
+                *_access = auth_vnode->status.anon_access;
+                valid = true;
+        } else {
+                valid = false;
+                rcu_read_lock();
+                permits = rcu_dereference(auth_vnode->permits);
+                if (permits) {
+                        permit = permits->permits;
+                        for (loop = permits->count; loop > 0; loop--) {
+                                if (permit->key == key) {
+                                        _debug("found in cache");
+                                        *_access = permit->access_mask;
+                                        valid = true;
+                                        break;
+                                }
+                                permit++;
+                        }
+                }
+                rcu_read_unlock();
+        }
+        if (!valid) {
+                /* check the status on the file we're actually interested in
+                 * (the post-processing will cache the result on auth_vnode) */
+                _debug("no valid permit");
+                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+                ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+                if (ret < 0) {
+                        iput(&auth_vnode->vfs_inode);
+                        *_access = 0;
+                        _leave(" = %d", ret);
+                        return ret;
+                }
+        }
+        *_access = vnode->status.caller_access;
+        iput(&auth_vnode->vfs_inode);
+        _leave(" = 0 [access %x]", *_access);
+        return 0;
+}
+/*
+ * check the permissions on an AFS file
+ * - AFS ACLs are attached to directories only, and a file is controlled by its
+ *   parent directory's ACL
+ */
+int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        afs_access_t access;
+        struct key *key;
+        int ret;
+        _enter("{{%x:%x},%lx},%x,",
+               vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
+        key = afs_request_key(vnode->volume->cell);
+        if (IS_ERR(key)) {
+                _leave(" = %ld [key]", PTR_ERR(key));
+                return PTR_ERR(key);
+        }
+        /* if the promise has expired, we need to check the server again */
+        if (!vnode->cb_promised) {
+                _debug("not promised");
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
+                if (ret < 0)
+                        goto error;
+                _debug("new promise [fl=%lx]", vnode->flags);
+        }
+        /* check the permits to see if we've got one yet */
+        ret = afs_check_permit(vnode, key, &access);
+        if (ret < 0)
+                goto error;
+        /* interpret the access mask */
+        _debug("REQ %x ACC %x on %s",
+               mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
+        if (S_ISDIR(inode->i_mode)) {
+                if (mask & MAY_EXEC) {
+                        if (!(access & AFS_ACE_LOOKUP))
+                                goto permission_denied;
+                } else if (mask & MAY_READ) {
+                        if (!(access & AFS_ACE_READ))
+                                goto permission_denied;
+                } else if (mask & MAY_WRITE) {
+                        if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
+                                        AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
+                                        AFS_ACE_WRITE))) /* chmod */
+                                goto permission_denied;
+                } else {
+                        BUG();
+                }
+        } else {
+                if (!(access & AFS_ACE_LOOKUP))
+                        goto permission_denied;
+                if (mask & (MAY_EXEC | MAY_READ)) {
+                        if (!(access & AFS_ACE_READ))
+                                goto permission_denied;
+                } else if (mask & MAY_WRITE) {
+                        if (!(access & AFS_ACE_WRITE))
+                                goto permission_denied;
+                }
+        }
+        key_put(key);
+        ret = generic_permission(inode, mask, NULL);
+        _leave(" = %d", ret);
+        return ret;
+permission_denied:
+        ret = -EACCES;
+error:
+        key_put(key);
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 44aff81dc6a7..96bb23b476a2 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -1,6 +1,6 @@
-/* server.c: AFS server record management
+/* AFS server record management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -11,489 +11,314 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <rxrpc/peer.h>
-#include <rxrpc/connection.h>
-#include "volume.h"
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
 #include "internal.h"
-DEFINE_SPINLOCK(afs_server_peer_lock);
+unsigned afs_server_timeout = 10;       /* server timeout in seconds */
-#define FS_SERVICE_ID           1       /* AFS Volume Location Service ID */
+static void afs_reap_server(struct work_struct *);
-#define VL_SERVICE_ID           52      /* AFS Volume Location Service ID */
-static void __afs_server_timeout(struct afs_timer *timer)
+/* tree of all the servers, indexed by IP address */
+static struct rb_root afs_servers = RB_ROOT;
+static DEFINE_RWLOCK(afs_servers_lock);
+/* LRU list of all the servers not currently in use */
+static LIST_HEAD(afs_server_graveyard);
+static DEFINE_SPINLOCK(afs_server_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+/*
+ * install a server record in the master tree
+ */
+static int afs_install_server(struct afs_server *server)
 {
-        struct afs_server *server =
+        struct afs_server *xserver;
-                list_entry(timer, struct afs_server, timeout);
+        struct rb_node **pp, *p;
+        int ret;
-        _debug("SERVER TIMEOUT [%p{u=%d}]",
+        _enter("%p", server);
-               server, atomic_read(&server->usage));
-        afs_server_do_timeout(server);
+        write_lock(&afs_servers_lock);
-}
+        ret = -EEXIST;
+        pp = &afs_servers.rb_node;
+        p = NULL;
+        while (*pp) {
+                p = *pp;
+                _debug("- consider %p", p);
+                xserver = rb_entry(p, struct afs_server, master_rb);
+                if (server->addr.s_addr < xserver->addr.s_addr)
+                        pp = &(*pp)->rb_left;
+                else if (server->addr.s_addr > xserver->addr.s_addr)
+                        pp = &(*pp)->rb_right;
+                else
+                        goto error;
+        }
-static const struct afs_timer_ops afs_server_timer_ops = {
+        rb_link_node(&server->master_rb, p, pp);
-        .timed_out      = __afs_server_timeout,
+        rb_insert_color(&server->master_rb, &afs_servers);
-};
+        ret = 0;
+error:
+        write_unlock(&afs_servers_lock);
+        return ret;
+}
-/*****************************************************************************/
 /*
- * lookup a server record in a cell
+ * allocate a new server record
- * - TODO: search the cell's server list
 */
-int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
+static struct afs_server *afs_alloc_server(struct afs_cell *cell,
-                      struct afs_server **_server)
+                                           const struct in_addr *addr)
 {
-        struct afs_server *server, *active, *zombie;
+        struct afs_server *server;
-        int loop;
-        _enter("%p,%08x,", cell, ntohl(addr->s_addr));
+        _enter("");
-        /* allocate and initialise a server record */
        server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
-        if (!server) {
+        if (server) {
-                _leave(" = -ENOMEM");
+                atomic_set(&server->usage, 1);
-                return -ENOMEM;
+                server->cell = cell;
+                INIT_LIST_HEAD(&server->link);
+                INIT_LIST_HEAD(&server->grave);
+                init_rwsem(&server->sem);
+                spin_lock_init(&server->fs_lock);
+                server->fs_vnodes = RB_ROOT;
+                server->cb_promises = RB_ROOT;
+                spin_lock_init(&server->cb_lock);
+                init_waitqueue_head(&server->cb_break_waitq);
+                INIT_DELAYED_WORK(&server->cb_break_work,
+                                  afs_dispatch_give_up_callbacks);
+                memcpy(&server->addr, addr, sizeof(struct in_addr));
+                server->addr.s_addr = addr->s_addr;
        }
-        atomic_set(&server->usage, 1);
+        _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        return server;
-        INIT_LIST_HEAD(&server->link);
+}
-        init_rwsem(&server->sem);
-        INIT_LIST_HEAD(&server->fs_callq);
-        spin_lock_init(&server->fs_lock);
-        INIT_LIST_HEAD(&server->cb_promises);
-        spin_lock_init(&server->cb_lock);
-        for (loop = 0; loop < AFS_SERVER_CONN_LIST_SIZE; loop++)
-                server->fs_conn_cnt[loop] = 4;
-        memcpy(&server->addr, addr, sizeof(struct in_addr));
+/*
-        server->addr.s_addr = addr->s_addr;
+ * get an FS-server record for a cell
+ */
+struct afs_server *afs_lookup_server(struct afs_cell *cell,
+                                     const struct in_addr *addr)
+{
+        struct afs_server *server, *candidate;
-        afs_timer_init(&server->timeout, &afs_server_timer_ops);
+        _enter("%p,"NIPQUAD_FMT, cell, NIPQUAD(addr->s_addr));
-        /* add to the cell */
+        /* quick scan of the list to see if we already have the server */
-        write_lock(&cell->sv_lock);
+        read_lock(&cell->servers_lock);
-        /* check the active list */
+        list_for_each_entry(server, &cell->servers, link) {
-        list_for_each_entry(active, &cell->sv_list, link) {
+                if (server->addr.s_addr == addr->s_addr)
-                if (active->addr.s_addr == addr->s_addr)
+                        goto found_server_quickly;
-                        goto use_active_server;
        }
+        read_unlock(&cell->servers_lock);
-        /* check the inactive list */
+        candidate = afs_alloc_server(cell, addr);
-        spin_lock(&cell->sv_gylock);
+        if (!candidate) {
-        list_for_each_entry(zombie, &cell->sv_graveyard, link) {
+                _leave(" = -ENOMEM");
-                if (zombie->addr.s_addr == addr->s_addr)
+                return ERR_PTR(-ENOMEM);
-                        goto resurrect_server;
        }
-        spin_unlock(&cell->sv_gylock);
-        afs_get_cell(cell);
+        write_lock(&cell->servers_lock);
-        server->cell = cell;
-        list_add_tail(&server->link, &cell->sv_list);
-        write_unlock(&cell->sv_lock);
+        /* check the cell's server list again */
+        list_for_each_entry(server, &cell->servers, link) {
+                if (server->addr.s_addr == addr->s_addr)
+                        goto found_server;
+        }
-        *_server = server;
+        _debug("new");
-        _leave(" = 0 (%p)", server);
+        server = candidate;
-        return 0;
+        if (afs_install_server(server) < 0)
+                goto server_in_two_cells;
-        /* found a matching active server */
+        afs_get_cell(cell);
- use_active_server:
+        list_add_tail(&server->link, &cell->servers);
-        _debug("active server");
-        afs_get_server(active);
+        write_unlock(&cell->servers_lock);
-        write_unlock(&cell->sv_lock);
+        _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        return server;
+        /* found a matching server quickly */
+found_server_quickly:
+        _debug("found quickly");
+        afs_get_server(server);
+        read_unlock(&cell->servers_lock);
+no_longer_unused:
+        if (!list_empty(&server->grave)) {
+                spin_lock(&afs_server_graveyard_lock);
+                list_del_init(&server->grave);
+                spin_unlock(&afs_server_graveyard_lock);
+        }
+        _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        return server;
+        /* found a matching server on the second pass */
+found_server:
+        _debug("found");
+        afs_get_server(server);
+        write_unlock(&cell->servers_lock);
+        kfree(candidate);
+        goto no_longer_unused;
+        /* found a server that seems to be in two cells */
+server_in_two_cells:
+        write_unlock(&cell->servers_lock);
+        kfree(candidate);
+        printk(KERN_NOTICE "kAFS:"
+               " Server "NIPQUAD_FMT" appears to be in two cells\n",
+               NIPQUAD(*addr));
+        _leave(" = -EEXIST");
+        return ERR_PTR(-EEXIST);
+}
-        kfree(server);
+/*
+ * look up a server by its IP address
+ */
+struct afs_server *afs_find_server(const struct in_addr *_addr)
+{
+        struct afs_server *server = NULL;
+        struct rb_node *p;
+        struct in_addr addr = *_addr;
-        *_server = active;
+        _enter(NIPQUAD_FMT, NIPQUAD(addr.s_addr));
-        _leave(" = 0 (%p)", active);
-        return 0;
-        /* found a matching server in the graveyard, so resurrect it and
+        read_lock(&afs_servers_lock);
-         * dispose of the new record */
- resurrect_server:
-        _debug("resurrecting server");
-        list_move_tail(&zombie->link, &cell->sv_list);
+        p = afs_servers.rb_node;
-        afs_get_server(zombie);
+        while (p) {
-        afs_kafstimod_del_timer(&zombie->timeout);
+                server = rb_entry(p, struct afs_server, master_rb);
-        spin_unlock(&cell->sv_gylock);
-        write_unlock(&cell->sv_lock);
-        kfree(server);
+                _debug("- consider %p", p);
-        *_server = zombie;
+                if (addr.s_addr < server->addr.s_addr) {
-        _leave(" = 0 (%p)", zombie);
+                        p = p->rb_left;
-        return 0;
+                } else if (addr.s_addr > server->addr.s_addr) {
+                        p = p->rb_right;
+                } else {
+                        afs_get_server(server);
+                        goto found;
+                }
+        }
-} /* end afs_server_lookup() */
+        server = NULL;
+found:
+        read_unlock(&afs_servers_lock);
+        ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
+        _leave(" = %p", server);
+        return server;
+}
-/*****************************************************************************/
 /*
 * destroy a server record
 * - removes from the cell list
 */
 void afs_put_server(struct afs_server *server)
 {
-        struct afs_cell *cell;
        if (!server)
                return;
-        _enter("%p", server);
+        _enter("%p{%d}", server, atomic_read(&server->usage));
-        cell = server->cell;
-        /* sanity check */
+        _debug("PUT SERVER %d", atomic_read(&server->usage));
-        BUG_ON(atomic_read(&server->usage) <= 0);
-        /* to prevent a race, the decrement and the dequeue must be effectively
+        ASSERTCMP(atomic_read(&server->usage), >, 0);
-         * atomic */
-        write_lock(&cell->sv_lock);
        if (likely(!atomic_dec_and_test(&server->usage))) {
-                write_unlock(&cell->sv_lock);
                _leave("");
                return;
        }
-        spin_lock(&cell->sv_gylock);
+        afs_flush_callback_breaks(server);
-        list_move_tail(&server->link, &cell->sv_graveyard);
-        /* time out in 10 secs */
+        spin_lock(&afs_server_graveyard_lock);
-        afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
+        if (atomic_read(&server->usage) == 0) {
+                list_move_tail(&server->grave, &afs_server_graveyard);
-        spin_unlock(&cell->sv_gylock);
+                server->time_of_death = get_seconds();
-        write_unlock(&cell->sv_lock);
+                schedule_delayed_work(&afs_server_reaper,
+                                      afs_server_timeout * HZ);
-        _leave(" [killed]");
+        }
-} /* end afs_put_server() */
+        spin_unlock(&afs_server_graveyard_lock);
+        _leave(" [dead]");
+}
-/*****************************************************************************/
 /*
- * timeout server record
+ * destroy a dead server
- * - removes from the cell's graveyard if the usage count is zero
 */
-void afs_server_do_timeout(struct afs_server *server)
+static void afs_destroy_server(struct afs_server *server)
 {
-        struct rxrpc_peer *peer;
-        struct afs_cell *cell;
-        int loop;
        _enter("%p", server);
-        cell = server->cell;
+        ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
+        ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
-        BUG_ON(atomic_read(&server->usage) < 0);
+        ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
+        ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
-        /* remove from graveyard if still dead */
-        spin_lock(&cell->vl_gylock);
-        if (atomic_read(&server->usage) == 0)
-                list_del_init(&server->link);
-        else
-                server = NULL;
-        spin_unlock(&cell->vl_gylock);
-        if (!server) {
-                _leave("");
-                return; /* resurrected */
-        }
-        /* we can now destroy it properly */
-        afs_put_cell(cell);
-        /* uncross-point the structs under a global lock */
-        spin_lock(&afs_server_peer_lock);
-        peer = server->peer;
-        if (peer) {
-                server->peer = NULL;
-                peer->user = NULL;
-        }
-        spin_unlock(&afs_server_peer_lock);
-        /* finish cleaning up the server */
-        for (loop = AFS_SERVER_CONN_LIST_SIZE - 1; loop >= 0; loop--)
-                if (server->fs_conn[loop])
-                        rxrpc_put_connection(server->fs_conn[loop]);
-        if (server->vlserver)
-                rxrpc_put_connection(server->vlserver);
+        afs_put_cell(server->cell);
        kfree(server);
+}
-        _leave(" [destroyed]");
-} /* end afs_server_do_timeout() */
-/*****************************************************************************/
 /*
- * get a callslot on a connection to the fileserver on the specified server
+ * reap dead server records
 */
-int afs_server_request_callslot(struct afs_server *server,
+static void afs_reap_server(struct work_struct *work)
-                                struct afs_server_callslot *callslot)
 {
-        struct afs_server_callslot *pcallslot;
+        LIST_HEAD(corpses);
-        struct rxrpc_connection *conn;
+        struct afs_server *server;
-        int nconn, ret;
+        unsigned long delay, expiry;
+        time_t now;
-        _enter("%p,",server);
+        now = get_seconds();
-        INIT_LIST_HEAD(&callslot->link);
+        spin_lock(&afs_server_graveyard_lock);
-        callslot->task = current;
-        callslot->conn = NULL;
+        while (!list_empty(&afs_server_graveyard)) {
-        callslot->nconn = -1;
+                server = list_entry(afs_server_graveyard.next,
-        callslot->ready = 0;
+                                    struct afs_server, grave);
-        ret = 0;
+                /* the queue is ordered most dead first */
-        conn = NULL;
+                expiry = server->time_of_death + afs_server_timeout;
+                if (expiry > now) {
-        /* get hold of a callslot first */
+                        delay = (expiry - now) * HZ;
-        spin_lock(&server->fs_lock);
+                        if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+                                cancel_delayed_work(&afs_server_reaper);
-        /* resurrect the server if it's death timeout has expired */
+                                schedule_delayed_work(&afs_server_reaper,
-        if (server->fs_state) {
+                                                      delay);
-                if (time_before(jiffies, server->fs_dead_jif)) {
+                        }
-                        ret = server->fs_state;
+                        break;
-                        spin_unlock(&server->fs_lock);
-                        _leave(" = %d [still dead]", ret);
-                        return ret;
                }
-                server->fs_state = 0;
+                write_lock(&server->cell->servers_lock);
-        }
+                write_lock(&afs_servers_lock);
+                if (atomic_read(&server->usage) > 0) {
-        /* try and find a connection that has spare callslots */
+                        list_del_init(&server->grave);
-        for (nconn = 0; nconn < AFS_SERVER_CONN_LIST_SIZE; nconn++) {
+                } else {
-                if (server->fs_conn_cnt[nconn] > 0) {
+                        list_move_tail(&server->grave, &corpses);
-                        server->fs_conn_cnt[nconn]--;
+                        list_del_init(&server->link);
-                        spin_unlock(&server->fs_lock);
+                        rb_erase(&server->master_rb, &afs_servers);
-                        callslot->nconn = nconn;
-                        goto obtained_slot;
                }
+                write_unlock(&afs_servers_lock);
+                write_unlock(&server->cell->servers_lock);
        }
-        /* none were available - wait interruptibly for one to become
+        spin_unlock(&afs_server_graveyard_lock);
-         * available */
-        set_current_state(TASK_INTERRUPTIBLE);
-        list_add_tail(&callslot->link, &server->fs_callq);
-        spin_unlock(&server->fs_lock);
-        while (!callslot->ready && !signal_pending(current)) {
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        set_current_state(TASK_RUNNING);
-        /* even if we were interrupted we may still be queued */
-        if (!callslot->ready) {
-                spin_lock(&server->fs_lock);
-                list_del_init(&callslot->link);
-                spin_unlock(&server->fs_lock);
-        }
-        nconn = callslot->nconn;
-        /* if interrupted, we must release any slot we also got before
+        /* now reap the corpses we've extracted */
-         * returning an error */
+        while (!list_empty(&corpses)) {
-        if (signal_pending(current)) {
+                server = list_entry(corpses.next, struct afs_server, grave);
-                ret = -EINTR;
+                list_del(&server->grave);
-                goto error_release;
+                afs_destroy_server(server);
        }
+}
-        /* if we were woken up with an error, then pass that error back to the
-         * called */
-        if (nconn < 0) {
-                _leave(" = %d", callslot->errno);
-                return callslot->errno;
-        }
-        /* were we given a connection directly? */
-        if (callslot->conn) {
-                /* yes - use it */
-                _leave(" = 0 (nc=%d)", nconn);
-                return 0;
-        }
-        /* got a callslot, but no connection */
- obtained_slot:
-        /* need to get hold of the RxRPC connection */
-        down_write(&server->sem);
-        /* quick check to see if there's an outstanding error */
-        ret = server->fs_state;
-        if (ret)
-                goto error_release_upw;
-        if (server->fs_conn[nconn]) {
-                /* reuse an existing connection */
-                rxrpc_get_connection(server->fs_conn[nconn]);
-                callslot->conn = server->fs_conn[nconn];
-        }
-        else {
-                /* create a new connection */
-                ret = rxrpc_create_connection(afs_transport,
-                                              htons(7000),
-                                              server->addr.s_addr,
-                                              FS_SERVICE_ID,
-                                              NULL,
-                                              &server->fs_conn[nconn]);
-                if (ret < 0)
-                        goto error_release_upw;
-                callslot->conn = server->fs_conn[0];
-                rxrpc_get_connection(callslot->conn);
-        }
-        up_write(&server->sem);
-        _leave(" = 0");
-        return 0;
-        /* handle an error occurring */
- error_release_upw:
-        up_write(&server->sem);
- error_release:
-        /* either release the callslot or pass it along to another deserving
-         * task */
-        spin_lock(&server->fs_lock);
-        if (nconn < 0) {
-                /* no callslot allocated */
-        }
-        else if (list_empty(&server->fs_callq)) {
-                /* no one waiting */
-                server->fs_conn_cnt[nconn]++;
-                spin_unlock(&server->fs_lock);
-        }
-        else {
-                /* someone's waiting - dequeue them and wake them up */
-                pcallslot = list_entry(server->fs_callq.next,
-                                       struct afs_server_callslot, link);
-                list_del_init(&pcallslot->link);
-                pcallslot->errno = server->fs_state;
-                if (!pcallslot->errno) {
-                        /* pass them out callslot details */
-                        callslot->conn = xchg(&pcallslot->conn,
-                                              callslot->conn);
-                        pcallslot->nconn = nconn;
-                        callslot->nconn = nconn = -1;
-                }
-                pcallslot->ready = 1;
-                wake_up_process(pcallslot->task);
-                spin_unlock(&server->fs_lock);
-        }
-        rxrpc_put_connection(callslot->conn);
-        callslot->conn = NULL;
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_server_request_callslot() */
-/*****************************************************************************/
-/*
- * release a callslot back to the server
- * - transfers the RxRPC connection to the next pending callslot if possible
- */
-void afs_server_release_callslot(struct afs_server *server,
-                                 struct afs_server_callslot *callslot)
-{
-        struct afs_server_callslot *pcallslot;
-        _enter("{ad=%08x,cnt=%u},{%d}",
-               ntohl(server->addr.s_addr),
-               server->fs_conn_cnt[callslot->nconn],
-               callslot->nconn);
-        BUG_ON(callslot->nconn < 0);
-        spin_lock(&server->fs_lock);
-        if (list_empty(&server->fs_callq)) {
-                /* no one waiting */
-                server->fs_conn_cnt[callslot->nconn]++;
-                spin_unlock(&server->fs_lock);
-        }
-        else {
-                /* someone's waiting - dequeue them and wake them up */
-                pcallslot = list_entry(server->fs_callq.next,
-                                       struct afs_server_callslot, link);
-                list_del_init(&pcallslot->link);
-                pcallslot->errno = server->fs_state;
-                if (!pcallslot->errno) {
-                        /* pass them out callslot details */
-                        callslot->conn = xchg(&pcallslot->conn, callslot->conn);
-                        pcallslot->nconn = callslot->nconn;
-                        callslot->nconn = -1;
-                }
-                pcallslot->ready = 1;
-                wake_up_process(pcallslot->task);
-                spin_unlock(&server->fs_lock);
-        }
-        rxrpc_put_connection(callslot->conn);
-        _leave("");
-} /* end afs_server_release_callslot() */
-/*****************************************************************************/
 /*
- * get a handle to a connection to the vlserver (volume location) on the
+ * discard all the server records for rmmod
- * specified server
 */
-int afs_server_get_vlconn(struct afs_server *server,
+void __exit afs_purge_servers(void)
-                          struct rxrpc_connection **_conn)
 {
-        struct rxrpc_connection *conn;
+        afs_server_timeout = 0;
-        int ret;
+        cancel_delayed_work(&afs_server_reaper);
+        schedule_delayed_work(&afs_server_reaper, 0);
-        _enter("%p,", server);
+}
-        ret = 0;
-        conn = NULL;
-        down_read(&server->sem);
-        if (server->vlserver) {
-                /* reuse an existing connection */
-                rxrpc_get_connection(server->vlserver);
-                conn = server->vlserver;
-                up_read(&server->sem);
-        }
-        else {
-                /* create a new connection */
-                up_read(&server->sem);
-                down_write(&server->sem);
-                if (!server->vlserver) {
-                        ret = rxrpc_create_connection(afs_transport,
-                                                      htons(7003),
-                                                      server->addr.s_addr,
-                                                      VL_SERVICE_ID,
-                                                      NULL,
-                                                      &server->vlserver);
-                }
-                if (ret == 0) {
-                        rxrpc_get_connection(server->vlserver);
-                        conn = server->vlserver;
-                }
-                up_write(&server->sem);
-        }
-        *_conn = conn;
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_server_get_vlconn() */
diff --git a/fs/afs/server.h b/fs/afs/server.h
deleted file mode 100644
index c3d24115578f..000000000000
--- a/fs/afs/server.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* server.h: AFS server record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_SERVER_H
-#define _LINUX_AFS_SERVER_H
-#include "types.h"
-#include "kafstimod.h"
-#include <rxrpc/peer.h>
-#include <linux/rwsem.h>
-extern spinlock_t afs_server_peer_lock;
-/*****************************************************************************/
-/*
- * AFS server record
- */
-struct afs_server
-{
-        atomic_t                usage;
-        struct afs_cell         *cell;          /* cell in which server resides */
-        struct list_head        link;           /* link in cell's server list */
-        struct rw_semaphore     sem;            /* access lock */
-        struct afs_timer        timeout;        /* graveyard timeout */
-        struct in_addr          addr;           /* server address */
-        struct rxrpc_peer       *peer;          /* peer record for this server */
-        struct rxrpc_connection *vlserver;      /* connection to the volume location service */
-        /* file service access */
-#define AFS_SERVER_CONN_LIST_SIZE 2
-        struct rxrpc_connection *fs_conn[AFS_SERVER_CONN_LIST_SIZE]; /* FS connections */
-        unsigned                fs_conn_cnt[AFS_SERVER_CONN_LIST_SIZE]; /* per conn call count */
-        struct list_head        fs_callq;       /* queue of processes waiting to make a call */
-        spinlock_t              fs_lock;        /* access lock */
-        int                     fs_state;       /* 0 or reason FS currently marked dead (-errno) */
-        unsigned                fs_rtt;         /* FS round trip time */
-        unsigned long           fs_act_jif;     /* time at which last activity occurred */
-        unsigned long           fs_dead_jif;    /* time at which no longer to be considered dead */
-        /* callback promise management */
-        struct list_head        cb_promises;    /* as yet unbroken promises from this server */
-        spinlock_t              cb_lock;        /* access lock */
-};
-extern int afs_server_lookup(struct afs_cell *cell,
-                             const struct in_addr *addr,
-                             struct afs_server **_server);
-#define afs_get_server(S) do { atomic_inc(&(S)->usage); } while(0)
-extern void afs_put_server(struct afs_server *server);
-extern void afs_server_do_timeout(struct afs_server *server);
-extern int afs_server_find_by_peer(const struct rxrpc_peer *peer,
-                                   struct afs_server **_server);
-extern int afs_server_get_vlconn(struct afs_server *server,
-                                 struct rxrpc_connection **_conn);
-static inline
-struct afs_server *afs_server_get_from_peer(struct rxrpc_peer *peer)
-{
-        struct afs_server *server;
-        spin_lock(&afs_server_peer_lock);
-        server = peer->user;
-        if (server)
-                afs_get_server(server);
-        spin_unlock(&afs_server_peer_lock);
-        return server;
-}
-/*****************************************************************************/
-/*
- * AFS server callslot grant record
- */
-struct afs_server_callslot
-{
-        struct list_head        link;           /* link in server's list */
-        struct task_struct      *task;          /* process waiting to make call */
-        struct rxrpc_connection *conn;          /* connection to use (or NULL on error) */
-        short                   nconn;          /* connection slot number (-1 on error) */
-        char                    ready;          /* T when ready */
-        int                     errno;          /* error number if nconn==-1 */
-};
-extern int afs_server_request_callslot(struct afs_server *server,
-                                       struct afs_server_callslot *callslot);
-extern void afs_server_release_callslot(struct afs_server *server,
-                                        struct afs_server_callslot *callslot);
-#endif /* _LINUX_AFS_SERVER_H */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eb7e32349da3..7030d76155fc 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,5 +1,6 @@
-/*
+/* AFS superblock handling
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ *
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
 *
 * This software may be freely redistributed under the terms of the
 * GNU General Public License.
@@ -9,7 +10,7 @@
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Authors: David Howells <dhowells@redhat.com>
- *          David Woodhouse <dwmw2@cambridge.redhat.com>
+ *          David Woodhouse <dwmw2@redhat.com>
 *
 */
@@ -19,22 +20,11 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "vnode.h"
+#include <linux/parser.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "super.h"
 #include "internal.h"
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-struct afs_mount_params {
-        int                     rwpath;
-        struct afs_cell         *default_cell;
-        struct afs_volume       *volume;
-};
 static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
                            unsigned long flags);
@@ -53,7 +43,7 @@ struct file_system_type afs_fs_type = {
        .name           = "afs",
        .get_sb         = afs_get_sb,
        .kill_sb        = kill_anon_super,
-        .fs_flags       = FS_BINARY_MOUNTDATA,
+        .fs_flags       = 0,
 };
 static const struct super_operations afs_super_ops = {
@@ -62,13 +52,27 @@ static const struct super_operations afs_super_ops = {
        .drop_inode     = generic_delete_inode,
        .destroy_inode  = afs_destroy_inode,
        .clear_inode    = afs_clear_inode,
+        .umount_begin   = afs_umount_begin,
        .put_super      = afs_put_super,
 };
 static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
-/*****************************************************************************/
+enum {
+        afs_no_opt,
+        afs_opt_cell,
+        afs_opt_rwpath,
+        afs_opt_vol,
+};
+static const match_table_t afs_options_list = {
+        { afs_opt_cell,         "cell=%s"       },
+        { afs_opt_rwpath,       "rwpath"        },
+        { afs_opt_vol,          "vol=%s"        },
+        { afs_no_opt,           NULL            },
+};
 /*
 * initialise the filesystem
 */
@@ -78,8 +82,6 @@ int __init afs_fs_init(void)
        _enter("");
-        afs_timer_init(&afs_mntpt_expiry_timer, &afs_mntpt_expiry_timer_ops);
        /* create ourselves an inode cache */
        atomic_set(&afs_count_active_inodes, 0);
@@ -99,20 +101,22 @@ int __init afs_fs_init(void)
        ret = register_filesystem(&afs_fs_type);
        if (ret < 0) {
                kmem_cache_destroy(afs_inode_cachep);
-                kleave(" = %d", ret);
+                _leave(" = %d", ret);
                return ret;
        }
-        kleave(" = 0");
+        _leave(" = 0");
        return 0;
-} /* end afs_fs_init() */
+}
-/*****************************************************************************/
 /*
 * clean up the filesystem
 */
 void __exit afs_fs_exit(void)
 {
+        _enter("");
+        afs_mntpt_kill_timer();
        unregister_filesystem(&afs_fs_type);
        if (atomic_read(&afs_count_active_inodes) != 0) {
@@ -122,99 +126,153 @@ void __exit afs_fs_exit(void)
        }
        kmem_cache_destroy(afs_inode_cachep);
+        _leave("");
+}
-} /* end afs_fs_exit() */
-/*****************************************************************************/
-/*
- * check that an argument has a value
- */
-static int want_arg(char **_value, const char *option)
-{
-        if (!_value || !*_value || !**_value) {
-                printk(KERN_NOTICE "kAFS: %s: argument missing\n", option);
-                return 0;
-        }
-        return 1;
-} /* end want_arg() */
-/*****************************************************************************/
-/*
- * check that there's no subsequent value
- */
-static int want_no_value(char *const *_value, const char *option)
-{
-        if (*_value && **_value) {
-                printk(KERN_NOTICE "kAFS: %s: Invalid argument: %s\n",
-                       option, *_value);
-                return 0;
-        }
-        return 1;
-} /* end want_no_value() */
-/*****************************************************************************/
 /*
 * parse the mount options
 * - this function has been shamelessly adapted from the ext3 fs which
 *   shamelessly adapted it from the msdos fs
 */
-static int afs_super_parse_options(struct afs_mount_params *params,
+static int afs_parse_options(struct afs_mount_params *params,
-                                   char *options,
+                             char *options, const char **devname)
-                                   const char **devname)
 {
-        char *key, *value;
+        struct afs_cell *cell;
-        int ret;
+        substring_t args[MAX_OPT_ARGS];
+        char *p;
+        int token;
        _enter("%s", options);
        options[PAGE_SIZE - 1] = 0;
-        ret = 0;
+        while ((p = strsep(&options, ","))) {
-        while ((key = strsep(&options, ",")) != 0)
+                if (!*p)
-        {
+                        continue;
-                value = strchr(key, '=');
-                if (value)
-                        *value++ = 0;
-                printk("kAFS: KEY: %s, VAL:%s\n", key, value ?: "-");
-                if (strcmp(key, "rwpath") == 0) {
+                token = match_token(p, afs_options_list, args);
-                        if (!want_no_value(&value, "rwpath"))
+                switch (token) {
-                                return -EINVAL;
+                case afs_opt_cell:
+                        cell = afs_cell_lookup(args[0].from,
+                                               args[0].to - args[0].from);
+                        if (IS_ERR(cell))
+                                return PTR_ERR(cell);
+                        afs_put_cell(params->cell);
+                        params->cell = cell;
+                        break;
+                case afs_opt_rwpath:
                        params->rwpath = 1;
-                        continue;
+                        break;
-                }
-                else if (strcmp(key, "vol") == 0) {
+                case afs_opt_vol:
-                        if (!want_arg(&value, "vol"))
+                        *devname = args[0].from;
-                                return -EINVAL;
+                        break;
-                        *devname = value;
-                        continue;
+                default:
+                        printk(KERN_ERR "kAFS:"
+                               " Unknown or invalid mount option: '%s'\n", p);
+                        return -EINVAL;
                }
-                else if (strcmp(key, "cell") == 0) {
+        }
-                        if (!want_arg(&value, "cell"))
-                                return -EINVAL;
+        _leave(" = 0");
-                        afs_put_cell(params->default_cell);
+        return 0;
-                        ret = afs_cell_lookup(value,
+}
-                                              strlen(value),
-                                              &params->default_cell);
+/*
-                        if (ret < 0)
+ * parse a device name to get cell name, volume name, volume type and R/W
-                                return -EINVAL;
+ * selector
-                        continue;
+ * - this can be one of the following:
+ *      "%[cell:]volume[.]"             R/W volume
+ *      "#[cell:]volume[.]"             R/O or R/W volume (rwpath=0),
+ *                                       or R/W (rwpath=1) volume
+ *      "%[cell:]volume.readonly"       R/O volume
+ *      "#[cell:]volume.readonly"       R/O volume
+ *      "%[cell:]volume.backup"         Backup volume
+ *      "#[cell:]volume.backup"         Backup volume
+ */
+static int afs_parse_device_name(struct afs_mount_params *params,
+                                 const char *name)
+{
+        struct afs_cell *cell;
+        const char *cellname, *suffix;
+        int cellnamesz;
+        _enter(",%s", name);
+        if (!name) {
+                printk(KERN_ERR "kAFS: no volume name specified\n");
+                return -EINVAL;
+        }
+        if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+                printk(KERN_ERR "kAFS: unparsable volume name\n");
+                return -EINVAL;
+        }
+        /* determine the type of volume we're looking for */
+        params->type = AFSVL_ROVOL;
+        params->force = false;
+        if (params->rwpath || name[0] == '%') {
+                params->type = AFSVL_RWVOL;
+                params->force = true;
+        }
+        name++;
+        /* split the cell name out if there is one */
+        params->volname = strchr(name, ':');
+        if (params->volname) {
+                cellname = name;
+                cellnamesz = params->volname - name;
+                params->volname++;
+        } else {
+                params->volname = name;
+                cellname = NULL;
+                cellnamesz = 0;
+        }
+        /* the volume type is further affected by a possible suffix */
+        suffix = strrchr(params->volname, '.');
+        if (suffix) {
+                if (strcmp(suffix, ".readonly") == 0) {
+                        params->type = AFSVL_ROVOL;
+                        params->force = true;
+                } else if (strcmp(suffix, ".backup") == 0) {
+                        params->type = AFSVL_BACKVOL;
+                        params->force = true;
+                } else if (suffix[1] == 0) {
+                } else {
+                        suffix = NULL;
                }
+        }
-                printk("kAFS: Unknown mount option: '%s'\n",  key);
+        params->volnamesz = suffix ?
-                ret = -EINVAL;
+                suffix - params->volname : strlen(params->volname);
-                goto error;
+        _debug("cell %*.*s [%p]",
+               cellnamesz, cellnamesz, cellname ?: "", params->cell);
+        /* lookup the cell record */
+        if (cellname || !params->cell) {
+                cell = afs_cell_lookup(cellname, cellnamesz);
+                if (IS_ERR(cell)) {
+                        printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
+                               cellname ?: "");
+                        return PTR_ERR(cell);
+                }
+                afs_put_cell(params->cell);
+                params->cell = cell;
        }
-        ret = 0;
+        _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
+               params->cell->name, params->cell,
+               params->volnamesz, params->volnamesz, params->volname,
+               suffix ?: "-", params->type, params->force ? " FORCE" : "");
- error:
+        return 0;
-        _leave(" = %d", ret);
+}
-        return ret;
-} /* end afs_super_parse_options() */
-/*****************************************************************************/
 /*
 * check a superblock to see if it's the one we're looking for
 */
@@ -224,13 +282,12 @@ static int afs_test_super(struct super_block *sb, void *data)
        struct afs_super_info *as = sb->s_fs_info;
        return as->volume == params->volume;
-} /* end afs_test_super() */
+}
-/*****************************************************************************/
 /*
 * fill in the superblock
 */
-static int afs_fill_super(struct super_block *sb, void *data, int silent)
+static int afs_fill_super(struct super_block *sb, void *data)
 {
        struct afs_mount_params *params = data;
        struct afs_super_info *as = NULL;
@@ -239,7 +296,7 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *inode = NULL;
        int ret;
-        kenter("");
+        _enter("");
        /* allocate a superblock info record */
        as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
@@ -262,9 +319,9 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
        fid.vid         = as->volume->vid;
        fid.vnode       = 1;
        fid.unique      = 1;
-        ret = afs_iget(sb, &fid, &inode);
+        inode = afs_iget(sb, params->key, &fid, NULL, NULL);
-        if (ret < 0)
+        if (IS_ERR(inode))
-                goto error;
+                goto error_inode;
        ret = -ENOMEM;
        root = d_alloc_root(inode);
@@ -273,24 +330,25 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_root = root;
-        kleave(" = 0");
+        _leave(" = 0");
        return 0;
- error:
+error_inode:
+        ret = PTR_ERR(inode);
+        inode = NULL;
+error:
        iput(inode);
        afs_put_volume(as->volume);
        kfree(as);
        sb->s_fs_info = NULL;
-        kleave(" = %d", ret);
+        _leave(" = %d", ret);
        return ret;
-} /* end afs_fill_super() */
+}
-/*****************************************************************************/
 /*
 * get an AFS superblock
- * - TODO: don't use get_sb_nodev(), but rather call sget() directly
 */
 static int afs_get_sb(struct file_system_type *fs_type,
                      int flags,
@@ -300,69 +358,79 @@ static int afs_get_sb(struct file_system_type *fs_type,
 {
        struct afs_mount_params params;
        struct super_block *sb;
+        struct afs_volume *vol;
+        struct key *key;
        int ret;
        _enter(",,%s,%p", dev_name, options);
        memset(&params, 0, sizeof(params));
-        /* start the cache manager */
+        /* parse the options and device name */
-        ret = afscm_start();
-        if (ret < 0) {
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* parse the options */
        if (options) {
-                ret = afs_super_parse_options(&params, options, &dev_name);
+                ret = afs_parse_options(&params, options, &dev_name);
                if (ret < 0)
                        goto error;
-                if (!dev_name) {
-                        printk("kAFS: no volume name specified\n");
-                        ret = -EINVAL;
-                        goto error;
-                }
        }
-        /* parse the device name */
+        ret = afs_parse_device_name(&params, dev_name);
-        ret = afs_volume_lookup(dev_name,
-                                params.default_cell,
-                                params.rwpath,
-                                &params.volume);
        if (ret < 0)
                goto error;
-        /* allocate a deviceless superblock */
+        /* try and do the mount securely */
-        sb = sget(fs_type, afs_test_super, set_anon_super, &params);
+        key = afs_request_key(params.cell);
-        if (IS_ERR(sb))
+        if (IS_ERR(key)) {
+                _leave(" = %ld [key]", PTR_ERR(key));
+                ret = PTR_ERR(key);
                goto error;
+        }
+        params.key = key;
-        sb->s_flags = flags;
+        /* parse the device name */
+        vol = afs_volume_lookup(&params);
+        if (IS_ERR(vol)) {
+                ret = PTR_ERR(vol);
+                goto error;
+        }
+        params.volume = vol;
-        ret = afs_fill_super(sb, &params, flags & MS_SILENT ? 1 : 0);
+        /* allocate a deviceless superblock */
-        if (ret < 0) {
+        sb = sget(fs_type, afs_test_super, set_anon_super, &params);
-                up_write(&sb->s_umount);
+        if (IS_ERR(sb)) {
-                deactivate_super(sb);
+                ret = PTR_ERR(sb);
                goto error;
        }
-        sb->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, sb);
+        if (!sb->s_root) {
+                /* initial superblock/root creation */
+                _debug("create");
+                sb->s_flags = flags;
+                ret = afs_fill_super(sb, &params);
+                if (ret < 0) {
+                        up_write(&sb->s_umount);
+                        deactivate_super(sb);
+                        goto error;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        } else {
+                _debug("reuse");
+                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+        }
+        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
-        afs_put_cell(params.default_cell);
+        afs_put_cell(params.cell);
-        _leave(" = 0 [%p]", 0, sb);
+        _leave(" = 0 [%p]", sb);
        return 0;
- error:
+error:
        afs_put_volume(params.volume);
-        afs_put_cell(params.default_cell);
+        afs_put_cell(params.cell);
-        afscm_stop();
+        key_put(params.key);
        _leave(" = %d", ret);
        return ret;
-} /* end afs_get_sb() */
+}
-/*****************************************************************************/
 /*
 * finish the unmounting process on the superblock
 */
@@ -373,35 +441,29 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
        afs_put_volume(as->volume);
-        afscm_stop();
        _leave("");
-} /* end afs_put_super() */
+}
-/*****************************************************************************/
 /*
 * initialise an inode cache slab element prior to any use
 */
 static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
                            unsigned long flags)
 {
-        struct afs_vnode *vnode = (struct afs_vnode *) _vnode;
+        struct afs_vnode *vnode = _vnode;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                memset(vnode, 0, sizeof(*vnode));
                inode_init_once(&vnode->vfs_inode);
                init_waitqueue_head(&vnode->update_waitq);
+                mutex_init(&vnode->permits_lock);
+                mutex_init(&vnode->validate_lock);
                spin_lock_init(&vnode->lock);
-                INIT_LIST_HEAD(&vnode->cb_link);
+                INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
-                INIT_LIST_HEAD(&vnode->cb_hash_link);
-                afs_timer_init(&vnode->cb_timeout,
-                               &afs_vnode_cb_timed_out_ops);
        }
+}
-} /* end afs_i_init_once() */
-/*****************************************************************************/
 /*
 * allocate an AFS inode struct from our slab cache
 */
@@ -409,8 +471,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 {
        struct afs_vnode *vnode;
-        vnode = (struct afs_vnode *)
+        vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
-                kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
        if (!vnode)
                return NULL;
@@ -421,21 +482,25 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        vnode->volume           = NULL;
        vnode->update_cnt       = 0;
-        vnode->flags            = 0;
+        vnode->flags            = 1 << AFS_VNODE_UNSET;
+        vnode->cb_promised      = false;
        return &vnode->vfs_inode;
-} /* end afs_alloc_inode() */
+}
-/*****************************************************************************/
 /*
 * destroy an AFS inode struct
 */
 static void afs_destroy_inode(struct inode *inode)
 {
+        struct afs_vnode *vnode = AFS_FS_I(inode);
        _enter("{%lu}", inode->i_ino);
-        kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode));
+        _debug("DESTROY INODE %p", inode);
-        atomic_dec(&afs_count_active_inodes);
+        ASSERTCMP(vnode->server, ==, NULL);
-} /* end afs_destroy_inode() */
+        kmem_cache_free(afs_inode_cachep, vnode);
+        atomic_dec(&afs_count_active_inodes);
+}
diff --git a/fs/afs/super.h b/fs/afs/super.h
deleted file mode 100644
index 32de8cc6fae8..000000000000
--- a/fs/afs/super.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* super.h: AFS filesystem internal private data
- *
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
- *
- * This software may be freely redistributed under the terms of the
- * GNU General Public License.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
- *          David Howells <dhowells@redhat.com>
- *
- */
-#ifndef _LINUX_AFS_SUPER_H
-#define _LINUX_AFS_SUPER_H
-#include <linux/fs.h>
-#include "server.h"
-#ifdef __KERNEL__
-/*****************************************************************************/
-/*
- * AFS superblock private data
- * - there's one superblock per volume
- */
-struct afs_super_info
-{
-        struct afs_volume       *volume;        /* volume record */
-        char                    rwparent;       /* T if parent is R/W AFS volume */
-};
-static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-extern struct file_system_type afs_fs_type;
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/transport.h b/fs/afs/transport.h
deleted file mode 100644
index 7013ae6ccc8c..000000000000
--- a/fs/afs/transport.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* transport.h: AFS transport management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_TRANSPORT_H
-#define _LINUX_AFS_TRANSPORT_H
-#include "types.h"
-#include <rxrpc/transport.h>
-/* the cache manager transport endpoint */
-extern struct rxrpc_transport *afs_transport;
-#endif /* _LINUX_AFS_TRANSPORT_H */
diff --git a/fs/afs/types.h b/fs/afs/types.h
deleted file mode 100644
index b1a2367c7587..000000000000
--- a/fs/afs/types.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* types.h: AFS types
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_TYPES_H
-#define _LINUX_AFS_TYPES_H
-#ifdef __KERNEL__
-#include <rxrpc/types.h>
-#endif /* __KERNEL__ */
-typedef unsigned                        afs_volid_t;
-typedef unsigned                        afs_vnodeid_t;
-typedef unsigned long long              afs_dataversion_t;
-typedef enum {
-        AFSVL_RWVOL,                    /* read/write volume */
-        AFSVL_ROVOL,                    /* read-only volume */
-        AFSVL_BACKVOL,                  /* backup volume */
-} __attribute__((packed)) afs_voltype_t;
-typedef enum {
-        AFS_FTYPE_INVALID       = 0,
-        AFS_FTYPE_FILE          = 1,
-        AFS_FTYPE_DIR           = 2,
-        AFS_FTYPE_SYMLINK       = 3,
-} afs_file_type_t;
-#ifdef __KERNEL__
-struct afs_cell;
-struct afs_vnode;
-/*****************************************************************************/
-/*
- * AFS file identifier
- */
-struct afs_fid
-{
-        afs_volid_t     vid;            /* volume ID */
-        afs_vnodeid_t   vnode;          /* file index within volume */
-        unsigned        unique;         /* unique ID number (file index version) */
-};
-/*****************************************************************************/
-/*
- * AFS callback notification
- */
-typedef enum {
-        AFSCM_CB_UNTYPED        = 0,    /* no type set on CB break */
-        AFSCM_CB_EXCLUSIVE      = 1,    /* CB exclusive to CM [not implemented] */
-        AFSCM_CB_SHARED         = 2,    /* CB shared by other CM's */
-        AFSCM_CB_DROPPED        = 3,    /* CB promise cancelled by file server */
-} afs_callback_type_t;
-struct afs_callback
-{
-        struct afs_server       *server;        /* server that made the promise */
-        struct afs_fid          fid;            /* file identifier */
-        unsigned                version;        /* callback version */
-        unsigned                expiry;         /* time at which expires */
-        afs_callback_type_t     type;           /* type of callback */
-};
-#define AFSCBMAX 50
-/*****************************************************************************/
-/*
- * AFS volume information
- */
-struct afs_volume_info
-{
-        afs_volid_t             vid;            /* volume ID */
-        afs_voltype_t           type;           /* type of this volume */
-        afs_volid_t             type_vids[5];   /* volume ID's for possible types for this vol */
-        
-        /* list of fileservers serving this volume */
-        size_t                  nservers;       /* number of entries used in servers[] */
-        struct {
-                struct in_addr  addr;           /* fileserver address */
-        } servers[8];
-};
-/*****************************************************************************/
-/*
- * AFS file status information
- */
-struct afs_file_status
-{
-        unsigned                if_version;     /* interface version */
-#define AFS_FSTATUS_VERSION     1
-        afs_file_type_t         type;           /* file type */
-        unsigned                nlink;          /* link count */
-        size_t                  size;           /* file size */
-        afs_dataversion_t       version;        /* current data version */
-        unsigned                author;         /* author ID */
-        unsigned                owner;          /* owner ID */
-        unsigned                caller_access;  /* access rights for authenticated caller */
-        unsigned                anon_access;    /* access rights for unauthenticated caller */
-        umode_t                 mode;           /* UNIX mode */
-        struct afs_fid          parent;         /* parent file ID */
-        time_t                  mtime_client;   /* last time client changed data */
-        time_t                  mtime_server;   /* last time server changed data */
-};
-/*****************************************************************************/
-/*
- * AFS volume synchronisation information
- */
-struct afs_volsync
-{
-        time_t                  creation;       /* volume creation time */
-};
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_TYPES_H */
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 7b0e3192ee39..36c1306e09e0 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -1,4 +1,4 @@
-/* vlclient.c: AFS Volume Location Service client
+/* AFS Volume Location Service client
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -11,247 +11,76 @@
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "server.h"
-#include "volume.h"
-#include "vlclient.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include "errors.h"
 #include "internal.h"
-#define VLGETENTRYBYID          503     /* AFS Get Cache Entry By ID operation ID */
-#define VLGETENTRYBYNAME        504     /* AFS Get Cache Entry By Name operation ID */
-#define VLPROBE                 514     /* AFS Probe Volume Location Service operation ID */
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call);
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call);
-/*****************************************************************************/
 /*
- * map afs VL abort codes to/from Linux error codes
+ * map volume locator abort codes to error codes
- * - called with call->lock held
 */
-static void afs_rxvl_aemap(struct rxrpc_call *call)
+static int afs_vl_abort_to_error(u32 abort_code)
 {
-        int err;
+        _enter("%u", abort_code);
-        _enter("{%u,%u,%d}",
+        switch (abort_code) {
-               call->app_err_state, call->app_abort_code, call->app_errno);
+        case AFSVL_IDEXIST:             return -EEXIST;
+        case AFSVL_IO:                  return -EREMOTEIO;
-        switch (call->app_err_state) {
+        case AFSVL_NAMEEXIST:           return -EEXIST;
-        case RXRPC_ESTATE_LOCAL_ABORT:
+        case AFSVL_CREATEFAIL:          return -EREMOTEIO;
-                call->app_abort_code = -call->app_errno;
+        case AFSVL_NOENT:               return -ENOMEDIUM;
-                return;
+        case AFSVL_EMPTY:               return -ENOMEDIUM;
+        case AFSVL_ENTDELETED:          return -ENOMEDIUM;
-        case RXRPC_ESTATE_PEER_ABORT:
+        case AFSVL_BADNAME:             return -EINVAL;
-                switch (call->app_abort_code) {
+        case AFSVL_BADINDEX:            return -EINVAL;
-                case AFSVL_IDEXIST:             err = -EEXIST;          break;
+        case AFSVL_BADVOLTYPE:          return -EINVAL;
-                case AFSVL_IO:                  err = -EREMOTEIO;       break;
+        case AFSVL_BADSERVER:           return -EINVAL;
-                case AFSVL_NAMEEXIST:           err = -EEXIST;          break;
+        case AFSVL_BADPARTITION:        return -EINVAL;
-                case AFSVL_CREATEFAIL:          err = -EREMOTEIO;       break;
+        case AFSVL_REPSFULL:            return -EFBIG;
-                case AFSVL_NOENT:               err = -ENOMEDIUM;       break;
+        case AFSVL_NOREPSERVER:         return -ENOENT;
-                case AFSVL_EMPTY:               err = -ENOMEDIUM;       break;
+        case AFSVL_DUPREPSERVER:        return -EEXIST;
-                case AFSVL_ENTDELETED:          err = -ENOMEDIUM;       break;
+        case AFSVL_RWNOTFOUND:          return -ENOENT;
-                case AFSVL_BADNAME:             err = -EINVAL;          break;
+        case AFSVL_BADREFCOUNT:         return -EINVAL;
-                case AFSVL_BADINDEX:            err = -EINVAL;          break;
+        case AFSVL_SIZEEXCEEDED:        return -EINVAL;
-                case AFSVL_BADVOLTYPE:          err = -EINVAL;          break;
+        case AFSVL_BADENTRY:            return -EINVAL;
-                case AFSVL_BADSERVER:           err = -EINVAL;          break;
+        case AFSVL_BADVOLIDBUMP:        return -EINVAL;
-                case AFSVL_BADPARTITION:        err = -EINVAL;          break;
+        case AFSVL_IDALREADYHASHED:     return -EINVAL;
-                case AFSVL_REPSFULL:            err = -EFBIG;           break;
+        case AFSVL_ENTRYLOCKED:         return -EBUSY;
-                case AFSVL_NOREPSERVER:         err = -ENOENT;          break;
+        case AFSVL_BADVOLOPER:          return -EBADRQC;
-                case AFSVL_DUPREPSERVER:        err = -EEXIST;          break;
+        case AFSVL_BADRELLOCKTYPE:      return -EINVAL;
-                case AFSVL_RWNOTFOUND:          err = -ENOENT;          break;
+        case AFSVL_RERELEASE:           return -EREMOTEIO;
-                case AFSVL_BADREFCOUNT:         err = -EINVAL;          break;
+        case AFSVL_BADSERVERFLAG:       return -EINVAL;
-                case AFSVL_SIZEEXCEEDED:        err = -EINVAL;          break;
+        case AFSVL_PERM:                return -EACCES;
-                case AFSVL_BADENTRY:            err = -EINVAL;          break;
+        case AFSVL_NOMEM:               return -EREMOTEIO;
-                case AFSVL_BADVOLIDBUMP:        err = -EINVAL;          break;
-                case AFSVL_IDALREADYHASHED:     err = -EINVAL;          break;
-                case AFSVL_ENTRYLOCKED:         err = -EBUSY;           break;
-                case AFSVL_BADVOLOPER:          err = -EBADRQC;         break;
-                case AFSVL_BADRELLOCKTYPE:      err = -EINVAL;          break;
-                case AFSVL_RERELEASE:           err = -EREMOTEIO;       break;
-                case AFSVL_BADSERVERFLAG:       err = -EINVAL;          break;
-                case AFSVL_PERM:                err = -EACCES;          break;
-                case AFSVL_NOMEM:               err = -EREMOTEIO;       break;
-                default:
-                        err = afs_abort_to_error(call->app_abort_code);
-                        break;
-                }
-                call->app_errno = err;
-                return;
        default:
-                return;
+                return afs_abort_to_error(abort_code);
        }
-} /* end afs_rxvl_aemap() */
+}
-#if 0
-/*****************************************************************************/
 /*
- * probe a volume location server to see if it is still alive -- unused
+ * deliver reply data to a VL.GetEntryByXXX call
 */
-static int afs_rxvl_probe(struct afs_server *server, int alloc_flags)
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
+                                           struct sk_buff *skb, bool last)
 {
-        struct rxrpc_connection *conn;
+        struct afs_cache_vlocation *entry;
-        struct rxrpc_call *call;
+        __be32 *bp;
-        struct kvec piov[1];
+        u32 tmp;
-        size_t sent;
+        int loop;
-        int ret;
-        __be32 param[1];
-        DECLARE_WAITQUEUE(myself, current);
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(server, &conn);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = VLPROBE;
-        /* we want to get event notifications from the call */
-        add_wait_queue(&call->waitq, &myself);
-        /* marshall the parameters */
-        param[0] = htonl(VLPROBE);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET,
-                                    alloc_flags, 0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-                    signal_pending(current))
-                        break;
-                schedule();
-        }
-        set_current_state(TASK_RUNNING);
-        ret = -EINTR;
-        if (signal_pending(current))
-                goto abort;
-        switch (call->app_call_state) {
-        case RXRPC_CSTATE_ERROR:
-                ret = call->app_errno;
-                goto out_unwait;
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
-                ret = 0;
-                goto out_unwait;
-        default:
-                BUG();
-        }
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        rxrpc_put_connection(conn);
- out:
-        return ret;
-} /* end afs_rxvl_probe() */
+        _enter(",,%u", last);
-#endif
-/*****************************************************************************/
+        afs_transfer_reply(call, skb);
-/*
+        if (!last)
- * look up a volume location database entry by name
+                return 0;
- */
-int afs_rxvl_get_entry_by_name(struct afs_server *server,
-                               const char *volname,
-                               unsigned volnamesz,
-                               struct afs_cache_vlocation *entry)
-{
-        DECLARE_WAITQUEUE(myself, current);
-        struct rxrpc_connection *conn;
-        struct rxrpc_call *call;
-        struct kvec piov[3];
-        unsigned tmp;
-        size_t sent;
-        int ret, loop;
-        __be32 *bp, param[2], zero;
-        _enter(",%*.*s,%u,", volnamesz, volnamesz, volname, volnamesz);
-        memset(entry, 0, sizeof(*entry));
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(server, &conn);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = VLGETENTRYBYNAME;
-        /* we want to get event notifications from the call */
+        if (call->reply_size != call->reply_max)
-        add_wait_queue(&call->waitq, &myself);
+                return -EBADMSG;
-        /* marshall the parameters */
+        /* unmarshall the reply once we've received all of it */
-        piov[1].iov_len = volnamesz;
+        entry = call->reply;
-        piov[1].iov_base = (char *) volname;
+        bp = call->buffer;
-        zero = 0;
-        piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
-        piov[2].iov_base = &zero;
-        param[0] = htonl(VLGETENTRYBYNAME);
-        param[1] = htonl(piov[1].iov_len);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 384);
-        ret = rxrpc_call_read_data(call, bp, 384,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
-        }
-        /* unmarshall the reply */
        for (loop = 0; loop < 64; loop++)
                entry->name[loop] = ntohl(*bp++);
+        entry->name[loop] = 0;
        bp++; /* final NUL */
        bp++; /* type */
@@ -264,6 +93,7 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
        for (loop = 0; loop < 8; loop++) {
                tmp = ntohl(*bp++);
+                entry->srvtmask[loop] = 0;
                if (tmp & AFS_VLSF_RWVOL)
                        entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
                if (tmp & AFS_VLSF_ROVOL)
@@ -279,417 +109,110 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
        bp++; /* clone ID */
        tmp = ntohl(*bp++); /* flags */
+        entry->vidmask = 0;
        if (tmp & AFS_VLF_RWEXISTS)
                entry->vidmask |= AFS_VOL_VTM_RW;
        if (tmp & AFS_VLF_ROEXISTS)
                entry->vidmask |= AFS_VOL_VTM_RO;
        if (tmp & AFS_VLF_BACKEXISTS)
                entry->vidmask |= AFS_VOL_VTM_BAK;
-        ret = -ENOMEDIUM;
        if (!entry->vidmask)
-                goto abort;
+                return -EBADMSG;
-        /* success */
-        entry->rtime = get_seconds();
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        rxrpc_put_connection(conn);
- out:
-        _leave(" = %d", ret);
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxvl_get_entry_by_name() */
-/*****************************************************************************/
-/*
- * look up a volume location database entry by ID
- */
-int afs_rxvl_get_entry_by_id(struct afs_server *server,
-                             afs_volid_t volid,
-                             afs_voltype_t voltype,
-                             struct afs_cache_vlocation *entry)
-{
-        DECLARE_WAITQUEUE(myself, current);
-        struct rxrpc_connection *conn;
-        struct rxrpc_call *call;
-        struct kvec piov[1];
-        unsigned tmp;
-        size_t sent;
-        int ret, loop;
-        __be32 *bp, param[3];
-        _enter(",%x,%d,", volid, voltype);
-        memset(entry, 0, sizeof(*entry));
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(server, &conn);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = VLGETENTRYBYID;
-        /* we want to get event notifications from the call */
-        add_wait_queue(&call->waitq, &myself);
-        /* marshall the parameters */
-        param[0] = htonl(VLGETENTRYBYID);
-        param[1] = htonl(volid);
-        param[2] = htonl(voltype);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 384);
-        ret = rxrpc_call_read_data(call, bp, 384,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
-        }
-        /* unmarshall the reply */
-        for (loop = 0; loop < 64; loop++)
-                entry->name[loop] = ntohl(*bp++);
-        bp++; /* final NUL */
-        bp++; /* type */
+        _leave(" = 0 [done]");
-        entry->nservers = ntohl(*bp++);
+        return 0;
+}
-        for (loop = 0; loop < 8; loop++)
-                entry->servers[loop].s_addr = *bp++;
-        bp += 8; /* partition IDs */
-        for (loop = 0; loop < 8; loop++) {
-                tmp = ntohl(*bp++);
-                if (tmp & AFS_VLSF_RWVOL)
-                        entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
-                if (tmp & AFS_VLSF_ROVOL)
-                        entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
-                if (tmp & AFS_VLSF_BACKVOL)
-                        entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-        }
-        entry->vid[0] = ntohl(*bp++);
-        entry->vid[1] = ntohl(*bp++);
-        entry->vid[2] = ntohl(*bp++);
-        bp++; /* clone ID */
-        tmp = ntohl(*bp++); /* flags */
-        if (tmp & AFS_VLF_RWEXISTS)
-                entry->vidmask |= AFS_VOL_VTM_RW;
-        if (tmp & AFS_VLF_ROEXISTS)
-                entry->vidmask |= AFS_VOL_VTM_RO;
-        if (tmp & AFS_VLF_BACKEXISTS)
-                entry->vidmask |= AFS_VOL_VTM_BAK;
-        ret = -ENOMEDIUM;
-        if (!entry->vidmask)
-                goto abort;
-#if 0 /* TODO: remove */
-        entry->nservers = 3;
-        entry->servers[0].s_addr = htonl(0xac101249);
-        entry->servers[1].s_addr = htonl(0xac101243);
-        entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-        entry->srvtmask[0] = AFS_VOL_VTM_RO;
-        entry->srvtmask[1] = AFS_VOL_VTM_RO;
-        entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-        /* success */
-        entry->rtime = get_seconds();
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        rxrpc_put_connection(conn);
- out:
-        _leave(" = %d", ret);
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxvl_get_entry_by_id() */
-/*****************************************************************************/
 /*
- * look up a volume location database entry by ID asynchronously
+ * VL.GetEntryByName operation type
 */
-int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
+static const struct afs_call_type afs_RXVLGetEntryByName = {
-                                   afs_volid_t volid,
+        .name           = "VL.GetEntryByName",
-                                   afs_voltype_t voltype)
+        .deliver        = afs_deliver_vl_get_entry_by_xxx,
-{
+        .abort_to_error = afs_vl_abort_to_error,
-        struct rxrpc_connection *conn;
+        .destructor     = afs_flat_call_destructor,
-        struct rxrpc_call *call;
+};
-        struct kvec piov[1];
-        size_t sent;
-        int ret;
-        __be32 param[3];
-        _enter(",%x,%d,", volid, voltype);
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(op->server, &conn);
-        if (ret < 0) {
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn,
-                                afs_rxvl_get_entry_by_id_attn,
-                                afs_rxvl_get_entry_by_id_error,
-                                afs_rxvl_aemap,
-                                &op->call);
-        rxrpc_put_connection(conn);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                _leave(" = %d", ret);
-                return ret;
-        }
-        op->call->app_opcode = VLGETENTRYBYID;
+/*
-        op->call->app_user = op;
+ * VL.GetEntryById operation type
+ */
-        call = op->call;
+static const struct afs_call_type afs_RXVLGetEntryById = {
-        rxrpc_get_call(call);
+        .name           = "VL.GetEntryById",
+        .deliver        = afs_deliver_vl_get_entry_by_xxx,
-        /* send event notifications from the call to kafsasyncd */
+        .abort_to_error = afs_vl_abort_to_error,
-        afs_kafsasyncd_begin_op(op);
+        .destructor     = afs_flat_call_destructor,
+};
-        /* marshall the parameters */
-        param[0] = htonl(VLGETENTRYBYID);
-        param[1] = htonl(volid);
-        param[2] = htonl(voltype);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* allocate result read buffer in scratch space */
-        call->app_scr_ptr = rxrpc_call_alloc_scratch(op->call, 384);
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0) {
-                rxrpc_call_abort(call, ret); /* handle from kafsasyncd */
-                ret = 0;
-                goto out;
-        }
-        /* wait for the reply to completely arrive */
-        ret = rxrpc_call_read_data(call, call->app_scr_ptr, 384, 0);
-        switch (ret) {
-        case 0:
-        case -EAGAIN:
-        case -ECONNABORTED:
-                ret = 0;
-                break;  /* all handled by kafsasyncd */
-        default:
-                rxrpc_call_abort(call, ret); /* make kafsasyncd handle it */
-                ret = 0;
-                break;
-        }
- out:
-        rxrpc_put_call(call);
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_rxvl_get_entry_by_id_async() */
-/*****************************************************************************/
 /*
- * attend to the asynchronous get VLDB entry by ID
+ * dispatch a get volume entry by name operation
 */
-int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
+int afs_vl_get_entry_by_name(struct in_addr *addr,
-                                    struct afs_cache_vlocation *entry)
+                             struct key *key,
+                             const char *volname,
+                             struct afs_cache_vlocation *entry,
+                             const struct afs_wait_mode *wait_mode)
 {
+        struct afs_call *call;
+        size_t volnamesz, reqsz, padsz;
        __be32 *bp;
-        __u32 tmp;
-        int loop, ret;
-        _enter("{op=%p cst=%u}", op, op->call->app_call_state);
-        memset(entry, 0, sizeof(*entry));
-        if (op->call->app_call_state == RXRPC_CSTATE_COMPLETE) {
-                /* operation finished */
-                afs_kafsasyncd_terminate_op(op);
-                bp = op->call->app_scr_ptr;
-                /* unmarshall the reply */
-                for (loop = 0; loop < 64; loop++)
-                        entry->name[loop] = ntohl(*bp++);
-                bp++; /* final NUL */
-                bp++; /* type */
-                entry->nservers = ntohl(*bp++);
-                for (loop = 0; loop < 8; loop++)
-                        entry->servers[loop].s_addr = *bp++;
-                bp += 8; /* partition IDs */
-                for (loop = 0; loop < 8; loop++) {
-                        tmp = ntohl(*bp++);
-                        if (tmp & AFS_VLSF_RWVOL)
-                                entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
-                        if (tmp & AFS_VLSF_ROVOL)
-                                entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
-                        if (tmp & AFS_VLSF_BACKVOL)
-                                entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-                }
-                entry->vid[0] = ntohl(*bp++);
-                entry->vid[1] = ntohl(*bp++);
-                entry->vid[2] = ntohl(*bp++);
-                bp++; /* clone ID */
-                tmp = ntohl(*bp++); /* flags */
-                if (tmp & AFS_VLF_RWEXISTS)
-                        entry->vidmask |= AFS_VOL_VTM_RW;
-                if (tmp & AFS_VLF_ROEXISTS)
-                        entry->vidmask |= AFS_VOL_VTM_RO;
-                if (tmp & AFS_VLF_BACKEXISTS)
-                        entry->vidmask |= AFS_VOL_VTM_BAK;
-                ret = -ENOMEDIUM;
-                if (!entry->vidmask) {
-                        rxrpc_call_abort(op->call, ret);
-                        goto done;
-                }
-#if 0 /* TODO: remove */
-                entry->nservers = 3;
-                entry->servers[0].s_addr = htonl(0xac101249);
-                entry->servers[1].s_addr = htonl(0xac101243);
-                entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-                entry->srvtmask[0] = AFS_VOL_VTM_RO;
-                entry->srvtmask[1] = AFS_VOL_VTM_RO;
-                entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-                /* success */
-                entry->rtime = get_seconds();
-                ret = 0;
-                goto done;
-        }
-        if (op->call->app_call_state == RXRPC_CSTATE_ERROR) {
+        _enter("");
-                /* operation error */
-                ret = op->call->app_errno;
-                goto done;
-        }
-        _leave(" = -EAGAIN");
+        volnamesz = strlen(volname);
-        return -EAGAIN;
+        padsz = (4 - (volnamesz & 3)) & 3;
+        reqsz = 8 + volnamesz + padsz;
- done:
+        call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
-        rxrpc_put_call(op->call);
+        if (!call)
-        op->call = NULL;
+                return -ENOMEM;
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_rxvl_get_entry_by_id_async2() */
-/*****************************************************************************/
+        call->key = key;
-/*
+        call->reply = entry;
- * handle attention events on an async get-entry-by-ID op
+        call->service_id = VL_SERVICE;
- * - called from krxiod
+        call->port = htons(AFS_VL_PORT);
- */
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call)
-{
-        struct afs_async_op *op = call->app_user;
-        _enter("{op=%p cst=%u}", op, call->app_call_state);
-        switch (call->app_call_state) {
-        case RXRPC_CSTATE_COMPLETE:
-                afs_kafsasyncd_attend_op(op);
-                break;
-        case RXRPC_CSTATE_CLNT_RCV_REPLY:
-                if (call->app_async_read)
-                        break;
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
-                if (call->app_read_count == 0)
-                        break;
-                printk("kAFS: Reply bigger than expected"
-                       " {cst=%u asyn=%d mark=%Zu rdy=%Zu pr=%u%s}",
-                       call->app_call_state,
-                       call->app_async_read,
-                       call->app_mark,
-                       call->app_ready_qty,
-                       call->pkt_rcv_count,
-                       call->app_last_rcv ? " last" : "");
-                rxrpc_call_abort(call, -EBADMSG);
-                break;
-        default:
-                BUG();
-        }
-        _leave("");
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(VLGETENTRYBYNAME);
+        *bp++ = htonl(volnamesz);
+        memcpy(bp, volname, volnamesz);
+        if (padsz > 0)
+                memset((void *) bp + volnamesz, 0, padsz);
-} /* end afs_rxvl_get_entry_by_id_attn() */
+        /* initiate the call */
+        return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
-/*****************************************************************************/
 /*
- * handle error events on an async get-entry-by-ID op
+ * dispatch a get volume entry by ID operation
- * - called from krxiod
 */
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call)
+int afs_vl_get_entry_by_id(struct in_addr *addr,
+                           struct key *key,
+                           afs_volid_t volid,
+                           afs_voltype_t voltype,
+                           struct afs_cache_vlocation *entry,
+                           const struct afs_wait_mode *wait_mode)
 {
-        struct afs_async_op *op = call->app_user;
+        struct afs_call *call;
+        __be32 *bp;
-        _enter("{op=%p cst=%u}", op, call->app_call_state);
+        _enter("");
-        afs_kafsasyncd_attend_op(op);
+        call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+        if (!call)
+                return -ENOMEM;
-        _leave("");
+        call->key = key;
+        call->reply = entry;
+        call->service_id = VL_SERVICE;
+        call->port = htons(AFS_VL_PORT);
-} /* end afs_rxvl_get_entry_by_id_error() */
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(VLGETENTRYBYID);
+        *bp++ = htonl(volid);
+        *bp   = htonl(voltype);
+        /* initiate the call */
+        return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 782ee7c600ca..3370cdb72566 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -1,6 +1,6 @@
-/* vlocation.c: volume location management
+/* AFS volume location management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -12,131 +12,61 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include <rxrpc/connection.h>
 #include "internal.h"
-#define AFS_VLDB_TIMEOUT HZ*1000
+unsigned afs_vlocation_timeout = 10;    /* volume location timeout in seconds */
+unsigned afs_vlocation_update_timeout = 10 * 60;
-static void afs_vlocation_update_timer(struct afs_timer *timer);
+static void afs_vlocation_reaper(struct work_struct *);
-static void afs_vlocation_update_attend(struct afs_async_op *op);
+static void afs_vlocation_updater(struct work_struct *);
-static void afs_vlocation_update_discard(struct afs_async_op *op);
-static void __afs_put_vlocation(struct afs_vlocation *vlocation);
-static void __afs_vlocation_timeout(struct afs_timer *timer)
+static LIST_HEAD(afs_vlocation_updates);
-{
+static LIST_HEAD(afs_vlocation_graveyard);
-        struct afs_vlocation *vlocation =
+static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
-                list_entry(timer, struct afs_vlocation, timeout);
+static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
-        _debug("VL TIMEOUT [%s{u=%d}]",
+static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
-               vlocation->vldb.name, atomic_read(&vlocation->usage));
+static struct workqueue_struct *afs_vlocation_update_worker;
-        afs_vlocation_do_timeout(vlocation);
-}
-static const struct afs_timer_ops afs_vlocation_timer_ops = {
-        .timed_out      = __afs_vlocation_timeout,
-};
-static const struct afs_timer_ops afs_vlocation_update_timer_ops = {
-        .timed_out      = afs_vlocation_update_timer,
-};
-static const struct afs_async_op_ops afs_vlocation_update_op_ops = {
-        .attend         = afs_vlocation_update_attend,
-        .discard        = afs_vlocation_update_discard,
-};
-static LIST_HEAD(afs_vlocation_update_pendq);   /* queue of VLs awaiting update */
-static struct afs_vlocation *afs_vlocation_update;      /* VL currently being updated */
-static DEFINE_SPINLOCK(afs_vlocation_update_lock); /* lock guarding update queue */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-                                                     const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-        .name           = "vldb",
-        .data_size      = sizeof(struct afs_cache_vlocation),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-        .match          = afs_vlocation_cache_match,
-        .update         = afs_vlocation_cache_update,
-};
-#endif
-/*****************************************************************************/
 /*
 * iterate through the VL servers in a cell until one of them admits knowing
 * about the volume in question
- * - caller must have cell->vl_sem write-locked
 */
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
+static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
-                                           const char *name,
+                                           struct key *key,
-                                           unsigned namesz,
                                           struct afs_cache_vlocation *vldb)
 {
-        struct afs_server *server = NULL;
+        struct afs_cell *cell = vl->cell;
-        struct afs_cell *cell = vlocation->cell;
+        struct in_addr addr;
        int count, ret;
-        _enter("%s,%*.*s,%u", cell->name, namesz, namesz, name, namesz);
+        _enter("%s,%s", cell->name, vl->vldb.name);
+        down_write(&vl->cell->vl_sem);
        ret = -ENOMEDIUM;
        for (count = cell->vl_naddrs; count > 0; count--) {
-                _debug("CellServ[%hu]: %08x",
+                addr = cell->vl_addrs[cell->vl_curr_svix];
-                       cell->vl_curr_svix,
-                       cell->vl_addrs[cell->vl_curr_svix].s_addr);
+                _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-                /* try and create a server */
-                ret = afs_server_lookup(cell,
-                                        &cell->vl_addrs[cell->vl_curr_svix],
-                                        &server);
-                switch (ret) {
-                case 0:
-                        break;
-                case -ENOMEM:
-                case -ENONET:
-                        goto out;
-                default:
-                        goto rotate;
-                }
                /* attempt to access the VL server */
-                ret = afs_rxvl_get_entry_by_name(server, name, namesz, vldb);
+                ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
+                                               &afs_sync_call);
                switch (ret) {
                case 0:
-                        afs_put_server(server);
                        goto out;
                case -ENOMEM:
                case -ENONET:
                case -ENETUNREACH:
                case -EHOSTUNREACH:
                case -ECONNREFUSED:
-                        down_write(&server->sem);
-                        if (server->vlserver) {
-                                rxrpc_put_connection(server->vlserver);
-                                server->vlserver = NULL;
-                        }
-                        up_write(&server->sem);
-                        afs_put_server(server);
                        if (ret == -ENOMEM || ret == -ENONET)
                                goto out;
                        goto rotate;
                case -ENOMEDIUM:
-                        afs_put_server(server);
                        goto out;
                default:
-                        afs_put_server(server);
+                        ret = -EIO;
-                        ret = -ENOMEDIUM;
                        goto rotate;
                }
@@ -146,76 +76,66 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
                cell->vl_curr_svix %= cell->vl_naddrs;
        }
- out:
+out:
+        up_write(&vl->cell->vl_sem);
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_vlocation_access_vl_by_name() */
-/*****************************************************************************/
 /*
 * iterate through the VL servers in a cell until one of them admits knowing
 * about the volume in question
- * - caller must have cell->vl_sem write-locked
 */
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
+static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
+                                         struct key *key,
                                         afs_volid_t volid,
                                         afs_voltype_t voltype,
                                         struct afs_cache_vlocation *vldb)
 {
-        struct afs_server *server = NULL;
+        struct afs_cell *cell = vl->cell;
-        struct afs_cell *cell = vlocation->cell;
+        struct in_addr addr;
        int count, ret;
        _enter("%s,%x,%d,", cell->name, volid, voltype);
+        down_write(&vl->cell->vl_sem);
        ret = -ENOMEDIUM;
        for (count = cell->vl_naddrs; count > 0; count--) {
-                _debug("CellServ[%hu]: %08x",
+                addr = cell->vl_addrs[cell->vl_curr_svix];
-                       cell->vl_curr_svix,
-                       cell->vl_addrs[cell->vl_curr_svix].s_addr);
+                _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-                /* try and create a server */
-                ret = afs_server_lookup(cell,
-                                        &cell->vl_addrs[cell->vl_curr_svix],
-                                        &server);
-                switch (ret) {
-                case 0:
-                        break;
-                case -ENOMEM:
-                case -ENONET:
-                        goto out;
-                default:
-                        goto rotate;
-                }
                /* attempt to access the VL server */
-                ret = afs_rxvl_get_entry_by_id(server, volid, voltype, vldb);
+                ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
+                                             &afs_sync_call);
                switch (ret) {
                case 0:
-                        afs_put_server(server);
                        goto out;
                case -ENOMEM:
                case -ENONET:
                case -ENETUNREACH:
                case -EHOSTUNREACH:
                case -ECONNREFUSED:
-                        down_write(&server->sem);
-                        if (server->vlserver) {
-                                rxrpc_put_connection(server->vlserver);
-                                server->vlserver = NULL;
-                        }
-                        up_write(&server->sem);
-                        afs_put_server(server);
                        if (ret == -ENOMEM || ret == -ENONET)
                                goto out;
                        goto rotate;
+                case -EBUSY:
+                        vl->upd_busy_cnt++;
+                        if (vl->upd_busy_cnt <= 3) {
+                                if (vl->upd_busy_cnt > 1) {
+                                        /* second+ BUSY - sleep a little bit */
+                                        set_current_state(TASK_UNINTERRUPTIBLE);
+                                        schedule_timeout(1);
+                                        __set_current_state(TASK_RUNNING);
+                                }
+                                continue;
+                        }
+                        break;
                case -ENOMEDIUM:
-                        afs_put_server(server);
+                        vl->upd_rej_cnt++;
-                        goto out;
+                        goto rotate;
                default:
-                        afs_put_server(server);
+                        ret = -EIO;
-                        ret = -ENOMEDIUM;
                        goto rotate;
                }
@@ -223,729 +143,579 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
        rotate:
                cell->vl_curr_svix++;
                cell->vl_curr_svix %= cell->vl_naddrs;
+                vl->upd_busy_cnt = 0;
        }
- out:
+out:
+        if (ret < 0 && vl->upd_rej_cnt > 0) {
+                printk(KERN_NOTICE "kAFS:"
+                       " Active volume no longer valid '%s'\n",
+                       vl->vldb.name);
+                vl->valid = 0;
+                ret = -ENOMEDIUM;
+        }
+        up_write(&vl->cell->vl_sem);
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_vlocation_access_vl_by_id() */
-/*****************************************************************************/
 /*
- * lookup volume location
+ * allocate a volume location record
- * - caller must have cell->vol_sem write-locked
- * - iterate through the VL servers in a cell until one of them admits knowing
- *   about the volume in question
- * - lookup in the local cache if not able to find on the VL server
- * - insert/update in the local cache if did get a VL response
 */
-int afs_vlocation_lookup(struct afs_cell *cell,
+static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
-                         const char *name,
+                                                 const char *name,
-                         unsigned namesz,
+                                                 size_t namesz)
-                         struct afs_vlocation **_vlocation)
 {
-        struct afs_cache_vlocation vldb;
+        struct afs_vlocation *vl;
-        struct afs_vlocation *vlocation;
-        afs_voltype_t voltype;
+        vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-        afs_volid_t vid;
+        if (vl) {
-        int active = 0, ret;
+                vl->cell = cell;
+                vl->state = AFS_VL_NEW;
-        _enter("{%s},%*.*s,%u,", cell->name, namesz, namesz, name, namesz);
+                atomic_set(&vl->usage, 1);
+                INIT_LIST_HEAD(&vl->link);
-        if (namesz > sizeof(vlocation->vldb.name)) {
+                INIT_LIST_HEAD(&vl->grave);
-                _leave(" = -ENAMETOOLONG");
+                INIT_LIST_HEAD(&vl->update);
-                return -ENAMETOOLONG;
+                init_waitqueue_head(&vl->waitq);
-        }
+                spin_lock_init(&vl->lock);
+                memcpy(vl->vldb.name, name, namesz);
-        /* search the cell's active list first */
-        list_for_each_entry(vlocation, &cell->vl_list, link) {
-                if (namesz < sizeof(vlocation->vldb.name) &&
-                    vlocation->vldb.name[namesz] != '\0')
-                        continue;
-                if (memcmp(vlocation->vldb.name, name, namesz) == 0)
-                        goto found_in_memory;
-        }
-        /* search the cell's graveyard list second */
-        spin_lock(&cell->vl_gylock);
-        list_for_each_entry(vlocation, &cell->vl_graveyard, link) {
-                if (namesz < sizeof(vlocation->vldb.name) &&
-                    vlocation->vldb.name[namesz] != '\0')
-                        continue;
-                if (memcmp(vlocation->vldb.name, name, namesz) == 0)
-                        goto found_in_graveyard;
-        }
-        spin_unlock(&cell->vl_gylock);
-        /* not in the cell's in-memory lists - create a new record */
-        vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-        if (!vlocation)
-                return -ENOMEM;
-        atomic_set(&vlocation->usage, 1);
-        INIT_LIST_HEAD(&vlocation->link);
-        rwlock_init(&vlocation->lock);
-        memcpy(vlocation->vldb.name, name, namesz);
-        afs_timer_init(&vlocation->timeout, &afs_vlocation_timer_ops);
-        afs_timer_init(&vlocation->upd_timer, &afs_vlocation_update_timer_ops);
-        afs_async_op_init(&vlocation->upd_op, &afs_vlocation_update_op_ops);
-        afs_get_cell(cell);
-        vlocation->cell = cell;
-        list_add_tail(&vlocation->link, &cell->vl_list);
-#ifdef AFS_CACHING_SUPPORT
-        /* we want to store it in the cache, plus it might already be
-         * encached */
-        cachefs_acquire_cookie(cell->cache,
-                               &afs_volume_cache_index_def,
-                               vlocation,
-                               &vlocation->cache);
-        if (vlocation->valid)
-                goto found_in_cache;
-#endif
-        /* try to look up an unknown volume in the cell VL databases by name */
-        ret = afs_vlocation_access_vl_by_name(vlocation, name, namesz, &vldb);
-        if (ret < 0) {
-                printk("kAFS: failed to locate '%*.*s' in cell '%s'\n",
-                       namesz, namesz, name, cell->name);
-                goto error;
        }
-        goto found_on_vlserver;
+        _leave(" = %p", vl);
+        return vl;
- found_in_graveyard:
+}
-        /* found in the graveyard - resurrect */
-        _debug("found in graveyard");
-        atomic_inc(&vlocation->usage);
-        list_move_tail(&vlocation->link, &cell->vl_list);
-        spin_unlock(&cell->vl_gylock);
-        afs_kafstimod_del_timer(&vlocation->timeout);
-        goto active;
- found_in_memory:
-        /* found in memory - check to see if it's active */
-        _debug("found in memory");
-        atomic_inc(&vlocation->usage);
- active:
+/*
-        active = 1;
+ * update record if we found it in the cache
+ */
+static int afs_vlocation_update_record(struct afs_vlocation *vl,
+                                       struct key *key,
+                                       struct afs_cache_vlocation *vldb)
+{
+        afs_voltype_t voltype;
+        afs_volid_t vid;
+        int ret;
-#ifdef AFS_CACHING_SUPPORT
- found_in_cache:
-#endif
        /* try to look up a cached volume in the cell VL databases by ID */
-        _debug("found in cache");
        _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-               vlocation->vldb.name,
+               vl->vldb.name,
-               vlocation->vldb.vidmask,
+               vl->vldb.vidmask,
-               ntohl(vlocation->vldb.servers[0].s_addr),
+               ntohl(vl->vldb.servers[0].s_addr),
-               vlocation->vldb.srvtmask[0],
+               vl->vldb.srvtmask[0],
-               ntohl(vlocation->vldb.servers[1].s_addr),
+               ntohl(vl->vldb.servers[1].s_addr),
-               vlocation->vldb.srvtmask[1],
+               vl->vldb.srvtmask[1],
-               ntohl(vlocation->vldb.servers[2].s_addr),
+               ntohl(vl->vldb.servers[2].s_addr),
-               vlocation->vldb.srvtmask[2]
+               vl->vldb.srvtmask[2]);
-               );
        _debug("Vids: %08x %08x %08x",
-               vlocation->vldb.vid[0],
+               vl->vldb.vid[0],
-               vlocation->vldb.vid[1],
+               vl->vldb.vid[1],
-               vlocation->vldb.vid[2]);
+               vl->vldb.vid[2]);
-        if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
+        if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
-                vid = vlocation->vldb.vid[0];
+                vid = vl->vldb.vid[0];
                voltype = AFSVL_RWVOL;
-        }
+        } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
+                vid = vl->vldb.vid[1];
-                vid = vlocation->vldb.vid[1];
                voltype = AFSVL_ROVOL;
-        }
+        } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
+                vid = vl->vldb.vid[2];
-                vid = vlocation->vldb.vid[2];
                voltype = AFSVL_BACKVOL;
-        }
+        } else {
-        else {
                BUG();
                vid = 0;
                voltype = 0;
        }
-        ret = afs_vlocation_access_vl_by_id(vlocation, vid, voltype, &vldb);
+        /* contact the server to make sure the volume is still available
+         * - TODO: need to handle disconnected operation here
+         */
+        ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
        switch (ret) {
                /* net error */
        default:
-                printk("kAFS: failed to volume '%*.*s' (%x) up in '%s': %d\n",
+                printk(KERN_WARNING "kAFS:"
-                       namesz, namesz, name, vid, cell->name, ret);
+                       " failed to update volume '%s' (%x) up in '%s': %d\n",
-                goto error;
+                       vl->vldb.name, vid, vl->cell->name, ret);
+                _leave(" = %d", ret);
+                return ret;
                /* pulled from local cache into memory */
        case 0:
-                goto found_on_vlserver;
+                _leave(" = 0");
+                return 0;
                /* uh oh... looks like the volume got deleted */
        case -ENOMEDIUM:
-                printk("kAFS: volume '%*.*s' (%x) does not exist '%s'\n",
+                printk(KERN_ERR "kAFS:"
-                       namesz, namesz, name, vid, cell->name);
+                       " volume '%s' (%x) does not exist '%s'\n",
+                       vl->vldb.name, vid, vl->cell->name);
                /* TODO: make existing record unavailable */
-                goto error;
+                _leave(" = %d", ret);
+                return ret;
        }
+}
- found_on_vlserver:
+/*
-        _debug("Done VL Lookup: %*.*s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+ * apply the update to a VL record
-               namesz, namesz, name,
+ */
-               vldb.vidmask,
+static void afs_vlocation_apply_update(struct afs_vlocation *vl,
-               ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
+                                       struct afs_cache_vlocation *vldb)
-               ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
+{
-               ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
+        _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-               );
+               vldb->name, vldb->vidmask,
+               ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
-        _debug("Vids: %08x %08x %08x", vldb.vid[0], vldb.vid[1], vldb.vid[2]);
+               ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
+               ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
-        if ((namesz < sizeof(vlocation->vldb.name) &&
+        _debug("Vids: %08x %08x %08x",
-             vlocation->vldb.name[namesz] != '\0') ||
+               vldb->vid[0], vldb->vid[1], vldb->vid[2]);
-            memcmp(vldb.name, name, namesz) != 0)
-                printk("kAFS: name of volume '%*.*s' changed to '%s' on server\n",
-                       namesz, namesz, name, vldb.name);
-        memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
+        if (strcmp(vldb->name, vl->vldb.name) != 0)
+                printk(KERN_NOTICE "kAFS:"
+                       " name of volume '%s' changed to '%s' on server\n",
+                       vl->vldb.name, vldb->name);
-        afs_kafstimod_add_timer(&vlocation->upd_timer, 10 * HZ);
+        vl->vldb = *vldb;
 #ifdef AFS_CACHING_SUPPORT
        /* update volume entry in local cache */
-        cachefs_update_cookie(vlocation->cache);
+        cachefs_update_cookie(vl->cache);
-#endif
-        *_vlocation = vlocation;
-        _leave(" = 0 (%p)",vlocation);
-        return 0;
- error:
-        if (vlocation) {
-                if (active) {
-                        __afs_put_vlocation(vlocation);
-                }
-                else {
-                        list_del(&vlocation->link);
-#ifdef AFS_CACHING_SUPPORT
-                        cachefs_relinquish_cookie(vlocation->cache, 0);
 #endif
-                        afs_put_cell(vlocation->cell);
+}
-                        kfree(vlocation);
-                }
-        }
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_vlocation_lookup() */
-/*****************************************************************************/
 /*
- * finish using a volume location record
+ * fill in a volume location record, consulting the cache and the VL server
- * - caller must have cell->vol_sem write-locked
+ * both
 */
-static void __afs_put_vlocation(struct afs_vlocation *vlocation)
+static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
+                                        struct key *key)
 {
-        struct afs_cell *cell;
+        struct afs_cache_vlocation vldb;
+        int ret;
-        if (!vlocation)
+        _enter("");
-                return;
-        _enter("%s", vlocation->vldb.name);
+        ASSERTCMP(vl->valid, ==, 0);
-        cell = vlocation->cell;
+        memset(&vldb, 0, sizeof(vldb));
-        /* sanity check */
+        /* see if we have an in-cache copy (will set vl->valid if there is) */
-        BUG_ON(atomic_read(&vlocation->usage) <= 0);
+#ifdef AFS_CACHING_SUPPORT
+        cachefs_acquire_cookie(cell->cache,
+                               &afs_volume_cache_index_def,
+                               vlocation,
+                               &vl->cache);
+#endif
-        spin_lock(&cell->vl_gylock);
+        if (vl->valid) {
-        if (likely(!atomic_dec_and_test(&vlocation->usage))) {
+                /* try to update a known volume in the cell VL databases by
-                spin_unlock(&cell->vl_gylock);
+                 * ID as the name may have changed */
-                _leave("");
+                _debug("found in cache");
-                return;
+                ret = afs_vlocation_update_record(vl, key, &vldb);
+        } else {
+                /* try to look up an unknown volume in the cell VL databases by
+                 * name */
+                ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+                if (ret < 0) {
+                        printk("kAFS: failed to locate '%s' in cell '%s'\n",
+                               vl->vldb.name, vl->cell->name);
+                        return ret;
+                }
        }
-        /* move to graveyard queue */
+        afs_vlocation_apply_update(vl, &vldb);
-        list_move_tail(&vlocation->link,&cell->vl_graveyard);
+        _leave(" = 0");
+        return 0;
-        /* remove from pending timeout queue (refcounted if actually being
+}
-         * updated) */
-        list_del_init(&vlocation->upd_op.link);
-        /* time out in 10 secs */
-        afs_kafstimod_del_timer(&vlocation->upd_timer);
-        afs_kafstimod_add_timer(&vlocation->timeout, 10 * HZ);
-        spin_unlock(&cell->vl_gylock);
-        _leave(" [killed]");
-} /* end __afs_put_vlocation() */
-/*****************************************************************************/
-/*
- * finish using a volume location record
- */
-void afs_put_vlocation(struct afs_vlocation *vlocation)
-{
-        if (vlocation) {
-                struct afs_cell *cell = vlocation->cell;
-                down_write(&cell->vl_sem);
-                __afs_put_vlocation(vlocation);
-                up_write(&cell->vl_sem);
-        }
-} /* end afs_put_vlocation() */
-/*****************************************************************************/
 /*
- * timeout vlocation record
+ * queue a vlocation record for updates
- * - removes from the cell's graveyard if the usage count is zero
 */
-void afs_vlocation_do_timeout(struct afs_vlocation *vlocation)
+void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
 {
-        struct afs_cell *cell;
+        struct afs_vlocation *xvl;
-        _enter("%s", vlocation->vldb.name);
+        /* wait at least 10 minutes before updating... */
+        vl->update_at = get_seconds() + afs_vlocation_update_timeout;
-        cell = vlocation->cell;
+        spin_lock(&afs_vlocation_updates_lock);
-        BUG_ON(atomic_read(&vlocation->usage) < 0);
+        if (!list_empty(&afs_vlocation_updates)) {
+                /* ... but wait at least 1 second more than the newest record
-        /* remove from graveyard if still dead */
+                 * already queued so that we don't spam the VL server suddenly
-        spin_lock(&cell->vl_gylock);
+                 * with lots of requests
-        if (atomic_read(&vlocation->usage) == 0)
+                 */
-                list_del_init(&vlocation->link);
+                xvl = list_entry(afs_vlocation_updates.prev,
-        else
+                                 struct afs_vlocation, update);
-                vlocation = NULL;
+                if (vl->update_at <= xvl->update_at)
-        spin_unlock(&cell->vl_gylock);
+                        vl->update_at = xvl->update_at + 1;
+        } else {
-        if (!vlocation) {
+                queue_delayed_work(afs_vlocation_update_worker,
-                _leave("");
+                                   &afs_vlocation_update,
-                return; /* resurrected */
+                                   afs_vlocation_update_timeout * HZ);
        }
-        /* we can now destroy it properly */
+        list_add_tail(&vl->update, &afs_vlocation_updates);
-#ifdef AFS_CACHING_SUPPORT
+        spin_unlock(&afs_vlocation_updates_lock);
-        cachefs_relinquish_cookie(vlocation->cache, 0);
+}
-#endif
-        afs_put_cell(cell);
-        kfree(vlocation);
-        _leave(" [destroyed]");
-} /* end afs_vlocation_do_timeout() */
-/*****************************************************************************/
 /*
- * send an update operation to the currently selected server
+ * lookup volume location
+ * - iterate through the VL servers in a cell until one of them admits knowing
+ *   about the volume in question
+ * - lookup in the local cache if not able to find on the VL server
+ * - insert/update in the local cache if did get a VL response
 */
-static int afs_vlocation_update_begin(struct afs_vlocation *vlocation)
+struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+                                           struct key *key,
+                                           const char *name,
+                                           size_t namesz)
 {
-        afs_voltype_t voltype;
+        struct afs_vlocation *vl;
-        afs_volid_t vid;
        int ret;
-        _enter("%s{ufs=%u ucs=%u}",
+        _enter("{%s},{%x},%*.*s,%zu",
-               vlocation->vldb.name,
+               cell->name, key_serial(key),
-               vlocation->upd_first_svix,
+               (int) namesz, (int) namesz, name, namesz);
-               vlocation->upd_curr_svix);
-        /* try to look up a cached volume in the cell VL databases by ID */
+        if (namesz > sizeof(vl->vldb.name)) {
-        if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
+                _leave(" = -ENAMETOOLONG");
-                vid = vlocation->vldb.vid[0];
+                return ERR_PTR(-ENAMETOOLONG);
-                voltype = AFSVL_RWVOL;
-        }
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
-                vid = vlocation->vldb.vid[1];
-                voltype = AFSVL_ROVOL;
        }
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
-                vid = vlocation->vldb.vid[2];
+        /* see if we have an in-memory copy first */
-                voltype = AFSVL_BACKVOL;
+        down_write(&cell->vl_sem);
+        spin_lock(&cell->vl_lock);
+        list_for_each_entry(vl, &cell->vl_list, link) {
+                if (vl->vldb.name[namesz] != '\0')
+                        continue;
+                if (memcmp(vl->vldb.name, name, namesz) == 0)
+                        goto found_in_memory;
        }
-        else {
+        spin_unlock(&cell->vl_lock);
-                BUG();
-                vid = 0;
+        /* not in the cell's in-memory lists - create a new record */
-                voltype = 0;
+        vl = afs_vlocation_alloc(cell, name, namesz);
+        if (!vl) {
+                up_write(&cell->vl_sem);
+                return ERR_PTR(-ENOMEM);
        }
-        /* contact the chosen server */
+        afs_get_cell(cell);
-        ret = afs_server_lookup(
-                vlocation->cell,
-                &vlocation->cell->vl_addrs[vlocation->upd_curr_svix],
-                &vlocation->upd_op.server);
-        switch (ret) {
+        list_add_tail(&vl->link, &cell->vl_list);
-        case 0:
+        vl->state = AFS_VL_CREATING;
-                break;
+        up_write(&cell->vl_sem);
-        case -ENOMEM:
-        case -ENONET:
-        default:
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* initiate the update operation */
+fill_in_record:
-        ret = afs_rxvl_get_entry_by_id_async(&vlocation->upd_op, vid, voltype);
+        ret = afs_vlocation_fill_in_record(vl, key);
-        if (ret < 0) {
+        if (ret < 0)
-                _leave(" = %d", ret);
+                goto error_abandon;
-                return ret;
+        spin_lock(&vl->lock);
+        vl->state = AFS_VL_VALID;
+        spin_unlock(&vl->lock);
+        wake_up(&vl->waitq);
+        /* schedule for regular updates */
+        afs_vlocation_queue_for_updates(vl);
+        goto success;
+found_in_memory:
+        /* found in memory */
+        _debug("found in memory");
+        atomic_inc(&vl->usage);
+        spin_unlock(&cell->vl_lock);
+        if (!list_empty(&vl->grave)) {
+                spin_lock(&afs_vlocation_graveyard_lock);
+                list_del_init(&vl->grave);
+                spin_unlock(&afs_vlocation_graveyard_lock);
        }
+        up_write(&cell->vl_sem);
+        /* see if it was an abandoned record that we might try filling in */
+        spin_lock(&vl->lock);
+        while (vl->state != AFS_VL_VALID) {
+                afs_vlocation_state_t state = vl->state;
+                _debug("invalid [state %d]", state);
+                if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
+                        vl->state = AFS_VL_CREATING;
+                        spin_unlock(&vl->lock);
+                        goto fill_in_record;
+                }
+                /* must now wait for creation or update by someone else to
+                 * complete */
+                _debug("wait");
+                spin_unlock(&vl->lock);
+                ret = wait_event_interruptible(vl->waitq,
+                                               vl->state == AFS_VL_NEW ||
+                                               vl->state == AFS_VL_VALID ||
+                                               vl->state == AFS_VL_NO_VOLUME);
+                if (ret < 0)
+                        goto error;
+                spin_lock(&vl->lock);
+        }
+        spin_unlock(&vl->lock);
+success:
+        _leave(" = %p",vl);
+        return vl;
+error_abandon:
+        spin_lock(&vl->lock);
+        vl->state = AFS_VL_NEW;
+        spin_unlock(&vl->lock);
+        wake_up(&vl->waitq);
+error:
+        ASSERT(vl != NULL);
+        afs_put_vlocation(vl);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
-} /* end afs_vlocation_update_begin() */
+}
-/*****************************************************************************/
 /*
- * abandon updating a VL record
+ * finish using a volume location record
- * - does not restart the update timer
 */
-static void afs_vlocation_update_abandon(struct afs_vlocation *vlocation,
+void afs_put_vlocation(struct afs_vlocation *vl)
-                                         afs_vlocation_upd_t state,
-                                         int ret)
 {
-        _enter("%s,%u", vlocation->vldb.name, state);
+        if (!vl)
+                return;
-        if (ret < 0)
-                printk("kAFS: Abandoning VL update '%s': %d\n",
-                       vlocation->vldb.name, ret);
-        /* discard the server record */
-        afs_put_server(vlocation->upd_op.server);
-        vlocation->upd_op.server = NULL;
-        spin_lock(&afs_vlocation_update_lock);
+        _enter("%s", vl->vldb.name);
-        afs_vlocation_update = NULL;
-        vlocation->upd_state = state;
-        /* TODO: start updating next VL record on pending list */
+        ASSERTCMP(atomic_read(&vl->usage), >, 0);
-        spin_unlock(&afs_vlocation_update_lock);
+        if (likely(!atomic_dec_and_test(&vl->usage))) {
+                _leave("");
+                return;
+        }
-        _leave("");
+        spin_lock(&afs_vlocation_graveyard_lock);
-} /* end afs_vlocation_update_abandon() */
+        if (atomic_read(&vl->usage) == 0) {
+                _debug("buried");
+                list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+                vl->time_of_death = get_seconds();
+                schedule_delayed_work(&afs_vlocation_reap,
+                                      afs_vlocation_timeout * HZ);
+                /* suspend updates on this record */
+                if (!list_empty(&vl->update)) {
+                        spin_lock(&afs_vlocation_updates_lock);
+                        list_del_init(&vl->update);
+                        spin_unlock(&afs_vlocation_updates_lock);
+                }
+        }
+        spin_unlock(&afs_vlocation_graveyard_lock);
+        _leave(" [killed?]");
+}
-/*****************************************************************************/
 /*
- * handle periodic update timeouts and busy retry timeouts
+ * destroy a dead volume location record
- * - called from kafstimod
 */
-static void afs_vlocation_update_timer(struct afs_timer *timer)
+static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
-        struct afs_vlocation *vlocation =
+        _enter("%p", vl);
-                list_entry(timer, struct afs_vlocation, upd_timer);
-        int ret;
-        _enter("%s", vlocation->vldb.name);
+#ifdef AFS_CACHING_SUPPORT
+        cachefs_relinquish_cookie(vl->cache, 0);
+#endif
-        /* only update if not in the graveyard (defend against putting too) */
+        afs_put_cell(vl->cell);
-        spin_lock(&vlocation->cell->vl_gylock);
+        kfree(vl);
+}
-        if (!atomic_read(&vlocation->usage))
+/*
-                goto out_unlock1;
+ * reap dead volume location records
+ */
+static void afs_vlocation_reaper(struct work_struct *work)
+{
+        LIST_HEAD(corpses);
+        struct afs_vlocation *vl;
+        unsigned long delay, expiry;
+        time_t now;
-        spin_lock(&afs_vlocation_update_lock);
+        _enter("");
-        /* if we were woken up due to EBUSY sleep then restart immediately if
+        now = get_seconds();
-         * possible or else jump to front of pending queue */
+        spin_lock(&afs_vlocation_graveyard_lock);
-        if (vlocation->upd_state == AFS_VLUPD_BUSYSLEEP) {
-                if (afs_vlocation_update) {
+        while (!list_empty(&afs_vlocation_graveyard)) {
-                        list_add(&vlocation->upd_op.link,
+                vl = list_entry(afs_vlocation_graveyard.next,
-                                 &afs_vlocation_update_pendq);
+                                struct afs_vlocation, grave);
+                _debug("check %p", vl);
+                /* the queue is ordered most dead first */
+                expiry = vl->time_of_death + afs_vlocation_timeout;
+                if (expiry > now) {
+                        delay = (expiry - now) * HZ;
+                        _debug("delay %lu", delay);
+                        if (!schedule_delayed_work(&afs_vlocation_reap,
+                                                   delay)) {
+                                cancel_delayed_work(&afs_vlocation_reap);
+                                schedule_delayed_work(&afs_vlocation_reap,
+                                                      delay);
+                        }
+                        break;
                }
-                else {
-                        afs_get_vlocation(vlocation);
+                spin_lock(&vl->cell->vl_lock);
-                        afs_vlocation_update = vlocation;
+                if (atomic_read(&vl->usage) > 0) {
-                        vlocation->upd_state = AFS_VLUPD_INPROGRESS;
+                        _debug("no reap");
+                        list_del_init(&vl->grave);
+                } else {
+                        _debug("reap");
+                        list_move_tail(&vl->grave, &corpses);
+                        list_del_init(&vl->link);
                }
-                goto out_unlock2;
+                spin_unlock(&vl->cell->vl_lock);
        }
-        /* put on pending queue if there's already another update in progress */
+        spin_unlock(&afs_vlocation_graveyard_lock);
-        if (afs_vlocation_update) {
-                vlocation->upd_state = AFS_VLUPD_PENDING;
-                list_add_tail(&vlocation->upd_op.link,
-                              &afs_vlocation_update_pendq);
-                goto out_unlock2;
-        }
-        /* hold a ref on it while actually updating */
+        /* now reap the corpses we've extracted */
-        afs_get_vlocation(vlocation);
+        while (!list_empty(&corpses)) {
-        afs_vlocation_update = vlocation;
+                vl = list_entry(corpses.next, struct afs_vlocation, grave);
-        vlocation->upd_state = AFS_VLUPD_INPROGRESS;
+                list_del(&vl->grave);
+                afs_vlocation_destroy(vl);
-        spin_unlock(&afs_vlocation_update_lock);
-        spin_unlock(&vlocation->cell->vl_gylock);
-        /* okay... we can start the update */
-        _debug("BEGIN VL UPDATE [%s]", vlocation->vldb.name);
-        vlocation->upd_first_svix = vlocation->cell->vl_curr_svix;
-        vlocation->upd_curr_svix = vlocation->upd_first_svix;
-        vlocation->upd_rej_cnt = 0;
-        vlocation->upd_busy_cnt = 0;
-        ret = afs_vlocation_update_begin(vlocation);
-        if (ret < 0) {
-                afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
-                afs_kafstimod_add_timer(&vlocation->upd_timer,
-                                        AFS_VLDB_TIMEOUT);
-                afs_put_vlocation(vlocation);
        }
        _leave("");
-        return;
+}
- out_unlock2:
+/*
-        spin_unlock(&afs_vlocation_update_lock);
+ * initialise the VL update process
- out_unlock1:
+ */
-        spin_unlock(&vlocation->cell->vl_gylock);
+int __init afs_vlocation_update_init(void)
-        _leave("");
+{
-        return;
+        afs_vlocation_update_worker =
+                create_singlethread_workqueue("kafs_vlupdated");
+        return afs_vlocation_update_worker ? 0 : -ENOMEM;
+}
-} /* end afs_vlocation_update_timer() */
+/*
+ * discard all the volume location records for rmmod
+ */
+void afs_vlocation_purge(void)
+{
+        afs_vlocation_timeout = 0;
+        spin_lock(&afs_vlocation_updates_lock);
+        list_del_init(&afs_vlocation_updates);
+        spin_unlock(&afs_vlocation_updates_lock);
+        cancel_delayed_work(&afs_vlocation_update);
+        queue_delayed_work(afs_vlocation_update_worker,
+                           &afs_vlocation_update, 0);
+        destroy_workqueue(afs_vlocation_update_worker);
+        cancel_delayed_work(&afs_vlocation_reap);
+        schedule_delayed_work(&afs_vlocation_reap, 0);
+}
-/*****************************************************************************/
 /*
- * attend to an update operation upon which an event happened
+ * update a volume location
- * - called in kafsasyncd context
 */
-static void afs_vlocation_update_attend(struct afs_async_op *op)
+static void afs_vlocation_updater(struct work_struct *work)
 {
        struct afs_cache_vlocation vldb;
-        struct afs_vlocation *vlocation =
+        struct afs_vlocation *vl, *xvl;
-                list_entry(op, struct afs_vlocation, upd_op);
+        time_t now;
-        unsigned tmp;
+        long timeout;
        int ret;
-        _enter("%s", vlocation->vldb.name);
+        _enter("");
-        ret = afs_rxvl_get_entry_by_id_async2(op, &vldb);
-        switch (ret) {
-        case -EAGAIN:
-                _leave(" [unfinished]");
-                return;
-        case 0:
-                _debug("END VL UPDATE: %d\n", ret);
-                vlocation->valid = 1;
-                _debug("Done VL Lookup: %02x { %08x(%x) %08x(%x) %08x(%x) }",
-                       vldb.vidmask,
-                       ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
-                       ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
-                       ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
-                       );
-                _debug("Vids: %08x %08x %08x",
-                       vldb.vid[0], vldb.vid[1], vldb.vid[2]);
-                afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
-                down_write(&vlocation->cell->vl_sem);
-                /* actually update the cache */
-                if (strncmp(vldb.name, vlocation->vldb.name,
-                            sizeof(vlocation->vldb.name)) != 0)
-                        printk("kAFS: name of volume '%s'"
-                               " changed to '%s' on server\n",
-                               vlocation->vldb.name, vldb.name);
-                memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
-#if 0
-                /* TODO update volume entry in local cache */
-#endif
-                up_write(&vlocation->cell->vl_sem);
-                if (ret < 0)
-                        printk("kAFS: failed to update local cache: %d\n", ret);
-                afs_kafstimod_add_timer(&vlocation->upd_timer,
-                                        AFS_VLDB_TIMEOUT);
-                afs_put_vlocation(vlocation);
-                _leave(" [found]");
-                return;
-        case -ENOMEDIUM:
-                vlocation->upd_rej_cnt++;
-                goto try_next;
-                /* the server is locked - retry in a very short while */
-        case -EBUSY:
-                vlocation->upd_busy_cnt++;
-                if (vlocation->upd_busy_cnt > 3)
-                        goto try_next; /* too many retries */
-                afs_vlocation_update_abandon(vlocation,
-                                             AFS_VLUPD_BUSYSLEEP, 0);
-                afs_kafstimod_add_timer(&vlocation->upd_timer, HZ / 2);
-                afs_put_vlocation(vlocation);
-                _leave(" [busy]");
-                return;
-        case -ENETUNREACH:
-        case -EHOSTUNREACH:
-        case -ECONNREFUSED:
-        case -EREMOTEIO:
-                /* record bad vlserver info in the cell too
-                 * - TODO: use down_write_trylock() if available
-                 */
-                if (vlocation->upd_curr_svix == vlocation->cell->vl_curr_svix)
-                        vlocation->cell->vl_curr_svix =
-                                vlocation->cell->vl_curr_svix %
-                                vlocation->cell->vl_naddrs;
-        case -EBADRQC:
-        case -EINVAL:
-        case -EACCES:
-        case -EBADMSG:
-                goto try_next;
-        default:
-                goto abandon;
-        }
-        /* try contacting the next server */
- try_next:
-        vlocation->upd_busy_cnt = 0;
-        /* discard the server record */
-        afs_put_server(vlocation->upd_op.server);
-        vlocation->upd_op.server = NULL;
-        tmp = vlocation->cell->vl_naddrs;
+        now = get_seconds();
-        if (tmp == 0)
-                goto abandon;
-        vlocation->upd_curr_svix++;
+        /* find a record to update */
-        if (vlocation->upd_curr_svix >= tmp)
+        spin_lock(&afs_vlocation_updates_lock);
-                vlocation->upd_curr_svix = 0;
+        for (;;) {
-        if (vlocation->upd_first_svix >= tmp)
+                if (list_empty(&afs_vlocation_updates)) {
-                vlocation->upd_first_svix = tmp - 1;
+                        spin_unlock(&afs_vlocation_updates_lock);
+                        _leave(" [nothing]");
+                        return;
+                }
-        /* move to the next server */
+                vl = list_entry(afs_vlocation_updates.next,
-        if (vlocation->upd_curr_svix != vlocation->upd_first_svix) {
+                                struct afs_vlocation, update);
-                afs_vlocation_update_begin(vlocation);
+                if (atomic_read(&vl->usage) > 0)
-                _leave(" [next]");
+                        break;
-                return;
+                list_del_init(&vl->update);
        }
-        /* run out of servers to try - was the volume rejected? */
+        timeout = vl->update_at - now;
-        if (vlocation->upd_rej_cnt > 0) {
+        if (timeout > 0) {
-                printk("kAFS: Active volume no longer valid '%s'\n",
+                queue_delayed_work(afs_vlocation_update_worker,
-                       vlocation->vldb.name);
+                                   &afs_vlocation_update, timeout * HZ);
-                vlocation->valid = 0;
+                spin_unlock(&afs_vlocation_updates_lock);
-                afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
+                _leave(" [nothing]");
-                afs_kafstimod_add_timer(&vlocation->upd_timer,
-                                        AFS_VLDB_TIMEOUT);
-                afs_put_vlocation(vlocation);
-                _leave(" [invalidated]");
                return;
        }
-        /* abandon the update */
+        list_del_init(&vl->update);
- abandon:
+        atomic_inc(&vl->usage);
-        afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
+        spin_unlock(&afs_vlocation_updates_lock);
-        afs_kafstimod_add_timer(&vlocation->upd_timer, HZ * 10);
-        afs_put_vlocation(vlocation);
-        _leave(" [abandoned]");
-} /* end afs_vlocation_update_attend() */
-/*****************************************************************************/
-/*
- * deal with an update operation being discarded
- * - called in kafsasyncd context when it's dying due to rmmod
- * - the call has already been aborted and put()'d
- */
-static void afs_vlocation_update_discard(struct afs_async_op *op)
-{
-        struct afs_vlocation *vlocation =
-                list_entry(op, struct afs_vlocation, upd_op);
-        _enter("%s", vlocation->vldb.name);
+        /* we can now perform the update */
+        _debug("update %s", vl->vldb.name);
+        vl->state = AFS_VL_UPDATING;
+        vl->upd_rej_cnt = 0;
+        vl->upd_busy_cnt = 0;
-        afs_put_server(op->server);
+        ret = afs_vlocation_update_record(vl, NULL, &vldb);
-        op->server = NULL;
+        spin_lock(&vl->lock);
+        switch (ret) {
+        case 0:
+                afs_vlocation_apply_update(vl, &vldb);
+                vl->state = AFS_VL_VALID;
+                break;
+        case -ENOMEDIUM:
+                vl->state = AFS_VL_VOLUME_DELETED;
+                break;
+        default:
+                vl->state = AFS_VL_UNCERTAIN;
+                break;
+        }
+        spin_unlock(&vl->lock);
+        wake_up(&vl->waitq);
-        afs_put_vlocation(vlocation);
+        /* and then reschedule */
+        _debug("reschedule");
+        vl->update_at = get_seconds() + afs_vlocation_update_timeout;
-        _leave("");
+        spin_lock(&afs_vlocation_updates_lock);
-} /* end afs_vlocation_update_discard() */
-/*****************************************************************************/
+        if (!list_empty(&afs_vlocation_updates)) {
-/*
+                /* next update in 10 minutes, but wait at least 1 second more
- * match a VLDB record stored in the cache
+                 * than the newest record already queued so that we don't spam
- * - may also load target from entry
+                 * the VL server suddenly with lots of requests
- */
+                 */
-#ifdef AFS_CACHING_SUPPORT
+                xvl = list_entry(afs_vlocation_updates.prev,
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                 struct afs_vlocation, update);
-                                                     const void *entry)
+                if (vl->update_at <= xvl->update_at)
-{
+                        vl->update_at = xvl->update_at + 1;
-        const struct afs_cache_vlocation *vldb = entry;
+                xvl = list_entry(afs_vlocation_updates.next,
-        struct afs_vlocation *vlocation = target;
+                                 struct afs_vlocation, update);
+                timeout = xvl->update_at - now;
-        _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+                if (timeout < 0)
+                        timeout = 0;
-        if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+        } else {
-            ) {
+                timeout = afs_vlocation_update_timeout;
-                if (!vlocation->valid ||
-                    vlocation->vldb.rtime == vldb->rtime
-                    ) {
-                        vlocation->vldb = *vldb;
-                        vlocation->valid = 1;
-                        _leave(" = SUCCESS [c->m]");
-                        return CACHEFS_MATCH_SUCCESS;
-                }
-                /* need to update cache if cached info differs */
-                else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-                        /* delete if VIDs for this name differ */
-                        if (memcmp(&vlocation->vldb.vid,
-                                   &vldb->vid,
-                                   sizeof(vldb->vid)) != 0) {
-                                _leave(" = DELETE");
-                                return CACHEFS_MATCH_SUCCESS_DELETE;
-                        }
-                        _leave(" = UPDATE");
-                        return CACHEFS_MATCH_SUCCESS_UPDATE;
-                }
-                else {
-                        _leave(" = SUCCESS");
-                        return CACHEFS_MATCH_SUCCESS;
-                }
        }
-        _leave(" = FAILED");
+        ASSERT(list_empty(&vl->update));
-        return CACHEFS_MATCH_FAILED;
-} /* end afs_vlocation_cache_match() */
-#endif
-/*****************************************************************************/
-/*
- * update a VLDB record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
-{
-        struct afs_cache_vlocation *vldb = entry;
-        struct afs_vlocation *vlocation = source;
-        _enter("");
+        list_add_tail(&vl->update, &afs_vlocation_updates);
-        *vldb = vlocation->vldb;
-} /* end afs_vlocation_cache_update() */
+        _debug("timeout %ld", timeout);
-#endif
+        queue_delayed_work(afs_vlocation_update_worker,
+                           &afs_vlocation_update, timeout * HZ);
+        spin_unlock(&afs_vlocation_updates_lock);
+        afs_put_vlocation(vl);
+}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index cf62da5d7825..a1904ab8426a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -1,6 +1,6 @@
-/* vnode.c: AFS vnode management
+/* AFS vnode management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -14,142 +14,237 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "vnode.h"
 #include "internal.h"
-static void afs_vnode_cb_timed_out(struct afs_timer *timer);
+#if 0
+static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
+                                   int depth, char lr)
+{
+        struct afs_vnode *vnode;
+        bool bad = false;
+        if (!node)
+                return false;
+        if (node->rb_left)
+                bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
+        vnode = rb_entry(node, struct afs_vnode, cb_promise);
+        _debug("%c %*.*s%c%p {%d}",
+               rb_is_red(node) ? 'R' : 'B',
+               depth, depth, "", lr,
+               vnode, vnode->cb_expires_at);
+        if (rb_parent(node) != parent) {
+                printk("BAD: %p != %p\n", rb_parent(node), parent);
+                bad = true;
+        }
-struct afs_timer_ops afs_vnode_cb_timed_out_ops = {
+        if (node->rb_right)
-        .timed_out      = afs_vnode_cb_timed_out,
+                bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
-};
-#ifdef AFS_CACHING_SUPPORT
+        return bad;
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
+}
-                                                 const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vnode_cache_index_def = {
+static noinline void dump_tree(const char *name, struct afs_server *server)
-        .name           = "vnode",
+{
-        .data_size      = sizeof(struct afs_cache_vnode),
+        _enter("%s", name);
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
+        if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
-        .match          = afs_vnode_cache_match,
+                BUG();
-        .update         = afs_vnode_cache_update,
+}
-};
 #endif
-/*****************************************************************************/
 /*
- * handle a callback timing out
+ * insert a vnode into the backing server's vnode tree
- * TODO: retain a ref to vnode struct for an outstanding callback timeout
 */
-static void afs_vnode_cb_timed_out(struct afs_timer *timer)
+static void afs_install_vnode(struct afs_vnode *vnode,
+                              struct afs_server *server)
 {
-        struct afs_server *oldserver;
+        struct afs_server *old_server = vnode->server;
-        struct afs_vnode *vnode;
+        struct afs_vnode *xvnode;
+        struct rb_node *parent, **p;
-        vnode = list_entry(timer, struct afs_vnode, cb_timeout);
+        _enter("%p,%p", vnode, server);
-        _enter("%p", vnode);
+        if (old_server) {
+                spin_lock(&old_server->fs_lock);
+                rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
+                spin_unlock(&old_server->fs_lock);
+        }
-        /* set the changed flag in the vnode and release the server */
+        afs_get_server(server);
-        spin_lock(&vnode->lock);
+        vnode->server = server;
+        afs_put_server(old_server);
+        /* insert into the server's vnode tree in FID order */
+        spin_lock(&server->fs_lock);
+        parent = NULL;
+        p = &server->fs_vnodes.rb_node;
+        while (*p) {
+                parent = *p;
+                xvnode = rb_entry(parent, struct afs_vnode, server_rb);
+                if (vnode->fid.vid < xvnode->fid.vid)
+                        p = &(*p)->rb_left;
+                else if (vnode->fid.vid > xvnode->fid.vid)
+                        p = &(*p)->rb_right;
+                else if (vnode->fid.vnode < xvnode->fid.vnode)
+                        p = &(*p)->rb_left;
+                else if (vnode->fid.vnode > xvnode->fid.vnode)
+                        p = &(*p)->rb_right;
+                else if (vnode->fid.unique < xvnode->fid.unique)
+                        p = &(*p)->rb_left;
+                else if (vnode->fid.unique > xvnode->fid.unique)
+                        p = &(*p)->rb_right;
+                else
+                        BUG(); /* can't happen unless afs_iget() malfunctions */
+        }
+        rb_link_node(&vnode->server_rb, parent, p);
+        rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
-        oldserver = xchg(&vnode->cb_server, NULL);
+        spin_unlock(&server->fs_lock);
-        if (oldserver) {
+        _leave("");
-                vnode->flags |= AFS_VNODE_CHANGED;
+}
-                spin_lock(&afs_cb_hash_lock);
+/*
-                list_del_init(&vnode->cb_hash_link);
+ * insert a vnode into the promising server's update/expiration tree
-                spin_unlock(&afs_cb_hash_lock);
+ * - caller must hold vnode->lock
+ */
+static void afs_vnode_note_promise(struct afs_vnode *vnode,
+                                   struct afs_server *server)
+{
+        struct afs_server *old_server;
+        struct afs_vnode *xvnode;
+        struct rb_node *parent, **p;
-                spin_lock(&oldserver->cb_lock);
+        _enter("%p,%p", vnode, server);
-                list_del_init(&vnode->cb_link);
-                spin_unlock(&oldserver->cb_lock);
+        ASSERT(server != NULL);
+        old_server = vnode->server;
+        if (vnode->cb_promised) {
+                if (server == old_server &&
+                    vnode->cb_expires == vnode->cb_expires_at) {
+                        _leave(" [no change]");
+                        return;
+                }
+                spin_lock(&old_server->cb_lock);
+                if (vnode->cb_promised) {
+                        _debug("delete");
+                        rb_erase(&vnode->cb_promise, &old_server->cb_promises);
+                        vnode->cb_promised = false;
+                }
+                spin_unlock(&old_server->cb_lock);
        }
-        spin_unlock(&vnode->lock);
+        if (vnode->server != server)
+                afs_install_vnode(vnode, server);
+        vnode->cb_expires_at = vnode->cb_expires;
+        _debug("PROMISE on %p {%lu}",
+               vnode, (unsigned long) vnode->cb_expires_at);
+        /* abuse an RB-tree to hold the expiration order (we may have multiple
+         * items with the same expiration time) */
+        spin_lock(&server->cb_lock);
+        parent = NULL;
+        p = &server->cb_promises.rb_node;
+        while (*p) {
+                parent = *p;
+                xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
+                if (vnode->cb_expires_at < xvnode->cb_expires_at)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
-        afs_put_server(oldserver);
+        rb_link_node(&vnode->cb_promise, parent, p);
+        rb_insert_color(&vnode->cb_promise, &server->cb_promises);
+        vnode->cb_promised = true;
+        spin_unlock(&server->cb_lock);
        _leave("");
-} /* end afs_vnode_cb_timed_out() */
+}
-/*****************************************************************************/
 /*
- * finish off updating the recorded status of a file
+ * handle remote file deletion by discarding the callback promise
+ */
+static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
+{
+        struct afs_server *server;
+        set_bit(AFS_VNODE_DELETED, &vnode->flags);
+        server = vnode->server;
+        if (vnode->cb_promised) {
+                spin_lock(&server->cb_lock);
+                if (vnode->cb_promised) {
+                        rb_erase(&vnode->cb_promise, &server->cb_promises);
+                        vnode->cb_promised = false;
+                }
+                spin_unlock(&server->cb_lock);
+        }
+        spin_lock(&vnode->server->fs_lock);
+        rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+        spin_unlock(&vnode->server->fs_lock);
+        vnode->server = NULL;
+        afs_put_server(server);
+}
+/*
+ * finish off updating the recorded status of a file after a successful
+ * operation completion
 * - starts callback expiry timer
 * - adds to server's callback list
 */
-static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
+void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
-                                             struct afs_server *server,
+                                      struct afs_server *server)
-                                             int ret)
 {
        struct afs_server *oldserver = NULL;
-        _enter("%p,%p,%d", vnode, server, ret);
+        _enter("%p,%p", vnode, server);
        spin_lock(&vnode->lock);
+        clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+        afs_vnode_note_promise(vnode, server);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        wake_up_all(&vnode->update_waitq);
+        afs_put_server(oldserver);
+        _leave("");
+}
-        vnode->flags &= ~AFS_VNODE_CHANGED;
+/*
+ * finish off updating the recorded status of a file after an operation failed
+ */
+static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
+{
+        _enter("%p,%d", vnode, ret);
-        if (ret == 0) {
+        spin_lock(&vnode->lock);
-                /* adjust the callback timeout appropriately */
-                afs_kafstimod_add_timer(&vnode->cb_timeout,
-                                        vnode->cb_expiry * HZ);
-                spin_lock(&afs_cb_hash_lock);
-                list_move_tail(&vnode->cb_hash_link,
-                              &afs_cb_hash(server, &vnode->fid));
-                spin_unlock(&afs_cb_hash_lock);
-                /* swap ref to old callback server with that for new callback
-                 * server */
-                oldserver = xchg(&vnode->cb_server, server);
-                if (oldserver != server) {
-                        if (oldserver) {
-                                spin_lock(&oldserver->cb_lock);
-                                list_del_init(&vnode->cb_link);
-                                spin_unlock(&oldserver->cb_lock);
-                        }
-                        afs_get_server(server);
+        clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-                        spin_lock(&server->cb_lock);
-                        list_add_tail(&vnode->cb_link, &server->cb_promises);
-                        spin_unlock(&server->cb_lock);
-                }
-                else {
-                        /* same server */
-                        oldserver = NULL;
-                }
-        }
-        else if (ret == -ENOENT) {
-                /* the file was deleted - clear the callback timeout */
-                oldserver = xchg(&vnode->cb_server, NULL);
-                afs_kafstimod_del_timer(&vnode->cb_timeout);
+        if (ret == -ENOENT) {
+                /* the file was deleted on the server */
                _debug("got NOENT from server - marking file deleted");
-                vnode->flags |= AFS_VNODE_DELETED;
+                afs_vnode_deleted_remotely(vnode);
        }
        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
        spin_unlock(&vnode->lock);
        wake_up_all(&vnode->update_waitq);
-        afs_put_server(oldserver);
        _leave("");
+}
-} /* end afs_vnode_finalise_status_update() */
-/*****************************************************************************/
 /*
 * fetch file status from the volume
 * - don't issue a fetch if:
@@ -157,9 +252,11 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
 *   - there are any outstanding ops that will fetch the status
 * - TODO implement local caching
 */
-int afs_vnode_fetch_status(struct afs_vnode *vnode)
+int afs_vnode_fetch_status(struct afs_vnode *vnode,
+                           struct afs_vnode *auth_vnode, struct key *key)
 {
        struct afs_server *server;
+        unsigned long acl_order;
        int ret;
        DECLARE_WAITQUEUE(myself, current);
@@ -168,38 +265,49 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
               vnode->volume->vlocation->vldb.name,
               vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
-        if (!(vnode->flags & AFS_VNODE_CHANGED) && vnode->cb_server) {
+        if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+            vnode->cb_promised) {
                _leave(" [unchanged]");
                return 0;
        }
-        if (vnode->flags & AFS_VNODE_DELETED) {
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
                _leave(" [deleted]");
                return -ENOENT;
        }
+        acl_order = 0;
+        if (auth_vnode)
+                acl_order = auth_vnode->acl_order;
        spin_lock(&vnode->lock);
-        if (!(vnode->flags & AFS_VNODE_CHANGED)) {
+        if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+            vnode->cb_promised) {
                spin_unlock(&vnode->lock);
                _leave(" [unchanged]");
                return 0;
        }
+        ASSERTCMP(vnode->update_cnt, >=, 0);
        if (vnode->update_cnt > 0) {
                /* someone else started a fetch */
+                _debug("wait on fetch %d", vnode->update_cnt);
                set_current_state(TASK_UNINTERRUPTIBLE);
+                ASSERT(myself.func != NULL);
                add_wait_queue(&vnode->update_waitq, &myself);
                /* wait for the status to be updated */
                for (;;) {
-                        if (!(vnode->flags & AFS_VNODE_CHANGED))
+                        if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
                                break;
-                        if (vnode->flags & AFS_VNODE_DELETED)
+                        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                                break;
-                        /* it got updated and invalidated all before we saw
+                        /* check to see if it got updated and invalidated all
-                         * it */
+                         * before we saw it */
                        if (vnode->update_cnt == 0) {
                                remove_wait_queue(&vnode->update_waitq,
                                                  &myself);
@@ -219,10 +327,11 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
                spin_unlock(&vnode->lock);
                set_current_state(TASK_RUNNING);
-                return vnode->flags & AFS_VNODE_DELETED ? -ENOENT : 0;
+                return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
+                        -ENOENT : 0;
        }
- get_anyway:
+get_anyway:
        /* okay... we're going to have to initiate the op */
        vnode->update_cnt++;
@@ -232,39 +341,60 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
         * vnode */
        do {
                /* pick a server to query */
-                ret = afs_volume_pick_fileserver(vnode->volume, &server);
+                server = afs_volume_pick_fileserver(vnode);
-                if (ret<0)
+                if (IS_ERR(server))
-                        return ret;
+                        goto no_server;
-                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                _debug("USING SERVER: %p{%08x}",
+                       server, ntohl(server->addr.s_addr));
-                ret = afs_rxfs_fetch_file_status(server, vnode, NULL);
+                ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
+                                               &afs_sync_call);
-        } while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
        /* adjust the flags */
-        afs_vnode_finalise_status_update(vnode, server, ret);
+        if (ret == 0) {
+                _debug("adjust");
+                if (auth_vnode)
+                        afs_cache_permit(vnode, key, acl_order);
+                afs_vnode_finalise_status_update(vnode, server);
+                afs_put_server(server);
+        } else {
+                _debug("failed [%d]", ret);
+                afs_vnode_status_update_failed(vnode, ret);
+        }
-        _leave(" = %d", ret);
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
        return ret;
-} /* end afs_vnode_fetch_status() */
-/*****************************************************************************/
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
 /*
 * fetch file data from the volume
- * - TODO implement caching and server failover
+ * - TODO implement caching
 */
-int afs_vnode_fetch_data(struct afs_vnode *vnode,
+int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-                         struct afs_rxfs_fetch_descriptor *desc)
+                         off_t offset, size_t length, struct page *page)
 {
        struct afs_server *server;
        int ret;
-        _enter("%s,{%u,%u,%u}",
+        _enter("%s{%u,%u,%u},%x,,,",
               vnode->volume->vlocation->vldb.name,
               vnode->fid.vid,
               vnode->fid.vnode,
-               vnode->fid.unique);
+               vnode->fid.unique,
+               key_serial(key));
        /* this op will fetch the status */
        spin_lock(&vnode->lock);
@@ -275,120 +405,351 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode,
         * vnode */
        do {
                /* pick a server to query */
-                ret = afs_volume_pick_fileserver(vnode->volume, &server);
+                server = afs_volume_pick_fileserver(vnode);
-                if (ret < 0)
+                if (IS_ERR(server))
-                        return ret;
+                        goto no_server;
                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-                ret = afs_rxfs_fetch_file_data(server, vnode, desc, NULL);
+                ret = afs_fs_fetch_data(server, key, vnode, offset, length,
+                                        page, &afs_sync_call);
-        } while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
        /* adjust the flags */
-        afs_vnode_finalise_status_update(vnode, server, ret);
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+        }
        _leave(" = %d", ret);
        return ret;
-} /* end afs_vnode_fetch_data() */
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        return PTR_ERR(server);
+}
-/*****************************************************************************/
 /*
- * break any outstanding callback on a vnode
+ * make a file or a directory
- * - only relevent to server that issued it
 */
-int afs_vnode_give_up_callback(struct afs_vnode *vnode)
+int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
+                     const char *name, umode_t mode, struct afs_fid *newfid,
+                     struct afs_file_status *newstatus,
+                     struct afs_callback *newcb, struct afs_server **_server)
 {
        struct afs_server *server;
        int ret;
-        _enter("%s,{%u,%u,%u}",
+        _enter("%s{%u,%u,%u},%x,%s,,",
               vnode->volume->vlocation->vldb.name,
               vnode->fid.vid,
               vnode->fid.vnode,
-               vnode->fid.unique);
+               vnode->fid.unique,
+               key_serial(key),
-        spin_lock(&afs_cb_hash_lock);
+               name);
-        list_del_init(&vnode->cb_hash_link);
-        spin_unlock(&afs_cb_hash_lock);
-        /* set the changed flag in the vnode and release the server */
+        /* this op will fetch the status on the directory we're creating in */
        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
-        afs_kafstimod_del_timer(&vnode->cb_timeout);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-        server = xchg(&vnode->cb_server, NULL);
+                ret = afs_fs_create(server, key, vnode, name, mode, newfid,
-        if (server) {
+                                    newstatus, newcb, &afs_sync_call);
-                vnode->flags |= AFS_VNODE_CHANGED;
-                spin_lock(&server->cb_lock);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
-                list_del_init(&vnode->cb_link);
-                spin_unlock(&server->cb_lock);
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                *_server = server;
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+                *_server = NULL;
        }
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+        return ret;
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
-        ret = 0;
+/*
-        if (server) {
+ * remove a file or directory
-                ret = afs_rxfs_give_up_callback(server, vnode);
+ */
+int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
+                     bool isdir)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%u,%u,%u},%x,%s",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key),
+               name);
+        /* this op will fetch the status on the directory we're removing from */
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_remove(server, key, vnode, name, isdir,
+                                    &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
        }
-        _leave(" = %d", ret);
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
        return ret;
-} /* end afs_vnode_give_up_callback() */
-/*****************************************************************************/
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
 /*
- * match a vnode record stored in the cache
+ * create a hard link
 */
-#ifdef AFS_CACHING_SUPPORT
+extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                          struct key *key, const char *name)
-                                                 const void *entry)
 {
-        const struct afs_cache_vnode *cvnode = entry;
+        struct afs_server *server;
-        struct afs_vnode *vnode = target;
+        int ret;
-        _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+        _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
+               dvnode->volume->vlocation->vldb.name,
+               dvnode->fid.vid,
+               dvnode->fid.vnode,
+               dvnode->fid.unique,
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
               vnode->fid.vnode,
               vnode->fid.unique,
-               vnode->status.version,
+               key_serial(key),
-               cvnode->vnode_id,
+               name);
-               cvnode->vnode_unique,
-               cvnode->data_version);
+        /* this op will fetch the status on the directory we're removing from */
+        spin_lock(&vnode->lock);
-        if (vnode->fid.vnode != cvnode->vnode_id) {
+        vnode->update_cnt++;
-                _leave(" = FAILED");
+        spin_unlock(&vnode->lock);
-                return CACHEFS_MATCH_FAILED;
+        spin_lock(&dvnode->lock);
+        dvnode->update_cnt++;
+        spin_unlock(&dvnode->lock);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(dvnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_link(server, key, dvnode, vnode, name,
+                                  &afs_sync_call);
+        } while (!afs_volume_release_fileserver(dvnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                afs_vnode_finalise_status_update(dvnode, server);
+                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+                afs_vnode_status_update_failed(dvnode, ret);
        }
-        if (vnode->fid.unique != cvnode->vnode_unique ||
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-            vnode->status.version != cvnode->data_version) {
+        return ret;
-                _leave(" = DELETE");
-                return CACHEFS_MATCH_SUCCESS_DELETE;
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        spin_lock(&dvnode->lock);
+        dvnode->update_cnt--;
+        ASSERTCMP(dvnode->update_cnt, >=, 0);
+        spin_unlock(&dvnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
+/*
+ * create a symbolic link
+ */
+int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
+                      const char *name, const char *content,
+                      struct afs_fid *newfid,
+                      struct afs_file_status *newstatus,
+                      struct afs_server **_server)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%u,%u,%u},%x,%s,%s,,,",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key),
+               name, content);
+        /* this op will fetch the status on the directory we're creating in */
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_symlink(server, key, vnode, name, content,
+                                     newfid, newstatus, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                *_server = server;
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+                *_server = NULL;
        }
-        _leave(" = SUCCESS");
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-        return CACHEFS_MATCH_SUCCESS;
+        return ret;
-} /* end afs_vnode_cache_match() */
-#endif
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
-/*****************************************************************************/
 /*
- * update a vnode record stored in the cache
+ * rename a file
 */
-#ifdef AFS_CACHING_SUPPORT
+int afs_vnode_rename(struct afs_vnode *orig_dvnode,
-static void afs_vnode_cache_update(void *source, void *entry)
+                     struct afs_vnode *new_dvnode,
+                     struct key *key,
+                     const char *orig_name,
+                     const char *new_name)
 {
-        struct afs_cache_vnode *cvnode = entry;
+        struct afs_server *server;
-        struct afs_vnode *vnode = source;
+        int ret;
-        _enter("");
+        _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
+               orig_dvnode->volume->vlocation->vldb.name,
+               orig_dvnode->fid.vid,
+               orig_dvnode->fid.vnode,
+               orig_dvnode->fid.unique,
+               new_dvnode->volume->vlocation->vldb.name,
+               new_dvnode->fid.vid,
+               new_dvnode->fid.vnode,
+               new_dvnode->fid.unique,
+               key_serial(key),
+               orig_name,
+               new_name);
+        /* this op will fetch the status on both the directories we're dealing
+         * with */
+        spin_lock(&orig_dvnode->lock);
+        orig_dvnode->update_cnt++;
+        spin_unlock(&orig_dvnode->lock);
+        if (new_dvnode != orig_dvnode) {
+                spin_lock(&new_dvnode->lock);
+                new_dvnode->update_cnt++;
+                spin_unlock(&new_dvnode->lock);
+        }
-        cvnode->vnode_id        = vnode->fid.vnode;
+        do {
-        cvnode->vnode_unique    = vnode->fid.unique;
+                /* pick a server to query */
-        cvnode->data_version    = vnode->status.version;
+                server = afs_volume_pick_fileserver(orig_dvnode);
+                if (IS_ERR(server))
+                        goto no_server;
-} /* end afs_vnode_cache_update() */
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-#endif
+                ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
+                                    new_dvnode, new_name, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(orig_dvnode, server);
+                if (new_dvnode != orig_dvnode)
+                        afs_vnode_finalise_status_update(new_dvnode, server);
+                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(orig_dvnode, ret);
+                if (new_dvnode != orig_dvnode)
+                        afs_vnode_status_update_failed(new_dvnode, ret);
+        }
+        _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
+        return ret;
+no_server:
+        spin_lock(&orig_dvnode->lock);
+        orig_dvnode->update_cnt--;
+        ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
+        spin_unlock(&orig_dvnode->lock);
+        if (new_dvnode != orig_dvnode) {
+                spin_lock(&new_dvnode->lock);
+                new_dvnode->update_cnt--;
+                ASSERTCMP(new_dvnode->update_cnt, >=, 0);
+                spin_unlock(&new_dvnode->lock);
+        }
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
+        return PTR_ERR(server);
+}
diff --git a/fs/afs/vnode.h b/fs/afs/vnode.h
deleted file mode 100644
index b86a97102e8b..000000000000
--- a/fs/afs/vnode.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* vnode.h: AFS vnode record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_VNODE_H
-#define _LINUX_AFS_VNODE_H
-#include <linux/fs.h>
-#include "server.h"
-#include "kafstimod.h"
-#include "cache.h"
-#ifdef __KERNEL__
-struct afs_rxfs_fetch_descriptor;
-/*****************************************************************************/
-/*
- * vnode catalogue entry
- */
-struct afs_cache_vnode
-{
-        afs_vnodeid_t           vnode_id;       /* vnode ID */
-        unsigned                vnode_unique;   /* vnode ID uniquifier */
-        afs_dataversion_t       data_version;   /* data version */
-};
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-/*****************************************************************************/
-/*
- * AFS inode private data
- */
-struct afs_vnode
-{
-        struct inode            vfs_inode;      /* the VFS's inode record */
-        struct afs_volume       *volume;        /* volume on which vnode resides */
-        struct afs_fid          fid;            /* the file identifier for this inode */
-        struct afs_file_status  status;         /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        wait_queue_head_t       update_waitq;   /* status fetch waitqueue */
-        unsigned                update_cnt;     /* number of outstanding ops that will update the
-                                                 * status */
-        spinlock_t              lock;           /* waitqueue/flags lock */
-        unsigned                flags;
-#define AFS_VNODE_CHANGED       0x00000001      /* set if vnode reported changed by callback */
-#define AFS_VNODE_DELETED       0x00000002      /* set if vnode deleted on server */
-#define AFS_VNODE_MOUNTPOINT    0x00000004      /* set if vnode is a mountpoint symlink */
-        /* outstanding callback notification on this file */
-        struct afs_server       *cb_server;     /* server that made the current promise */
-        struct list_head        cb_link;        /* link in server's promises list */
-        struct list_head        cb_hash_link;   /* link in master callback hash */
-        struct afs_timer        cb_timeout;     /* timeout on promise */
-        unsigned                cb_version;     /* callback version */
-        unsigned                cb_expiry;      /* callback expiry time */
-        afs_callback_type_t     cb_type;        /* type of callback */
-};
-static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
-{
-        return container_of(inode,struct afs_vnode,vfs_inode);
-}
-static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
-{
-        return &vnode->vfs_inode;
-}
-extern int afs_vnode_fetch_status(struct afs_vnode *vnode);
-extern int afs_vnode_fetch_data(struct afs_vnode *vnode,
-                                struct afs_rxfs_fetch_descriptor *desc);
-extern int afs_vnode_give_up_callback(struct afs_vnode *vnode);
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_VNODE_H */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 768c6dbd323a..dd160cada45d 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -1,6 +1,6 @@
-/* volume.c: AFS volume management
+/* AFS volume management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -15,35 +15,10 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "cell.h"
-#include "cache.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
 #include "internal.h"
-#ifdef __KDEBUG
 static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                  const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_volume_cache_index_def = {
-        .name           = "volume",
-        .data_size      = sizeof(struct afs_cache_vhash),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .match          = afs_volume_cache_match,
-        .update         = afs_volume_cache_update,
-};
-#endif
-/*****************************************************************************/
 /*
 * lookup a volume by name
 * - this can be one of the following:
@@ -66,118 +41,52 @@ struct cachefs_index_def afs_volume_cache_index_def = {
 * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
 *           explicitly told otherwise
 */
-int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
+struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
-                      struct afs_volume **_volume)
 {
        struct afs_vlocation *vlocation = NULL;
        struct afs_volume *volume = NULL;
-        afs_voltype_t type;
+        struct afs_server *server = NULL;
-        const char *cellname, *volname, *suffix;
        char srvtmask;
-        int force, ret, loop, cellnamesz, volnamesz;
+        int ret, loop;
-        _enter("%s,,%d,", name, rwpath);
-        if (!name || (name[0] != '%' && name[0] != '#') || !name[1]) {
-                printk("kAFS: unparsable volume name\n");
-                return -EINVAL;
-        }
-        /* determine the type of volume we're looking for */
-        force = 0;
-        type = AFSVL_ROVOL;
-        if (rwpath || name[0] == '%') {
-                type = AFSVL_RWVOL;
-                force = 1;
-        }
-        suffix = strrchr(name, '.');
-        if (suffix) {
-                if (strcmp(suffix, ".readonly") == 0) {
-                        type = AFSVL_ROVOL;
-                        force = 1;
-                }
-                else if (strcmp(suffix, ".backup") == 0) {
-                        type = AFSVL_BACKVOL;
-                        force = 1;
-                }
-                else if (suffix[1] == 0) {
-                }
-                else {
-                        suffix = NULL;
-                }
-        }
-        /* split the cell and volume names */
+        _enter("{%*.*s,%d}",
-        name++;
+               params->volnamesz, params->volnamesz, params->volname, params->rwpath);
-        volname = strchr(name, ':');
-        if (volname) {
-                cellname = name;
-                cellnamesz = volname - name;
-                volname++;
-        }
-        else {
-                volname = name;
-                cellname = NULL;
-                cellnamesz = 0;
-        }
-        volnamesz = suffix ? suffix - volname : strlen(volname);
-        _debug("CELL:%*.*s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
-               cellnamesz, cellnamesz, cellname ?: "", cell,
-               volnamesz, volnamesz, volname, suffix ?: "-",
-               type,
-               force ? " FORCE" : "");
-        /* lookup the cell record */
-        if (cellname || !cell) {
-                ret = afs_cell_lookup(cellname, cellnamesz, &cell);
-                if (ret<0) {
-                        printk("kAFS: unable to lookup cell '%s'\n",
-                               cellname ?: "");
-                        goto error;
-                }
-        }
-        else {
-                afs_get_cell(cell);
-        }
        /* lookup the volume location record */
-        ret = afs_vlocation_lookup(cell, volname, volnamesz, &vlocation);
+        vlocation = afs_vlocation_lookup(params->cell, params->key,
-        if (ret < 0)
+                                         params->volname, params->volnamesz);
+        if (IS_ERR(vlocation)) {
+                ret = PTR_ERR(vlocation);
+                vlocation = NULL;
                goto error;
+        }
        /* make the final decision on the type we want */
        ret = -ENOMEDIUM;
-        if (force && !(vlocation->vldb.vidmask & (1 << type)))
+        if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
                goto error;
        srvtmask = 0;
        for (loop = 0; loop < vlocation->vldb.nservers; loop++)
                srvtmask |= vlocation->vldb.srvtmask[loop];
-        if (force) {
+        if (params->force) {
-                if (!(srvtmask & (1 << type)))
+                if (!(srvtmask & (1 << params->type)))
                        goto error;
-        }
+        } else if (srvtmask & AFS_VOL_VTM_RO) {
-        else if (srvtmask & AFS_VOL_VTM_RO) {
+                params->type = AFSVL_ROVOL;
-                type = AFSVL_ROVOL;
+        } else if (srvtmask & AFS_VOL_VTM_RW) {
-        }
+                params->type = AFSVL_RWVOL;
-        else if (srvtmask & AFS_VOL_VTM_RW) {
+        } else {
-                type = AFSVL_RWVOL;
-        }
-        else {
                goto error;
        }
-        down_write(&cell->vl_sem);
+        down_write(&params->cell->vl_sem);
        /* is the volume already active? */
-        if (vlocation->vols[type]) {
+        if (vlocation->vols[params->type]) {
                /* yes - re-use it */
-                volume = vlocation->vols[type];
+                volume = vlocation->vols[params->type];
                afs_get_volume(volume);
                goto success;
        }
@@ -191,23 +100,24 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
                goto error_up;
        atomic_set(&volume->usage, 1);
-        volume->type            = type;
+        volume->type            = params->type;
-        volume->type_force      = force;
+        volume->type_force      = params->force;
-        volume->cell            = cell;
+        volume->cell            = params->cell;
-        volume->vid             = vlocation->vldb.vid[type];
+        volume->vid             = vlocation->vldb.vid[params->type];
        init_rwsem(&volume->server_sem);
        /* look up all the applicable server records */
        for (loop = 0; loop < 8; loop++) {
                if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
-                        ret = afs_server_lookup(
+                        server = afs_lookup_server(
-                                volume->cell,
+                               volume->cell, &vlocation->vldb.servers[loop]);
-                                &vlocation->vldb.servers[loop],
+                        if (IS_ERR(server)) {
-                                &volume->servers[volume->nservers]);
+                                ret = PTR_ERR(server);
-                        if (ret < 0)
                                goto error_discard;
+                        }
+                        volume->servers[volume->nservers] = server;
                        volume->nservers++;
                }
        }
@@ -223,35 +133,34 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
        afs_get_vlocation(vlocation);
        volume->vlocation = vlocation;
-        vlocation->vols[type] = volume;
+        vlocation->vols[volume->type] = volume;
- success:
+success:
        _debug("kAFS selected %s volume %08x",
               afs_voltypes[volume->type], volume->vid);
-        *_volume = volume;
+        up_write(&params->cell->vl_sem);
-        ret = 0;
+        afs_put_vlocation(vlocation);
+        _leave(" = %p", volume);
+        return volume;
        /* clean up */
- error_up:
+error_up:
-        up_write(&cell->vl_sem);
+        up_write(&params->cell->vl_sem);
- error:
+error:
        afs_put_vlocation(vlocation);
-        afs_put_cell(cell);
+        _leave(" = %d", ret);
+        return ERR_PTR(ret);
-        _leave(" = %d (%p)", ret, volume);
-        return ret;
- error_discard:
+error_discard:
-        up_write(&cell->vl_sem);
+        up_write(&params->cell->vl_sem);
        for (loop = volume->nservers - 1; loop >= 0; loop--)
                afs_put_server(volume->servers[loop]);
        kfree(volume);
        goto error;
-} /* end afs_volume_lookup() */
+}
-/*****************************************************************************/
 /*
 * destroy a volume record
 */
@@ -265,10 +174,9 @@ void afs_put_volume(struct afs_volume *volume)
        _enter("%p", volume);
-        vlocation = volume->vlocation;
+        ASSERTCMP(atomic_read(&volume->usage), >, 0);
-        /* sanity check */
+        vlocation = volume->vlocation;
-        BUG_ON(atomic_read(&volume->usage) <= 0);
        /* to prevent a race, the decrement and the dequeue must be effectively
         * atomic */
@@ -296,21 +204,27 @@ void afs_put_volume(struct afs_volume *volume)
        kfree(volume);
        _leave(" [destroyed]");
-} /* end afs_put_volume() */
+}
-/*****************************************************************************/
 /*
 * pick a server to use to try accessing this volume
 * - returns with an elevated usage count on the server chosen
 */
-int afs_volume_pick_fileserver(struct afs_volume *volume,
+struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
-                               struct afs_server **_server)
 {
+        struct afs_volume *volume = vnode->volume;
        struct afs_server *server;
        int ret, state, loop;
        _enter("%s", volume->vlocation->vldb.name);
+        /* stick with the server we're already using if we can */
+        if (vnode->server && vnode->server->fs_state == 0) {
+                afs_get_server(vnode->server);
+                _leave(" = %p [current]", vnode->server);
+                return vnode->server;
+        }
        down_read(&volume->server_sem);
        /* handle the no-server case */
@@ -318,7 +232,7 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
                ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
                up_read(&volume->server_sem);
                _leave(" = %d [no servers]", ret);
-                return ret;
+                return ERR_PTR(ret);
        }
        /* basically, just search the list for the first live server and use
@@ -328,15 +242,16 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
                server = volume->servers[loop];
                state = server->fs_state;
+                _debug("consider %d [%d]", loop, state);
                switch (state) {
                        /* found an apparently healthy server */
                case 0:
                        afs_get_server(server);
                        up_read(&volume->server_sem);
-                        *_server = server;
+                        _leave(" = %p (picked %08x)",
-                        _leave(" = 0 (picked %08x)",
+                               server, ntohl(server->addr.s_addr));
-                               ntohl(server->addr.s_addr));
+                        return server;
-                        return 0;
                case -ENETUNREACH:
                        if (ret == 0)
@@ -372,20 +287,21 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
         */
        up_read(&volume->server_sem);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
-} /* end afs_volume_pick_fileserver() */
+}
-/*****************************************************************************/
 /*
 * release a server after use
 * - releases the ref on the server struct that was acquired by picking
 * - records result of using a particular server to access a volume
 * - return 0 to try again, 1 if okay or to issue error
+ * - the caller must release the server struct if result was 0
 */
-int afs_volume_release_fileserver(struct afs_volume *volume,
+int afs_volume_release_fileserver(struct afs_vnode *vnode,
                                  struct afs_server *server,
                                  int result)
 {
+        struct afs_volume *volume = vnode->volume;
        unsigned loop;
        _enter("%s,%08x,%d",
@@ -396,14 +312,16 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
                /* success */
        case 0:
                server->fs_act_jif = jiffies;
-                break;
+                server->fs_state = 0;
+                _leave("");
+                return 1;
                /* the fileserver denied all knowledge of the volume */
        case -ENOMEDIUM:
                server->fs_act_jif = jiffies;
                down_write(&volume->server_sem);
-                /* first, find where the server is in the active list (if it
+                /* firstly, find where the server is in the active list (if it
                 * is) */
                for (loop = 0; loop < volume->nservers; loop++)
                        if (volume->servers[loop] == server)
@@ -441,6 +359,7 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
        case -ENETUNREACH:
        case -EHOSTUNREACH:
        case -ECONNREFUSED:
+        case -ETIME:
        case -ETIMEDOUT:
        case -EREMOTEIO:
                /* mark the server as dead
@@ -460,60 +379,17 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
                server->fs_act_jif = jiffies;
        case -ENOMEM:
        case -ENONET:
-                break;
+                /* tell the caller to accept the result */
+                afs_put_server(server);
+                _leave(" [local failure]");
+                return 1;
        }
-        /* tell the caller to accept the result */
-        afs_put_server(server);
-        _leave("");
-        return 1;
        /* tell the caller to loop around and try the next server */
- try_next_server_upw:
+try_next_server_upw:
        up_write(&volume->server_sem);
- try_next_server:
+try_next_server:
        afs_put_server(server);
        _leave(" [try next server]");
        return 0;
+}
-} /* end afs_volume_release_fileserver() */
-/*****************************************************************************/
-/*
- * match a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                  const void *entry)
-{
-        const struct afs_cache_vhash *vhash = entry;
-        struct afs_volume *volume = target;
-        _enter("{%u},{%u}", volume->type, vhash->vtype);
-        if (volume->type == vhash->vtype) {
-                _leave(" = SUCCESS");
-                return CACHEFS_MATCH_SUCCESS;
-        }
-        _leave(" = FAILED");
-        return CACHEFS_MATCH_FAILED;
-} /* end afs_volume_cache_match() */
-#endif
-/*****************************************************************************/
-/*
- * update a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
-{
-        struct afs_cache_vhash *vhash = entry;
-        struct afs_volume *volume = source;
-        _enter("");
-        vhash->vtype = volume->type;
-} /* end afs_volume_cache_update() */
-#endif
diff --git a/fs/afs/volume.h b/fs/afs/volume.h
deleted file mode 100644
index bfdcf19ba3f3..000000000000
--- a/fs/afs/volume.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* volume.h: AFS volume management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_VOLUME_H
-#define _LINUX_AFS_VOLUME_H
-#include "types.h"
-#include "fsclient.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
-#include "cache.h"
-typedef enum {
-        AFS_VLUPD_SLEEP,                /* sleeping waiting for update timer to fire */
-        AFS_VLUPD_PENDING,              /* on pending queue */
-        AFS_VLUPD_INPROGRESS,           /* op in progress */
-        AFS_VLUPD_BUSYSLEEP,            /* sleeping because server returned EBUSY */
-        
-} __attribute__((packed)) afs_vlocation_upd_t;
-/*****************************************************************************/
-/*
- * entry in the cached volume location catalogue
- */
-struct afs_cache_vlocation
-{
-        uint8_t                 name[64];       /* volume name (lowercase, padded with NULs) */
-        uint8_t                 nservers;       /* number of entries used in servers[] */
-        uint8_t                 vidmask;        /* voltype mask for vid[] */
-        uint8_t                 srvtmask[8];    /* voltype masks for servers[] */
-#define AFS_VOL_VTM_RW  0x01 /* R/W version of the volume is available (on this server) */
-#define AFS_VOL_VTM_RO  0x02 /* R/O version of the volume is available (on this server) */
-#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
-        afs_volid_t             vid[3];         /* volume IDs for R/W, R/O and Bak volumes */
-        struct in_addr          servers[8];     /* fileserver addresses */
-        time_t                  rtime;          /* last retrieval time */
-};
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-/*****************************************************************************/
-/*
- * volume -> vnode hash table entry
- */
-struct afs_cache_vhash
-{
-        afs_voltype_t           vtype;          /* which volume variation */
-        uint8_t                 hash_bucket;    /* which hash bucket this represents */
-} __attribute__((packed));
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-/*****************************************************************************/
-/*
- * AFS volume location record
- */
-struct afs_vlocation
-{
-        atomic_t                usage;
-        struct list_head        link;           /* link in cell volume location list */
-        struct afs_timer        timeout;        /* decaching timer */
-        struct afs_cell         *cell;          /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        struct afs_cache_vlocation vldb;        /* volume information DB record */
-        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
-        rwlock_t                lock;           /* access lock */
-        unsigned long           read_jif;       /* time at which last read from vlserver */
-        struct afs_timer        upd_timer;      /* update timer */
-        struct afs_async_op     upd_op;         /* update operation */
-        afs_vlocation_upd_t     upd_state;      /* update state */
-        unsigned short          upd_first_svix; /* first server index during update */
-        unsigned short          upd_curr_svix;  /* current server index during update */
-        unsigned short          upd_rej_cnt;    /* ENOMEDIUM count during update */
-        unsigned short          upd_busy_cnt;   /* EBUSY count during update */
-        unsigned short          valid;          /* T if valid */
-};
-extern int afs_vlocation_lookup(struct afs_cell *cell,
-                                const char *name,
-                                unsigned namesz,
-                                struct afs_vlocation **_vlocation);
-#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-extern void afs_put_vlocation(struct afs_vlocation *vlocation);
-extern void afs_vlocation_do_timeout(struct afs_vlocation *vlocation);
-/*****************************************************************************/
-/*
- * AFS volume access record
- */
-struct afs_volume
-{
-        atomic_t                usage;
-        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
-        struct afs_vlocation    *vlocation;     /* volume location */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        afs_volid_t             vid;            /* volume ID */
-        afs_voltype_t           type;           /* type of volume */
-        char                    type_force;     /* force volume type (suppress R/O -> R/W) */
-        unsigned short          nservers;       /* number of server slots filled */
-        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
-        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
-        struct rw_semaphore     server_sem;     /* lock for accessing current server */
-};
-extern int afs_volume_lookup(const char *name,
-                             struct afs_cell *cell,
-                             int rwpath,
-                             struct afs_volume **_volume);
-#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
-extern void afs_put_volume(struct afs_volume *volume);
-extern int afs_volume_pick_fileserver(struct afs_volume *volume,
-                                      struct afs_server **_server);
-extern int afs_volume_release_fileserver(struct afs_volume *volume,
-                                         struct afs_server *server,
-                                         int result);
-#endif /* _LINUX_AFS_VOLUME_H */
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6d49dd..b97ab8028b6d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,10 +68,8 @@ static void aio_queue_work(struct kioctx *);
 */
 static int __init aio_setup(void)
 {
-        kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
+        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-                                0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-        kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
-                                0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
        aio_wq = create_workqueue("aio");
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cc6cc8ed2e39..fe96108a788d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -293,8 +293,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
        
-                if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+                if (flags & SLAB_CTOR_CONSTRUCTOR) {
-                            SLAB_CTOR_CONSTRUCTOR) {
                        inode_init_once(&bi->vfs_inode);
                }
 }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 93d6219243ad..edc08d89aabc 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,8 +248,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct bfs_inode_info *bi = foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&bi->vfs_inode);
 }
 
diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb18368..093345f00128 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,7 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-#define BIO_POOL_SIZE 256
+#define BIO_POOL_SIZE 2
 static struct kmem_cache *bio_slab __read_mostly;
@@ -38,7 +38,7 @@ static struct kmem_cache *bio_slab __read_mostly;
 * a small number of entries is fine, not going to be performance critical.
 * basically we just need to survive
 */
-#define BIO_SPLIT_ENTRIES 8     
+#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 struct biovec_slab {
@@ -1120,7 +1120,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
+static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
        int i;
@@ -1128,9 +1128,6 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
                struct biovec_slab *bp = bvec_slabs + i;
                mempool_t **bvp = bs->bvec_pools + i;
-                if (pool_entries > 1 && i >= scale)
-                        pool_entries >>= 1;
                *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
                if (!*bvp)
                        return -ENOMEM;
@@ -1161,7 +1158,7 @@ void bioset_free(struct bio_set *bs)
        kfree(bs);
 }
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
+struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 {
        struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
@@ -1172,7 +1169,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
        if (!bs->bio_pool)
                goto bad;
-        if (!biovec_create_pools(bs, bvec_pool_size, scale))
+        if (!biovec_create_pools(bs, bvec_pool_size))
                return bs;
 bad:
@@ -1196,38 +1193,11 @@ static void __init biovec_init_slabs(void)
 static int __init init_bio(void)
 {
-        int megabytes, bvec_pool_entries;
+        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-        int scale = BIOVEC_NR_POOLS;
-        bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
-                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
        biovec_init_slabs();
-        megabytes = nr_free_pages() >> (20 - PAGE_SHIFT);
+        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
-        /*
-         * find out where to start scaling
-         */
-        if (megabytes <= 16)
-                scale = 0;
-        else if (megabytes <= 32)
-                scale = 1;
-        else if (megabytes <= 64)
-                scale = 2;
-        else if (megabytes <= 96)
-                scale = 3;
-        else if (megabytes <= 128)
-                scale = 4;
-        /*
-         * Limit number of entries reserved -- mempools are only used when
-         * the system is completely unable to allocate memory, so we only
-         * need enough to make progress.
-         */
-        bvec_pool_entries = 1 + scale;
-        fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 575076c018f4..f02b7bdd9864 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,10 +55,12 @@ static sector_t max_block(struct block_device *bdev)
        return retval;
 }
-/* Kill _all_ buffers, dirty or not.. */
+/* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
-        invalidate_bdev(bdev, 1);
+        if (bdev->bd_inode->i_mapping->nrpages == 0)
+                return;
+        invalidate_bh_lrus();
        truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 }       
@@ -455,9 +457,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
        struct bdev_inode *ei = (struct bdev_inode *) foo;
        struct block_device *bdev = &ei->bdev;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR)
-        {
                memset(bdev, 0, sizeof(*bdev));
                mutex_init(&bdev->bd_mutex);
                sema_init(&bdev->bd_mount_sem, 1);
@@ -1478,7 +1478,7 @@ int __invalidate_device(struct block_device *bdev)
                res = invalidate_inodes(sb);
                drop_super(sb);
        }
-        invalidate_bdev(bdev, 0);
+        invalidate_bdev(bdev);
        return res;
 }
 EXPORT_SYMBOL(__invalidate_device);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1d0852fa728b..7db24b9e5449 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -44,7 +44,6 @@
 #include <linux/bit_spinlock.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static void invalidate_bh_lrus(void);
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -333,7 +332,7 @@ out:
   we think the disk contains more recent information than the buffercache.
   The update == 1 pass marks the buffers we need to update, the update == 2
   pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
+void invalidate_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
@@ -341,11 +340,6 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
                return;
        invalidate_bh_lrus();
-        /*
-         * FIXME: what about destroy_dirty_buffers?
-         * We really want to use invalidate_inode_pages2() for
-         * that, but not until that's cleaned up.
-         */
        invalidate_mapping_pages(mapping, 0, -1);
 }
@@ -1408,7 +1402,7 @@ static void invalidate_bh_lru(void *arg)
        put_cpu_var(bh_lrus);
 }
        
-static void invalidate_bh_lrus(void)
+void invalidate_bh_lrus(void)
 {
        on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
 }
@@ -1700,17 +1694,8 @@ done:
                 * clean.  Someone wrote them back by hand with
                 * ll_rw_block/submit_bh.  A rare case.
                 */
-                int uptodate = 1;
-                do {
-                        if (!buffer_uptodate(bh)) {
-                                uptodate = 0;
-                                break;
-                        }
-                        bh = bh->b_this_page;
-                } while (bh != head);
-                if (uptodate)
-                        SetPageUptodate(page);
                end_page_writeback(page);
                /*
                 * The page and buffer_heads can be released at any time from
                 * here on.
@@ -2968,8 +2953,7 @@ EXPORT_SYMBOL(free_buffer_head);
 static void
 init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-                            SLAB_CTOR_CONSTRUCTOR) {
                struct buffer_head * bh = (struct buffer_head *)data;
                memset(bh, 0, sizeof(*bh));
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 5d1f4873d701..a9b6bc5157b8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,4 +1,16 @@
-Verison 1.48
+Version 1.49
+------------
+IPv6 support.  Enable ipv6 addresses to be passed on mount (put the ipv6
+address after the "ip=" mount option, at least until mount.cifs is fixed to
+handle DNS host to ipv6 name translation).  Accept override of uid or gid
+on mount even when Unix Extensions are negotiated (it used to be ignored
+when Unix Extensions were ignored).  This allows users to override the
+default uid and gid for files when they are certain that the uids or
+gids on the server do not match those of the client.  Make "sec=none"
+mount override username (so that null user connection is attempted)
+to match what documentation said.
+Version 1.48
 ------------
 Fix mtime bouncing around from local idea of last write times to remote time.
 Fix hang (in i_size_read) when simultaneous size update of same remote file
@@ -9,7 +21,13 @@ from read-only back to read-write, reflect this change in default file mode
 (we had been leaving a file's mode read-only until the inode were reloaded).
 Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
 when archive dos attribute not set and we are changing mode back to writeable
-on server which does not support the Unix Extensions).
+on server which does not support the Unix Extensions).  Remove read only dos
+attribute on chmod when adding any write permission (ie on any of
+user/group/other (not all of user/group/other ie  0222) when
+mounted to windows.  Add support for POSIX MkDir (slight performance
+enhancement and eliminates the network race between the mkdir and set 
+path info of the mode).
 Version 1.47
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 080c5eba112b..4d01697722cc 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -257,13 +257,19 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  uid           If CIFS Unix extensions are not supported by the server
+  uid           Set the default uid for inodes. For mounts to servers
-                this overrides the default uid for inodes. For mounts to
+                which do support the CIFS Unix extensions, such as a
-                servers which do support the CIFS Unix extensions, such
+                properly configured Samba server, the server provides
-                as a properly configured Samba server, the server provides
+                the uid, gid and mode so this parameter should  not be
-                the uid, gid and mode.  For servers which do not support
+                specified unless the server and clients uid and gid
-                the Unix extensions, the default uid (and gid) returned on
+                numbering differ.  If the server and client are in the
-                lookup of existing files is the uid (gid) of the person
+                same domain (e.g. running winbind or nss_ldap) and
+                the server supports the Unix Extensions then the uid
+                and gid can be retrieved from the server (and uid
+                and gid would not have to be specifed on the mount. 
+                For servers which do not support the CIFS Unix
+                extensions, the default uid (and gid) returned on lookup
+                of existing files will be the uid (gid) of the person
                who executed the mount (root, except when mount.cifs
                is configured setuid for user mounts) unless the "uid=" 
                (gid) mount option is specified.  For the uid (gid) of newly
@@ -281,8 +287,7 @@ A partial list of the supported mount options follows:
                the client.  Note that the mount.cifs helper must be
                at version 1.10 or higher to support specifying the uid
                (or gid) in non-numberic form.
-  gid           If CIFS Unix extensions are not supported by the server
+  gid           Set the default gid for inodes (similar to above).
-                this overrides the default gid for inodes.
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
  dir_mode      If CIFS Unix extensions are not supported by the server 
@@ -467,7 +472,7 @@ including:
        -V      print mount.cifs version
        -?      display simple usage information
-With recent 2.6 kernel versions of modutils, the version of the cifs kernel
+With most 2.6 kernel versions of modutils, the version of the cifs kernel
 module can be displayed via modinfo.
 Misc /proc/fs/cifs Flags and Debug Info
@@ -516,8 +521,22 @@ SecurityFlags		Flags which control security negotiation and
                        must use plaintext passwords                    0x20020
                        (reserved for future packet encryption)         0x00040
-cifsFYI                 If set to one, additional debug information is
+cifsFYI                 If set to non-zero value, additional debug information
-                        logged to the system error log. (default 0)
+                        will be logged to the system error log.  This field
+                        contains three flags controlling different classes of
+                        debugging entries.  The maximum value it can be set
+                        to is 7 which enables all debugging points (default 0).
+                        Some debugging statements are not compiled into the
+                        cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
+                        kernel configuration. cifsFYI may be set to one or
+                        nore of the following flags (7 sets them all):
+                        log cifs informational messages                 0x01
+                        log return codes from cifs entry points         0x02
+                        log slow responses (ie which take longer than 1 second)
+                          CONFIG_CIFS_STATS2 must be enabled in .config 0x04
+                                
+                                
 traceSMB                If set to one, debug information is logged to the
                        system error log with the start of smb requests
                        and responses (default 0)
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index d7b9c27c942d..78b620e332bd 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.39 November 30, 2005
+Version 1.49 April 26, 2007
 A Partial List of Missing Features
 ==================================
@@ -18,7 +18,7 @@ better)
 d) Kerberos/SPNEGO session setup support - (started)
-e) NTLMv2 authentication (mostly implemented - double check
+e) More testing of NTLMv2 authentication (mostly implemented - double check
 that NTLMv2 signing works, also need to cleanup now unneeded SessSetup code in
 fs/cifs/connect.c)
@@ -27,55 +27,44 @@ used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
 and raw NTLMSSP already. This is important when enabling
 extended security and mounting to Windows 2003 Servers
-f) Directory entry caching relies on a 1 second timer, rather than 
+g) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
-g) A few byte range testcases fail due to POSIX vs. Windows/CIFS
+h) quota support (needs minor kernel change since quota calls
-style byte range lock differences.  Save byte range locks so
-reconnect can replay them.  
-h) Support unlock all (unlock 0,MAX_OFFSET)
-by unlocking all known byte range locks that we locked on the file.
-i) quota support (needs minor kernel change since quota calls
 to make it to network filesystems or deviceless filesystems)
-j) investigate sync behavior (including syncpage) and check  
+i) investigate sync behavior (including syncpage) and check  
 for proper behavior of intr/nointr
-k) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
 extra copy in/out of the socket buffers in some cases.
-l) finish support for IPv6.  This is mostly complete but
+k) Better optimize open (and pathbased setfilesize) to reduce the
-needs a simple conversion of ipv6 to sin6_addr from the
-address in string representation.
-m) Better optimize open (and pathbased setfilesize) to reduce the
 oplock breaks coming from windows srv.  Piggyback identical file
 opens on top of each other by incrementing reference count rather
 than resending (helps reduce server resource utilization and avoid
 spurious oplock breaks).
-o) Improve performance of readpages by sending more than one read
+l) Improve performance of readpages by sending more than one read
 at a time when 8 pages or more are requested. In conjuntion
 add support for async_cifs_readpages.
-p) Add support for storing symlink info to Windows servers 
+m) Add support for storing symlink info to Windows servers 
 in the Extended Attribute format their SFU clients would recognize.
-q) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
 will autorefresh (partially complete by Asser). Needs minor kernel
 vfs change to support removing D_NOTIFY on a file.   
-r) Add GUI tool to configure /proc/fs/cifs settings and for display of
+o) Add GUI tool to configure /proc/fs/cifs settings and for display of
 the CIFS statistics (started)
-s) implement support for security and trusted categories of xattrs
+p) implement support for security and trusted categories of xattrs
 (requires minor protocol extension) to enable better support for SELINUX
-t) Implement O_DIRECT flag on open (already supported on mount)
+q) Implement O_DIRECT flag on open (already supported on mount)
-u) Create UID mapping facility so server UIDs can be mapped on a per
+r) Create UID mapping facility so server UIDs can be mapped on a per
 mount or a per server basis to client UIDs or nobody if no mapping
 exists.  This is helpful when Unix extensions are negotiated to
 allow better permission checking when UIDs differ on the server
@@ -83,19 +72,26 @@ and client.  Add new protocol request to the CIFS protocol
 standard for asking the server for the corresponding name of a
 particular uid.
-v) Add support for CIFS Unix and also the newer POSIX extensions to the
+s) Add support for CIFS Unix and also the newer POSIX extensions to the
 server side for Samba 4.
-w) Finish up the dos time conversion routines needed to return old server
+t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
-time to the client (default time, of now or time 0 is used now for these 
-very old servers)
-x) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
 need to add ability to set time to server (utimes command)
-y) Finish testing of Windows 9x/Windows ME server support (started).
+u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+v) mount check for unmatched uids
+w) Add mount option for Linux extension disable per mount, and partial
+disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?)
-KNOWN BUGS (updated February 26, 2007)
+x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
+processes can proceed better in parallel (on the server)
+y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
+restriction of wsize max being 127K) 
+KNOWN BUGS (updated April 24, 2007)
 ====================================
 See http://bugzilla.samba.org - search on product "CifsVFS" for
 current bug list.
@@ -127,10 +123,3 @@ negotiated size) and send larger write sizes to modern servers.
 4) More exhaustively test against less common servers.  More testing
 against Windows 9x, Windows ME servers.
-DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
-mount check for unmatched uids - and uid override
-Add mount option for Linux extension disable per mount, and partial disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?) 
-Free threads at umount --force that are stuck on the sesSem
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index fd1e52ebcee6..4cc2012e9322 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -22,12 +22,14 @@
 #define CIFS_MOUNT_SET_UID      2 /* set current->euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server */
 #define CIFS_MOUNT_DIRECT_IO    8 /* do not write nor read through page cache */
-#define CIFS_MOUNT_NO_XATTR  0x10 /* if set - disable xattr support */
+#define CIFS_MOUNT_NO_XATTR     0x10  /* if set - disable xattr support       */
-#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */
+#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames   */
-#define CIFS_MOUNT_POSIX_PATHS  0x40 /* Negotiate posix pathnames if possible. */
+#define CIFS_MOUNT_POSIX_PATHS  0x40  /* Negotiate posix pathnames if possible*/
-#define CIFS_MOUNT_UNX_EMUL     0x80 /* Network compat with SFUnix emulation */
+#define CIFS_MOUNT_UNX_EMUL     0x80  /* Network compat with SFUnix emulation */
-#define CIFS_MOUNT_NO_BRL       0x100 /* No sending byte range locks to srv */
+#define CIFS_MOUNT_NO_BRL       0x100 /* No sending byte range locks to srv   */
-#define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv */
+#define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
+#define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
+#define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
 struct cifs_sb_info {
        struct cifsTconInfo *tcon;      /* primary mount */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d2a8b2941fc2..793c4b95c164 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -74,8 +74,8 @@ cifs_strtoUCS(__le16 * to, const char *from, int len,
                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
                        cERROR(1,
-                               ("cifs_strtoUCS: char2uni returned %d",
+                               ("strtoUCS: char2uni of %d returned %d",
-                                charlen));
+                                (int)*from, charlen));
                        /* A question mark */
                        to[i] = cpu_to_le16(0x003f);
                        charlen = 1;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index faba4d69fe91..8568e100953c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -100,7 +100,7 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
        sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL);
        cifs_sb = CIFS_SB(sb);
-        if(cifs_sb == NULL)
+        if (cifs_sb == NULL)
                return -ENOMEM;
        rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -115,10 +115,10 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        if(experimEnabled != 0)
+        if (experimEnabled != 0)
                sb->s_export_op = &cifs_export_ops;
 #endif /* EXPERIMENTAL */       
-/*      if(cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
+/*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
            sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
 #ifdef CONFIG_CIFS_QUOTA
        sb->s_qcop = &cifs_quotactl_ops;
@@ -147,8 +147,8 @@ out_no_root:
                iput(inode);
 out_mount_failed:
-        if(cifs_sb) {
+        if (cifs_sb) {
-                if(cifs_sb->local_nls)
+                if (cifs_sb->local_nls)
                        unload_nls(cifs_sb->local_nls); 
                kfree(cifs_sb);
        }
@@ -163,7 +163,7 @@ cifs_put_super(struct super_block *sb)
        cFYI(1, ("In cifs_put_super"));
        cifs_sb = CIFS_SB(sb);
-        if(cifs_sb == NULL) {
+        if (cifs_sb == NULL) {
                cFYI(1,("Empty cifs superblock info passed to unmount"));
                return;
        }
@@ -208,14 +208,14 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
    /* Only need to call the old QFSInfo if failed
    on newer one */
-    if(rc)
+    if (rc)
-        if(pTcon->ses->capabilities & CAP_NT_SMBS)
+        if (pTcon->ses->capabilities & CAP_NT_SMBS)
                rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
        /* Some old Windows servers also do not support level 103, retry with
           older level one if old server failed the previous call or we
           bypassed it because we detected that this was an older LANMAN sess */
-        if(rc)
+        if (rc)
                rc = SMBOldQFSInfo(xid, pTcon, buf);
        /*     
           int f_type;
@@ -301,11 +301,19 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                                if (cifs_sb->tcon->ses->userName)
                                        seq_printf(s, ",username=%s",
                                           cifs_sb->tcon->ses->userName);
-                                if(cifs_sb->tcon->ses->domainName)
+                                if (cifs_sb->tcon->ses->domainName)
                                        seq_printf(s, ",domain=%s",
                                           cifs_sb->tcon->ses->domainName);
                        }
                }
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+                        seq_printf(s, ",posixpaths");
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
+                   !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
+                        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
+                   !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
+                        seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
                seq_printf(s, ",rsize=%d",cifs_sb->rsize);
                seq_printf(s, ",wsize=%d",cifs_sb->wsize);
        }
@@ -321,14 +329,14 @@ int cifs_xquota_set(struct super_block * sb, int quota_type, qid_t qid,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifsTconInfo *pTcon;
        
-        if(cifs_sb)
+        if (cifs_sb)
                pTcon = cifs_sb->tcon;
        else
                return -EIO;
        xid = GetXid();
-        if(pTcon) {
+        if (pTcon) {
                cFYI(1,("set type: 0x%x id: %d",quota_type,qid));               
        } else {
                return -EIO;
@@ -346,13 +354,13 @@ int cifs_xquota_get(struct super_block * sb, int quota_type, qid_t qid,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifsTconInfo *pTcon;
-        if(cifs_sb)
+        if (cifs_sb)
                pTcon = cifs_sb->tcon;
        else
                return -EIO;
        xid = GetXid();
-        if(pTcon) {
+        if (pTcon) {
                cFYI(1,("set type: 0x%x id: %d",quota_type,qid));
        } else {
                rc = -EIO;
@@ -369,13 +377,13 @@ int cifs_xstate_set(struct super_block * sb, unsigned int flags, int operation)
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifsTconInfo *pTcon;
-        if(cifs_sb)
+        if (cifs_sb)
                pTcon = cifs_sb->tcon;
        else
                return -EIO;
        xid = GetXid();
-        if(pTcon) {
+        if (pTcon) {
                cFYI(1,("flags: 0x%x operation: 0x%x",flags,operation));
        } else {
                rc = -EIO;
@@ -392,13 +400,13 @@ int cifs_xstate_get(struct super_block * sb, struct fs_quota_stat *qstats)
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifsTconInfo *pTcon;
-        if(cifs_sb) {
+        if (cifs_sb) {
                pTcon = cifs_sb->tcon;
        } else {
                return -EIO;
        }
        xid = GetXid();
-        if(pTcon) {
+        if (pTcon) {
                cFYI(1,("pqstats %p",qstats));          
        } else {
                rc = -EIO;
@@ -424,11 +432,11 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
        if (!(flags & MNT_FORCE))
                return;
        cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
-        if(cifs_sb == NULL)
+        if (cifs_sb == NULL)
                return;
        tcon = cifs_sb->tcon;
-        if(tcon == NULL)
+        if (tcon == NULL)
                return;
        down(&tcon->tconSem);
        if (atomic_read(&tcon->useCount) == 1)
@@ -437,7 +445,7 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
-        if(tcon->ses && tcon->ses->server)
+        if (tcon->ses && tcon->ses->server)
        {
                cFYI(1,("wake up tasks now - umount begin not complete"));
                wake_up_all(&tcon->ses->server->request_q);
@@ -529,8 +537,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                /* some applications poll for the file length in this strange
                   way so we must seek to end on non-oplocked files by
                   setting the revalidate time to zero */
-                if(file->f_path.dentry->d_inode)                
+                CIFS_I(file->f_path.dentry->d_inode)->time = 0;
-                        CIFS_I(file->f_path.dentry->d_inode)->time = 0;
                retval = cifs_revalidate(file->f_path.dentry);
                if (retval < 0)
@@ -694,8 +701,7 @@ cifs_init_once(void *inode, struct kmem_cache * cachep, unsigned long flags)
 {
        struct cifsInodeInfo *cifsi = inode;
-        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                inode_init_once(&cifsi->vfs_inode);
                INIT_LIST_HEAD(&cifsi->lockList);
        }
@@ -724,7 +730,7 @@ cifs_destroy_inodecache(void)
 static int
 cifs_init_request_bufs(void)
 {
-        if(CIFSMaxBufSize < 8192) {
+        if (CIFSMaxBufSize < 8192) {
        /* Buffer size can not be smaller than 2 * PATH_MAX since maximum
        Unicode path name has to fit in any SMB/CIFS path based frames */
                CIFSMaxBufSize = 8192;
@@ -741,7 +747,7 @@ cifs_init_request_bufs(void)
        if (cifs_req_cachep == NULL)
                return -ENOMEM;
-        if(cifs_min_rcv < 1)
+        if (cifs_min_rcv < 1)
                cifs_min_rcv = 1;
        else if (cifs_min_rcv > 64) {
                cifs_min_rcv = 64;
@@ -751,7 +757,7 @@ cifs_init_request_bufs(void)
        cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
                                                  cifs_req_cachep);
-        if(cifs_req_poolp == NULL) {
+        if (cifs_req_poolp == NULL) {
                kmem_cache_destroy(cifs_req_cachep);
                return -ENOMEM;
        }
@@ -772,7 +778,7 @@ cifs_init_request_bufs(void)
                return -ENOMEM;              
        }
-        if(cifs_min_small < 2)
+        if (cifs_min_small < 2)
                cifs_min_small = 2;
        else if (cifs_min_small > 256) {
                cifs_min_small = 256;
@@ -782,7 +788,7 @@ cifs_init_request_bufs(void)
        cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
                                                     cifs_sm_req_cachep);
-        if(cifs_sm_req_poolp == NULL) {
+        if (cifs_sm_req_poolp == NULL) {
                mempool_destroy(cifs_req_poolp);
                kmem_cache_destroy(cifs_req_cachep);
                kmem_cache_destroy(cifs_sm_req_cachep);
@@ -812,7 +818,7 @@ cifs_init_mids(void)
        /* 3 is a reasonable minimum number of simultaneous operations */
        cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
-        if(cifs_mid_poolp == NULL) {
+        if (cifs_mid_poolp == NULL) {
                kmem_cache_destroy(cifs_mid_cachep);
                return -ENOMEM;
        }
@@ -850,14 +856,14 @@ static int cifs_oplock_thread(void * dummyarg)
                        continue;
                
                spin_lock(&GlobalMid_Lock);
-                if(list_empty(&GlobalOplock_Q)) {
+                if (list_empty(&GlobalOplock_Q)) {
                        spin_unlock(&GlobalMid_Lock);
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(39*HZ);
                } else {
                        oplock_item = list_entry(GlobalOplock_Q.next, 
                                struct oplock_q_entry, qhead);
-                        if(oplock_item) {
+                        if (oplock_item) {
                                cFYI(1,("found oplock item to write out")); 
                                pTcon = oplock_item->tcon;
                                inode = oplock_item->pinode;
@@ -871,7 +877,7 @@ static int cifs_oplock_thread(void * dummyarg)
                                /* mutex_lock(&inode->i_mutex);*/
                                if (S_ISREG(inode->i_mode)) {
                                        rc = filemap_fdatawrite(inode->i_mapping);
-                                        if(CIFS_I(inode)->clientCanCacheRead == 0) {
+                                        if (CIFS_I(inode)->clientCanCacheRead == 0) {
                                                filemap_fdatawait(inode->i_mapping);
                                                invalidate_remote_inode(inode);
                                        }
@@ -888,7 +894,7 @@ static int cifs_oplock_thread(void * dummyarg)
                                not bother sending an oplock release if session 
                                to server still is disconnected since oplock 
                                already released by the server in that case */
-                                if(pTcon->tidStatus != CifsNeedReconnect) {
+                                if (pTcon->tidStatus != CifsNeedReconnect) {
                                    rc = CIFSSMBLock(0, pTcon, netfid,
                                            0 /* len */ , 0 /* offset */, 0, 
                                            0, LOCKING_ANDX_OPLOCK_RELEASE,
@@ -922,7 +928,7 @@ static int cifs_dnotify_thread(void * dummyarg)
                list_for_each(tmp, &GlobalSMBSessionList) {
                        ses = list_entry(tmp, struct cifsSesInfo, 
                                cifsSessionList);
-                        if(ses && ses->server && 
+                        if (ses && ses->server && 
                             atomic_read(&ses->server->inFlight))
                                wake_up_all(&ses->server->response_q);
                }
@@ -971,10 +977,10 @@ init_cifs(void)
        rwlock_init(&GlobalSMBSeslock);
        spin_lock_init(&GlobalMid_Lock);
-        if(cifs_max_pending < 2) {
+        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
                cFYI(1,("cifs_max_pending set to min of 2"));
-        } else if(cifs_max_pending > 256) {
+        } else if (cifs_max_pending > 256) {
                cifs_max_pending = 256;
                cFYI(1,("cifs_max_pending set to max of 256"));
        }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2c2c384894d8..c235d32ad4a8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t  cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
                       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.48"
+#define CIFS_VERSION   "1.49"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e4de8eba4780..23655de2f4a4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -311,7 +311,7 @@ struct cifsFileInfo {
        /* lock scope id (0 if none) */
        struct file * pfile; /* needed for writepage */
        struct inode * pInode; /* needed for oplock break */
-        struct semaphore lock_sem;
+        struct mutex lock_mutex;
        struct list_head llist; /* list of byte range locks we have. */
        unsigned closePend:1;   /* file is marked to close */
        unsigned invalidHandle:1;  /* file closed via session abend */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 4d8948e8762c..d619ca7d1416 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1388,7 +1388,7 @@ struct smb_t2_rsp {
 #define SMB_SET_POSIX_LOCK              0x208
 #define SMB_POSIX_OPEN                  0x209
 #define SMB_POSIX_UNLINK                0x20a
-#define SMB_SET_FILE_UNIX_INFO2
+#define SMB_SET_FILE_UNIX_INFO2         0x20b
 #define SMB_SET_FILE_BASIC_INFO2        0x3ec
 #define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */
 #define SMB_FILE_ALL_INFO2              0x3fa
@@ -2109,22 +2109,40 @@ struct cifs_posix_acl { /* access conrol list  (ACL) */
 /* end of POSIX ACL definitions */
+/* POSIX Open Flags */
+#define SMB_O_RDONLY     0x1
+#define SMB_O_WRONLY    0x2
+#define SMB_O_RDWR      0x4
+#define SMB_O_CREAT     0x10
+#define SMB_O_EXCL      0x20
+#define SMB_O_TRUNC     0x40
+#define SMB_O_APPEND    0x80
+#define SMB_O_SYNC      0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW  0x400
+#define SMB_O_DIRECT    0x800
 typedef struct {
-        __u32 OpenFlags; /* same as NT CreateX */
+        __le32 OpenFlags; /* same as NT CreateX */
-        __u32 PosixOpenFlags;
+        __le32 PosixOpenFlags;
-        __u32 Mode;
+        __le64 Permissions;
-        __u16 Level; /* reply level requested (see QPathInfo levels) */
+        __le16 Level; /* reply level requested (see QPathInfo levels) */
-        __u16 Pad;  /* reserved - MBZ */
 } __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */
 typedef struct {
-        /* reply varies based on requested level */
+        __le16 OplockFlags;
+        __u16 Fid;
+        __le32 CreateAction;
+        __le16 ReturnedLevel;
+        __le16 Pad;
+        /* struct following varies based on requested level */
 } __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */
 struct file_internal_info {
        __u64  UniqueId; /* inode number */
 } __attribute__((packed));      /* level 0x3ee */
 struct file_mode_info {
        __le32  Mode;
 } __attribute__((packed));      /* level 0x3f8 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 32eb1acab630..5d163e2b6143 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifsproto.h
 *
- *   Copyright (c) International Business Machines  Corp., 2002,2006
+ *   Copyright (c) International Business Machines  Corp., 2002,2007
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -244,6 +244,11 @@ extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
                        const int access_flags, const int omode,
                        __u16 * netfid, int *pOplock, FILE_ALL_INFO *,
                        const struct nls_table *nls_codepage, int remap);
+extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, 
+                        u32 posix_flags, __u64 mode, __u16 * netfid,
+                        FILE_UNIX_BASIC_INFO *pRetData,
+                        __u32 *pOplock, const char *name,
+                        const struct nls_table *nls_codepage, int remap);                       
 extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
                        const int smb_file_id);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 48fc0c2ab0e5..14de58fa1437 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifssmb.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2006
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for constructing the SMB PDUs themselves
@@ -24,8 +24,8 @@
 /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c   */
 /* These are mostly routines that operate on a pathname, or on a tree id     */
 /* (mounted volume), but there are eight handle based routines which must be */
- /* treated slightly different for reconnection purposes since we never want  */
+ /* treated slightly differently for reconnection purposes since we never     */
- /* to reuse a stale file handle and the caller knows the file handle */
+ /* want to reuse a stale file handle and only the caller knows the file info */
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -913,6 +913,130 @@ MkDirRetry:
        return rc;
 }
+int
+CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
+                __u64 mode, __u16 * netfid, FILE_UNIX_BASIC_INFO *pRetData,
+                __u32 *pOplock, const char *name, 
+                const struct nls_table *nls_codepage, int remap)
+{
+        TRANSACTION2_SPI_REQ *pSMB = NULL;
+        TRANSACTION2_SPI_RSP *pSMBr = NULL;
+        int name_len;
+        int rc = 0;
+        int bytes_returned = 0;
+        char *data_offset;
+        __u16 params, param_offset, offset, byte_count, count;
+        OPEN_PSX_REQ * pdata;
+        OPEN_PSX_RSP * psx_rsp;
+        cFYI(1, ("In POSIX Create"));
+PsxCreat:
+        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+                      (void **) &pSMBr);
+        if (rc)
+                return rc;
+        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+                name_len =
+                    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
+                                     PATH_MAX, nls_codepage, remap);
+                name_len++;     /* trailing null */
+                name_len *= 2;
+        } else {        /* BB improve the check for buffer overruns BB */
+                name_len = strnlen(name, PATH_MAX);
+                name_len++;     /* trailing null */
+                strncpy(pSMB->FileName, name, name_len);
+        }
+        params = 6 + name_len;
+        count = sizeof(OPEN_PSX_REQ);
+        pSMB->MaxParameterCount = cpu_to_le16(2);
+        pSMB->MaxDataCount = cpu_to_le16(1000); /* large enough */
+        pSMB->MaxSetupCount = 0;
+        pSMB->Reserved = 0;
+        pSMB->Flags = 0;
+        pSMB->Timeout = 0;
+        pSMB->Reserved2 = 0;
+        param_offset = offsetof(struct smb_com_transaction2_spi_req,
+                                     InformationLevel) - 4;
+        offset = param_offset + params;
+        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+        pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+        pdata->Level = SMB_QUERY_FILE_UNIX_BASIC;
+        pdata->Permissions = cpu_to_le64(mode);
+        pdata->PosixOpenFlags = cpu_to_le32(posix_flags); 
+        pdata->OpenFlags =  cpu_to_le32(*pOplock);
+        pSMB->ParameterOffset = cpu_to_le16(param_offset);
+        pSMB->DataOffset = cpu_to_le16(offset);
+        pSMB->SetupCount = 1;
+        pSMB->Reserved3 = 0;
+        pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+        byte_count = 3 /* pad */  + params + count;
+        pSMB->DataCount = cpu_to_le16(count);
+        pSMB->ParameterCount = cpu_to_le16(params);
+        pSMB->TotalDataCount = pSMB->DataCount;
+        pSMB->TotalParameterCount = pSMB->ParameterCount;
+        pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
+        pSMB->Reserved4 = 0;
+        pSMB->hdr.smb_buf_length += byte_count; 
+        pSMB->ByteCount = cpu_to_le16(byte_count);
+        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+        if (rc) {
+                cFYI(1, ("Posix create returned %d", rc));
+                goto psx_create_err;
+        }
+        cFYI(1,("copying inode info"));
+        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+        if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+                rc = -EIO;      /* bad smb */
+                goto psx_create_err;
+        }
+        /* copy return information to pRetData */
+        psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol 
+                        + le16_to_cpu(pSMBr->t2.DataOffset));
+                
+        *pOplock = le16_to_cpu(psx_rsp->OplockFlags);
+        if(netfid)
+                *netfid = psx_rsp->Fid;   /* cifs fid stays in le */
+        /* Let caller know file was created so we can set the mode. */
+        /* Do we care about the CreateAction in any other cases? */
+        if(cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction)
+                *pOplock |= CIFS_CREATE_ACTION;
+        /* check to make sure response data is there */
+        if(psx_rsp->ReturnedLevel != SMB_QUERY_FILE_UNIX_BASIC) {
+                pRetData->Type = -1; /* unknown */
+#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(1,("unknown type"));
+#endif
+        } else {
+                if(pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) 
+                                        + sizeof(FILE_UNIX_BASIC_INFO)) {
+                        cERROR(1,("Open response data too small"));
+                        pRetData->Type = -1;
+                        goto psx_create_err;
+                }
+                memcpy((char *) pRetData, 
+                        (char *)psx_rsp + sizeof(OPEN_PSX_RSP),
+                        sizeof (FILE_UNIX_BASIC_INFO));
+        }
+                        
+psx_create_err:
+        cifs_buf_release(pSMB);
+        cifs_stats_inc(&tcon->num_mkdirs);
+        if (rc == -EAGAIN)
+                goto PsxCreat;
+        return rc;      
+}
 static __u16 convert_disposition(int disposition)
 {
        __u16 ofun = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 20ba7dcc9959..216fb625843f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -30,6 +30,7 @@
 #include <linux/mempool.h>
 #include <linux/delay.h>
 #include <linux/completion.h>
+#include <linux/kthread.h>
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
@@ -74,6 +75,8 @@ struct smb_vol {
        unsigned retry:1;
        unsigned intr:1;
        unsigned setuids:1;
+        unsigned override_uid:1;
+        unsigned override_gid:1;
        unsigned noperm:1;
        unsigned no_psx_acl:1; /* set if posix acl support should be disabled */
        unsigned cifs_acl:1;
@@ -120,7 +123,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        struct mid_q_entry * mid_entry;
        
        spin_lock(&GlobalMid_Lock);
-        if(server->tcpStatus == CifsExiting) {
+        if( kthread_should_stop() ) {
                /* the demux thread will exit normally 
                next time through the loop */
                spin_unlock(&GlobalMid_Lock);
@@ -182,7 +185,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        spin_unlock(&GlobalMid_Lock);
        up(&server->tcpSem); 
-        while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood))
+        while ( (!kthread_should_stop()) && (server->tcpStatus != CifsGood))
        {
                try_to_freeze();
                if(server->protocolType == IPV6) {
@@ -199,7 +202,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                } else {
                        atomic_inc(&tcpSesReconnectCount);
                        spin_lock(&GlobalMid_Lock);
-                        if(server->tcpStatus != CifsExiting)
+                        if( !kthread_should_stop() )
                                server->tcpStatus = CifsGood;
                        server->sequence_number = 0;
                        spin_unlock(&GlobalMid_Lock);                   
@@ -345,7 +348,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        int isMultiRsp;
        int reconnect;
-        daemonize("cifsd");
        allow_signal(SIGKILL);
        current->flags |= PF_MEMALLOC;
        server->tsk = current;  /* save process info to wake at shutdown */
@@ -361,7 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                        GFP_KERNEL);
        }
-        while (server->tcpStatus != CifsExiting) {
+        while (!kthread_should_stop()) {
                if (try_to_freeze())
                        continue;
                if (bigbuf == NULL) {
@@ -400,7 +402,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                    kernel_recvmsg(csocket, &smb_msg,
                                 &iov, 1, 4, 0 /* BB see socket.h flags */);
-                if (server->tcpStatus == CifsExiting) {
+                if ( kthread_should_stop() ) {
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
                        cFYI(1, ("Reconnect after server stopped responding"));
@@ -524,7 +526,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                     total_read += length) {
                        length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
                                                pdu_length - total_read, 0);
-                        if((server->tcpStatus == CifsExiting) ||
+                        if( kthread_should_stop() ||
                            (length == -EINTR)) {
                                /* then will exit */
                                reconnect = 2;
@@ -757,7 +759,6 @@ multi_t2_fnd:
                        GFP_KERNEL);
        }
        
-        complete_and_exit(&cifsd_complete, 0);
        return 0;
 }
@@ -973,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                        }
                        if ((temp_len = strnlen(value, 300)) < 300) {
                                vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
-                                if(vol->UNC == NULL)
+                                if (vol->UNC == NULL)
                                        return 1;
                                strcpy(vol->UNC,value);
                                if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1010,12 +1011,12 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                return 1;       /* needs_arg; */
                        }
                        if ((temp_len = strnlen(value, 1024)) < 1024) {
-                                if(value[0] != '/')
+                                if (value[0] != '/')
                                        temp_len++;  /* missing leading slash */
                                vol->prepath = kmalloc(temp_len+1,GFP_KERNEL);
-                                if(vol->prepath == NULL)
+                                if (vol->prepath == NULL)
                                        return 1;
-                                if(value[0] != '/') {
+                                if (value[0] != '/') {
                                        vol->prepath[0] = '/';
                                        strcpy(vol->prepath+1,value);
                                } else
@@ -1031,7 +1032,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                return 1;       /* needs_arg; */
                        }
                        if (strnlen(value, 65) < 65) {
-                                if(strnicmp(value,"default",7))
+                                if (strnicmp(value,"default",7))
                                        vol->iocharset = value;
                                /* if iocharset not set load_nls_default used by caller */
                                cFYI(1, ("iocharset set to %s",value));
@@ -1043,11 +1044,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                        if (value && *value) {
                                vol->linux_uid =
                                        simple_strtoul(value, &value, 0);
+                                vol->override_uid = 1;
                        }
                } else if (strnicmp(data, "gid", 3) == 0) {
                        if (value && *value) {
                                vol->linux_gid =
                                        simple_strtoul(value, &value, 0);
+                                vol->override_gid = 1;
                        }
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
@@ -1102,7 +1105,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                }
                                /* The string has 16th byte zero still from
                                set at top of the function  */
-                                if((i==15) && (value[i] != 0))
+                                if ((i==15) && (value[i] != 0))
                                        printk(KERN_WARNING "CIFS: netbiosname longer than 15 truncated.\n");
                        }
                } else if (strnicmp(data, "servern", 7) == 0) {
@@ -1126,7 +1129,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                }
                                /* The string has 16th byte zero still from
                                   set at top of the function  */
-                                if((i==15) && (value[i] != 0))
+                                if ((i==15) && (value[i] != 0))
                                        printk(KERN_WARNING "CIFS: server netbiosname longer than 15 truncated.\n");
                        }
                } else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1233,13 +1236,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                        printk(KERN_WARNING "CIFS: Unknown mount option %s\n",data);
        }
        if (vol->UNC == NULL) {
-                if(devname == NULL) {
+                if (devname == NULL) {
                        printk(KERN_WARNING "CIFS: Missing UNC name for mount target\n");
                        return 1;
                }
                if ((temp_len = strnlen(devname, 300)) < 300) {
                        vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
-                        if(vol->UNC == NULL)
+                        if (vol->UNC == NULL)
                                return 1;
                        strcpy(vol->UNC,devname);
                        if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1663,7 +1666,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo * tcon,
                                CIFS_SB(sb)->mnt_cifs_flags |= 
                                        CIFS_MOUNT_POSIX_PATHS;
                }
-                        
+        
+                /* We might be setting the path sep back to a different
+                form if we are reconnecting and the server switched its
+                posix path capability for this share */ 
+                if(sb && (CIFS_SB(sb)->prepathlen > 0))
+                        CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
+        
                cFYI(1,("Negotiate caps 0x%x",(int)cap));
 #ifdef CONFIG_CIFS_DEBUG2
                if(cap & CIFS_UNIX_FCNTL_CAP)
@@ -1712,12 +1721,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                return -EINVAL;
        }
-        if (volume_info.username) {
+        if (volume_info.nullauth) {
+                cFYI(1,("null user"));
+                volume_info.username = NULL;
+        } else if (volume_info.username) {
                /* BB fixme parse for domain name here */
                cFYI(1, ("Username: %s ", volume_info.username));
-        } else if (volume_info.nullauth) {
-                cFYI(1,("null user"));
        } else {
                cifserror("No username specified");
        /* In userspace mount helper we can get user name from alternate
@@ -1791,11 +1800,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
                        NULL /* no ipv6 addr */,
                        volume_info.username, &srvTcp);
-        else if(address_type == AF_INET6)
+        else if(address_type == AF_INET6) {
+                cFYI(1,("looking for ipv6 address"));
                existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
                        &sin_server6.sin6_addr,
                        volume_info.username, &srvTcp);
-        else {
+        } else {
                kfree(volume_info.UNC);
                kfree(volume_info.password);
                kfree(volume_info.prepath);
@@ -1807,17 +1817,23 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        if (srvTcp) {
                cFYI(1, ("Existing tcp session with server found"));                
        } else {        /* create socket */
-                if(volume_info.port)
+                if (volume_info.port)
                        sin_server.sin_port = htons(volume_info.port);
                else
                        sin_server.sin_port = 0;
-                rc = ipv4_connect(&sin_server,&csocket,
+                if (address_type == AF_INET6) {
+                        cFYI(1,("attempting ipv6 connect"));
+                        /* BB should we allow ipv6 on port 139? */
+                        /* other OS never observed in Wild doing 139 with v6 */
+                        rc = ipv6_connect(&sin_server6,&csocket);
+                } else 
+                        rc = ipv4_connect(&sin_server,&csocket,
                                  volume_info.source_rfc1001_name,
                                  volume_info.target_rfc1001_name);
                if (rc < 0) {
                        cERROR(1,
-                               ("Error connecting to IPv4 socket. Aborting operation"));
+                               ("Error connecting to IPv4 socket. Aborting operation"));                               
-                        if(csocket != NULL)
+                        if (csocket != NULL)
                                sock_release(csocket);
                        kfree(volume_info.UNC);
                        kfree(volume_info.password);
@@ -1850,10 +1866,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        so no need to spinlock this init of tcpStatus */
                        srvTcp->tcpStatus = CifsNew;
                        init_MUTEX(&srvTcp->tcpSem);
-                        rc = (int)kernel_thread((void *)(void *)cifs_demultiplex_thread, srvTcp,
+                        srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
-                                      CLONE_FS | CLONE_FILES | CLONE_VM);
+                        if ( IS_ERR(srvTcp->tsk) ) {
-                        if(rc < 0) {
+                                rc = PTR_ERR(srvTcp->tsk);
-                                rc = -ENOMEM;
+                                cERROR(1,("error %d create cifsd thread", rc));
+                                srvTcp->tsk = NULL;
                                sock_release(csocket);
                                kfree(volume_info.UNC);
                                kfree(volume_info.password);
@@ -1896,7 +1913,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                int len = strlen(volume_info.domainname);
                                pSesInfo->domainName = 
                                        kmalloc(len + 1, GFP_KERNEL);
-                                if(pSesInfo->domainName)
+                                if (pSesInfo->domainName)
                                        strcpy(pSesInfo->domainName,
                                                volume_info.domainname);
                        }
@@ -1906,7 +1923,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        /* BB FIXME need to pass vol->secFlgs BB */
                        rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
                        up(&pSesInfo->sesSem);
-                        if(!rc)
+                        if (!rc)
                                atomic_inc(&srvTcp->socketUseCount);
                } else
                        kfree(volume_info.password);
@@ -1914,7 +1931,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
    
        /* search for existing tcon to this server share */
        if (!rc) {
-                if(volume_info.rsize > CIFSMaxBufSize) {
+                if (volume_info.rsize > CIFSMaxBufSize) {
                        cERROR(1,("rsize %d too large, using MaxBufSize",
                                volume_info.rsize));
                        cifs_sb->rsize = CIFSMaxBufSize;
@@ -1923,11 +1940,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                else /* default */
                        cifs_sb->rsize = CIFSMaxBufSize;
-                if(volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+                if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
                        cERROR(1,("wsize %d too large using 4096 instead",
                                  volume_info.wsize));
                        cifs_sb->wsize = 4096;
-                } else if(volume_info.wsize)
+                } else if (volume_info.wsize)
                        cifs_sb->wsize = volume_info.wsize;
                else
                        cifs_sb->wsize = 
@@ -1940,14 +1957,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                           conjunction with 52K kvec constraint on arch with 4K
                           page size  */
-                if(cifs_sb->rsize < 2048) {
+                if (cifs_sb->rsize < 2048) {
                        cifs_sb->rsize = 2048; 
                        /* Windows ME may prefer this */
                        cFYI(1,("readsize set to minimum 2048"));
                }
                /* calculate prepath */
                cifs_sb->prepath = volume_info.prepath;
-                if(cifs_sb->prepath) {
+                if (cifs_sb->prepath) {
                        cifs_sb->prepathlen = strlen(cifs_sb->prepath);
                        cifs_sb->prepath[0] = CIFS_DIR_SEP(cifs_sb);
                        volume_info.prepath = NULL;
@@ -1960,24 +1977,27 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                cFYI(1,("file mode: 0x%x  dir mode: 0x%x",
                        cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode));
-                if(volume_info.noperm)
+                if (volume_info.noperm)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-                if(volume_info.setuids)
+                if (volume_info.setuids)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-                if(volume_info.server_ino)
+                if (volume_info.server_ino)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-                if(volume_info.remap)
+                if (volume_info.remap)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-                if(volume_info.no_xattr)
+                if (volume_info.no_xattr)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-                if(volume_info.sfu_emul)
+                if (volume_info.sfu_emul)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-                if(volume_info.nobrl)
+                if (volume_info.nobrl)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-                if(volume_info.cifs_acl)
+                if (volume_info.cifs_acl)
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+                if (volume_info.override_uid)
-                if(volume_info.direct_io) {
+                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+                if (volume_info.override_gid)
+                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+                if (volume_info.direct_io) {
                        cFYI(1,("mounting share using direct i/o"));
                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
                }
@@ -2030,7 +2050,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        }
                }
        }
-        if(pSesInfo) {
+        if (pSesInfo) {
                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
                        sb->s_maxbytes = (u64) 1 << 63;
                } else
@@ -2044,13 +2064,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        if (rc) {
                /* if session setup failed, use count is zero but
                we still need to free cifsd thread */
-                if(atomic_read(&srvTcp->socketUseCount) == 0) {
+                if (atomic_read(&srvTcp->socketUseCount) == 0) {
                        spin_lock(&GlobalMid_Lock);
                        srvTcp->tcpStatus = CifsExiting;
                        spin_unlock(&GlobalMid_Lock);
-                        if(srvTcp->tsk) {
+                        if (srvTcp->tsk) {
                                send_sig(SIGKILL,srvTcp->tsk,1);
-                                wait_for_completion(&cifsd_complete);
+                                kthread_stop(srvTcp->tsk);
                        }
                }
                 /* If find_unc succeeded then rc == 0 so we can not end */
@@ -2063,10 +2083,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                        int temp_rc;
                                        temp_rc = CIFSSMBLogoff(xid, pSesInfo);
                                        /* if the socketUseCount is now zero */
-                                        if((temp_rc == -ESHUTDOWN) &&
+                                        if ((temp_rc == -ESHUTDOWN) &&
-                                           (pSesInfo->server->tsk)) {
+                                           (pSesInfo->server) && (pSesInfo->server->tsk)) {
                                                send_sig(SIGKILL,pSesInfo->server->tsk,1);
-                                                wait_for_completion(&cifsd_complete);
+                                                kthread_stop(pSesInfo->server->tsk);
                                        }
                                } else
                                        cFYI(1, ("No session or bad tcon"));
@@ -2127,7 +2147,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
        __u16 count;
        cFYI(1, ("In sesssetup"));
-        if(ses == NULL)
+        if (ses == NULL)
                return -EINVAL;
        user = ses->userName;
        domain = ses->domainName;
@@ -2182,7 +2202,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                        *bcc_ptr = 0;
                        bcc_ptr++;
                }
-                if(user == NULL)
+                if (user == NULL)
                        bytes_returned = 0; /* skip null user */
                else
                        bytes_returned =
@@ -2216,7 +2236,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                bcc_ptr += 2 * bytes_returned;
                bcc_ptr += 2;
        } else {
-                if(user != NULL) {                
+                if (user != NULL) {                
                    strncpy(bcc_ptr, user, 200);
                    bcc_ptr += strnlen(user, 200);
                }
@@ -3316,7 +3336,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
                                cFYI(1,("Waking up socket by sending it signal"));
                                if(cifsd_task) {
                                        send_sig(SIGKILL,cifsd_task,1);
-                                        wait_for_completion(&cifsd_complete);
+                                        kthread_stop(cifsd_task);
                                }
                                rc = 0;
                        } /* else - we have an smb session
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3fad638d26d3..e5210519ac4b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -274,7 +274,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        pCifsFile->invalidHandle = FALSE;
                        pCifsFile->closePend     = FALSE;
                        init_MUTEX(&pCifsFile->fh_sem);
-                        init_MUTEX(&pCifsFile->lock_sem);
+                        mutex_init(&pCifsFile->lock_mutex);
                        INIT_LIST_HEAD(&pCifsFile->llist);
                        atomic_set(&pCifsFile->wrtPending,0);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2d3275bedb55..b570530f97bf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -48,7 +48,7 @@ static inline struct cifsFileInfo *cifs_init_private(
        private_data->netfid = netfid;
        private_data->pid = current->tgid;      
        init_MUTEX(&private_data->fh_sem);
-        init_MUTEX(&private_data->lock_sem);
+        mutex_init(&private_data->lock_mutex);
        INIT_LIST_HEAD(&private_data->llist);
        private_data->pfile = file; /* needed for writepage */
        private_data->pInode = inode;
@@ -338,8 +338,7 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
        return rc;
 }
-static int cifs_reopen_file(struct inode *inode, struct file *file, 
+static int cifs_reopen_file(struct file *file, int can_flush)
-        int can_flush)
 {
        int rc = -EACCES;
        int xid, oplock;
@@ -347,13 +346,12 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *pCifsFile;
        struct cifsInodeInfo *pCifsInode;
+        struct inode * inode;
        char *full_path = NULL;
        int desiredAccess;
        int disposition = FILE_OPEN;
        __u16 netfid;
-        if (inode == NULL)
-                return -EBADF;
        if (file->private_data) {
                pCifsFile = (struct cifsFileInfo *)file->private_data;
        } else
@@ -368,25 +366,37 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
        }
        if (file->f_path.dentry == NULL) {
-                up(&pCifsFile->fh_sem);
+                cERROR(1, ("no valid name if dentry freed"));
-                cFYI(1, ("failed file reopen, no valid name if dentry freed"));
+                dump_stack();
-                FreeXid(xid);
+                rc = -EBADF;
-                return -EBADF;
+                goto reopen_error_exit;
        }
+        inode = file->f_path.dentry->d_inode;
+        if(inode == NULL) {
+                cERROR(1, ("inode not valid"));
+                dump_stack();
+                rc = -EBADF;
+                goto reopen_error_exit;
+        }
+                
        cifs_sb = CIFS_SB(inode->i_sb);
        pTcon = cifs_sb->tcon;
 /* can not grab rename sem here because various ops, including
   those that already have the rename sem can end up causing writepage
   to get called and if the server was down that means we end up here,
   and we can never tell if the caller already has the rename_sem */
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
+                rc = -ENOMEM;
+reopen_error_exit:
                up(&pCifsFile->fh_sem);
                FreeXid(xid);
-                return -ENOMEM;
+                return rc;
        }
-        cFYI(1, (" inode = 0x%p file flags are 0x%x for %s",
+        cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
                 inode, file->f_flags,full_path));
        desiredAccess = cifs_convert_flags(file->f_flags);
@@ -401,13 +411,6 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
           and server version of file size can be stale. If we knew for sure
           that inode was not dirty locally we could do this */
-/*      buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-        if (buf == 0) {
-                up(&pCifsFile->fh_sem);
-                kfree(full_path);
-                FreeXid(xid);
-                return -ENOMEM;
-        } */
        rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 
@@ -508,12 +511,12 @@ int cifs_close(struct inode *inode, struct file *file)
                /* Delete any outstanding lock records.
                   We'll lose them when the file is closed anyway. */
-                down(&pSMBFile->lock_sem);
+                mutex_lock(&pSMBFile->lock_mutex);
                list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
                        list_del(&li->llist);
                        kfree(li);
                }
-                up(&pSMBFile->lock_sem);
+                mutex_unlock(&pSMBFile->lock_mutex);
                write_lock(&GlobalSMBSeslock);
                list_del(&pSMBFile->flist);
@@ -598,9 +601,9 @@ static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
        li->offset = offset;
        li->length = len;
        li->type = lockType;
-        down(&fid->lock_sem);
+        mutex_lock(&fid->lock_mutex);
        list_add(&li->llist, &fid->llist);
-        up(&fid->lock_sem);
+        mutex_unlock(&fid->lock_mutex);
        return 0;
 }
@@ -757,7 +760,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                        struct cifsLockInfo *li, *tmp;
                        rc = 0;
-                        down(&fid->lock_sem);
+                        mutex_lock(&fid->lock_mutex);
                        list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
                                if (pfLock->fl_start <= li->offset &&
                                                length >= li->length) {
@@ -771,7 +774,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                        kfree(li);
                                }
                        }
-                        up(&fid->lock_sem);
+                        mutex_unlock(&fid->lock_mutex);
                }
        }
@@ -792,12 +795,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        int xid, long_op;
        struct cifsFileInfo *open_file;
-        if (file->f_path.dentry == NULL)
-                return -EBADF;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if (cifs_sb == NULL)
-                return -EBADF;
        pTcon = cifs_sb->tcon;
@@ -807,14 +805,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        if (file->private_data == NULL)
                return -EBADF;
-        else
+        open_file = (struct cifsFileInfo *) file->private_data;
-                open_file = (struct cifsFileInfo *) file->private_data;
        
        xid = GetXid();
-        if (file->f_path.dentry->d_inode == NULL) {
-                FreeXid(xid);
-                return -EBADF;
-        }
        if (*poffset > file->f_path.dentry->d_inode->i_size)
                long_op = 2; /* writes past end of file can take a long time */
@@ -841,17 +834,11 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                                        return -EBADF;
                        }
                        if (open_file->invalidHandle) {
-                                if ((file->f_path.dentry == NULL) ||
-                                    (file->f_path.dentry->d_inode == NULL)) {
-                                        FreeXid(xid);
-                                        return total_written;
-                                }
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
                                   reopen_file not to flush data to server
                                   now */
-                                rc = cifs_reopen_file(file->f_path.dentry->d_inode,
+                                rc = cifs_reopen_file(file, FALSE);
-                                        file, FALSE);
                                if (rc != 0)
                                        break;
                        }
@@ -908,12 +895,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        int xid, long_op;
        struct cifsFileInfo *open_file;
-        if (file->f_path.dentry == NULL)
-                return -EBADF;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if (cifs_sb == NULL)
-                return -EBADF;
        pTcon = cifs_sb->tcon;
@@ -922,14 +904,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        if (file->private_data == NULL)
                return -EBADF;
-        else
+        open_file = (struct cifsFileInfo *)file->private_data;
-                open_file = (struct cifsFileInfo *)file->private_data;
        
        xid = GetXid();
-        if (file->f_path.dentry->d_inode == NULL) {
-                FreeXid(xid);
-                return -EBADF;
-        }
        if (*poffset > file->f_path.dentry->d_inode->i_size)
                long_op = 2; /* writes past end of file can take a long time */
@@ -957,17 +934,11 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
                                        return -EBADF;
                        }
                        if (open_file->invalidHandle) {
-                                if ((file->f_path.dentry == NULL) ||
-                                   (file->f_path.dentry->d_inode == NULL)) {
-                                        FreeXid(xid);
-                                        return total_written;
-                                }
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
                                   reopen_file not to flush data to 
                                   server now */
-                                rc = cifs_reopen_file(file->f_path.dentry->d_inode,
+                                rc = cifs_reopen_file(file, FALSE);
-                                        file, FALSE);
                                if (rc != 0)
                                        break;
                        }
@@ -1056,8 +1027,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
                        read_unlock(&GlobalSMBSeslock);
                        if((open_file->invalidHandle) && 
                           (!open_file->closePend) /* BB fixme -since the second clause can not be true remove it BB */) {
-                                rc = cifs_reopen_file(&cifs_inode->vfs_inode, 
+                                rc = cifs_reopen_file(open_file->pfile, FALSE);
-                                                      open_file->pfile, FALSE);
                                /* if it fails, try another handle - might be */
                                /* dangerous to hold up writepages with retry */
                                if(rc) {
@@ -1404,32 +1374,6 @@ static int cifs_commit_write(struct file *file, struct page *page,
        spin_lock(&inode->i_lock);
        if (position > inode->i_size) {
                i_size_write(inode, position);
-                /* if (file->private_data == NULL) {
-                        rc = -EBADF;
-                } else {
-                        open_file = (struct cifsFileInfo *)file->private_data;
-                        cifs_sb = CIFS_SB(inode->i_sb);
-                        rc = -EAGAIN;
-                        while (rc == -EAGAIN) {
-                                if ((open_file->invalidHandle) && 
-                                    (!open_file->closePend)) {
-                                        rc = cifs_reopen_file(
-                                                file->f_path.dentry->d_inode, file);
-                                        if (rc != 0)
-                                                break;
-                                }
-                                if (!open_file->closePend) {
-                                        rc = CIFSSMBSetFileSize(xid,
-                                                cifs_sb->tcon, position,
-                                                open_file->netfid,
-                                                open_file->pid, FALSE);
-                                } else {
-                                        rc = -EBADF;
-                                        break;
-                                }
-                        }
-                        cFYI(1, (" SetEOF (commit write) rc = %d", rc));
-                } */
        }
        spin_unlock(&inode->i_lock);
        if (!PageUptodate(page)) {
@@ -1573,8 +1517,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                        int buf_type = CIFS_NO_BUFFER;
                        if ((open_file->invalidHandle) && 
                            (!open_file->closePend)) {
-                                rc = cifs_reopen_file(file->f_path.dentry->d_inode,
+                                rc = cifs_reopen_file(file, TRUE);
-                                        file, TRUE);
                                if (rc != 0)
                                        break;
                        }
@@ -1660,8 +1603,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                while (rc == -EAGAIN) {
                        if ((open_file->invalidHandle) && 
                            (!open_file->closePend)) {
-                                rc = cifs_reopen_file(file->f_path.dentry->d_inode,
+                                rc = cifs_reopen_file(file, TRUE);
-                                        file, TRUE);
                                if (rc != 0)
                                        break;
                        }
@@ -1817,8 +1759,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                while (rc == -EAGAIN) {
                        if ((open_file->invalidHandle) && 
                            (!open_file->closePend)) {
-                                rc = cifs_reopen_file(file->f_path.dentry->d_inode,
+                                rc = cifs_reopen_file(file, TRUE);
-                                        file, TRUE);
                                if (rc != 0)
                                        break;
                        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f414526e476a..3e87dad3367c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/inode.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2005
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -90,7 +90,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                                (*pinode)->i_ino =
                                        (unsigned long)findData.UniqueId;
                        } /* note ino incremented to unique num in new_inode */
-                        if(sb->s_flags & MS_NOATIME)
+                        if (sb->s_flags & MS_NOATIME)
                                (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
                                
                        insert_inode_hash(*pinode);
@@ -139,8 +139,17 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                        inode->i_mode |= S_IFREG;
                        cFYI(1,("unknown type %d",type));
                }
-                inode->i_uid = le64_to_cpu(findData.Uid);
+                
-                inode->i_gid = le64_to_cpu(findData.Gid);
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+                        inode->i_uid = cifs_sb->mnt_uid;
+                else
+                        inode->i_uid = le64_to_cpu(findData.Uid);
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+                        inode->i_gid = cifs_sb->mnt_gid;
+                else
+                        inode->i_gid = le64_to_cpu(findData.Gid);
+                        
                inode->i_nlink = le64_to_cpu(findData.Nlinks);
                spin_lock(&inode->i_lock);
@@ -178,13 +187,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                                                &cifs_file_direct_nobrl_ops;
                                else
                                        inode->i_fop = &cifs_file_direct_ops;
-                        } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                                inode->i_fop = &cifs_file_nobrl_ops;
                        else /* not direct, send byte range locks */ 
                                inode->i_fop = &cifs_file_ops;
                        /* check if server can support readpages */
-                        if(pTcon->ses->server->maxBuf < 
+                        if (pTcon->ses->server->maxBuf < 
                            PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
                                inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
                        else
@@ -220,7 +229,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
        pbuf = buf;
-        if(size == 0) {
+        if (size == 0) {
                inode->i_mode |= S_IFIFO;
                return 0;
        } else if (size < 8) {
@@ -239,11 +248,11 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
                                 netfid,
                                 24 /* length */, 0 /* offset */,
                                 &bytes_read, &pbuf, &buf_type);
-                if((rc == 0) && (bytes_read >= 8)) {
+                if ((rc == 0) && (bytes_read >= 8)) {
-                        if(memcmp("IntxBLK", pbuf, 8) == 0) {
+                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
                                cFYI(1,("Block device"));
                                inode->i_mode |= S_IFBLK;
-                                if(bytes_read == 24) {
+                                if (bytes_read == 24) {
                                        /* we have enough to decode dev num */
                                        __u64 mjr; /* major */
                                        __u64 mnr; /* minor */
@@ -251,10 +260,10 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
                                        inode->i_rdev = MKDEV(mjr, mnr);
                                }
-                        } else if(memcmp("IntxCHR", pbuf, 8) == 0) {
+                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
                                cFYI(1,("Char device"));
                                inode->i_mode |= S_IFCHR;
-                                if(bytes_read == 24) {
+                                if (bytes_read == 24) {
                                        /* we have enough to decode dev num */
                                        __u64 mjr; /* major */
                                        __u64 mnr; /* minor */
@@ -262,7 +271,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
                                        mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
                                        inode->i_rdev = MKDEV(mjr, mnr);
                                }
-                        } else if(memcmp("IntxLNK", pbuf, 7) == 0) {
+                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
                                cFYI(1,("Symlink"));
                                inode->i_mode |= S_IFLNK;
                        } else {
@@ -293,7 +302,7 @@ static int get_sfu_uid_mode(struct inode * inode,
        rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
                        ea_value, 4 /* size of buf */, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if(rc < 0)
+        if (rc < 0)
                return (int)rc;
        else if (rc > 3) {
                mode = le32_to_cpu(*((__le32 *)ea_value));
@@ -348,7 +357,7 @@ int cifs_get_inode_info(struct inode **pinode,
                /* BB optimize code so we do not make the above call
                when server claims no NT SMB support and the above call
                failed at least once - set flag in tcon or mount */
-                if((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+                if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
                        rc = SMBQueryInformation(xid, pTcon, search_path,
                                        pfindData, cifs_sb->local_nls, 
                                        cifs_sb->mnt_cifs_flags &
@@ -425,7 +434,7 @@ int cifs_get_inode_info(struct inode **pinode,
                                } else /* do we need cast or hash to ino? */
                                        (*pinode)->i_ino = inode_num;
                        } /* else ino incremented to unique num in new_inode*/
-                        if(sb->s_flags & MS_NOATIME)
+                        if (sb->s_flags & MS_NOATIME)
                                (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
                        insert_inode_hash(*pinode);
                }
@@ -442,7 +451,7 @@ int cifs_get_inode_info(struct inode **pinode,
                (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
                /* Linux can not store file creation time so ignore it */
-                if(pfindData->LastAccessTime)
+                if (pfindData->LastAccessTime)
                        inode->i_atime = cifs_NTtimeToUnix
                                (le64_to_cpu(pfindData->LastAccessTime));
                else /* do not need to use current_fs_time - time not stored */
@@ -452,7 +461,7 @@ int cifs_get_inode_info(struct inode **pinode,
                inode->i_ctime =
                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
                cFYI(0, ("Attributes came in as 0x%x", attr));
-                if(adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
+                if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
                        inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
                        inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
                }
@@ -521,8 +530,10 @@ int cifs_get_inode_info(struct inode **pinode,
                /* BB fill in uid and gid here? with help from winbind? 
                   or retrieve from NTFS stream extended attribute */
-                if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
                        /* fill in uid, gid, mode from server ACL */
+                        /* BB FIXME this should also take into account the
+                         * default uid specified on mount if present */
                        get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
                } else if (atomic_read(&cifsInfo->inUse) == 0) {
                        inode->i_uid = cifs_sb->mnt_uid;
@@ -541,12 +552,12 @@ int cifs_get_inode_info(struct inode **pinode,
                                                &cifs_file_direct_nobrl_ops;
                                else
                                        inode->i_fop = &cifs_file_direct_ops;
-                        } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                                inode->i_fop = &cifs_file_nobrl_ops;
                        else /* not direct, send byte range locks */
                                inode->i_fop = &cifs_file_ops;
-                        if(pTcon->ses->server->maxBuf < 
+                        if (pTcon->ses->server->maxBuf < 
                             PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
                                inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
                        else
@@ -597,7 +608,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
        xid = GetXid();
-        if(inode)
+        if (inode)
                cifs_sb = CIFS_SB(inode->i_sb);
        else
                cifs_sb = CIFS_SB(direntry->d_sb);
@@ -723,7 +734,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
                                           when needed */
                direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
        }
-        if(inode) {
+        if (inode) {
                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
                cifsInode = CIFS_I(inode);
                cifsInode->time = 0;    /* force revalidate of dir as well */
@@ -734,6 +745,136 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
        return rc;
 }
+static void posix_fill_in_inode(struct inode *tmp_inode,
+        FILE_UNIX_BASIC_INFO *pData, int *pobject_type, int isNewInode)
+{
+        loff_t local_size;
+        struct timespec local_mtime;
+        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
+        __u32 type = le32_to_cpu(pData->Type);
+        __u64 num_of_bytes = le64_to_cpu(pData->NumOfBytes);
+        __u64 end_of_file = le64_to_cpu(pData->EndOfFile);
+        cifsInfo->time = jiffies;
+        atomic_inc(&cifsInfo->inUse);
+        /* save mtime and size */
+        local_mtime = tmp_inode->i_mtime;
+        local_size  = tmp_inode->i_size;
+        tmp_inode->i_atime =
+            cifs_NTtimeToUnix(le64_to_cpu(pData->LastAccessTime));
+        tmp_inode->i_mtime =
+            cifs_NTtimeToUnix(le64_to_cpu(pData->LastModificationTime));
+        tmp_inode->i_ctime =
+            cifs_NTtimeToUnix(le64_to_cpu(pData->LastStatusChange));
+        tmp_inode->i_mode = le64_to_cpu(pData->Permissions);
+        /* since we set the inode type below we need to mask off type
+           to avoid strange results if bits above were corrupt */
+        tmp_inode->i_mode &= ~S_IFMT;
+        if (type == UNIX_FILE) {
+                *pobject_type = DT_REG;
+                tmp_inode->i_mode |= S_IFREG;
+        } else if (type == UNIX_SYMLINK) {
+                *pobject_type = DT_LNK;
+                tmp_inode->i_mode |= S_IFLNK;
+        } else if (type == UNIX_DIR) {
+                *pobject_type = DT_DIR;
+                tmp_inode->i_mode |= S_IFDIR;
+        } else if (type == UNIX_CHARDEV) {
+                *pobject_type = DT_CHR;
+                tmp_inode->i_mode |= S_IFCHR;
+                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
+                                le64_to_cpu(pData->DevMinor) & MINORMASK);
+        } else if (type == UNIX_BLOCKDEV) {
+                *pobject_type = DT_BLK;
+                tmp_inode->i_mode |= S_IFBLK;
+                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
+                                le64_to_cpu(pData->DevMinor) & MINORMASK);
+        } else if (type == UNIX_FIFO) {
+                *pobject_type = DT_FIFO;
+                tmp_inode->i_mode |= S_IFIFO;
+        } else if (type == UNIX_SOCKET) {
+                *pobject_type = DT_SOCK;
+                tmp_inode->i_mode |= S_IFSOCK;
+        } else {
+                /* safest to just call it a file */
+                *pobject_type = DT_REG;
+                tmp_inode->i_mode |= S_IFREG;
+                cFYI(1,("unknown inode type %d",type)); 
+        }
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1,("object type: %d", type));
+#endif
+        tmp_inode->i_uid = le64_to_cpu(pData->Uid);
+        tmp_inode->i_gid = le64_to_cpu(pData->Gid);
+        tmp_inode->i_nlink = le64_to_cpu(pData->Nlinks);
+        spin_lock(&tmp_inode->i_lock);
+        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
+                /* can not safely change the file size here if the 
+                client is writing to it due to potential races */
+                i_size_write(tmp_inode, end_of_file);
+        /* 512 bytes (2**9) is the fake blocksize that must be used */
+        /* for this calculation, not the real blocksize */
+                tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
+        }
+        spin_unlock(&tmp_inode->i_lock);
+        if (S_ISREG(tmp_inode->i_mode)) {
+                cFYI(1, ("File inode"));
+                tmp_inode->i_op = &cifs_file_inode_ops;
+                if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+                        if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
+                        else
+                                tmp_inode->i_fop = &cifs_file_direct_ops;
+                
+                } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
+                else
+                        tmp_inode->i_fop = &cifs_file_ops;
+                if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+                   (cifs_sb->tcon->ses->server->maxBuf < 
+                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
+                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+                else
+                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
+                if(isNewInode)
+                        return; /* No sense invalidating pages for new inode since we
+                                           have not started caching readahead file data yet */
+                if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
+                        (local_size == tmp_inode->i_size)) {
+                        cFYI(1, ("inode exists but unchanged"));
+                } else {
+                        /* file may have changed on server */
+                        cFYI(1, ("invalidate inode, readdir detected change"));
+                        invalidate_remote_inode(tmp_inode);
+                }
+        } else if (S_ISDIR(tmp_inode->i_mode)) {
+                cFYI(1, ("Directory inode"));
+                tmp_inode->i_op = &cifs_dir_inode_ops;
+                tmp_inode->i_fop = &cifs_dir_ops;
+        } else if (S_ISLNK(tmp_inode->i_mode)) {
+                cFYI(1, ("Symbolic Link inode"));
+                tmp_inode->i_op = &cifs_symlink_inode_ops;
+/* tmp_inode->i_fop = *//* do not need to set to anything */
+        } else {
+                cFYI(1, ("Special inode")); 
+                init_special_inode(tmp_inode, tmp_inode->i_mode,
+                                   tmp_inode->i_rdev);
+        }       
+}
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
        int rc = 0;
@@ -755,6 +896,71 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                FreeXid(xid);
                return -ENOMEM;
        }
+        
+        if((pTcon->ses->capabilities & CAP_UNIX) && 
+                (CIFS_UNIX_POSIX_PATH_OPS_CAP & 
+                        le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
+                u32 oplock = 0;
+                FILE_UNIX_BASIC_INFO * pInfo = 
+                        kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+                if(pInfo == NULL) {
+                        rc = -ENOMEM;
+                        goto mkdir_out;
+                }
+                        
+                rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
+                                mode, NULL /* netfid */, pInfo, &oplock,
+                                full_path, cifs_sb->local_nls, 
+                                cifs_sb->mnt_cifs_flags & 
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc) {
+                        cFYI(1, ("posix mkdir returned 0x%x", rc));
+                        d_drop(direntry);
+                } else {
+                        int obj_type;
+                        if (pInfo->Type == -1) /* no return info - go query */
+                                goto mkdir_get_info; 
+/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */
+                        inc_nlink(inode);
+                        if (pTcon->nocase)
+                                direntry->d_op = &cifs_ci_dentry_ops;
+                        else
+                                direntry->d_op = &cifs_dentry_ops;
+                        newinode = new_inode(inode->i_sb);
+                        if (newinode == NULL)
+                                goto mkdir_get_info;
+                        /* Is an i_ino of zero legal? */
+                        /* Are there sanity checks we can use to ensure that
+                           the server is really filling in that field? */
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+                                newinode->i_ino =
+                                        (unsigned long)pInfo->UniqueId;
+                        } /* note ino incremented to unique num in new_inode */
+                        if(inode->i_sb->s_flags & MS_NOATIME)
+                                newinode->i_flags |= S_NOATIME | S_NOCMTIME;
+                        newinode->i_nlink = 2;
+                        insert_inode_hash(newinode);
+                        d_instantiate(direntry, newinode);
+                        /* we already checked in POSIXCreate whether
+                           frame was long enough */
+                        posix_fill_in_inode(direntry->d_inode,
+                                        pInfo, &obj_type, 1 /* NewInode */);
+#ifdef CONFIG_CIFS_DEBUG2
+                        cFYI(1,("instantiated dentry %p %s to inode %p",
+                                direntry, direntry->d_name.name, newinode));
+                        if(newinode->i_nlink != 2)
+                                cFYI(1,("unexpected number of links %d",
+                                        newinode->i_nlink));
+#endif
+                }
+                kfree(pInfo);
+                goto mkdir_out;
+        }       
+        
        /* BB add setting the equivalent of mode via CreateX w/ACLs */
        rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -762,6 +968,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                cFYI(1, ("cifs_mkdir returned 0x%x", rc));
                d_drop(direntry);
        } else {
+mkdir_get_info:         
                inc_nlink(inode);
                if (pTcon->ses->capabilities & CAP_UNIX)
                        rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -775,8 +982,10 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                else
                        direntry->d_op = &cifs_dentry_ops;
                d_instantiate(direntry, newinode);
-                if (direntry->d_inode)
+                 /* setting nlink not necessary except in cases where we
-                        direntry->d_inode->i_nlink = 2;
+                  * failed to get it from the server or was set bogus */ 
+                if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
+                                direntry->d_inode->i_nlink = 2; 
                if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
                                CIFSSMBUnixSetPerms(xid, pTcon, full_path,
@@ -812,6 +1021,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        }
                }
        }
+mkdir_out:      
        kfree(full_path);
        FreeXid(xid);
        return rc;
@@ -1339,17 +1549,17 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                                        cpu_to_le32(cifsInode->cifsAttrs |
                                                    ATTR_READONLY);
                        }
-                } else if ((mode & S_IWUGO) == S_IWUGO) {
+                } else if (cifsInode->cifsAttrs & ATTR_READONLY) {
-                        if (cifsInode->cifsAttrs & ATTR_READONLY) {
+                        /* If file is readonly on server, we would
-                                set_dosattr = TRUE;
+                        not be able to write to it - so if any write
-                                time_buf.Attributes =
+                        bit is enabled for user or group or other we
-                                        cpu_to_le32(cifsInode->cifsAttrs &
+                        need to at least try to remove r/o dos attr */
-                                                    (~ATTR_READONLY));
+                        set_dosattr = TRUE;
-                                /* Windows ignores set to zero */
+                        time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
-                                if(time_buf.Attributes == 0)
+                                            (~ATTR_READONLY));
-                                        time_buf.Attributes |= 
+                        /* Windows ignores set to zero */
-                                                cpu_to_le32(ATTR_NORMAL);
+                        if(time_buf.Attributes == 0)
-                        }
+                                time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
                }
                /* BB to be implemented -
                   via Windows security descriptors or streams */
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 992e80edc720..53e304d59544 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -30,6 +30,9 @@
 #include <linux/fs.h>
 #include <asm/div64.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+#include <linux/inet.h>
+#endif
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -129,11 +132,27 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 /* Convert string containing dotted ip address to binary form */
 /* returns 0 if invalid address */
-/* BB add address family, change rc to status flag and return union or for ipv6 */
-/*  will need parent to call something like inet_pton to convert ipv6 address  BB */
 int
 cifs_inet_pton(int address_family, char *cp,void *dst)
 {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+        int ret = 0;
+        /* calculate length by finding first slash or NULL */
+        /* BB Should we convert '/' slash to '\' here since it seems already done
+           before this */
+        if( address_family == AF_INET ){
+                ret = in4_pton(cp, -1 /* len */, dst , '\\', NULL);     
+        } else if( address_family == AF_INET6 ){
+                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
+        }
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1,("address conversion returned %d for %s", ret, cp));
+#endif
+        if (ret > 0)
+                ret = 1;
+        return ret;
+#else
        int value;
        int digit;
        int i;
@@ -192,6 +211,7 @@ cifs_inet_pton(int address_family, char *cp,void *dst)
        *((__be32 *)dst) = *((__be32 *) bytes) | htonl(value);
        return 1; /* success */
+#endif /* EXPERIMENTAL */       
 }
 /*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2a374d5215ab..b5364f90d551 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -37,19 +37,19 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 {
        struct cifsFileInfo * cf;
-        if(file) {
+        if (file) {
                cf = file->private_data;
-                if(cf == NULL) {
+                if (cf == NULL) {
                        cFYI(1,("empty cifs private file data"));
                        return;
                }
-                if(cf->invalidHandle) {
+                if (cf->invalidHandle) {
                        cFYI(1,("invalid handle"));
                }
-                if(cf->srch_inf.endOfSearch) {
+                if (cf->srch_inf.endOfSearch) {
                        cFYI(1,("end of search"));
                }
-                if(cf->srch_inf.emptyDir) {
+                if (cf->srch_inf.emptyDir) {
                        cFYI(1,("empty dir"));
                }
                
@@ -77,17 +77,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
                cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode));
                *ptmp_inode = tmp_dentry->d_inode;
 /* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
-                if(*ptmp_inode == NULL) {
+                if (*ptmp_inode == NULL) {
                        *ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-                        if(*ptmp_inode == NULL)
+                        if (*ptmp_inode == NULL)
                                return rc;
                        rc = 1;
                }
-                if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+                if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
        } else {
                tmp_dentry = d_alloc(file->f_path.dentry, qstring);
-                if(tmp_dentry == NULL) {
+                if (tmp_dentry == NULL) {
                        cERROR(1,("Failed allocating dentry"));
                        *ptmp_inode = NULL;
                        return rc;
@@ -98,9 +98,9 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
                        tmp_dentry->d_op = &cifs_ci_dentry_ops;
                else
                        tmp_dentry->d_op = &cifs_dentry_ops;
-                if(*ptmp_inode == NULL)
+                if (*ptmp_inode == NULL)
                        return rc;
-                if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+                if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;                       
                rc = 2;
        }
@@ -112,7 +112,7 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
 {
-        if((tcon) && (tcon->ses) && (tcon->ses->server)) {
+        if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
                inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
                inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
                inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
@@ -137,7 +137,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        local_mtime = tmp_inode->i_mtime;
        local_size  = tmp_inode->i_size;
-        if(new_buf_type) {
+        if (new_buf_type) {
                FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
                attr = le32_to_cpu(pfindData->ExtFileAttributes);
@@ -193,7 +193,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (attr & ATTR_DIRECTORY) {
                *pobject_type = DT_DIR;
                /* override default perms since we do not lock dirs */
-                if(atomic_read(&cifsInfo->inUse) == 0) {
+                if (atomic_read(&cifsInfo->inUse) == 0) {
                        tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
                }
                tmp_inode->i_mode |= S_IFDIR;
@@ -250,25 +250,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (S_ISREG(tmp_inode->i_mode)) {
                cFYI(1, ("File inode"));
                tmp_inode->i_op = &cifs_file_inode_ops;
-                if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                        if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
                        else
                                tmp_inode->i_fop = &cifs_file_direct_ops;
                
-                } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
                else
                        tmp_inode->i_fop = &cifs_file_ops;
-                if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
                   (cifs_sb->tcon->ses->server->maxBuf <
                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
                else
                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
-                if(isNewInode)
+                if (isNewInode)
                        return; /* No sense invalidating pages for new inode
                                   since have not started caching readahead file
                                   data yet */
@@ -357,8 +357,14 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
                cFYI(1,("unknown inode type %d",type)); 
        }
-        tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-        tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
+                tmp_inode->i_uid = cifs_sb->mnt_uid;
+        else
+                tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+                tmp_inode->i_gid = cifs_sb->mnt_gid;
+        else
+                tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
        tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
        spin_lock(&tmp_inode->i_lock);
@@ -377,25 +383,24 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
                cFYI(1, ("File inode"));
                tmp_inode->i_op = &cifs_file_inode_ops;
-                if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                        if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
                        else
                                tmp_inode->i_fop = &cifs_file_direct_ops;
-                
+                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
                else
                        tmp_inode->i_fop = &cifs_file_ops;
-                if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
                   (cifs_sb->tcon->ses->server->maxBuf < 
                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
                else
                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
-                if(isNewInode)
+                if (isNewInode)
                        return; /* No sense invalidating pages for new inode since we
                                           have not started caching readahead file data yet */
@@ -430,34 +435,28 @@ static int initiate_cifs_search(const int xid, struct file *file)
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        if(file->private_data == NULL) {
+        if (file->private_data == NULL) {
                file->private_data = 
-                        kmalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
+                        kzalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
        }
-        if(file->private_data == NULL) {
+        if (file->private_data == NULL)
                return -ENOMEM;
-        } else {
-                memset(file->private_data,0,sizeof(struct cifsFileInfo));
-        }
        cifsFile = file->private_data;
        cifsFile->invalidHandle = TRUE;
        cifsFile->srch_inf.endOfSearch = FALSE;
-        if(file->f_path.dentry == NULL)
-                return -ENOENT;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        if(cifs_sb == NULL)
+        if (cifs_sb == NULL)
                return -EINVAL;
        pTcon = cifs_sb->tcon;
-        if(pTcon == NULL)
+        if (pTcon == NULL)
                return -EINVAL;
        full_path = build_path_from_dentry(file->f_path.dentry);
-        if(full_path == NULL) {
+        if (full_path == NULL) {
                return -ENOMEM;
        }
@@ -480,9 +479,9 @@ ffirst_retry:
                &cifsFile->netfid, &cifsFile->srch_inf,
                cifs_sb->mnt_cifs_flags & 
                        CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
-        if(rc == 0)
+        if (rc == 0)
                cifsFile->invalidHandle = FALSE;
-        if((rc == -EOPNOTSUPP) && 
+        if ((rc == -EOPNOTSUPP) && 
                (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
                goto ffirst_retry;
@@ -498,7 +497,7 @@ static int cifs_unicode_bytelen(char *str)
        __le16 * ustr = (__le16 *)str;
        for(len=0;len <= PATH_MAX;len++) {
-                if(ustr[len] == 0)
+                if (ustr[len] == 0)
                        return len << 1;
        }
        cFYI(1,("Unicode string longer than PATH_MAX found"));
@@ -510,7 +509,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
        char * new_entry;
        FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
-        if(level == SMB_FIND_FILE_INFO_STANDARD) {
+        if (level == SMB_FIND_FILE_INFO_STANDARD) {
                FIND_FILE_STANDARD_INFO * pfData;
                pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
@@ -520,12 +519,12 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
                new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
        cFYI(1,("new entry %p old entry %p",new_entry,old_entry));
        /* validate that new_entry is not past end of SMB */
-        if(new_entry >= end_of_smb) {
+        if (new_entry >= end_of_smb) {
                cERROR(1,
                      ("search entry %p began after end of SMB %p old entry %p",
                        new_entry, end_of_smb, old_entry)); 
                return NULL;
-        } else if(((level == SMB_FIND_FILE_INFO_STANDARD) &&
+        } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
                   (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) ||
                  ((level != SMB_FIND_FILE_INFO_STANDARD) &&
                   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
@@ -546,39 +545,39 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
        char * filename = NULL;
        int len = 0; 
-        if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
+        if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
                FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
                filename = &pFindData->FileName[0];
-                if(cfile->srch_inf.unicode) {
+                if (cfile->srch_inf.unicode) {
                        len = cifs_unicode_bytelen(filename);
                } else {
                        /* BB should we make this strnlen of PATH_MAX? */
                        len = strnlen(filename, 5);
                }
-        } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
+        } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
                FILE_DIRECTORY_INFO * pFindData = 
                        (FILE_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level == 
+        } else if (cfile->srch_inf.info_level == 
                        SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
                FILE_FULL_DIRECTORY_INFO * pFindData = 
                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level ==
+        } else if (cfile->srch_inf.info_level ==
                        SMB_FIND_FILE_ID_FULL_DIR_INFO) {
                SEARCH_ID_FULL_DIR_INFO * pFindData = 
                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level == 
+        } else if (cfile->srch_inf.info_level == 
                        SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
                FILE_BOTH_DIRECTORY_INFO * pFindData = 
                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
+        } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
                FIND_FILE_STANDARD_INFO * pFindData =
                        (FIND_FILE_STANDARD_INFO *)current_entry;
                filename = &pFindData->FileName[0];
@@ -587,25 +586,25 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
                cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
        }
-        if(filename) {
+        if (filename) {
-                if(cfile->srch_inf.unicode) {
+                if (cfile->srch_inf.unicode) {
                        __le16 *ufilename = (__le16 *)filename;
-                        if(len == 2) {
+                        if (len == 2) {
                                /* check for . */
-                                if(ufilename[0] == UNICODE_DOT)
+                                if (ufilename[0] == UNICODE_DOT)
                                        rc = 1;
-                        } else if(len == 4) {
+                        } else if (len == 4) {
                                /* check for .. */
-                                if((ufilename[0] == UNICODE_DOT)
+                                if ((ufilename[0] == UNICODE_DOT)
                                   &&(ufilename[1] == UNICODE_DOT))
                                        rc = 2;
                        }
                } else /* ASCII */ {
-                        if(len == 1) {
+                        if (len == 1) {
-                                if(filename[0] == '.') 
+                                if (filename[0] == '.') 
                                        rc = 1;
-                        } else if(len == 2) {
+                        } else if (len == 2) {
-                                if((filename[0] == '.') && (filename[1] == '.')) 
+                                if((filename[0] == '.') && (filename[1] == '.'))
                                        rc = 2;
                        }
                }
@@ -618,20 +617,10 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
   whether we can use the cached search results from the previous search */
 static int is_dir_changed(struct file * file)
 {
-        struct inode * inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
-        struct cifsInodeInfo *cifsInfo;
+        struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
-        if(file->f_path.dentry == NULL)
+        if (cifsInfo->time == 0)
-                return 0;
-        inode = file->f_path.dentry->d_inode;
-        if(inode == NULL)
-                return 0;
-        cifsInfo = CIFS_I(inode);
-        if(cifsInfo->time == 0)
                return 1; /* directory was changed, perhaps due to unlink */
        else
                return 0;
@@ -654,7 +643,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        struct cifsFileInfo * cifsFile = file->private_data;
        /* check if index in the buffer */
        
-        if((cifsFile == NULL) || (ppCurrentEntry == NULL) || 
+        if ((cifsFile == NULL) || (ppCurrentEntry == NULL) || 
           (num_to_ret == NULL))
                return -ENOENT;
        
@@ -672,7 +661,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 #ifdef CONFIG_CIFS_DEBUG2
        dump_cifs_file_struct(file, "In fce ");
 #endif
-        if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
+        if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
             is_dir_changed(file)) || 
           (index_to_find < first_entry_in_buffer)) {
                /* close and restart search */
@@ -681,9 +670,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                CIFSFindClose(xid, pTcon, cifsFile->netfid);
                kfree(cifsFile->search_resume_name);
                cifsFile->search_resume_name = NULL;
-                if(cifsFile->srch_inf.ntwrk_buf_start) {
+                if (cifsFile->srch_inf.ntwrk_buf_start) {
                        cFYI(1,("freeing SMB ff cache buf on search rewind"));
-                        if(cifsFile->srch_inf.smallBuf)
+                        if (cifsFile->srch_inf.smallBuf)
                                cifs_small_buf_release(cifsFile->srch_inf.
                                                ntwrk_buf_start);
                        else
@@ -691,7 +680,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                                                ntwrk_buf_start);
                }
                rc = initiate_cifs_search(xid,file);
-                if(rc) {
+                if (rc) {
                        cFYI(1,("error %d reinitiating a search on rewind",rc));
                        return rc;
                }
@@ -702,10 +691,10 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                cFYI(1,("calling findnext2"));
                rc = CIFSFindNext(xid,pTcon,cifsFile->netfid, 
                                  &cifsFile->srch_inf);
-                if(rc)
+                if (rc)
                        return -ENOENT;
        }
-        if(index_to_find < cifsFile->srch_inf.index_of_last_entry) {
+        if (index_to_find < cifsFile->srch_inf.index_of_last_entry) {
                /* we found the buffer that contains the entry */
                /* scan and find it */
                int i;
@@ -851,9 +840,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        if((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
                return -ENOENT;
-        if(file->f_path.dentry == NULL)
-                return -ENOENT;
        rc = cifs_entry_is_dot(pfindEntry,pCifsF);
        /* skip . and .. since we added them first */
        if(rc != 0) 
@@ -997,11 +983,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        xid = GetXid();
-        if(file->f_path.dentry == NULL) {
-                FreeXid(xid);
-                return -EIO;
-        }
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        pTcon = cifs_sb->tcon;
        if(pTcon == NULL)
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 614175a3b02e..0aaff3651d14 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -62,8 +62,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct coda_inode_info *ei = (struct coda_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index 040a8be38a48..72e5e6923828 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -371,13 +371,14 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
                        fn = "?";
        }
-        sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
+        sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
        if (!isprint(buf[1]))
                sprintf(buf, "%02x", buf[1]);
        compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
-                        "cmd(%08x){%s} arg(%08x) on %s\n",
+                        "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
                        current->comm, current->pid,
                        (int)fd, (unsigned int)cmd, buf,
+                        (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
                        (unsigned int)arg, fn);
        if (path)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8b1c5d8bf4ef..464c04a9541d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -266,6 +266,23 @@ static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
        return err;
 }
+static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+        struct compat_timespec __user *up = compat_ptr(arg);
+        struct timespec kts;
+        mm_segment_t old_fs = get_fs();
+        int err;
+        set_fs(KERNEL_DS);
+        err = sys_ioctl(fd, cmd, (unsigned long)&kts);
+        set_fs(old_fs);
+        if (!err) {
+                err = put_user(kts.tv_sec, &up->tv_sec);
+                err |= __put_user(kts.tv_nsec, &up->tv_nsec);
+        }
+        return err;
+}
 struct ifmap32 {
        compat_ulong_t mem_start;
        compat_ulong_t mem_end;
@@ -2379,6 +2396,14 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 #define ULONG_IOCTL(cmd) \
        { (cmd), (ioctl_trans_handler_t)sys_ioctl },
+/* ioctl should not be warned about even if it's not implemented.
+   Valid reasons to use this:
+   - It is implemented with ->compat_ioctl on some device, but programs
+   call it on others too.
+   - The ioctl is not implemented in the native kernel, but programs
+   call it commonly anyways.
+   Most other reasons are not valid. */
+#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
 struct ioctl_trans ioctl_start[] = {
 #include <linux/compat_ioctl.h>
@@ -2437,6 +2462,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
 /* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
 HANDLE_IOCTL(SIOCRTMSG, ret_einval)
 HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
+HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
@@ -2576,6 +2602,8 @@ HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
+/* Not implemented in the native kernel */
+IGNORE_IOCTL(SIOCGIFCOUNT)
 HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
 HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
@@ -2599,6 +2627,15 @@ COMPATIBLE_IOCTL(LPRESET)
 /*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
 COMPATIBLE_IOCTL(LPGETFLAGS)
 HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
+/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
+   but we don't want warnings on other file systems. So declare
+   them as compatible here. */
+#define VFAT_IOCTL_READDIR_BOTH32       _IOR('r', 1, struct compat_dirent[2])
+#define VFAT_IOCTL_READDIR_SHORT32      _IOR('r', 2, struct compat_dirent[2])
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
 };
 int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 6f573004cd7d..b00d962de833 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
        if (!configfs_dir_cachep)
                goto out;
-        kset_set_kset_s(&config_subsys, kernel_subsys);
+        kobj_set_kset_s(&config_subsys, kernel_subsys);
        err = subsystem_register(&config_subsys);
        if (err) {
                kmem_cache_destroy(configfs_dir_cachep);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index facd0c89be8f..3d194a2be3f5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -180,7 +180,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
                struct page *page = NULL;
                if (blocknr + i < devsize) {
-                        page = read_mapping_page(mapping, blocknr + i, NULL);
+                        page = read_mapping_page_async(mapping, blocknr + i,
+                                                                        NULL);
                        /* synchronous error? */
                        if (IS_ERR(page))
                                page = NULL;
diff --git a/fs/dcache.c b/fs/dcache.c
index d68631f18df1..d1bf5d8aeb5a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2052,12 +2052,8 @@ static void __init dcache_init(unsigned long mempages)
         * but it is probably not worth it because of the cache nature
         * of the dcache. 
         */
-        dentry_cache = kmem_cache_create("dentry_cache",
+        dentry_cache = KMEM_CACHE(dentry,
-                                         sizeof(struct dentry),
+                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-                                         0,
-                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-                                         SLAB_MEM_SPREAD),
-                                         NULL, NULL);
        
        set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 682f928b7f4d..2e124e0075c5 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -179,6 +179,48 @@ struct dentry *debugfs_create_u32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32);
+static void debugfs_u64_set(void *data, u64 val)
+{
+        *(u64 *)data = val;
+}
+static u64 debugfs_u64_get(void *data)
+{
+        return *(u64 *)data;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+/**
+ * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+                                 struct dentry *parent, u64 *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_u64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u64);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7b324cfebcb1..ec8896b264de 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -374,7 +374,7 @@ static int __init debugfs_init(void)
 {
        int retval;
-        kset_set_kset_s(&debug_subsys, kernel_subsys);
+        kobj_set_kset_s(&debug_subsys, kernel_subsys);
        retval = subsystem_register(&debug_subsys);
        if (retval)
                return retval;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 6fa7b0d5c043..69a94690e493 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,36 +3,19 @@ menu "Distributed Lock Manager"
 config DLM
        tristate "Distributed Lock Manager (DLM)"
-        depends on SYSFS && (IPV6 || IPV6=n)
+        depends on IPV6 || IPV6=n
        select CONFIGFS_FS
-        select IP_SCTP if DLM_SCTP
+        select IP_SCTP
        help
-          A general purpose distributed lock manager for kernel or userspace
+        A general purpose distributed lock manager for kernel or userspace
-          applications.
+        applications.
-choice
-        prompt "Select DLM communications protocol"
-        depends on DLM
-        default DLM_TCP
-        help
-          The DLM Can use TCP or SCTP for it's network communications.
-          SCTP supports multi-homed operations whereas TCP doesn't.
-          However, SCTP seems to have stability problems at the moment.
-config DLM_TCP
-        bool "TCP/IP"
-config DLM_SCTP
-        bool "SCTP"
-endchoice
 config DLM_DEBUG
        bool "DLM debugging"
        depends on DLM
        help
-          Under the debugfs mount point, the name of each lockspace will
+        Under the debugfs mount point, the name of each lockspace will
-          appear as a file in the "dlm" directory.  The output is the
+        appear as a file in the "dlm" directory.  The output is the
-          list of resource and locks the local node knows about.
+        list of resource and locks the local node knows about.
 endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 65388944eba0..604cf7dc5f39 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -8,14 +8,12 @@ dlm-y :=			ast.o \
                                member.o \
                                memory.o \
                                midcomms.o \
+                                lowcomms.o \
                                rcom.o \
                                recover.o \
                                recoverd.o \
                                requestqueue.o \
                                user.o \
-                                util.o
+                                util.o 
 dlm-$(CONFIG_DLM_DEBUG) +=      debug_fs.o
-dlm-$(CONFIG_DLM_TCP)   += lowcomms-tcp.o
-dlm-$(CONFIG_DLM_SCTP)  += lowcomms-sctp.o
-\ No newline at end of file
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index f91d39cb1e0b..6308122890ca 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
 #include "dlm_internal.h"
 #include "lock.h"
 #include "user.h"
+#include "ast.h"
 #define WAKE_ASTS  0
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8665c88e5af2..822abdcd1434 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -89,6 +89,7 @@ struct cluster {
        unsigned int cl_toss_secs;
        unsigned int cl_scan_secs;
        unsigned int cl_log_debug;
+        unsigned int cl_protocol;
 };
 enum {
@@ -101,6 +102,7 @@ enum {
        CLUSTER_ATTR_TOSS_SECS,
        CLUSTER_ATTR_SCAN_SECS,
        CLUSTER_ATTR_LOG_DEBUG,
+        CLUSTER_ATTR_PROTOCOL,
 };
 struct cluster_attribute {
@@ -159,6 +161,7 @@ CLUSTER_ATTR(recover_timer, 1);
 CLUSTER_ATTR(toss_secs, 1);
 CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(protocol, 0);
 static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -170,6 +173,7 @@ static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
        [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
        [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+        [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
        NULL,
 };
@@ -904,6 +908,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
 #define DEFAULT_LOG_DEBUG          0
+#define DEFAULT_PROTOCOL           0
 struct dlm_config_info dlm_config = {
        .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -914,6 +919,7 @@ struct dlm_config_info dlm_config = {
        .ci_recover_timer = DEFAULT_RECOVER_TIMER,
        .ci_toss_secs = DEFAULT_TOSS_SECS,
        .ci_scan_secs = DEFAULT_SCAN_SECS,
-        .ci_log_debug = DEFAULT_LOG_DEBUG
+        .ci_log_debug = DEFAULT_LOG_DEBUG,
+        .ci_protocol = DEFAULT_PROTOCOL
 };
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 1e978611a96e..967cc3d72e5e 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -26,6 +26,7 @@ struct dlm_config_info {
        int ci_toss_secs;
        int ci_scan_secs;
        int ci_log_debug;
+        int ci_protocol;
 };
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 61d93201e1b2..30994d68f6a0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -210,6 +210,9 @@ struct dlm_args {
 #define DLM_IFL_MSTCPY          0x00010000
 #define DLM_IFL_RESEND          0x00020000
 #define DLM_IFL_DEAD            0x00040000
+#define DLM_IFL_OVERLAP_UNLOCK  0x00080000
+#define DLM_IFL_OVERLAP_CANCEL  0x00100000
+#define DLM_IFL_ENDOFLIFE       0x00200000
 #define DLM_IFL_USER            0x00000001
 #define DLM_IFL_ORPHAN          0x00000002
@@ -230,8 +233,8 @@ struct dlm_lkb {
        int8_t                  lkb_grmode;     /* granted lock mode */
        int8_t                  lkb_bastmode;   /* requested mode */
        int8_t                  lkb_highbast;   /* highest mode bast sent for */
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
+        int8_t                  lkb_wait_count;
        int8_t                  lkb_ast_type;   /* type of ast queued for */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
@@ -339,6 +342,7 @@ struct dlm_header {
 #define DLM_MSG_LOOKUP          11
 #define DLM_MSG_REMOVE          12
 #define DLM_MSG_LOOKUP_REPLY    13
+#define DLM_MSG_PURGE           14
 struct dlm_message {
        struct dlm_header       m_header;
@@ -440,6 +444,9 @@ struct dlm_ls {
        struct mutex            ls_waiters_mutex;
        struct list_head        ls_waiters;     /* lkbs needing a reply */
+        struct mutex            ls_orphans_mutex;
+        struct list_head        ls_orphans;
        struct list_head        ls_nodes;       /* current nodes in ls */
        struct list_head        ls_nodes_gone;  /* dead node list, recovery */
        int                     ls_num_nodes;   /* number of nodes in ls */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e725005fafd0..d8d6e729f96b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -85,6 +85,7 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                                    struct dlm_message *ms);
 static int receive_extralen(struct dlm_message *ms);
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 /*
 * Lock compatibilty matrix - thanks Steve
@@ -223,6 +224,16 @@ static inline int is_demoted(struct dlm_lkb *lkb)
        return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
 }
+static inline int is_altmode(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
+}
+static inline int is_granted(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_status == DLM_LKSTS_GRANTED);
+}
 static inline int is_remote(struct dlm_rsb *r)
 {
        DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
@@ -254,6 +265,22 @@ static inline int down_conversion(struct dlm_lkb *lkb)
        return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
 }
+static inline int is_overlap_unlock(struct dlm_lkb *lkb)
+{
+        return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
+}
+static inline int is_overlap_cancel(struct dlm_lkb *lkb)
+{
+        return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
+}
+static inline int is_overlap(struct dlm_lkb *lkb)
+{
+        return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
+                                  DLM_IFL_OVERLAP_CANCEL));
+}
 static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 {
        if (is_master_copy(lkb))
@@ -267,6 +294,12 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
        dlm_add_ast(lkb, AST_COMP);
 }
+static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+        queue_cast(r, lkb,
+                   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
+}
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
        if (is_master_copy(lkb))
@@ -547,6 +580,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
        lkb->lkb_grmode = DLM_LOCK_IV;
        kref_init(&lkb->lkb_ref);
        INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+        INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
        get_random_bytes(&bucket, sizeof(bucket));
        bucket &= (ls->ls_lkbtbl_size - 1);
@@ -556,7 +590,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
        /* counter can roll over so we must verify lkid is not in use */
        while (lkid == 0) {
-                lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+                lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++;
                list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
                                    lkb_idtbl_list) {
@@ -577,8 +611,8 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
 {
-        uint16_t bucket = lkid & 0xFFFF;
        struct dlm_lkb *lkb;
+        uint16_t bucket = (lkid >> 16);
        list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
                if (lkb->lkb_id == lkid)
@@ -590,7 +624,7 @@ static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
 static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
 {
        struct dlm_lkb *lkb;
-        uint16_t bucket = lkid & 0xFFFF;
+        uint16_t bucket = (lkid >> 16);
        if (bucket >= ls->ls_lkbtbl_size)
                return -EBADSLT;
@@ -620,7 +654,7 @@ static void kill_lkb(struct kref *kref)
 static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
-        uint16_t bucket = lkb->lkb_id & 0xFFFF;
+        uint16_t bucket = (lkb->lkb_id >> 16);
        write_lock(&ls->ls_lkbtbl[bucket].lock);
        if (kref_put(&lkb->lkb_ref, kill_lkb)) {
@@ -735,23 +769,75 @@ static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
        unhold_lkb(lkb);
 }
+static int msg_reply_type(int mstype)
+{
+        switch (mstype) {
+        case DLM_MSG_REQUEST:
+                return DLM_MSG_REQUEST_REPLY;
+        case DLM_MSG_CONVERT:
+                return DLM_MSG_CONVERT_REPLY;
+        case DLM_MSG_UNLOCK:
+                return DLM_MSG_UNLOCK_REPLY;
+        case DLM_MSG_CANCEL:
+                return DLM_MSG_CANCEL_REPLY;
+        case DLM_MSG_LOOKUP:
+                return DLM_MSG_LOOKUP_REPLY;
+        }
+        return -1;
+}
 /* add/remove lkb from global waiters list of lkb's waiting for
   a reply from a remote node */
-static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        int error = 0;
        mutex_lock(&ls->ls_waiters_mutex);
-        if (lkb->lkb_wait_type) {
-                log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+        if (is_overlap_unlock(lkb) ||
+            (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
+                error = -EINVAL;
+                goto out;
+        }
+        if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
+                switch (mstype) {
+                case DLM_MSG_UNLOCK:
+                        lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+                        break;
+                case DLM_MSG_CANCEL:
+                        lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+                        break;
+                default:
+                        error = -EBUSY;
+                        goto out;
+                }
+                lkb->lkb_wait_count++;
+                hold_lkb(lkb);
+                log_debug(ls, "add overlap %x cur %d new %d count %d flags %x",
+                          lkb->lkb_id, lkb->lkb_wait_type, mstype,
+                          lkb->lkb_wait_count, lkb->lkb_flags);
                goto out;
        }
+        DLM_ASSERT(!lkb->lkb_wait_count,
+                   dlm_print_lkb(lkb);
+                   printk("wait_count %d\n", lkb->lkb_wait_count););
+        lkb->lkb_wait_count++;
        lkb->lkb_wait_type = mstype;
-        kref_get(&lkb->lkb_ref);
+        hold_lkb(lkb);
        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
 out:
+        if (error)
+                log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s",
+                          lkb->lkb_id, error, lkb->lkb_flags, mstype,
+                          lkb->lkb_wait_type, lkb->lkb_resource->res_name);
        mutex_unlock(&ls->ls_waiters_mutex);
+        return error;
 }
 /* We clear the RESEND flag because we might be taking an lkb off the waiters
@@ -759,34 +845,85 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
   request reply on the requestqueue) between dlm_recover_waiters_pre() which
   set RESEND and dlm_recover_waiters_post() */
-static int _remove_from_waiters(struct dlm_lkb *lkb)
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 {
-        int error = 0;
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        int overlap_done = 0;
-        if (!lkb->lkb_wait_type) {
+        if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
-                log_print("remove_from_waiters error");
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
-                error = -EINVAL;
+                overlap_done = 1;
-                goto out;
+                goto out_del;
+        }
+        if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+                overlap_done = 1;
+                goto out_del;
+        }
+        /* N.B. type of reply may not always correspond to type of original
+           msg due to lookup->request optimization, verify others? */
+        if (lkb->lkb_wait_type) {
+                lkb->lkb_wait_type = 0;
+                goto out_del;
+        }
+        log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d",
+                  lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type);
+        return -1;
+ out_del:
+        /* the force-unlock/cancel has completed and we haven't recvd a reply
+           to the op that was in progress prior to the unlock/cancel; we
+           give up on any reply to the earlier op.  FIXME: not sure when/how
+           this would happen */
+        if (overlap_done && lkb->lkb_wait_type) {
+                log_error(ls, "remove_from_waiters %x reply %d give up on %d",
+                          lkb->lkb_id, mstype, lkb->lkb_wait_type);
+                lkb->lkb_wait_count--;
+                lkb->lkb_wait_type = 0;
        }
-        lkb->lkb_wait_type = 0;
+        DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
        lkb->lkb_flags &= ~DLM_IFL_RESEND;
-        list_del(&lkb->lkb_wait_reply);
+        lkb->lkb_wait_count--;
+        if (!lkb->lkb_wait_count)
+                list_del_init(&lkb->lkb_wait_reply);
        unhold_lkb(lkb);
- out:
+        return 0;
-        return error;
 }
-static int remove_from_waiters(struct dlm_lkb *lkb)
+static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int error;
        mutex_lock(&ls->ls_waiters_mutex);
-        error = _remove_from_waiters(lkb);
+        error = _remove_from_waiters(lkb, mstype);
        mutex_unlock(&ls->ls_waiters_mutex);
        return error;
 }
+/* Handles situations where we might be processing a "fake" or "stub" reply in
+   which we can't try to take waiters_mutex again. */
+static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        int error;
+        if (ms != &ls->ls_stub_ms)
+                mutex_lock(&ls->ls_waiters_mutex);
+        error = _remove_from_waiters(lkb, ms->m_type);
+        if (ms != &ls->ls_stub_ms)
+                mutex_unlock(&ls->ls_waiters_mutex);
+        return error;
+}
 static void dir_remove(struct dlm_rsb *r)
 {
        int to_nodeid;
@@ -988,8 +1125,14 @@ static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
        _remove_lock(r, lkb);
 }
-static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+/* returns: 0 did nothing
+            1 moved lock to granted
+           -1 removed lock */
+static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
+        int rv = 0;
        lkb->lkb_rqmode = DLM_LOCK_IV;
        switch (lkb->lkb_status) {
@@ -997,6 +1140,7 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                break;
        case DLM_LKSTS_CONVERT:
                move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+                rv = 1;
                break;
        case DLM_LKSTS_WAITING:
                del_lkb(r, lkb);
@@ -1004,15 +1148,17 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                /* this unhold undoes the original ref from create_lkb()
                   so this leads to the lkb being freed */
                unhold_lkb(lkb);
+                rv = -1;
                break;
        default:
                log_print("invalid status for revert %d", lkb->lkb_status);
        }
+        return rv;
 }
-static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
-        revert_lock(r, lkb);
+        return revert_lock(r, lkb);
 }
 static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -1055,6 +1201,50 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
                queue_cast(r, lkb, 0);
 }
+/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
+   change the granted/requested modes.  We're munging things accordingly in
+   the process copy.
+   CONVDEADLK: our grmode may have been forced down to NL to resolve a
+   conversion deadlock
+   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
+   compatible with other granted locks */
+static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
+                log_print("munge_demoted %x invalid reply type %d",
+                          lkb->lkb_id, ms->m_type);
+                return;
+        }
+        if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
+                log_print("munge_demoted %x invalid modes gr %d rq %d",
+                          lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
+                return;
+        }
+        lkb->lkb_grmode = DLM_LOCK_NL;
+}
+static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+        if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
+            ms->m_type != DLM_MSG_GRANT) {
+                log_print("munge_altmode %x invalid reply type %d",
+                          lkb->lkb_id, ms->m_type);
+                return;
+        }
+        if (lkb->lkb_exflags & DLM_LKF_ALTPR)
+                lkb->lkb_rqmode = DLM_LOCK_PR;
+        else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
+                lkb->lkb_rqmode = DLM_LOCK_CW;
+        else {
+                log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
+                dlm_print_lkb(lkb);
+        }
+}
 static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
 {
        struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
@@ -1499,7 +1689,7 @@ static void process_lookup_list(struct dlm_rsb *r)
        struct dlm_lkb *lkb, *safe;
        list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
-                list_del(&lkb->lkb_rsb_lookup);
+                list_del_init(&lkb->lkb_rsb_lookup);
                _request_lock(r, lkb);
                schedule();
        }
@@ -1530,7 +1720,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
                if (!list_empty(&r->res_lookup)) {
                        lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
                                         lkb_rsb_lookup);
-                        list_del(&lkb->lkb_rsb_lookup);
+                        list_del_init(&lkb->lkb_rsb_lookup);
                        r->res_first_lkid = lkb->lkb_id;
                        _request_lock(r, lkb);
                } else
@@ -1614,6 +1804,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
                      DLM_LKF_FORCEUNLOCK))
                return -EINVAL;
+        if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
+                return -EINVAL;
        args->flags = flags;
        args->astparam = (long) astarg;
        return 0;
@@ -1638,6 +1831,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
                if (lkb->lkb_wait_type)
                        goto out;
+                if (is_overlap(lkb))
+                        goto out;
        }
        lkb->lkb_exflags = args->flags;
@@ -1654,35 +1850,126 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        return rv;
 }
+/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
+   for success */
+/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
+   because there may be a lookup in progress and it's valid to do
+   cancel/unlockf on it */
 static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 {
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
        int rv = -EINVAL;
-        if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+        if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+                log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+                dlm_print_lkb(lkb);
                goto out;
+        }
-        if (args->flags & DLM_LKF_FORCEUNLOCK)
+        /* an lkb may still exist even though the lock is EOL'ed due to a
-                goto out_ok;
+           cancel, unlock or failed noqueue request; an app can't use these
+           locks; return same error as if the lkid had not been found at all */
-        if (args->flags & DLM_LKF_CANCEL &&
+        if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
-            lkb->lkb_status == DLM_LKSTS_GRANTED)
+                log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+                rv = -ENOENT;
                goto out;
+        }
-        if (!(args->flags & DLM_LKF_CANCEL) &&
+        /* an lkb may be waiting for an rsb lookup to complete where the
-            lkb->lkb_status != DLM_LKSTS_GRANTED)
+           lookup was initiated by another lock */
-                goto out;
+        if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+                if (!list_empty(&lkb->lkb_rsb_lookup)) {
+                        log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
+                        list_del_init(&lkb->lkb_rsb_lookup);
+                        queue_cast(lkb->lkb_resource, lkb,
+                                   args->flags & DLM_LKF_CANCEL ?
+                                   -DLM_ECANCEL : -DLM_EUNLOCK);
+                        unhold_lkb(lkb); /* undoes create_lkb() */
+                        rv = -EBUSY;
+                        goto out;
+                }
+        }
+        /* cancel not allowed with another cancel/unlock in progress */
+        if (args->flags & DLM_LKF_CANCEL) {
+                if (lkb->lkb_exflags & DLM_LKF_CANCEL)
+                        goto out;
+                if (is_overlap(lkb))
+                        goto out;
+                if (lkb->lkb_flags & DLM_IFL_RESEND) {
+                        lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+                        rv = -EBUSY;
+                        goto out;
+                }
+                switch (lkb->lkb_wait_type) {
+                case DLM_MSG_LOOKUP:
+                case DLM_MSG_REQUEST:
+                        lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+                        rv = -EBUSY;
+                        goto out;
+                case DLM_MSG_UNLOCK:
+                case DLM_MSG_CANCEL:
+                        goto out;
+                }
+                /* add_to_waiters() will set OVERLAP_CANCEL */
+                goto out_ok;
+        }
+        /* do we need to allow a force-unlock if there's a normal unlock
+           already in progress?  in what conditions could the normal unlock
+           fail such that we'd want to send a force-unlock to be sure? */
+        if (args->flags & DLM_LKF_FORCEUNLOCK) {
+                if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
+                        goto out;
+                if (is_overlap_unlock(lkb))
+                        goto out;
+                if (lkb->lkb_flags & DLM_IFL_RESEND) {
+                        lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+                        rv = -EBUSY;
+                        goto out;
+                }
+                switch (lkb->lkb_wait_type) {
+                case DLM_MSG_LOOKUP:
+                case DLM_MSG_REQUEST:
+                        lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+                        rv = -EBUSY;
+                        goto out;
+                case DLM_MSG_UNLOCK:
+                        goto out;
+                }
+                /* add_to_waiters() will set OVERLAP_UNLOCK */
+                goto out_ok;
+        }
+        /* normal unlock not allowed if there's any op in progress */
        rv = -EBUSY;
-        if (lkb->lkb_wait_type)
+        if (lkb->lkb_wait_type || lkb->lkb_wait_count)
                goto out;
 out_ok:
-        lkb->lkb_exflags = args->flags;
+        /* an overlapping op shouldn't blow away exflags from other op */
+        lkb->lkb_exflags |= args->flags;
        lkb->lkb_sbflags = 0;
        lkb->lkb_astparam = args->astparam;
        rv = 0;
 out:
+        if (rv)
+                log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+                          lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+                          args->flags, lkb->lkb_wait_type,
+                          lkb->lkb_resource->res_name);
        return rv;
 }
@@ -1732,9 +2019,24 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
                goto out;
        }
-        if (can_be_queued(lkb)) {
+        /* is_demoted() means the can_be_granted() above set the grmode
-                if (is_demoted(lkb))
+           to NL, and left us on the granted queue.  This auto-demotion
+           (due to CONVDEADLK) might mean other locks, and/or this lock, are
+           now grantable.  We have to try to grant other converting locks
+           before we try again to grant this one. */
+        if (is_demoted(lkb)) {
+                grant_pending_convert(r, DLM_LOCK_IV);
+                if (_can_be_granted(r, lkb, 1)) {
+                        grant_lock(r, lkb);
+                        queue_cast(r, lkb, 0);
                        grant_pending_locks(r);
+                        goto out;
+                }
+                /* else fall through and move to convert queue */
+        }
+        if (can_be_queued(lkb)) {
                error = -EINPROGRESS;
                del_lkb(r, lkb);
                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
@@ -1759,17 +2061,19 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
        return -DLM_EUNLOCK;
 }
-/* FIXME: if revert_lock() finds that the lkb is granted, we should
+/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
-   skip the queue_cast(ECANCEL).  It indicates that the request/convert
-   completed (and queued a normal ast) just before the cancel; we don't
-   want to clobber the sb_result for the normal ast with ECANCEL. */
 
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
-        revert_lock(r, lkb);
+        int error;
-        queue_cast(r, lkb, -DLM_ECANCEL);
-        grant_pending_locks(r);
+        error = revert_lock(r, lkb);
-        return -DLM_ECANCEL;
+        if (error) {
+                queue_cast(r, lkb, -DLM_ECANCEL);
+                grant_pending_locks(r);
+                return -DLM_ECANCEL;
+        }
+        return 0;
 }
 /*
@@ -2035,6 +2339,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
        if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
                error = 0;
+        if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
+                error = 0;
 out_put:
        dlm_put_lkb(lkb);
 out:
@@ -2065,31 +2371,14 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
 * receive_lookup_reply         send_lookup_reply
 */
-static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+static int _create_message(struct dlm_ls *ls, int mb_len,
-                          int to_nodeid, int mstype,
+                           int to_nodeid, int mstype,
-                          struct dlm_message **ms_ret,
+                           struct dlm_message **ms_ret,
-                          struct dlm_mhandle **mh_ret)
+                           struct dlm_mhandle **mh_ret)
 {
        struct dlm_message *ms;
        struct dlm_mhandle *mh;
        char *mb;
-        int mb_len = sizeof(struct dlm_message);
-        switch (mstype) {
-        case DLM_MSG_REQUEST:
-        case DLM_MSG_LOOKUP:
-        case DLM_MSG_REMOVE:
-                mb_len += r->res_length;
-                break;
-        case DLM_MSG_CONVERT:
-        case DLM_MSG_UNLOCK:
-        case DLM_MSG_REQUEST_REPLY:
-        case DLM_MSG_CONVERT_REPLY:
-        case DLM_MSG_GRANT:
-                if (lkb && lkb->lkb_lvbptr)
-                        mb_len += r->res_ls->ls_lvblen;
-                break;
-        }
        /* get_buffer gives us a message handle (mh) that we need to
           pass into lowcomms_commit and a message buffer (mb) that we
@@ -2104,7 +2393,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
        ms = (struct dlm_message *) mb;
        ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
-        ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+        ms->m_header.h_lockspace = ls->ls_global_id;
        ms->m_header.h_nodeid = dlm_our_nodeid();
        ms->m_header.h_length = mb_len;
        ms->m_header.h_cmd = DLM_MSG;
@@ -2116,6 +2405,33 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
        return 0;
 }
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+                          int to_nodeid, int mstype,
+                          struct dlm_message **ms_ret,
+                          struct dlm_mhandle **mh_ret)
+{
+        int mb_len = sizeof(struct dlm_message);
+        switch (mstype) {
+        case DLM_MSG_REQUEST:
+        case DLM_MSG_LOOKUP:
+        case DLM_MSG_REMOVE:
+                mb_len += r->res_length;
+                break;
+        case DLM_MSG_CONVERT:
+        case DLM_MSG_UNLOCK:
+        case DLM_MSG_REQUEST_REPLY:
+        case DLM_MSG_CONVERT_REPLY:
+        case DLM_MSG_GRANT:
+                if (lkb && lkb->lkb_lvbptr)
+                        mb_len += r->res_ls->ls_lvblen;
+                break;
+        }
+        return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
+                               ms_ret, mh_ret);
+}
 /* further lowcomms enhancements or alternate implementations may make
   the return value from this function useful at some point */
@@ -2176,7 +2492,9 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        add_to_waiters(lkb, mstype);
+        error = add_to_waiters(lkb, mstype);
+        if (error)
+                return error;
        to_nodeid = r->res_nodeid;
@@ -2192,7 +2510,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
        return 0;
 fail:
-        remove_from_waiters(lkb);
+        remove_from_waiters(lkb, msg_reply_type(mstype));
        return error;
 }
@@ -2209,7 +2527,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
        /* down conversions go without a reply from the master */
        if (!error && down_conversion(lkb)) {
-                remove_from_waiters(lkb);
+                remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+                r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                r->res_ls->ls_stub_ms.m_result = 0;
                r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
                __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
@@ -2280,7 +2599,9 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
        struct dlm_mhandle *mh;
        int to_nodeid, error;
-        add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+        if (error)
+                return error;
        to_nodeid = dlm_dir_nodeid(r);
@@ -2296,7 +2617,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
        return 0;
 fail:
-        remove_from_waiters(lkb);
+        remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
        return error;
 }
@@ -2656,6 +2977,8 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
        lock_rsb(r);
        receive_flags_reply(lkb, ms);
+        if (is_altmode(lkb))
+                munge_altmode(lkb, ms);
        grant_lock_pc(r, lkb, ms);
        queue_cast(r, lkb, 0);
@@ -2736,11 +3059,16 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
        dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
 }
+static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
+{
+        do_purge(ls, ms->m_nodeid, ms->m_pid);
+}
 static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 {
        struct dlm_lkb *lkb;
        struct dlm_rsb *r;
-        int error, mstype;
+        int error, mstype, result;
        error = find_lkb(ls, ms->m_remid, &lkb);
        if (error) {
@@ -2749,20 +3077,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
        }
        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
-        mstype = lkb->lkb_wait_type;
-        error = remove_from_waiters(lkb);
-        if (error) {
-                log_error(ls, "receive_request_reply not on waiters");
-                goto out;
-        }
-        /* this is the value returned from do_request() on the master */
-        error = ms->m_result;
        r = lkb->lkb_resource;
        hold_rsb(r);
        lock_rsb(r);
+        mstype = lkb->lkb_wait_type;
+        error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
+        if (error)
+                goto out;
        /* Optimization: the dir node was also the master, so it took our
           lookup as a request and sent request reply instead of lookup reply */
        if (mstype == DLM_MSG_LOOKUP) {
@@ -2770,14 +3093,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
                lkb->lkb_nodeid = r->res_nodeid;
        }
-        switch (error) {
+        /* this is the value returned from do_request() on the master */
+        result = ms->m_result;
+        switch (result) {
        case -EAGAIN:
-                /* request would block (be queued) on remote master;
+                /* request would block (be queued) on remote master */
-                   the unhold undoes the original ref from create_lkb()
-                   so it leads to the lkb being freed */
                queue_cast(r, lkb, -EAGAIN);
                confirm_master(r, -EAGAIN);
-                unhold_lkb(lkb);
+                unhold_lkb(lkb); /* undoes create_lkb() */
                break;
        case -EINPROGRESS:
@@ -2785,41 +3109,64 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
                /* request was queued or granted on remote master */
                receive_flags_reply(lkb, ms);
                lkb->lkb_remid = ms->m_lkid;
-                if (error)
+                if (is_altmode(lkb))
+                        munge_altmode(lkb, ms);
+                if (result)
                        add_lkb(r, lkb, DLM_LKSTS_WAITING);
                else {
                        grant_lock_pc(r, lkb, ms);
                        queue_cast(r, lkb, 0);
                }
-                confirm_master(r, error);
+                confirm_master(r, result);
                break;
        case -EBADR:
        case -ENOTBLK:
                /* find_rsb failed to find rsb or rsb wasn't master */
+                log_debug(ls, "receive_request_reply %x %x master diff %d %d",
+                          lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
                r->res_nodeid = -1;
                lkb->lkb_nodeid = -1;
-                _request_lock(r, lkb);
+                if (is_overlap(lkb)) {
+                        /* we'll ignore error in cancel/unlock reply */
+                        queue_cast_overlap(r, lkb);
+                        unhold_lkb(lkb); /* undoes create_lkb() */
+                } else
+                        _request_lock(r, lkb);
                break;
        default:
-                log_error(ls, "receive_request_reply error %d", error);
+                log_error(ls, "receive_request_reply %x error %d",
+                          lkb->lkb_id, result);
        }
+        if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
+                log_debug(ls, "receive_request_reply %x result %d unlock",
+                          lkb->lkb_id, result);
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+                send_unlock(r, lkb);
+        } else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
+                log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+                send_cancel(r, lkb);
+        } else {
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+        }
+ out:
        unlock_rsb(r);
        put_rsb(r);
- out:
        dlm_put_lkb(lkb);
 }
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
                                    struct dlm_message *ms)
 {
-        int error = ms->m_result;
        /* this is the value returned from do_convert() on the master */
+        switch (ms->m_result) {
-        switch (error) {
        case -EAGAIN:
                /* convert would block (be queued) on remote master */
                queue_cast(r, lkb, -EAGAIN);
@@ -2827,6 +3174,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
        case -EINPROGRESS:
                /* convert was queued on remote master */
+                receive_flags_reply(lkb, ms);
+                if (is_demoted(lkb))
+                        munge_demoted(lkb, ms);
                del_lkb(r, lkb);
                add_lkb(r, lkb, DLM_LKSTS_CONVERT);
                break;
@@ -2834,24 +3184,33 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
        case 0:
                /* convert was granted on remote master */
                receive_flags_reply(lkb, ms);
+                if (is_demoted(lkb))
+                        munge_demoted(lkb, ms);
                grant_lock_pc(r, lkb, ms);
                queue_cast(r, lkb, 0);
                break;
        default:
-                log_error(r->res_ls, "receive_convert_reply error %d", error);
+                log_error(r->res_ls, "receive_convert_reply %x error %d",
+                          lkb->lkb_id, ms->m_result);
        }
 }
 static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
        struct dlm_rsb *r = lkb->lkb_resource;
+        int error;
        hold_rsb(r);
        lock_rsb(r);
-        __receive_convert_reply(r, lkb, ms);
+        /* stub reply can happen with waiters_mutex held */
+        error = remove_from_waiters_ms(lkb, ms);
+        if (error)
+                goto out;
+        __receive_convert_reply(r, lkb, ms);
+ out:
        unlock_rsb(r);
        put_rsb(r);
 }
@@ -2868,37 +3227,38 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
        }
        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
-        error = remove_from_waiters(lkb);
-        if (error) {
-                log_error(ls, "receive_convert_reply not on waiters");
-                goto out;
-        }
        _receive_convert_reply(lkb, ms);
- out:
        dlm_put_lkb(lkb);
 }
 static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
        struct dlm_rsb *r = lkb->lkb_resource;
-        int error = ms->m_result;
+        int error;
        hold_rsb(r);
        lock_rsb(r);
+        /* stub reply can happen with waiters_mutex held */
+        error = remove_from_waiters_ms(lkb, ms);
+        if (error)
+                goto out;
        /* this is the value returned from do_unlock() on the master */
-        switch (error) {
+        switch (ms->m_result) {
        case -DLM_EUNLOCK:
                receive_flags_reply(lkb, ms);
                remove_lock_pc(r, lkb);
                queue_cast(r, lkb, -DLM_EUNLOCK);
                break;
+        case -ENOENT:
+                break;
        default:
-                log_error(r->res_ls, "receive_unlock_reply error %d", error);
+                log_error(r->res_ls, "receive_unlock_reply %x error %d",
+                          lkb->lkb_id, ms->m_result);
        }
+ out:
        unlock_rsb(r);
        put_rsb(r);
 }
@@ -2915,37 +3275,39 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
        }
        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
-        error = remove_from_waiters(lkb);
-        if (error) {
-                log_error(ls, "receive_unlock_reply not on waiters");
-                goto out;
-        }
        _receive_unlock_reply(lkb, ms);
- out:
        dlm_put_lkb(lkb);
 }
 static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
        struct dlm_rsb *r = lkb->lkb_resource;
-        int error = ms->m_result;
+        int error;
        hold_rsb(r);
        lock_rsb(r);
+        /* stub reply can happen with waiters_mutex held */
+        error = remove_from_waiters_ms(lkb, ms);
+        if (error)
+                goto out;
        /* this is the value returned from do_cancel() on the master */
-        switch (error) {
+        switch (ms->m_result) {
        case -DLM_ECANCEL:
                receive_flags_reply(lkb, ms);
                revert_lock_pc(r, lkb);
-                queue_cast(r, lkb, -DLM_ECANCEL);
+                if (ms->m_result)
+                        queue_cast(r, lkb, -DLM_ECANCEL);
+                break;
+        case 0:
                break;
        default:
-                log_error(r->res_ls, "receive_cancel_reply error %d", error);
+                log_error(r->res_ls, "receive_cancel_reply %x error %d",
+                          lkb->lkb_id, ms->m_result);
        }
+ out:
        unlock_rsb(r);
        put_rsb(r);
 }
@@ -2962,14 +3324,7 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
        }
        DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
-        error = remove_from_waiters(lkb);
-        if (error) {
-                log_error(ls, "receive_cancel_reply not on waiters");
-                goto out;
-        }
        _receive_cancel_reply(lkb, ms);
- out:
        dlm_put_lkb(lkb);
 }
@@ -2985,20 +3340,17 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
                return;
        }
-        error = remove_from_waiters(lkb);
+        /* ms->m_result is the value returned by dlm_dir_lookup on dir node
-        if (error) {
-                log_error(ls, "receive_lookup_reply not on waiters");
-                goto out;
-        }
-        /* this is the value returned by dlm_dir_lookup on dir node
           FIXME: will a non-zero error ever be returned? */
-        error = ms->m_result;
        r = lkb->lkb_resource;
        hold_rsb(r);
        lock_rsb(r);
+        error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+        if (error)
+                goto out;
        ret_nodeid = ms->m_nodeid;
        if (ret_nodeid == dlm_our_nodeid()) {
                r->res_nodeid = 0;
@@ -3009,14 +3361,22 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
                r->res_nodeid = ret_nodeid;
        }
+        if (is_overlap(lkb)) {
+                log_debug(ls, "receive_lookup_reply %x unlock %x",
+                          lkb->lkb_id, lkb->lkb_flags);
+                queue_cast_overlap(r, lkb);
+                unhold_lkb(lkb); /* undoes create_lkb() */
+                goto out_list;
+        }
        _request_lock(r, lkb);
+ out_list:
        if (!ret_nodeid)
                process_lookup_list(r);
+ out:
        unlock_rsb(r);
        put_rsb(r);
- out:
        dlm_put_lkb(lkb);
 }
@@ -3133,6 +3493,12 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
                receive_lookup_reply(ls, ms);
                break;
+        /* other messages */
+        case DLM_MSG_PURGE:
+                receive_purge(ls, ms);
+                break;
        default:
                log_error(ls, "unknown message type %d", ms->m_type);
        }
@@ -3153,9 +3519,9 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
        if (middle_conversion(lkb)) {
                hold_lkb(lkb);
+                ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                ls->ls_stub_ms.m_result = -EINPROGRESS;
                ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-                _remove_from_waiters(lkb);
                _receive_convert_reply(lkb, &ls->ls_stub_ms);
                /* Same special case as in receive_rcom_lock_args() */
@@ -3227,18 +3593,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                case DLM_MSG_UNLOCK:
                        hold_lkb(lkb);
+                        ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
                        ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-                        _remove_from_waiters(lkb);
                        _receive_unlock_reply(lkb, &ls->ls_stub_ms);
                        dlm_put_lkb(lkb);
                        break;
                case DLM_MSG_CANCEL:
                        hold_lkb(lkb);
+                        ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
                        ls->ls_stub_ms.m_result = -DLM_ECANCEL;
                        ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-                        _remove_from_waiters(lkb);
                        _receive_cancel_reply(lkb, &ls->ls_stub_ms);
                        dlm_put_lkb(lkb);
                        break;
@@ -3252,37 +3618,47 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
        mutex_unlock(&ls->ls_waiters_mutex);
 }
-static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        int rv = 0;
+        int found = 0;
        mutex_lock(&ls->ls_waiters_mutex);
        list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
                if (lkb->lkb_flags & DLM_IFL_RESEND) {
-                        rv = lkb->lkb_wait_type;
+                        hold_lkb(lkb);
-                        _remove_from_waiters(lkb);
+                        found = 1;
-                        lkb->lkb_flags &= ~DLM_IFL_RESEND;
                        break;
                }
        }
        mutex_unlock(&ls->ls_waiters_mutex);
-        if (!rv)
+        if (!found)
                lkb = NULL;
-        *lkb_ret = lkb;
+        return lkb;
-        return rv;
 }
 /* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
   master or dir-node for r.  Processing the lkb may result in it being placed
   back on waiters. */
+/* We do this after normal locking has been enabled and any saved messages
+   (in requestqueue) have been processed.  We should be confident that at
+   this point we won't get or process a reply to any of these waiting
+   operations.  But, new ops may be coming in on the rsbs/locks here from
+   userspace or remotely. */
+/* there may have been an overlap unlock/cancel prior to recovery or after
+   recovery.  if before, the lkb may still have a pos wait_count; if after, the
+   overlap flag would just have been set and nothing new sent.  we can be
+   confident here than any replies to either the initial op or overlap ops
+   prior to recovery have been received. */
 int dlm_recover_waiters_post(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
        struct dlm_rsb *r;
-        int error = 0, mstype;
+        int error = 0, mstype, err, oc, ou;
        while (1) {
                if (dlm_locking_stopped(ls)) {
@@ -3291,48 +3667,78 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
                        break;
                }
-                mstype = remove_resend_waiter(ls, &lkb);
+                lkb = find_resend_waiter(ls);
-                if (!mstype)
+                if (!lkb)
                        break;
                r = lkb->lkb_resource;
+                hold_rsb(r);
+                lock_rsb(r);
+                mstype = lkb->lkb_wait_type;
+                oc = is_overlap_cancel(lkb);
+                ou = is_overlap_unlock(lkb);
+                err = 0;
                log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
                          lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
-                switch (mstype) {
+                /* At this point we assume that we won't get a reply to any
+                   previous op or overlap op on this lock.  First, do a big
-                case DLM_MSG_LOOKUP:
+                   remove_from_waiters() for all previous ops. */
-                        hold_rsb(r);
-                        lock_rsb(r);
+                lkb->lkb_flags &= ~DLM_IFL_RESEND;
-                        _request_lock(r, lkb);
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
-                        if (is_master(r))
+                lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
-                                confirm_master(r, 0);
+                lkb->lkb_wait_type = 0;
-                        unlock_rsb(r);
+                lkb->lkb_wait_count = 0;
-                        put_rsb(r);
+                mutex_lock(&ls->ls_waiters_mutex);
-                        break;
+                list_del_init(&lkb->lkb_wait_reply);
+                mutex_unlock(&ls->ls_waiters_mutex);
-                case DLM_MSG_REQUEST:
+                unhold_lkb(lkb); /* for waiters list */
-                        hold_rsb(r);
-                        lock_rsb(r);
+                if (oc || ou) {
-                        _request_lock(r, lkb);
+                        /* do an unlock or cancel instead of resending */
-                        if (is_master(r))
+                        switch (mstype) {
-                                confirm_master(r, 0);
+                        case DLM_MSG_LOOKUP:
-                        unlock_rsb(r);
+                        case DLM_MSG_REQUEST:
-                        put_rsb(r);
+                                queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
-                        break;
+                                                        -DLM_ECANCEL);
+                                unhold_lkb(lkb); /* undoes create_lkb() */
-                case DLM_MSG_CONVERT:
+                                break;
-                        hold_rsb(r);
+                        case DLM_MSG_CONVERT:
-                        lock_rsb(r);
+                                if (oc) {
-                        _convert_lock(r, lkb);
+                                        queue_cast(r, lkb, -DLM_ECANCEL);
-                        unlock_rsb(r);
+                                } else {
-                        put_rsb(r);
+                                        lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
-                        break;
+                                        _unlock_lock(r, lkb);
+                                }
-                default:
+                                break;
-                        log_error(ls, "recover_waiters_post type %d", mstype);
+                        default:
+                                err = 1;
+                        }
+                } else {
+                        switch (mstype) {
+                        case DLM_MSG_LOOKUP:
+                        case DLM_MSG_REQUEST:
+                                _request_lock(r, lkb);
+                                if (is_master(r))
+                                        confirm_master(r, 0);
+                                break;
+                        case DLM_MSG_CONVERT:
+                                _convert_lock(r, lkb);
+                                break;
+                        default:
+                                err = 1;
+                        }
                }
+                if (err)
+                        log_error(ls, "recover_waiters_post %x %d %x %d %d",
+                                  lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
+                unlock_rsb(r);
+                put_rsb(r);
+                dlm_put_lkb(lkb);
        }
        return error;
@@ -3684,7 +4090,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
        /* add this new lkb to the per-process list of locks */
        spin_lock(&ua->proc->locks_spin);
-        kref_get(&lkb->lkb_ref);
+        hold_lkb(lkb);
        list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
        spin_unlock(&ua->proc->locks_spin);
 out:
@@ -3774,6 +4180,9 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        if (error == -DLM_EUNLOCK)
                error = 0;
+        /* from validate_unlock_args() */
+        if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
+                error = 0;
        if (error)
                goto out_put;
@@ -3786,6 +4195,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        dlm_put_lkb(lkb);
 out:
        unlock_recovery(ls);
+        kfree(ua_tmp);
        return error;
 }
@@ -3815,33 +4225,37 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        if (error == -DLM_ECANCEL)
                error = 0;
-        if (error)
+        /* from validate_unlock_args() */
-                goto out_put;
+        if (error == -EBUSY)
+                error = 0;
-        /* this lkb was removed from the WAITING queue */
-        if (lkb->lkb_grmode == DLM_LOCK_IV) {
-                spin_lock(&ua->proc->locks_spin);
-                list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
-                spin_unlock(&ua->proc->locks_spin);
-        }
 out_put:
        dlm_put_lkb(lkb);
 out:
        unlock_recovery(ls);
+        kfree(ua_tmp);
        return error;
 }
+/* lkb's that are removed from the waiters list by revert are just left on the
+   orphans list with the granted orphan locks, to be freed by purge */
 static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
        struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+        struct dlm_args args;
+        int error;
-        if (ua->lksb.sb_lvbptr)
+        hold_lkb(lkb);
-                kfree(ua->lksb.sb_lvbptr);
+        mutex_lock(&ls->ls_orphans_mutex);
-        kfree(ua);
+        list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
-        lkb->lkb_astparam = (long)NULL;
+        mutex_unlock(&ls->ls_orphans_mutex);
-        /* TODO: propogate to master if needed */
+        set_unlock_args(0, ua, &args);
-        return 0;
+        error = cancel_lock(ls, lkb, &args);
+        if (error == -DLM_ECANCEL)
+                error = 0;
+        return error;
 }
 /* The force flag allows the unlock to go ahead even if the lkb isn't granted.
@@ -3853,10 +4267,6 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
        struct dlm_args args;
        int error;
-        /* FIXME: we need to handle the case where the lkb is in limbo
-           while the rsb is being looked up, currently we assert in
-           _unlock_lock/is_remote because rsb nodeid is -1. */
        set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
        error = unlock_lock(ls, lkb, &args);
@@ -3865,6 +4275,31 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
        return error;
 }
+/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
+   (which does lock_rsb) due to deadlock with receiving a message that does
+   lock_rsb followed by dlm_user_add_ast() */
+static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
+                                     struct dlm_user_proc *proc)
+{
+        struct dlm_lkb *lkb = NULL;
+        mutex_lock(&ls->ls_clear_proc_locks);
+        if (list_empty(&proc->locks))
+                goto out;
+        lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
+        list_del_init(&lkb->lkb_ownqueue);
+        if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+                lkb->lkb_flags |= DLM_IFL_ORPHAN;
+        else
+                lkb->lkb_flags |= DLM_IFL_DEAD;
+ out:
+        mutex_unlock(&ls->ls_clear_proc_locks);
+        return lkb;
+}
 /* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
   which we clear here. */
@@ -3880,18 +4315,15 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        struct dlm_lkb *lkb, *safe;
        lock_recovery(ls);
-        mutex_lock(&ls->ls_clear_proc_locks);
-        list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
+        while (1) {
-                list_del_init(&lkb->lkb_ownqueue);
+                lkb = del_proc_lock(ls, proc);
+                if (!lkb)
-                if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
+                        break;
-                        lkb->lkb_flags |= DLM_IFL_ORPHAN;
+                if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
                        orphan_proc_lock(ls, lkb);
-                } else {
+                else
-                        lkb->lkb_flags |= DLM_IFL_DEAD;
                        unlock_proc_lock(ls, lkb);
-                }
                /* this removes the reference for the proc->locks list
                   added by dlm_user_request, it may result in the lkb
@@ -3900,6 +4332,8 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
                dlm_put_lkb(lkb);
        }
+        mutex_lock(&ls->ls_clear_proc_locks);
        /* in-progress unlocks */
        list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
                list_del_init(&lkb->lkb_ownqueue);
@@ -3916,3 +4350,92 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        unlock_recovery(ls);
 }
+static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+        struct dlm_lkb *lkb, *safe;
+        while (1) {
+                lkb = NULL;
+                spin_lock(&proc->locks_spin);
+                if (!list_empty(&proc->locks)) {
+                        lkb = list_entry(proc->locks.next, struct dlm_lkb,
+                                         lkb_ownqueue);
+                        list_del_init(&lkb->lkb_ownqueue);
+                }
+                spin_unlock(&proc->locks_spin);
+                if (!lkb)
+                        break;
+                lkb->lkb_flags |= DLM_IFL_DEAD;
+                unlock_proc_lock(ls, lkb);
+                dlm_put_lkb(lkb); /* ref from proc->locks list */
+        }
+        spin_lock(&proc->locks_spin);
+        list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+                list_del_init(&lkb->lkb_ownqueue);
+                lkb->lkb_flags |= DLM_IFL_DEAD;
+                dlm_put_lkb(lkb);
+        }
+        spin_unlock(&proc->locks_spin);
+        spin_lock(&proc->asts_spin);
+        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+                list_del(&lkb->lkb_astqueue);
+                dlm_put_lkb(lkb);
+        }
+        spin_unlock(&proc->asts_spin);
+}
+/* pid of 0 means purge all orphans */
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+        struct dlm_lkb *lkb, *safe;
+        mutex_lock(&ls->ls_orphans_mutex);
+        list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
+                if (pid && lkb->lkb_ownpid != pid)
+                        continue;
+                unlock_proc_lock(ls, lkb);
+                list_del_init(&lkb->lkb_ownqueue);
+                dlm_put_lkb(lkb);
+        }
+        mutex_unlock(&ls->ls_orphans_mutex);
+}
+static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+        struct dlm_message *ms;
+        struct dlm_mhandle *mh;
+        int error;
+        error = _create_message(ls, sizeof(struct dlm_message), nodeid,
+                                DLM_MSG_PURGE, &ms, &mh);
+        if (error)
+                return error;
+        ms->m_nodeid = nodeid;
+        ms->m_pid = pid;
+        return send_message(mh, ms);
+}
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+                   int nodeid, int pid)
+{
+        int error = 0;
+        if (nodeid != dlm_our_nodeid()) {
+                error = send_purge(ls, nodeid, pid);
+        } else {
+                lock_recovery(ls);
+                if (pid == current->pid)
+                        purge_proc_locks(ls, proc);
+                else
+                        do_purge(ls, nodeid, pid);
+                unlock_recovery(ls);
+        }
+        return error;
+}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 0843a3073ec3..64fc4ec40668 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -41,6 +41,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        uint32_t flags, uint32_t lkid, char *lvb_in);
 int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
        uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+        int nodeid, int pid);
 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
 static inline int is_master(struct dlm_rsb *r)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f40817b53c6f..a677b2a5eed4 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -167,7 +167,6 @@ static struct kobj_type dlm_ktype = {
 };
 static struct kset dlm_kset = {
-        .subsys = &kernel_subsys,
        .kobj   = {.name = "dlm",},
        .ktype  = &dlm_ktype,
 };
@@ -218,6 +217,7 @@ int dlm_lockspace_init(void)
        INIT_LIST_HEAD(&lslist);
        spin_lock_init(&lslist_lock);
+        kobj_set_kset_s(&dlm_kset, kernel_subsys);
        error = kset_register(&dlm_kset);
        if (error)
                printk("dlm_lockspace_init: cannot register kset %d\n", error);
@@ -459,6 +459,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        INIT_LIST_HEAD(&ls->ls_waiters);
        mutex_init(&ls->ls_waiters_mutex);
+        INIT_LIST_HEAD(&ls->ls_orphans);
+        mutex_init(&ls->ls_orphans_mutex);
        INIT_LIST_HEAD(&ls->ls_nodes);
        INIT_LIST_HEAD(&ls->ls_nodes_gone);
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
deleted file mode 100644
index dc83a9d979b5..000000000000
--- a/fs/dlm/lowcomms-sctp.c
+++ /dev/null
@@ -1,1210 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-/*
- * lowcomms.c
- *
- * This is the "low-level" comms layer.
- *
- * It is responsible for sending/receiving messages
- * from other nodes in the cluster.
- *
- * Cluster nodes are referred to by their nodeids. nodeids are
- * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
- * responsibility. It is this layer's
- * responsibility to resolve these into IP address or
- * whatever it needs for inter-node communication.
- *
- * The comms level is two kernel threads that deal mainly with
- * the receiving of messages from other nodes and passing them
- * up to the mid-level comms layer (which understands the
- * message format) for execution by the locking core, and
- * a send thread which does all the setting up of connections
- * to remote nodes and the sending of data. Threads are not allowed
- * to send their own data because it may cause them to wait in times
- * of high load. Also, this way, the sending thread can collect together
- * messages bound for one node and send them in one block.
- *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never (well, hardly ever) waits.
- *
- */
-#include <asm/ioctls.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/sctp/user.h>
-#include <linux/pagemap.h>
-#include <linux/socket.h>
-#include <linux/idr.h>
-#include "dlm_internal.h"
-#include "lowcomms.h"
-#include "config.h"
-#include "midcomms.h"
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
-static int                      dlm_local_count;
-static int                      dlm_local_nodeid;
-/* One of these per connected node */
-#define NI_INIT_PENDING 1
-#define NI_WRITE_PENDING 2
-struct nodeinfo {
-        spinlock_t              lock;
-        sctp_assoc_t            assoc_id;
-        unsigned long           flags;
-        struct list_head        write_list; /* nodes with pending writes */
-        struct list_head        writequeue; /* outgoing writequeue_entries */
-        spinlock_t              writequeue_lock;
-        int                     nodeid;
-        struct work_struct      swork; /* Send workqueue */
-        struct work_struct      lwork; /* Locking workqueue */
-};
-static DEFINE_IDR(nodeinfo_idr);
-static DECLARE_RWSEM(nodeinfo_lock);
-static int max_nodeid;
-struct cbuf {
-        unsigned int base;
-        unsigned int len;
-        unsigned int mask;
-};
-/* Just the one of these, now. But this struct keeps
-   the connection-specific variables together */
-#define CF_READ_PENDING 1
-struct connection {
-        struct socket           *sock;
-        unsigned long           flags;
-        struct page             *rx_page;
-        atomic_t                waiting_requests;
-        struct cbuf             cb;
-        int                     eagain_flag;
-        struct work_struct      work; /* Send workqueue */
-};
-/* An entry waiting to be sent */
-struct writequeue_entry {
-        struct list_head        list;
-        struct page             *page;
-        int                     offset;
-        int                     len;
-        int                     end;
-        int                     users;
-        struct nodeinfo         *ni;
-};
-static void cbuf_add(struct cbuf *cb, int n)
-{
-        cb->len += n;
-}
-static int cbuf_data(struct cbuf *cb)
-{
-        return ((cb->base + cb->len) & cb->mask);
-}
-static void cbuf_init(struct cbuf *cb, int size)
-{
-        cb->base = cb->len = 0;
-        cb->mask = size-1;
-}
-static void cbuf_eat(struct cbuf *cb, int n)
-{
-        cb->len  -= n;
-        cb->base += n;
-        cb->base &= cb->mask;
-}
-/* List of nodes which have writes pending */
-static LIST_HEAD(write_nodes);
-static DEFINE_SPINLOCK(write_nodes_lock);
-/* Maximum number of incoming messages to process before
- * doing a schedule()
- */
-#define MAX_RX_MSG_COUNT 25
-/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
-static struct workqueue_struct *lock_workqueue;
-/* The SCTP connection */
-static struct connection sctp_con;
-static void process_send_sockets(struct work_struct *work);
-static void process_recv_sockets(struct work_struct *work);
-static void process_lock_request(struct work_struct *work);
-static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
-{
-        struct sockaddr_storage addr;
-        int error;
-        if (!dlm_local_count)
-                return -1;
-        error = dlm_nodeid_to_addr(nodeid, &addr);
-        if (error)
-                return error;
-        if (dlm_local_addr[0]->ss_family == AF_INET) {
-                struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
-                struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
-                ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
-        } else {
-                struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
-                struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-                memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-                       sizeof(in6->sin6_addr));
-        }
-        return 0;
-}
-/* If alloc is 0 here we will not attempt to allocate a new
-   nodeinfo struct */
-static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
-{
-        struct nodeinfo *ni;
-        int r;
-        int n;
-        down_read(&nodeinfo_lock);
-        ni = idr_find(&nodeinfo_idr, nodeid);
-        up_read(&nodeinfo_lock);
-        if (ni || !alloc)
-                return ni;
-        down_write(&nodeinfo_lock);
-        ni = idr_find(&nodeinfo_idr, nodeid);
-        if (ni)
-                goto out_up;
-        r = idr_pre_get(&nodeinfo_idr, alloc);
-        if (!r)
-                goto out_up;
-        ni = kmalloc(sizeof(struct nodeinfo), alloc);
-        if (!ni)
-                goto out_up;
-        r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
-        if (r) {
-                kfree(ni);
-                ni = NULL;
-                goto out_up;
-        }
-        if (n != nodeid) {
-                idr_remove(&nodeinfo_idr, n);
-                kfree(ni);
-                ni = NULL;
-                goto out_up;
-        }
-        memset(ni, 0, sizeof(struct nodeinfo));
-        spin_lock_init(&ni->lock);
-        INIT_LIST_HEAD(&ni->writequeue);
-        spin_lock_init(&ni->writequeue_lock);
-        INIT_WORK(&ni->lwork, process_lock_request);
-        INIT_WORK(&ni->swork, process_send_sockets);
-        ni->nodeid = nodeid;
-        if (nodeid > max_nodeid)
-                max_nodeid = nodeid;
-out_up:
-        up_write(&nodeinfo_lock);
-        return ni;
-}
-/* Don't call this too often... */
-static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
-{
-        int i;
-        struct nodeinfo *ni;
-        for (i=1; i<=max_nodeid; i++) {
-                ni = nodeid2nodeinfo(i, 0);
-                if (ni && ni->assoc_id == assoc)
-                        return ni;
-        }
-        return NULL;
-}
-/* Data or notification available on socket */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
-{
-        if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
-                queue_work(recv_workqueue, &sctp_con.work);
-}
-/* Add the port number to an IP6 or 4 sockaddr and return the address length.
-   Also padd out the struct with zeros to make comparisons meaningful */
-static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
-                          int *addr_len)
-{
-        struct sockaddr_in *local4_addr;
-        struct sockaddr_in6 *local6_addr;
-        if (!dlm_local_count)
-                return;
-        if (!port) {
-                if (dlm_local_addr[0]->ss_family == AF_INET) {
-                        local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
-                        port = be16_to_cpu(local4_addr->sin_port);
-                } else {
-                        local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
-                        port = be16_to_cpu(local6_addr->sin6_port);
-                }
-        }
-        saddr->ss_family = dlm_local_addr[0]->ss_family;
-        if (dlm_local_addr[0]->ss_family == AF_INET) {
-                struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
-                in4_addr->sin_port = cpu_to_be16(port);
-                memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
-                memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
-                       sizeof(struct sockaddr_in));
-                *addr_len = sizeof(struct sockaddr_in);
-        } else {
-                struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
-                in6_addr->sin6_port = cpu_to_be16(port);
-                memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
-                       sizeof(struct sockaddr_in6));
-                *addr_len = sizeof(struct sockaddr_in6);
-        }
-}
-/* Close the connection and tidy up */
-static void close_connection(void)
-{
-        if (sctp_con.sock) {
-                sock_release(sctp_con.sock);
-                sctp_con.sock = NULL;
-        }
-        if (sctp_con.rx_page) {
-                __free_page(sctp_con.rx_page);
-                sctp_con.rx_page = NULL;
-        }
-}
-/* We only send shutdown messages to nodes that are not part of the cluster */
-static void send_shutdown(sctp_assoc_t associd)
-{
-        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-        struct msghdr outmessage;
-        struct cmsghdr *cmsg;
-        struct sctp_sndrcvinfo *sinfo;
-        int ret;
-        outmessage.msg_name = NULL;
-        outmessage.msg_namelen = 0;
-        outmessage.msg_control = outcmsg;
-        outmessage.msg_controllen = sizeof(outcmsg);
-        outmessage.msg_flags = MSG_EOR;
-        cmsg = CMSG_FIRSTHDR(&outmessage);
-        cmsg->cmsg_level = IPPROTO_SCTP;
-        cmsg->cmsg_type = SCTP_SNDRCV;
-        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-        outmessage.msg_controllen = cmsg->cmsg_len;
-        sinfo = CMSG_DATA(cmsg);
-        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-        sinfo->sinfo_flags |= MSG_EOF;
-        sinfo->sinfo_assoc_id = associd;
-        ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
-        if (ret != 0)
-                log_print("send EOF to node failed: %d", ret);
-}
-/* INIT failed but we don't know which node...
-   restart INIT on all pending nodes */
-static void init_failed(void)
-{
-        int i;
-        struct nodeinfo *ni;
-        for (i=1; i<=max_nodeid; i++) {
-                ni = nodeid2nodeinfo(i, 0);
-                if (!ni)
-                        continue;
-                if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-                        ni->assoc_id = 0;
-                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-                                spin_lock_bh(&write_nodes_lock);
-                                list_add_tail(&ni->write_list, &write_nodes);
-                                spin_unlock_bh(&write_nodes_lock);
-                                queue_work(send_workqueue, &ni->swork);
-                        }
-                }
-        }
-}
-/* Something happened to an association */
-static void process_sctp_notification(struct msghdr *msg, char *buf)
-{
-        union sctp_notification *sn = (union sctp_notification *)buf;
-        if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
-                switch (sn->sn_assoc_change.sac_state) {
-                case SCTP_COMM_UP:
-                case SCTP_RESTART:
-                {
-                        /* Check that the new node is in the lockspace */
-                        struct sctp_prim prim;
-                        mm_segment_t fs;
-                        int nodeid;
-                        int prim_len, ret;
-                        int addr_len;
-                        struct nodeinfo *ni;
-                        /* This seems to happen when we received a connection
-                         * too early... or something...  anyway, it happens but
-                         * we always seem to get a real message too, see
-                         * receive_from_sock */
-                        if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
-                                log_print("COMM_UP for invalid assoc ID %d",
-                                          (int)sn->sn_assoc_change.sac_assoc_id);
-                                init_failed();
-                                return;
-                        }
-                        memset(&prim, 0, sizeof(struct sctp_prim));
-                        prim_len = sizeof(struct sctp_prim);
-                        prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
-                        fs = get_fs();
-                        set_fs(get_ds());
-                        ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
-                                                             IPPROTO_SCTP,
-                                                             SCTP_PRIMARY_ADDR,
-                                                             (char*)&prim,
-                                                             &prim_len);
-                        set_fs(fs);
-                        if (ret < 0) {
-                                struct nodeinfo *ni;
-                                log_print("getsockopt/sctp_primary_addr on "
-                                          "new assoc %d failed : %d",
-                                          (int)sn->sn_assoc_change.sac_assoc_id,
-                                          ret);
-                                /* Retry INIT later */
-                                ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-                                if (ni)
-                                        clear_bit(NI_INIT_PENDING, &ni->flags);
-                                return;
-                        }
-                        make_sockaddr(&prim.ssp_addr, 0, &addr_len);
-                        if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
-                                log_print("reject connect from unknown addr");
-                                send_shutdown(prim.ssp_assoc_id);
-                                return;
-                        }
-                        ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-                        if (!ni)
-                                return;
-                        /* Save the assoc ID */
-                        ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
-                        log_print("got new/restarted association %d nodeid %d",
-                                  (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
-                        /* Send any pending writes */
-                        clear_bit(NI_INIT_PENDING, &ni->flags);
-                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-                                spin_lock_bh(&write_nodes_lock);
-                                list_add_tail(&ni->write_list, &write_nodes);
-                                spin_unlock_bh(&write_nodes_lock);
-                                queue_work(send_workqueue, &ni->swork);
-                        }
-                }
-                break;
-                case SCTP_COMM_LOST:
-                case SCTP_SHUTDOWN_COMP:
-                {
-                        struct nodeinfo *ni;
-                        ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-                        if (ni) {
-                                spin_lock(&ni->lock);
-                                ni->assoc_id = 0;
-                                spin_unlock(&ni->lock);
-                        }
-                }
-                break;
-                /* We don't know which INIT failed, so clear the PENDING flags
-                 * on them all.  if assoc_id is zero then it will then try
-                 * again */
-                case SCTP_CANT_STR_ASSOC:
-                {
-                        log_print("Can't start SCTP association - retrying");
-                        init_failed();
-                }
-                break;
-                default:
-                        log_print("unexpected SCTP assoc change id=%d state=%d",
-                                  (int)sn->sn_assoc_change.sac_assoc_id,
-                                  sn->sn_assoc_change.sac_state);
-                }
-        }
-}
-/* Data received from remote end */
-static int receive_from_sock(void)
-{
-        int ret = 0;
-        struct msghdr msg;
-        struct kvec iov[2];
-        unsigned len;
-        int r;
-        struct sctp_sndrcvinfo *sinfo;
-        struct cmsghdr *cmsg;
-        struct nodeinfo *ni;
-        /* These two are marginally too big for stack allocation, but this
-         * function is (currently) only called by dlm_recvd so static should be
-         * OK.
-         */
-        static struct sockaddr_storage msgname;
-        static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-        if (sctp_con.sock == NULL)
-                goto out;
-        if (sctp_con.rx_page == NULL) {
-                /*
-                 * This doesn't need to be atomic, but I think it should
-                 * improve performance if it is.
-                 */
-                sctp_con.rx_page = alloc_page(GFP_ATOMIC);
-                if (sctp_con.rx_page == NULL)
-                        goto out_resched;
-                cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE);
-        }
-        memset(&incmsg, 0, sizeof(incmsg));
-        memset(&msgname, 0, sizeof(msgname));
-        msg.msg_name = &msgname;
-        msg.msg_namelen = sizeof(msgname);
-        msg.msg_flags = 0;
-        msg.msg_control = incmsg;
-        msg.msg_controllen = sizeof(incmsg);
-        msg.msg_iovlen = 1;
-        /* I don't see why this circular buffer stuff is necessary for SCTP
-         * which is a packet-based protocol, but the whole thing breaks under
-         * load without it! The overhead is minimal (and is in the TCP lowcomms
-         * anyway, of course) so I'll leave it in until I can figure out what's
-         * really happening.
-         */
-        /*
-         * iov[0] is the bit of the circular buffer between the current end
-         * point (cb.base + cb.len) and the end of the buffer.
-         */
-        iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb);
-        iov[0].iov_base = page_address(sctp_con.rx_page) +
-                cbuf_data(&sctp_con.cb);
-        iov[1].iov_len = 0;
-        /*
-         * iov[1] is the bit of the circular buffer between the start of the
-         * buffer and the start of the currently used section (cb.base)
-         */
-        if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) {
-                iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb);
-                iov[1].iov_len = sctp_con.cb.base;
-                iov[1].iov_base = page_address(sctp_con.rx_page);
-                msg.msg_iovlen = 2;
-        }
-        len = iov[0].iov_len + iov[1].iov_len;
-        r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
-                                 MSG_NOSIGNAL | MSG_DONTWAIT);
-        if (ret <= 0)
-                goto out_close;
-        msg.msg_control = incmsg;
-        msg.msg_controllen = sizeof(incmsg);
-        cmsg = CMSG_FIRSTHDR(&msg);
-        sinfo = CMSG_DATA(cmsg);
-        if (msg.msg_flags & MSG_NOTIFICATION) {
-                process_sctp_notification(&msg, page_address(sctp_con.rx_page));
-                return 0;
-        }
-        /* Is this a new association ? */
-        ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
-        if (ni) {
-                ni->assoc_id = sinfo->sinfo_assoc_id;
-                if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-                                spin_lock_bh(&write_nodes_lock);
-                                list_add_tail(&ni->write_list, &write_nodes);
-                                spin_unlock_bh(&write_nodes_lock);
-                                queue_work(send_workqueue, &ni->swork);
-                        }
-                }
-        }
-        /* INIT sends a message with length of 1 - ignore it */
-        if (r == 1)
-                return 0;
-        cbuf_add(&sctp_con.cb, ret);
-        // PJC: TODO: Add to node's workqueue....can we ??
-        ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
-                                          page_address(sctp_con.rx_page),
-                                          sctp_con.cb.base, sctp_con.cb.len,
-                                          PAGE_CACHE_SIZE);
-        if (ret < 0)
-                goto out_close;
-        cbuf_eat(&sctp_con.cb, ret);
-out:
-        ret = 0;
-        goto out_ret;
-out_resched:
-        lowcomms_data_ready(sctp_con.sock->sk, 0);
-        ret = 0;
-        cond_resched();
-        goto out_ret;
-out_close:
-        if (ret != -EAGAIN)
-                log_print("error reading from sctp socket: %d", ret);
-out_ret:
-        return ret;
-}
-/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
-static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
-{
-        mm_segment_t fs;
-        int result = 0;
-        fs = get_fs();
-        set_fs(get_ds());
-        if (num == 1)
-                result = sctp_con.sock->ops->bind(sctp_con.sock,
-                                                  (struct sockaddr *) addr,
-                                                  addr_len);
-        else
-                result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
-                                                        SCTP_SOCKOPT_BINDX_ADD,
-                                                        (char *)addr, addr_len);
-        set_fs(fs);
-        if (result < 0)
-                log_print("Can't bind to port %d addr number %d",
-                          dlm_config.ci_tcp_port, num);
-        return result;
-}
-static void init_local(void)
-{
-        struct sockaddr_storage sas, *addr;
-        int i;
-        dlm_local_nodeid = dlm_our_nodeid();
-        for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
-                if (dlm_our_addr(&sas, i))
-                        break;
-                addr = kmalloc(sizeof(*addr), GFP_KERNEL);
-                if (!addr)
-                        break;
-                memcpy(addr, &sas, sizeof(*addr));
-                dlm_local_addr[dlm_local_count++] = addr;
-        }
-}
-/* Initialise SCTP socket and bind to all interfaces */
-static int init_sock(void)
-{
-        mm_segment_t fs;
-        struct socket *sock = NULL;
-        struct sockaddr_storage localaddr;
-        struct sctp_event_subscribe subscribe;
-        int result = -EINVAL, num = 1, i, addr_len;
-        if (!dlm_local_count) {
-                init_local();
-                if (!dlm_local_count) {
-                        log_print("no local IP address has been set");
-                        goto out;
-                }
-        }
-        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-                                  IPPROTO_SCTP, &sock);
-        if (result < 0) {
-                log_print("Can't create comms socket, check SCTP is loaded");
-                goto out;
-        }
-        /* Listen for events */
-        memset(&subscribe, 0, sizeof(subscribe));
-        subscribe.sctp_data_io_event = 1;
-        subscribe.sctp_association_event = 1;
-        subscribe.sctp_send_failure_event = 1;
-        subscribe.sctp_shutdown_event = 1;
-        subscribe.sctp_partial_delivery_event = 1;
-        fs = get_fs();
-        set_fs(get_ds());
-        result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
-                                       (char *)&subscribe, sizeof(subscribe));
-        set_fs(fs);
-        if (result < 0) {
-                log_print("Failed to set SCTP_EVENTS on socket: result=%d",
-                          result);
-                goto create_delsock;
-        }
-        /* Init con struct */
-        sock->sk->sk_user_data = &sctp_con;
-        sctp_con.sock = sock;
-        sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
-        /* Bind to all interfaces. */
-        for (i = 0; i < dlm_local_count; i++) {
-                memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-                make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
-                result = add_bind_addr(&localaddr, addr_len, num);
-                if (result)
-                        goto create_delsock;
-                ++num;
-        }
-        result = sock->ops->listen(sock, 5);
-        if (result < 0) {
-                log_print("Can't set socket listening");
-                goto create_delsock;
-        }
-        return 0;
-create_delsock:
-        sock_release(sock);
-        sctp_con.sock = NULL;
-out:
-        return result;
-}
-static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
-{
-        struct writequeue_entry *entry;
-        entry = kmalloc(sizeof(struct writequeue_entry), allocation);
-        if (!entry)
-                return NULL;
-        entry->page = alloc_page(allocation);
-        if (!entry->page) {
-                kfree(entry);
-                return NULL;
-        }
-        entry->offset = 0;
-        entry->len = 0;
-        entry->end = 0;
-        entry->users = 0;
-        return entry;
-}
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
-{
-        struct writequeue_entry *e;
-        int offset = 0;
-        int users = 0;
-        struct nodeinfo *ni;
-        ni = nodeid2nodeinfo(nodeid, allocation);
-        if (!ni)
-                return NULL;
-        spin_lock(&ni->writequeue_lock);
-        e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
-        if ((&e->list == &ni->writequeue) ||
-            (PAGE_CACHE_SIZE - e->end < len)) {
-                e = NULL;
-        } else {
-                offset = e->end;
-                e->end += len;
-                users = e->users++;
-        }
-        spin_unlock(&ni->writequeue_lock);
-        if (e) {
-        got_one:
-                if (users == 0)
-                        kmap(e->page);
-                *ppc = page_address(e->page) + offset;
-                return e;
-        }
-        e = new_writequeue_entry(allocation);
-        if (e) {
-                spin_lock(&ni->writequeue_lock);
-                offset = e->end;
-                e->end += len;
-                e->ni = ni;
-                users = e->users++;
-                list_add_tail(&e->list, &ni->writequeue);
-                spin_unlock(&ni->writequeue_lock);
-                goto got_one;
-        }
-        return NULL;
-}
-void dlm_lowcomms_commit_buffer(void *arg)
-{
-        struct writequeue_entry *e = (struct writequeue_entry *) arg;
-        int users;
-        struct nodeinfo *ni = e->ni;
-        spin_lock(&ni->writequeue_lock);
-        users = --e->users;
-        if (users)
-                goto out;
-        e->len = e->end - e->offset;
-        kunmap(e->page);
-        spin_unlock(&ni->writequeue_lock);
-        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-                spin_lock_bh(&write_nodes_lock);
-                list_add_tail(&ni->write_list, &write_nodes);
-                spin_unlock_bh(&write_nodes_lock);
-                queue_work(send_workqueue, &ni->swork);
-        }
-        return;
-out:
-        spin_unlock(&ni->writequeue_lock);
-        return;
-}
-static void free_entry(struct writequeue_entry *e)
-{
-        __free_page(e->page);
-        kfree(e);
-}
-/* Initiate an SCTP association. In theory we could just use sendmsg() on
-   the first IP address and it should work, but this allows us to set up the
-   association before sending any valuable data that we can't afford to lose.
-   It also keeps the send path clean as it can now always use the association ID */
-static void initiate_association(int nodeid)
-{
-        struct sockaddr_storage rem_addr;
-        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-        struct msghdr outmessage;
-        struct cmsghdr *cmsg;
-        struct sctp_sndrcvinfo *sinfo;
-        int ret;
-        int addrlen;
-        char buf[1];
-        struct kvec iov[1];
-        struct nodeinfo *ni;
-        log_print("Initiating association with node %d", nodeid);
-        ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-        if (!ni)
-                return;
-        if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
-                log_print("no address for nodeid %d", nodeid);
-                return;
-        }
-        make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
-        outmessage.msg_name = &rem_addr;
-        outmessage.msg_namelen = addrlen;
-        outmessage.msg_control = outcmsg;
-        outmessage.msg_controllen = sizeof(outcmsg);
-        outmessage.msg_flags = MSG_EOR;
-        iov[0].iov_base = buf;
-        iov[0].iov_len = 1;
-        /* Real INIT messages seem to cause trouble. Just send a 1 byte message
-           we can afford to lose */
-        cmsg = CMSG_FIRSTHDR(&outmessage);
-        cmsg->cmsg_level = IPPROTO_SCTP;
-        cmsg->cmsg_type = SCTP_SNDRCV;
-        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-        sinfo = CMSG_DATA(cmsg);
-        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-        sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-        outmessage.msg_controllen = cmsg->cmsg_len;
-        ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
-        if (ret < 0) {
-                log_print("send INIT to node failed: %d", ret);
-                /* Try again later */
-                clear_bit(NI_INIT_PENDING, &ni->flags);
-        }
-}
-/* Send a message */
-static void send_to_sock(struct nodeinfo *ni)
-{
-        int ret = 0;
-        struct writequeue_entry *e;
-        int len, offset;
-        struct msghdr outmsg;
-        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-        struct cmsghdr *cmsg;
-        struct sctp_sndrcvinfo *sinfo;
-        struct kvec iov;
-        /* See if we need to init an association before we start
-           sending precious messages */
-        spin_lock(&ni->lock);
-        if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-                spin_unlock(&ni->lock);
-                initiate_association(ni->nodeid);
-                return;
-        }
-        spin_unlock(&ni->lock);
-        outmsg.msg_name = NULL; /* We use assoc_id */
-        outmsg.msg_namelen = 0;
-        outmsg.msg_control = outcmsg;
-        outmsg.msg_controllen = sizeof(outcmsg);
-        outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
-        cmsg = CMSG_FIRSTHDR(&outmsg);
-        cmsg->cmsg_level = IPPROTO_SCTP;
-        cmsg->cmsg_type = SCTP_SNDRCV;
-        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-        sinfo = CMSG_DATA(cmsg);
-        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-        sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-        sinfo->sinfo_assoc_id = ni->assoc_id;
-        outmsg.msg_controllen = cmsg->cmsg_len;
-        spin_lock(&ni->writequeue_lock);
-        for (;;) {
-                if (list_empty(&ni->writequeue))
-                        break;
-                e = list_entry(ni->writequeue.next, struct writequeue_entry,
-                               list);
-                len = e->len;
-                offset = e->offset;
-                BUG_ON(len == 0 && e->users == 0);
-                spin_unlock(&ni->writequeue_lock);
-                kmap(e->page);
-                ret = 0;
-                if (len) {
-                        iov.iov_base = page_address(e->page)+offset;
-                        iov.iov_len = len;
-                        ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
-                                             len);
-                        if (ret == -EAGAIN) {
-                                sctp_con.eagain_flag = 1;
-                                goto out;
-                        } else if (ret < 0)
-                                goto send_error;
-                } else {
-                        /* Don't starve people filling buffers */
-                        cond_resched();
-                }
-                spin_lock(&ni->writequeue_lock);
-                e->offset += ret;
-                e->len -= ret;
-                if (e->len == 0 && e->users == 0) {
-                        list_del(&e->list);
-                        kunmap(e->page);
-                        free_entry(e);
-                        continue;
-                }
-        }
-        spin_unlock(&ni->writequeue_lock);
-out:
-        return;
-send_error:
-        log_print("Error sending to node %d %d", ni->nodeid, ret);
-        spin_lock(&ni->lock);
-        if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-                ni->assoc_id = 0;
-                spin_unlock(&ni->lock);
-                initiate_association(ni->nodeid);
-        } else
-                spin_unlock(&ni->lock);
-        return;
-}
-/* Try to send any messages that are pending */
-static void process_output_queue(void)
-{
-        struct list_head *list;
-        struct list_head *temp;
-        spin_lock_bh(&write_nodes_lock);
-        list_for_each_safe(list, temp, &write_nodes) {
-                struct nodeinfo *ni =
-                        list_entry(list, struct nodeinfo, write_list);
-                clear_bit(NI_WRITE_PENDING, &ni->flags);
-                list_del(&ni->write_list);
-                spin_unlock_bh(&write_nodes_lock);
-                send_to_sock(ni);
-                spin_lock_bh(&write_nodes_lock);
-        }
-        spin_unlock_bh(&write_nodes_lock);
-}
-/* Called after we've had -EAGAIN and been woken up */
-static void refill_write_queue(void)
-{
-        int i;
-        for (i=1; i<=max_nodeid; i++) {
-                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-                if (ni) {
-                        if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-                                spin_lock_bh(&write_nodes_lock);
-                                list_add_tail(&ni->write_list, &write_nodes);
-                                spin_unlock_bh(&write_nodes_lock);
-                        }
-                }
-        }
-}
-static void clean_one_writequeue(struct nodeinfo *ni)
-{
-        struct list_head *list;
-        struct list_head *temp;
-        spin_lock(&ni->writequeue_lock);
-        list_for_each_safe(list, temp, &ni->writequeue) {
-                struct writequeue_entry *e =
-                        list_entry(list, struct writequeue_entry, list);
-                list_del(&e->list);
-                free_entry(e);
-        }
-        spin_unlock(&ni->writequeue_lock);
-}
-static void clean_writequeues(void)
-{
-        int i;
-        for (i=1; i<=max_nodeid; i++) {
-                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-                if (ni)
-                        clean_one_writequeue(ni);
-        }
-}
-static void dealloc_nodeinfo(void)
-{
-        int i;
-        for (i=1; i<=max_nodeid; i++) {
-                struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-                if (ni) {
-                        idr_remove(&nodeinfo_idr, i);
-                        kfree(ni);
-                }
-        }
-}
-int dlm_lowcomms_close(int nodeid)
-{
-        struct nodeinfo *ni;
-        ni = nodeid2nodeinfo(nodeid, 0);
-        if (!ni)
-                return -1;
-        spin_lock(&ni->lock);
-        if (ni->assoc_id) {
-                ni->assoc_id = 0;
-                /* Don't send shutdown here, sctp will just queue it
-                   till the node comes back up! */
-        }
-        spin_unlock(&ni->lock);
-        clean_one_writequeue(ni);
-        clear_bit(NI_INIT_PENDING, &ni->flags);
-        return 0;
-}
-// PJC: The work queue function for receiving.
-static void process_recv_sockets(struct work_struct *work)
-{
-        if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
-                int ret;
-                int count = 0;
-                do {
-                        ret = receive_from_sock();
-                        /* Don't starve out everyone else */
-                        if (++count >= MAX_RX_MSG_COUNT) {
-                                cond_resched();
-                                count = 0;
-                        }
-                } while (!kthread_should_stop() && ret >=0);
-        }
-        cond_resched();
-}
-// PJC: the work queue function for sending
-static void process_send_sockets(struct work_struct *work)
-{
-        if (sctp_con.eagain_flag) {
-                sctp_con.eagain_flag = 0;
-                refill_write_queue();
-        }
-        process_output_queue();
-}
-// PJC: Process lock requests from a particular node.
-// TODO: can we optimise this out on UP ??
-static void process_lock_request(struct work_struct *work)
-{
-}
-static void daemons_stop(void)
-{
-        destroy_workqueue(recv_workqueue);
-        destroy_workqueue(send_workqueue);
-        destroy_workqueue(lock_workqueue);
-}
-static int daemons_start(void)
-{
-        int error;
-        recv_workqueue = create_workqueue("dlm_recv");
-        error = IS_ERR(recv_workqueue);
-        if (error) {
-                log_print("can't start dlm_recv %d", error);
-                return error;
-        }
-        send_workqueue = create_singlethread_workqueue("dlm_send");
-        error = IS_ERR(send_workqueue);
-        if (error) {
-                log_print("can't start dlm_send %d", error);
-                destroy_workqueue(recv_workqueue);
-                return error;
-        }
-        lock_workqueue = create_workqueue("dlm_rlock");
-        error = IS_ERR(lock_workqueue);
-        if (error) {
-                log_print("can't start dlm_rlock %d", error);
-                destroy_workqueue(send_workqueue);
-                destroy_workqueue(recv_workqueue);
-                return error;
-        }
-        return 0;
-}
-/*
- * This is quite likely to sleep...
- */
-int dlm_lowcomms_start(void)
-{
-        int error;
-        INIT_WORK(&sctp_con.work, process_recv_sockets);
-        error = init_sock();
-        if (error)
-                goto fail_sock;
-        error = daemons_start();
-        if (error)
-                goto fail_sock;
-        return 0;
-fail_sock:
-        close_connection();
-        return error;
-}
-void dlm_lowcomms_stop(void)
-{
-        int i;
-        sctp_con.flags = 0x7;
-        daemons_stop();
-        clean_writequeues();
-        close_connection();
-        dealloc_nodeinfo();
-        max_nodeid = 0;
-        dlm_local_count = 0;
-        dlm_local_nodeid = 0;
-        for (i = 0; i < dlm_local_count; i++)
-                kfree(dlm_local_addr[i]);
-}
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms.c
index 07e0a122c32f..27970a58d29b 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms.c
@@ -36,30 +36,36 @@
 * of high load. Also, this way, the sending thread can collect together
 * messages bound for one node and send them in one block.
 *
- * I don't see any problem with the recv thread executing the locking
+ * lowcomms will choose to use wither TCP or SCTP as its transport layer
- * code on behalf of remote processes as the locking code is
+ * depending on the configuration variable 'protocol'. This should be set
- * short, efficient and never waits.
+ * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
+ * cluster-wide mechanism as it must be the same on all nodes of the cluster
+ * for the DLM to function.
 *
 */
 #include <asm/ioctls.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/pagemap.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/sctp.h>
+#include <net/sctp/user.h>
 #include "dlm_internal.h"
 #include "lowcomms.h"
 #include "midcomms.h"
 #include "config.h"
+#define NEEDED_RMEM (4*1024*1024)
 struct cbuf {
        unsigned int base;
        unsigned int len;
        unsigned int mask;
 };
-#define NODE_INCREMENT 32
 static void cbuf_add(struct cbuf *cb, int n)
 {
        cb->len += n;
@@ -88,28 +94,25 @@ static bool cbuf_empty(struct cbuf *cb)
        return cb->len == 0;
 }
-/* Maximum number of incoming messages to process before
-   doing a cond_resched()
-*/
-#define MAX_RX_MSG_COUNT 25
 struct connection {
        struct socket *sock;    /* NULL if not connected */
        uint32_t nodeid;        /* So we know who we are in the list */
        struct mutex sock_mutex;
-        unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
+        unsigned long flags;
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
 #define CF_CONNECT_PENDING 3
-#define CF_IS_OTHERCON 4
+#define CF_INIT_PENDING 4
+#define CF_IS_OTHERCON 5
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
-        struct list_head listenlist;  /* List of allocated listening sockets */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
+        void (*connect_action) (struct connection *);   /* What to do to connect */
        struct page *rx_page;
        struct cbuf cb;
        int retries;
 #define MAX_CONNECT_RETRIES 3
+        int sctp_assoc;
        struct connection *othercon;
        struct work_struct rwork; /* Receive workqueue */
        struct work_struct swork; /* Send workqueue */
@@ -127,68 +130,136 @@ struct writequeue_entry {
        struct connection *con;
 };
-static struct sockaddr_storage dlm_local_addr;
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
 /* Work queues */
 static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
-/* An array of pointers to connections, indexed by NODEID */
+static DEFINE_IDR(connections_idr);
-static struct connection **connections;
 static DECLARE_MUTEX(connections_lock);
+static int max_nodeid;
 static struct kmem_cache *con_cache;
-static int conn_array_size;
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+/*
+ * If 'allocation' is zero then we don't attempt to create a new
+ * connection structure for this node.
+ */
+static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 {
        struct connection *con = NULL;
+        int r;
+        int n;
-        down(&connections_lock);
+        con = idr_find(&connections_idr, nodeid);
-        if (nodeid >= conn_array_size) {
+        if (con || !alloc)
-                int new_size = nodeid + NODE_INCREMENT;
+                return con;
-                struct connection **new_conns;
-                new_conns = kzalloc(sizeof(struct connection *) *
+        r = idr_pre_get(&connections_idr, alloc);
-                                    new_size, allocation);
+        if (!r)
-                if (!new_conns)
+                return NULL;
-                        goto finish;
+        con = kmem_cache_zalloc(con_cache, alloc);
+        if (!con)
+                return NULL;
-                memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
+        r = idr_get_new_above(&connections_idr, con, nodeid, &n);
-                conn_array_size = new_size;
+        if (r) {
-                kfree(connections);
+                kmem_cache_free(con_cache, con);
-                connections = new_conns;
+                return NULL;
+        }
+        if (n != nodeid) {
+                idr_remove(&connections_idr, n);
+                kmem_cache_free(con_cache, con);
+                return NULL;
        }
-        con = connections[nodeid];
+        con->nodeid = nodeid;
-        if (con == NULL && allocation) {
+        mutex_init(&con->sock_mutex);
-                con = kmem_cache_zalloc(con_cache, allocation);
+        INIT_LIST_HEAD(&con->writequeue);
-                if (!con)
+        spin_lock_init(&con->writequeue_lock);
-                        goto finish;
+        INIT_WORK(&con->swork, process_send_sockets);
+        INIT_WORK(&con->rwork, process_recv_sockets);
-                con->nodeid = nodeid;
+        /* Setup action pointers for child sockets */
-                mutex_init(&con->sock_mutex);
+        if (con->nodeid) {
-                INIT_LIST_HEAD(&con->writequeue);
+                struct connection *zerocon = idr_find(&connections_idr, 0);
-                spin_lock_init(&con->writequeue_lock);
-                INIT_WORK(&con->swork, process_send_sockets);
-                INIT_WORK(&con->rwork, process_recv_sockets);
-                connections[nodeid] = con;
+                con->connect_action = zerocon->connect_action;
+                if (!con->rx_action)
+                        con->rx_action = zerocon->rx_action;
        }
-finish:
+        if (nodeid > max_nodeid)
+                max_nodeid = nodeid;
+        return con;
+}
+static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+{
+        struct connection *con;
+        down(&connections_lock);
+        con = __nodeid2con(nodeid, allocation);
        up(&connections_lock);
        return con;
 }
+/* This is a bit drastic, but only called when things go wrong */
+static struct connection *assoc2con(int assoc_id)
+{
+        int i;
+        struct connection *con;
+        down(&connections_lock);
+        for (i=0; i<=max_nodeid; i++) {
+                con = __nodeid2con(i, 0);
+                if (con && con->sctp_assoc == assoc_id) {
+                        up(&connections_lock);
+                        return con;
+                }
+        }
+        up(&connections_lock);
+        return NULL;
+}
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+        struct sockaddr_storage addr;
+        int error;
+        if (!dlm_local_count)
+                return -1;
+        error = dlm_nodeid_to_addr(nodeid, &addr);
+        if (error)
+                return error;
+        if (dlm_local_addr[0]->ss_family == AF_INET) {
+                struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+                struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+                ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+        } else {
+                struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+                struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+                memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+                       sizeof(in6->sin6_addr));
+        }
+        return 0;
+}
 /* Data available on socket or listen socket received a connect */
 static void lowcomms_data_ready(struct sock *sk, int count_unused)
 {
        struct connection *con = sock2con(sk);
        if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
                queue_work(recv_workqueue, &con->rwork);
 }
@@ -222,20 +293,21 @@ static int add_sock(struct socket *sock, struct connection *con)
        con->sock->sk->sk_data_ready = lowcomms_data_ready;
        con->sock->sk->sk_write_space = lowcomms_write_space;
        con->sock->sk->sk_state_change = lowcomms_state_change;
+        con->sock->sk->sk_user_data = con;
        return 0;
 }
-/* Add the port number to an IP6 or 4 sockaddr and return the address
+/* Add the port number to an IPv6 or 4 sockaddr and return the address
   length */
 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
                          int *addr_len)
 {
-        saddr->ss_family =  dlm_local_addr.ss_family;
+        saddr->ss_family =  dlm_local_addr[0]->ss_family;
        if (saddr->ss_family == AF_INET) {
                struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
                in4_addr->sin_port = cpu_to_be16(port);
                *addr_len = sizeof(struct sockaddr_in);
+                memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
        } else {
                struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
                in6_addr->sin6_port = cpu_to_be16(port);
@@ -264,6 +336,193 @@ static void close_connection(struct connection *con, bool and_other)
        mutex_unlock(&con->sock_mutex);
 }
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void sctp_send_shutdown(sctp_assoc_t associd)
+{
+        static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct msghdr outmessage;
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        int ret;
+        struct connection *con;
+        con = nodeid2con(0,0);
+        BUG_ON(con == NULL);
+        outmessage.msg_name = NULL;
+        outmessage.msg_namelen = 0;
+        outmessage.msg_control = outcmsg;
+        outmessage.msg_controllen = sizeof(outcmsg);
+        outmessage.msg_flags = MSG_EOR;
+        cmsg = CMSG_FIRSTHDR(&outmessage);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        outmessage.msg_controllen = cmsg->cmsg_len;
+        sinfo = CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_flags |= MSG_EOF;
+        sinfo->sinfo_assoc_id = associd;
+        ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
+        if (ret != 0)
+                log_print("send EOF to node failed: %d", ret);
+}
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void sctp_init_failed(void)
+{
+        int i;
+        struct connection *con;
+        down(&connections_lock);
+        for (i=1; i<=max_nodeid; i++) {
+                con = __nodeid2con(i, 0);
+                if (!con)
+                        continue;
+                con->sctp_assoc = 0;
+                if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+                        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
+                                queue_work(send_workqueue, &con->swork);
+                        }
+                }
+        }
+        up(&connections_lock);
+}
+/* Something happened to an association */
+static void process_sctp_notification(struct connection *con,
+                                      struct msghdr *msg, char *buf)
+{
+        union sctp_notification *sn = (union sctp_notification *)buf;
+        if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+                switch (sn->sn_assoc_change.sac_state) {
+                case SCTP_COMM_UP:
+                case SCTP_RESTART:
+                {
+                        /* Check that the new node is in the lockspace */
+                        struct sctp_prim prim;
+                        int nodeid;
+                        int prim_len, ret;
+                        int addr_len;
+                        struct connection *new_con;
+                        struct file *file;
+                        sctp_peeloff_arg_t parg;
+                        int parglen = sizeof(parg);
+                        /*
+                         * We get this before any data for an association.
+                         * We verify that the node is in the cluster and
+                         * then peel off a socket for it.
+                         */
+                        if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+                                log_print("COMM_UP for invalid assoc ID %d",
+                                         (int)sn->sn_assoc_change.sac_assoc_id);
+                                sctp_init_failed();
+                                return;
+                        }
+                        memset(&prim, 0, sizeof(struct sctp_prim));
+                        prim_len = sizeof(struct sctp_prim);
+                        prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+                        ret = kernel_getsockopt(con->sock,
+                                                IPPROTO_SCTP,
+                                                SCTP_PRIMARY_ADDR,
+                                                (char*)&prim,
+                                                &prim_len);
+                        if (ret < 0) {
+                                log_print("getsockopt/sctp_primary_addr on "
+                                          "new assoc %d failed : %d",
+                                          (int)sn->sn_assoc_change.sac_assoc_id,
+                                          ret);
+                                /* Retry INIT later */
+                                new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+                                if (new_con)
+                                        clear_bit(CF_CONNECT_PENDING, &con->flags);
+                                return;
+                        }
+                        make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+                        if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+                                int i;
+                                unsigned char *b=(unsigned char *)&prim.ssp_addr;
+                                log_print("reject connect from unknown addr");
+                                for (i=0; i<sizeof(struct sockaddr_storage);i++)
+                                        printk("%02x ", b[i]);
+                                printk("\n");
+                                sctp_send_shutdown(prim.ssp_assoc_id);
+                                return;
+                        }
+                        new_con = nodeid2con(nodeid, GFP_KERNEL);
+                        if (!new_con)
+                                return;
+                        /* Peel off a new sock */
+                        parg.associd = sn->sn_assoc_change.sac_assoc_id;
+                        ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
+                                                SCTP_SOCKOPT_PEELOFF,
+                                                (void *)&parg, &parglen);
+                        if (ret) {
+                                log_print("Can't peel off a socket for "
+                                          "connection %d to node %d: err=%d\n",
+                                          parg.associd, nodeid, ret);
+                        }
+                        file = fget(parg.sd);
+                        new_con->sock = SOCKET_I(file->f_dentry->d_inode);
+                        add_sock(new_con->sock, new_con);
+                        fput(file);
+                        put_unused_fd(parg.sd);
+                        log_print("got new/restarted association %d nodeid %d",
+                                 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+                        /* Send any pending writes */
+                        clear_bit(CF_CONNECT_PENDING, &new_con->flags);
+                        clear_bit(CF_INIT_PENDING, &con->flags);
+                        if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
+                                queue_work(send_workqueue, &new_con->swork);
+                        }
+                        if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
+                                queue_work(recv_workqueue, &new_con->rwork);
+                }
+                break;
+                case SCTP_COMM_LOST:
+                case SCTP_SHUTDOWN_COMP:
+                {
+                        con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+                        if (con) {
+                                con->sctp_assoc = 0;
+                        }
+                }
+                break;
+                /* We don't know which INIT failed, so clear the PENDING flags
+                 * on them all.  if assoc_id is zero then it will then try
+                 * again */
+                case SCTP_CANT_STR_ASSOC:
+                {
+                        log_print("Can't start SCTP association - retrying");
+                        sctp_init_failed();
+                }
+                break;
+                default:
+                        log_print("unexpected SCTP assoc change id=%d state=%d",
+                                  (int)sn->sn_assoc_change.sac_assoc_id,
+                                  sn->sn_assoc_change.sac_state);
+                }
+        }
+}
 /* Data received from remote end */
 static int receive_from_sock(struct connection *con)
 {
@@ -274,6 +533,7 @@ static int receive_from_sock(struct connection *con)
        int r;
        int call_again_soon = 0;
        int nvec;
+        char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
        mutex_lock(&con->sock_mutex);
@@ -293,12 +553,18 @@ static int receive_from_sock(struct connection *con)
                cbuf_init(&con->cb, PAGE_CACHE_SIZE);
        }
+        /* Only SCTP needs these really */
+        memset(&incmsg, 0, sizeof(incmsg));
+        msg.msg_control = incmsg;
+        msg.msg_controllen = sizeof(incmsg);
        /*
         * iov[0] is the bit of the circular buffer between the current end
         * point (cb.base + cb.len) and the end of the buffer.
         */
        iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
        iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
+        iov[1].iov_len = 0;
        nvec = 1;
        /*
@@ -315,11 +581,20 @@ static int receive_from_sock(struct connection *con)
        r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
                               MSG_DONTWAIT | MSG_NOSIGNAL);
        if (ret <= 0)
                goto out_close;
-        if (ret == -EAGAIN)
-                goto out_resched;
+        /* Process SCTP notifications */
+        if (msg.msg_flags & MSG_NOTIFICATION) {
+                msg.msg_control = incmsg;
+                msg.msg_controllen = sizeof(incmsg);
+                process_sctp_notification(con, &msg,
+                                page_address(con->rx_page) + con->cb.base);
+                mutex_unlock(&con->sock_mutex);
+                return 0;
+        }
+        BUG_ON(con->nodeid == 0);
        if (ret == len)
                call_again_soon = 1;
@@ -329,10 +604,10 @@ static int receive_from_sock(struct connection *con)
                                          con->cb.base, con->cb.len,
                                          PAGE_CACHE_SIZE);
        if (ret == -EBADMSG) {
-                printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
+                log_print("lowcomms: addr=%p, base=%u, len=%u, "
-                       "iov_len=%u, iov_base[0]=%p, read=%d\n",
+                          "iov_len=%u, iov_base[0]=%p, read=%d",
-                       page_address(con->rx_page), con->cb.base, con->cb.len,
+                          page_address(con->rx_page), con->cb.base, con->cb.len,
-                       len, iov[0].iov_base, r);
+                          len, iov[0].iov_base, r);
        }
        if (ret < 0)
                goto out_close;
@@ -368,7 +643,7 @@ out_close:
 }
 /* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct connection *con)
+static int tcp_accept_from_sock(struct connection *con)
 {
        int result;
        struct sockaddr_storage peeraddr;
@@ -379,7 +654,7 @@ static int accept_from_sock(struct connection *con)
        struct connection *addcon;
        memset(&peeraddr, 0, sizeof(peeraddr));
-        result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
                                  IPPROTO_TCP, &newsock);
        if (result < 0)
                return -ENOMEM;
@@ -408,7 +683,7 @@ static int accept_from_sock(struct connection *con)
        /* Get the new node's NODEID */
        make_sockaddr(&peeraddr, 0, &len);
        if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
-                printk("dlm: connect from non cluster node\n");
+                log_print("connect from non cluster node");
                sock_release(newsock);
                mutex_unlock(&con->sock_mutex);
                return -1;
@@ -419,7 +694,6 @@ static int accept_from_sock(struct connection *con)
        /*  Check to see if we already have a connection to this node. This
         *  could happen if the two nodes initiate a connection at roughly
         *  the same time and the connections cross on the wire.
-         * TEMPORARY FIX:
         *  In this case we store the incoming one in "othercon"
         */
        newcon = nodeid2con(nodeid, GFP_KERNEL);
@@ -434,7 +708,7 @@ static int accept_from_sock(struct connection *con)
                if (!othercon) {
                        othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
                        if (!othercon) {
-                                printk("dlm: failed to allocate incoming socket\n");
+                                log_print("failed to allocate incoming socket");
                                mutex_unlock(&newcon->sock_mutex);
                                result = -ENOMEM;
                                goto accept_err;
@@ -477,12 +751,107 @@ accept_err:
        sock_release(newsock);
        if (result != -EAGAIN)
-                printk("dlm: error accepting connection from node: %d\n", result);
+                log_print("error accepting connection from node: %d", result);
        return result;
 }
+static void free_entry(struct writequeue_entry *e)
+{
+        __free_page(e->page);
+        kfree(e);
+}
+/* Initiate an SCTP association.
+   This is a special case of send_to_sock() in that we don't yet have a
+   peeled-off socket for this association, so we use the listening socket
+   and add the primary IP address of the remote node.
+ */
+static void sctp_init_assoc(struct connection *con)
+{
+        struct sockaddr_storage rem_addr;
+        char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+        struct msghdr outmessage;
+        struct cmsghdr *cmsg;
+        struct sctp_sndrcvinfo *sinfo;
+        struct connection *base_con;
+        struct writequeue_entry *e;
+        int len, offset;
+        int ret;
+        int addrlen;
+        struct kvec iov[1];
+        if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
+                return;
+        if (con->retries++ > MAX_CONNECT_RETRIES)
+                return;
+        log_print("Initiating association with node %d", con->nodeid);
+        if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
+                log_print("no address for nodeid %d", con->nodeid);
+                return;
+        }
+        base_con = nodeid2con(0, 0);
+        BUG_ON(base_con == NULL);
+        make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
+        outmessage.msg_name = &rem_addr;
+        outmessage.msg_namelen = addrlen;
+        outmessage.msg_control = outcmsg;
+        outmessage.msg_controllen = sizeof(outcmsg);
+        outmessage.msg_flags = MSG_EOR;
+        spin_lock(&con->writequeue_lock);
+        e = list_entry(con->writequeue.next, struct writequeue_entry,
+                       list);
+        BUG_ON((struct list_head *) e == &con->writequeue);
+        len = e->len;
+        offset = e->offset;
+        spin_unlock(&con->writequeue_lock);
+        kmap(e->page);
+        /* Send the first block off the write queue */
+        iov[0].iov_base = page_address(e->page)+offset;
+        iov[0].iov_len = len;
+        cmsg = CMSG_FIRSTHDR(&outmessage);
+        cmsg->cmsg_level = IPPROTO_SCTP;
+        cmsg->cmsg_type = SCTP_SNDRCV;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+        sinfo = CMSG_DATA(cmsg);
+        memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+        sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+        outmessage.msg_controllen = cmsg->cmsg_len;
+        ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
+        if (ret < 0) {
+                log_print("Send first packet to node %d failed: %d",
+                          con->nodeid, ret);
+                /* Try again later */
+                clear_bit(CF_CONNECT_PENDING, &con->flags);
+                clear_bit(CF_INIT_PENDING, &con->flags);
+        }
+        else {
+                spin_lock(&con->writequeue_lock);
+                e->offset += ret;
+                e->len -= ret;
+                if (e->len == 0 && e->users == 0) {
+                        list_del(&e->list);
+                        kunmap(e->page);
+                        free_entry(e);
+                }
+                spin_unlock(&con->writequeue_lock);
+        }
+}
 /* Connect a new socket to its peer */
-static void connect_to_sock(struct connection *con)
+static void tcp_connect_to_sock(struct connection *con)
 {
        int result = -EHOSTUNREACH;
        struct sockaddr_storage saddr;
@@ -505,7 +874,7 @@ static void connect_to_sock(struct connection *con)
        }
        /* Create a socket to communicate with */
-        result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
                                  IPPROTO_TCP, &sock);
        if (result < 0)
                goto out_err;
@@ -516,11 +885,11 @@ static void connect_to_sock(struct connection *con)
        sock->sk->sk_user_data = con;
        con->rx_action = receive_from_sock;
+        con->connect_action = tcp_connect_to_sock;
+        add_sock(sock, con);
        make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
-        add_sock(sock, con);
        log_print("connecting to %d", con->nodeid);
        result =
                sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
@@ -550,64 +919,57 @@ out:
        return;
 }
-static struct socket *create_listen_sock(struct connection *con,
+static struct socket *tcp_create_listen_sock(struct connection *con,
-                                         struct sockaddr_storage *saddr)
+                                             struct sockaddr_storage *saddr)
 {
        struct socket *sock = NULL;
-        mm_segment_t fs;
        int result = 0;
        int one = 1;
        int addr_len;
-        if (dlm_local_addr.ss_family == AF_INET)
+        if (dlm_local_addr[0]->ss_family == AF_INET)
                addr_len = sizeof(struct sockaddr_in);
        else
                addr_len = sizeof(struct sockaddr_in6);
        /* Create a socket to communicate with */
-        result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+                                  IPPROTO_TCP, &sock);
        if (result < 0) {
-                printk("dlm: Can't create listening comms socket\n");
+                log_print("Can't create listening comms socket");
                goto create_out;
        }
-        fs = get_fs();
+        result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-        set_fs(get_ds());
+                                   (char *)&one, sizeof(one));
-        result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                                 (char *)&one, sizeof(one));
-        set_fs(fs);
        if (result < 0) {
-                printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",
+                log_print("Failed to set SO_REUSEADDR on socket: %d", result);
-                       result);
        }
        sock->sk->sk_user_data = con;
-        con->rx_action = accept_from_sock;
+        con->rx_action = tcp_accept_from_sock;
+        con->connect_action = tcp_connect_to_sock;
        con->sock = sock;
        /* Bind to our port */
        make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
        result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
        if (result < 0) {
-                printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
+                log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
                sock_release(sock);
                sock = NULL;
                con->sock = NULL;
                goto create_out;
        }
+        result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-        fs = get_fs();
-        set_fs(get_ds());
-        result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
                                 (char *)&one, sizeof(one));
-        set_fs(fs);
        if (result < 0) {
-                printk("dlm: Set keepalive failed: %d\n", result);
+                log_print("Set keepalive failed: %d", result);
        }
        result = sock->ops->listen(sock, 5);
        if (result < 0) {
-                printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
+                log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
                sock_release(sock);
                sock = NULL;
                goto create_out;
@@ -617,18 +979,146 @@ create_out:
        return sock;
 }
+/* Get local addresses */
+static void init_local(void)
+{
+        struct sockaddr_storage sas, *addr;
+        int i;
+        dlm_local_count = 0;
+        for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+                if (dlm_our_addr(&sas, i))
+                        break;
+                addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+                if (!addr)
+                        break;
+                memcpy(addr, &sas, sizeof(*addr));
+                dlm_local_addr[dlm_local_count++] = addr;
+        }
+}
+/* Bind to an IP address. SCTP allows multiple address so it can do
+   multi-homing */
+static int add_sctp_bind_addr(struct connection *sctp_con,
+                              struct sockaddr_storage *addr,
+                              int addr_len, int num)
+{
+        int result = 0;
+        if (num == 1)
+                result = kernel_bind(sctp_con->sock,
+                                     (struct sockaddr *) addr,
+                                     addr_len);
+        else
+                result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
+                                           SCTP_SOCKOPT_BINDX_ADD,
+                                           (char *)addr, addr_len);
+        if (result < 0)
+                log_print("Can't bind to port %d addr number %d",
+                          dlm_config.ci_tcp_port, num);
+        return result;
+}
-/* Listen on all interfaces */
+/* Initialise SCTP socket and bind to all interfaces */
-static int listen_for_all(void)
+static int sctp_listen_for_all(void)
+{
+        struct socket *sock = NULL;
+        struct sockaddr_storage localaddr;
+        struct sctp_event_subscribe subscribe;
+        int result = -EINVAL, num = 1, i, addr_len;
+        struct connection *con = nodeid2con(0, GFP_KERNEL);
+        int bufsize = NEEDED_RMEM;
+        if (!con)
+                return -ENOMEM;
+        log_print("Using SCTP for communications");
+        result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+                                  IPPROTO_SCTP, &sock);
+        if (result < 0) {
+                log_print("Can't create comms socket, check SCTP is loaded");
+                goto out;
+        }
+        /* Listen for events */
+        memset(&subscribe, 0, sizeof(subscribe));
+        subscribe.sctp_data_io_event = 1;
+        subscribe.sctp_association_event = 1;
+        subscribe.sctp_send_failure_event = 1;
+        subscribe.sctp_shutdown_event = 1;
+        subscribe.sctp_partial_delivery_event = 1;
+        result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+                                 (char *)&bufsize, sizeof(bufsize));
+        if (result)
+                log_print("Error increasing buffer space on socket %d", result);
+        result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+                                   (char *)&subscribe, sizeof(subscribe));
+        if (result < 0) {
+                log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+                          result);
+                goto create_delsock;
+        }
+        /* Init con struct */
+        sock->sk->sk_user_data = con;
+        con->sock = sock;
+        con->sock->sk->sk_data_ready = lowcomms_data_ready;
+        con->rx_action = receive_from_sock;
+        con->connect_action = sctp_init_assoc;
+        /* Bind to all interfaces. */
+        for (i = 0; i < dlm_local_count; i++) {
+                memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+                make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
+                result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
+                if (result)
+                        goto create_delsock;
+                ++num;
+        }
+        result = sock->ops->listen(sock, 5);
+        if (result < 0) {
+                log_print("Can't set socket listening");
+                goto create_delsock;
+        }
+        return 0;
+create_delsock:
+        sock_release(sock);
+        con->sock = NULL;
+out:
+        return result;
+}
+static int tcp_listen_for_all(void)
 {
        struct socket *sock = NULL;
        struct connection *con = nodeid2con(0, GFP_KERNEL);
        int result = -EINVAL;
+        if (!con)
+                return -ENOMEM;
        /* We don't support multi-homed hosts */
+        if (dlm_local_addr[1] != NULL) {
+                log_print("TCP protocol can't handle multi-homed hosts, "
+                          "try SCTP");
+                return -EINVAL;
+        }
+        log_print("Using TCP for communications");
        set_bit(CF_IS_OTHERCON, &con->flags);
-        sock = create_listen_sock(con, &dlm_local_addr);
+        sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
        if (sock) {
                add_sock(sock, con);
                result = 0;
@@ -666,8 +1156,7 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
        return entry;
 }
-void *dlm_lowcomms_get_buffer(int nodeid, int len,
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
-                              gfp_t allocation, char **ppc)
 {
        struct connection *con;
        struct writequeue_entry *e;
@@ -735,12 +1224,6 @@ out:
        return;
 }
-static void free_entry(struct writequeue_entry *e)
-{
-        __free_page(e->page);
-        kfree(e);
-}
 /* Send a message */
 static void send_to_sock(struct connection *con)
 {
@@ -777,8 +1260,7 @@ static void send_to_sock(struct connection *con)
                                goto out;
                        if (ret <= 0)
                                goto send_error;
-                }
+                } else {
-                else {
                        /* Don't starve people filling buffers */
                        cond_resched();
                }
@@ -807,7 +1289,8 @@ send_error:
 out_connect:
        mutex_unlock(&con->sock_mutex);
-        connect_to_sock(con);
+        if (!test_bit(CF_INIT_PENDING, &con->flags))
+                lowcomms_connect_sock(con);
        return;
 }
@@ -832,9 +1315,6 @@ int dlm_lowcomms_close(int nodeid)
 {
        struct connection *con;
-        if (!connections)
-                goto out;
        log_print("closing connection to node %d", nodeid);
        con = nodeid2con(nodeid, 0);
        if (con) {
@@ -842,12 +1322,9 @@ int dlm_lowcomms_close(int nodeid)
                close_connection(con, true);
        }
        return 0;
-out:
-        return -1;
 }
-/* Look for activity on active sockets */
+/* Receive workqueue function */
 static void process_recv_sockets(struct work_struct *work)
 {
        struct connection *con = container_of(work, struct connection, rwork);
@@ -859,15 +1336,14 @@ static void process_recv_sockets(struct work_struct *work)
        } while (!err);
 }
+/* Send workqueue function */
 static void process_send_sockets(struct work_struct *work)
 {
        struct connection *con = container_of(work, struct connection, swork);
        if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
-                connect_to_sock(con);
+                con->connect_action(con);
        }
        clear_bit(CF_WRITE_PENDING, &con->flags);
        send_to_sock(con);
 }
@@ -878,8 +1354,8 @@ static void clean_writequeues(void)
 {
        int nodeid;
-        for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
+        for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
-                struct connection *con = nodeid2con(nodeid, 0);
+                struct connection *con = __nodeid2con(nodeid, 0);
                if (con)
                        clean_one_writequeue(con);
@@ -916,64 +1392,67 @@ static int work_start(void)
 void dlm_lowcomms_stop(void)
 {
        int i;
+        struct connection *con;
        /* Set all the flags to prevent any
           socket activity.
        */
-        for (i = 0; i < conn_array_size; i++) {
+        down(&connections_lock);
-                if (connections[i])
+        for (i = 0; i <= max_nodeid; i++) {
-                        connections[i]->flags |= 0xFF;
+                con = __nodeid2con(i, 0);
+                if (con)
+                        con->flags |= 0xFF;
        }
+        up(&connections_lock);
        work_stop();
+        down(&connections_lock);
        clean_writequeues();
-        for (i = 0; i < conn_array_size; i++) {
+        for (i = 0; i <= max_nodeid; i++) {
-                if (connections[i]) {
+                con = __nodeid2con(i, 0);
-                        close_connection(connections[i], true);
+                if (con) {
-                        if (connections[i]->othercon)
+                        close_connection(con, true);
-                                kmem_cache_free(con_cache, connections[i]->othercon);
+                        if (con->othercon)
-                        kmem_cache_free(con_cache, connections[i]);
+                                kmem_cache_free(con_cache, con->othercon);
+                        kmem_cache_free(con_cache, con);
                }
        }
+        max_nodeid = 0;
-        kfree(connections);
+        up(&connections_lock);
-        connections = NULL;
        kmem_cache_destroy(con_cache);
+        idr_init(&connections_idr);
 }
-/* This is quite likely to sleep... */
 int dlm_lowcomms_start(void)
 {
-        int error = 0;
+        int error = -EINVAL;
+        struct connection *con;
-        error = -ENOMEM;
-        connections = kzalloc(sizeof(struct connection *) *
-                              NODE_INCREMENT, GFP_KERNEL);
-        if (!connections)
-                goto out;
-        conn_array_size = NODE_INCREMENT;
-        if (dlm_our_addr(&dlm_local_addr, 0)) {
+        init_local();
+        if (!dlm_local_count) {
+                error = -ENOTCONN;
                log_print("no local IP address has been set");
-                goto fail_free_conn;
+                goto out;
-        }
-        if (!dlm_our_addr(&dlm_local_addr, 1)) {
-                log_print("This dlm comms module does not support multi-homed clustering");
-                goto fail_free_conn;
        }
+        error = -ENOMEM;
        con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
                                      __alignof__(struct connection), 0,
                                      NULL, NULL);
        if (!con_cache)
-                goto fail_free_conn;
+                goto out;
+        /* Set some sysctl minima */
+        if (sysctl_rmem_max < NEEDED_RMEM)
+                sysctl_rmem_max = NEEDED_RMEM;
        /* Start listening */
-        error = listen_for_all();
+        if (dlm_config.ci_protocol == 0)
+                error = tcp_listen_for_all();
+        else
+                error = sctp_listen_for_all();
        if (error)
                goto fail_unlisten;
@@ -984,24 +1463,13 @@ int dlm_lowcomms_start(void)
        return 0;
 fail_unlisten:
-        close_connection(connections[0], false);
+        con = nodeid2con(0,0);
-        kmem_cache_free(con_cache, connections[0]);
+        if (con) {
+                close_connection(con, false);
+                kmem_cache_free(con_cache, con);
+        }
        kmem_cache_destroy(con_cache);
-fail_free_conn:
-        kfree(connections);
 out:
        return error;
 }
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 3870150b83a4..b0201ec325a7 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@ struct dlm_write_request32 {
        union  {
                struct dlm_lock_params32 lock;
                struct dlm_lspace_params lspace;
+                struct dlm_purge_params purge;
        } i;
 };
@@ -92,6 +93,9 @@ static void compat_input(struct dlm_write_request *kb,
                kb->i.lspace.flags = kb32->i.lspace.flags;
                kb->i.lspace.minor = kb32->i.lspace.minor;
                strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+        } else if (kb->cmd == DLM_USER_PURGE) {
+                kb->i.purge.nodeid = kb32->i.purge.nodeid;
+                kb->i.purge.pid = kb32->i.purge.pid;
        } else {
                kb->i.lock.mode = kb32->i.lock.mode;
                kb->i.lock.namelen = kb32->i.lock.namelen;
@@ -111,8 +115,6 @@ static void compat_input(struct dlm_write_request *kb,
 static void compat_output(struct dlm_lock_result *res,
                          struct dlm_lock_result32 *res32)
 {
-        res32->length = res->length - (sizeof(struct dlm_lock_result) -
-                                       sizeof(struct dlm_lock_result32));
        res32->user_astaddr = (__u32)(long)res->user_astaddr;
        res32->user_astparam = (__u32)(long)res->user_astparam;
        res32->user_lksb = (__u32)(long)res->user_lksb;
@@ -128,35 +130,30 @@ static void compat_output(struct dlm_lock_result *res,
 }
 #endif
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+   being removed and then remove that lkb from the orphans list and free it */
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
        struct dlm_user_proc *proc;
-        int remove_ownqueue = 0;
+        int eol = 0, ast_type;
-        /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
+        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
-           lkb before dealing with it.  We need to check this
-           flag before taking ls_clear_proc_locks mutex because if
-           it's set, dlm_clear_proc_locks() holds the mutex. */
-        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-                /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
                return;
-        }
        ls = lkb->lkb_resource->res_ls;
        mutex_lock(&ls->ls_clear_proc_locks);
        /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
           can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
-           lkb->ua so we can't try to use it. */
+           lkb->ua so we can't try to use it.  This second check is necessary
+           for cases where a completion ast is received for an operation that
+           began before clear_proc_locks did its cancel/unlock. */
-        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
-                /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
                goto out;
-        }
        DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
        ua = (struct dlm_user_args *)lkb->lkb_astparam;
@@ -166,28 +163,42 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
                goto out;
        spin_lock(&proc->asts_spin);
-        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+        ast_type = lkb->lkb_ast_type;
+        lkb->lkb_ast_type |= type;
+        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-                lkb->lkb_ast_type |= type;
                wake_up_interruptible(&proc->wait);
        }
+        if (type == AST_COMP && (ast_type & AST_COMP))
-        /* noqueue requests that fail may need to be removed from the
+                log_debug(ls, "ast overlap %x status %x %x",
-           proc's locks list, there should be a better way of detecting
+                          lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
-           this situation than checking all these things... */
+        /* Figure out if this lock is at the end of its life and no longer
-        if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
+           available for the application to use.  The lkb still exists until
-            ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
+           the final ast is read.  A lock becomes EOL in three situations:
-                remove_ownqueue = 1;
+             1. a noqueue request fails with EAGAIN
+             2. an unlock completes with EUNLOCK
-        /* unlocks or cancels of waiting requests need to be removed from the
+             3. a cancel of a waiting request completes with ECANCEL
-           proc's unlocking list, again there must be a better way...  */
+           An EOL lock needs to be removed from the process's list of locks.
+           And we can't allow any new operation on an EOL lock.  This is
-        if (ua->lksb.sb_status == -DLM_EUNLOCK ||
+           not related to the lifetime of the lkb struct which is managed
+           entirely by refcount. */
+        if (type == AST_COMP &&
+            lkb->lkb_grmode == DLM_LOCK_IV &&
+            ua->lksb.sb_status == -EAGAIN)
+                eol = 1;
+        else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
            (ua->lksb.sb_status == -DLM_ECANCEL &&
             lkb->lkb_grmode == DLM_LOCK_IV))
-                remove_ownqueue = 1;
+                eol = 1;
+        if (eol) {
+                lkb->lkb_ast_type &= ~AST_BAST;
+                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+        }
        /* We want to copy the lvb to userspace when the completion
           ast is read if the status is 0, the lock has an lvb and
@@ -204,11 +215,13 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
        spin_unlock(&proc->asts_spin);
-        if (remove_ownqueue) {
+        if (eol) {
                spin_lock(&ua->proc->locks_spin);
-                list_del_init(&lkb->lkb_ownqueue);
+                if (!list_empty(&lkb->lkb_ownqueue)) {
+                        list_del_init(&lkb->lkb_ownqueue);
+                        dlm_put_lkb(lkb);
+                }
                spin_unlock(&ua->proc->locks_spin);
-                dlm_put_lkb(lkb);
        }
 out:
        mutex_unlock(&ls->ls_clear_proc_locks);
@@ -286,47 +299,71 @@ static int device_user_unlock(struct dlm_user_proc *proc,
        return error;
 }
-static int device_create_lockspace(struct dlm_lspace_params *params)
+static int create_misc_device(struct dlm_ls *ls, char *name)
 {
-        dlm_lockspace_t *lockspace;
-        struct dlm_ls *ls;
        int error, len;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        error = dlm_new_lockspace(params->name, strlen(params->name),
-                                  &lockspace, 0, DLM_USER_LVB_LEN);
-        if (error)
-                return error;
-        ls = dlm_find_lockspace_local(lockspace);
-        if (!ls)
-                return -ENOENT;
        error = -ENOMEM;
-        len = strlen(params->name) + strlen(name_prefix) + 2;
+        len = strlen(name) + strlen(name_prefix) + 2;
        ls->ls_device.name = kzalloc(len, GFP_KERNEL);
        if (!ls->ls_device.name)
                goto fail;
        snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
-                 params->name);
+                 name);
        ls->ls_device.fops = &device_fops;
        ls->ls_device.minor = MISC_DYNAMIC_MINOR;
        error = misc_register(&ls->ls_device);
        if (error) {
                kfree(ls->ls_device.name);
-                goto fail;
        }
+fail:
+        return error;
+}
+static int device_user_purge(struct dlm_user_proc *proc,
+                             struct dlm_purge_params *params)
+{
+        struct dlm_ls *ls;
+        int error;
+        ls = dlm_find_lockspace_local(proc->lockspace);
+        if (!ls)
+                return -ENOENT;
+        error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
-        error = ls->ls_device.minor;
        dlm_put_lockspace(ls);
        return error;
+}
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+        dlm_lockspace_t *lockspace;
+        struct dlm_ls *ls;
+        int error;
- fail:
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        error = dlm_new_lockspace(params->name, strlen(params->name),
+                                  &lockspace, 0, DLM_USER_LVB_LEN);
+        if (error)
+                return error;
+        ls = dlm_find_lockspace_local(lockspace);
+        if (!ls)
+                return -ENOENT;
+        error = create_misc_device(ls, params->name);
        dlm_put_lockspace(ls);
-        dlm_release_lockspace(lockspace, 0);
+        if (error)
+                dlm_release_lockspace(lockspace, 0);
+        else
+                error = ls->ls_device.minor;
        return error;
 }
@@ -343,6 +380,10 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
        if (!ls)
                return -ENOENT;
+        /* Deregister the misc device first, so we don't have
+         * a device that's not attached to a lockspace. If
+         * dlm_release_lockspace fails then we can recreate it
+         */
        error = misc_deregister(&ls->ls_device);
        if (error) {
                dlm_put_lockspace(ls);
@@ -361,6 +402,8 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
        dlm_put_lockspace(ls);
        error = dlm_release_lockspace(lockspace, force);
+        if (error)
+                create_misc_device(ls, ls->ls_name);
 out:
        return error;
 }
@@ -497,6 +540,14 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                error = device_remove_lockspace(&kbuf->i.lspace);
                break;
+        case DLM_USER_PURGE:
+                if (!proc) {
+                        log_print("no locking on control device");
+                        goto out_sig;
+                }
+                error = device_user_purge(proc, &kbuf->i.purge);
+                break;
        default:
                log_print("Unknown command passed to DLM device : %d\n",
                          kbuf->cmd);
diff --git a/fs/dquot.c b/fs/dquot.c
index b16f991662c1..0a5febc159f2 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1432,7 +1432,7 @@ int vfs_quota_off(struct super_block *sb, int type)
                        mutex_unlock(&dqopt->dqonoff_mutex);
                }
        if (sb->s_bdev)
-                invalidate_bdev(sb->s_bdev, 0);
+                invalidate_bdev(sb->s_bdev);
        return 0;
 }
@@ -1468,7 +1468,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
         * we see all the changes from userspace... */
        write_inode_now(inode, 1);
        /* And now flush the block cache so that kernel sees the changes */
-        invalidate_bdev(sb->s_bdev, 0);
+        invalidate_bdev(sb->s_bdev);
        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
        if (sb_has_quota_enabled(sb, type)) {
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fc4a3a224641..8cbf3f69ebe5 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -583,8 +583,7 @@ inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
 {
        struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
-        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
@@ -793,7 +792,7 @@ static int do_sysfs_registration(void)
                       "Unable to register ecryptfs sysfs subsystem\n");
                goto out;
        }
-        rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+        rc = sysfs_create_file(&ecryptfs_subsys.kobj,
                               &sysfs_attr_version.attr);
        if (rc) {
                printk(KERN_ERR
@@ -801,12 +800,12 @@ static int do_sysfs_registration(void)
                subsystem_unregister(&ecryptfs_subsys);
                goto out;
        }
-        rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+        rc = sysfs_create_file(&ecryptfs_subsys.kobj,
                               &sysfs_attr_version_str.attr);
        if (rc) {
                printk(KERN_ERR
                       "Unable to create ecryptfs version_str attribute\n");
-                sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+                sysfs_remove_file(&ecryptfs_subsys.kobj,
                                  &sysfs_attr_version.attr);
                subsystem_unregister(&ecryptfs_subsys);
                goto out;
@@ -841,7 +840,7 @@ static int __init ecryptfs_init(void)
                ecryptfs_free_kmem_caches();
                goto out;
        }
-        kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
+        kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
        sysfs_attr_version.attr.owner = THIS_MODULE;
        sysfs_attr_version_str.attr.owner = THIS_MODULE;
        rc = do_sysfs_registration();
@@ -862,9 +861,9 @@ out:
 static void __exit ecryptfs_exit(void)
 {
-        sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+        sysfs_remove_file(&ecryptfs_subsys.kobj,
                          &sysfs_attr_version.attr);
-        sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+        sysfs_remove_file(&ecryptfs_subsys.kobj,
                          &sysfs_attr_version_str.attr);
        subsystem_unregister(&ecryptfs_subsys);
        ecryptfs_release_messaging(ecryptfs_transport);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b731b09499cb..0770c4b66f53 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -46,7 +46,6 @@ struct kmem_cache *ecryptfs_lower_page_cache;
 */
 static struct page *ecryptfs_get1page(struct file *file, int index)
 {
-        struct page *page;
        struct dentry *dentry;
        struct inode *inode;
        struct address_space *mapping;
@@ -54,14 +53,7 @@ static struct page *ecryptfs_get1page(struct file *file, int index)
        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        mapping = inode->i_mapping;
-        page = read_cache_page(mapping, index,
+        return read_mapping_page(mapping, index, (void *)file);
-                               (filler_t *)mapping->a_ops->readpage,
-                               (void *)file);
-        if (IS_ERR(page))
-                goto out;
-        wait_on_page_locked(page);
-out:
-        return page;
 }
 static
@@ -233,7 +225,6 @@ int ecryptfs_do_readpage(struct file *file, struct page *page,
                ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
                goto out;
        }
-        wait_on_page_locked(lower_page);
        page_data = kmap_atomic(page, KM_USER0);
        lower_page_data = kmap_atomic(lower_page, KM_USER1);
        memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index e3aa2253c850..fe9186312d7c 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -97,7 +97,7 @@ out:
 */
 static int ecryptfs_process_nl_response(struct sk_buff *skb)
 {
-        struct nlmsghdr *nlh = (struct nlmsghdr*)skb->data;
+        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        struct ecryptfs_message *msg = NLMSG_DATA(nlh);
        int rc;
@@ -181,7 +181,7 @@ receive:
                                "rc = [%d]\n", rc);
                return;
        }
-        nlh = (struct nlmsghdr *)skb->data;
+        nlh = nlmsg_hdr(skb);
        if (!NLMSG_OK(nlh, skb->len)) {
                ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
                                "message\n");
@@ -229,7 +229,7 @@ int ecryptfs_init_netlink(void)
        ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
                                                 ecryptfs_receive_nl_message,
-                                                 THIS_MODULE);
+                                                 NULL, THIS_MODULE);
        if (!ecryptfs_nl_sock) {
                rc = -EIO;
                ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c2235e46edcd..ba7a8b9da0c1 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -72,8 +72,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct efs_inode_info *ei = (struct efs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 7e36c6f6f538..3155e915307a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1244,13 +1244,17 @@ EXPORT_SYMBOL(set_binfmt);
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
-static void format_corename(char *corename, const char *pattern, long signr)
+static int format_corename(char *corename, const char *pattern, long signr)
 {
        const char *pat_ptr = pattern;
        char *out_ptr = corename;
        char *const out_end = corename + CORENAME_MAX_SIZE;
        int rc;
        int pid_in_pattern = 0;
+        int ispipe = 0;
+        if (*pattern == '|')
+                ispipe = 1;
        /* Repeat as long as we have more pattern to process and more output
           space */
@@ -1341,8 +1345,8 @@ static void format_corename(char *corename, const char *pattern, long signr)
         *
         * If core_pattern does not include a %p (as is the default)
         * and core_uses_pid is set, then .%pid will be appended to
-         * the filename */
+         * the filename. Do not do this for piped commands. */
-        if (!pid_in_pattern
+        if (!ispipe && !pid_in_pattern
            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
                rc = snprintf(out_ptr, out_end - out_ptr,
                              ".%d", current->tgid);
@@ -1350,8 +1354,9 @@ static void format_corename(char *corename, const char *pattern, long signr)
                        goto out;
                out_ptr += rc;
        }
-      out:
+out:
        *out_ptr = 0;
+        return ispipe;
 }
 static void zap_process(struct task_struct *start)
@@ -1502,16 +1507,15 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
         * uses lock_kernel()
         */
        lock_kernel();
-        format_corename(corename, core_pattern, signr);
+        ispipe = format_corename(corename, core_pattern, signr);
        unlock_kernel();
-        if (corename[0] == '|') {
+        if (ispipe) {
                /* SIGPIPE can happen, but it's just never processed */
                if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
                        goto fail_unlock;
                }
-                ispipe = 1;
        } else
                file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index e89bfc8cf957..1d1e7e30d70e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -161,10 +161,7 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n)
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (!PageChecked(page))
                        ext2_check_page(page);
                if (PageError(page))
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a046a419d8af..685a1c287177 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -160,8 +160,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                rwlock_init(&ei->i_meta_lock);
 #ifdef CONFIG_EXT2_FS_XATTR
                init_rwsem(&ei->xattr_sem);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4a4fcd6868c7..54d3c9041259 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -420,7 +420,7 @@ static void ext3_put_super (struct super_block * sb)
                dump_orphan_list(sb, sbi);
        J_ASSERT(list_empty(&sbi->s_orphan));
-        invalidate_bdev(sb->s_bdev, 0);
+        invalidate_bdev(sb->s_bdev);
        if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
                /*
                 * Invalidate the journal device's buffers.  We don't want them
@@ -428,7 +428,7 @@ static void ext3_put_super (struct super_block * sb)
                 * hotswapped, and it breaks the `ro-after' testing code.
                 */
                sync_blockdev(sbi->journal_bdev);
-                invalidate_bdev(sbi->journal_bdev, 0);
+                invalidate_bdev(sbi->journal_bdev);
                ext3_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
@@ -466,8 +466,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                INIT_LIST_HEAD(&ei->i_orphan);
 #ifdef CONFIG_EXT3_FS_XATTR
                init_rwsem(&ei->xattr_sem);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61c4718e4a53..719126932354 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -470,7 +470,7 @@ static void ext4_put_super (struct super_block * sb)
                dump_orphan_list(sb, sbi);
        J_ASSERT(list_empty(&sbi->s_orphan));
-        invalidate_bdev(sb->s_bdev, 0);
+        invalidate_bdev(sb->s_bdev);
        if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
                /*
                 * Invalidate the journal device's buffers.  We don't want them
@@ -478,7 +478,7 @@ static void ext4_put_super (struct super_block * sb)
                 * hotswapped, and it breaks the `ro-after' testing code.
                 */
                sync_blockdev(sbi->journal_bdev);
-                invalidate_bdev(sbi->journal_bdev, 0);
+                invalidate_bdev(sbi->journal_bdev);
                ext4_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
@@ -517,8 +517,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                INIT_LIST_HEAD(&ei->i_orphan);
 #ifdef CONFIG_EXT4DEV_FS_XATTR
                init_rwsem(&ei->xattr_sem);
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 05c2941c74f2..1959143c1d27 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -40,8 +40,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
        struct fat_cache *cache = (struct fat_cache *)foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                INIT_LIST_HEAD(&cache->cache_list);
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9bfe607c892e..65cb54bde481 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -499,8 +499,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                spin_lock_init(&ei->cache_lru_lock);
                ei->nr_caches = 0;
                ei->cache_valid_id = FAT_CACHE_VALID + 1;
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index decac62efe57..ed8f0b0dd880 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -74,10 +74,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
        pp = read_mapping_page(mapping, n, NULL);
        if (!IS_ERR(pp)) {
-                wait_on_page_locked(pp);
                kmap(pp);
-                if (!PageUptodate(pp))
-                        goto fail;
                /** if (!PageChecked(pp)) **/
                        /** vxfs_check_page(pp); **/
                if (PageError(pp))
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 608db81219a0..d8003be56e05 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -685,8 +685,7 @@ static void fuse_inode_init_once(void *foo, struct kmem_cache *cachep,
 {
        struct inode * inode = foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(inode);
 }
@@ -731,12 +730,12 @@ static int fuse_sysfs_init(void)
 {
        int err;
-        kset_set_kset_s(&fuse_subsys, fs_subsys);
+        kobj_set_kset_s(&fuse_subsys, fs_subsys);
        err = subsystem_register(&fuse_subsys);
        if (err)
                goto out_err;
-        kset_set_kset_s(&connections_subsys, fuse_subsys);
+        kobj_set_kset_s(&connections_subsys, fuse_subsys);
        err = subsystem_register(&connections_subsys);
        if (err)
                goto out_fuse_unregister;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 82a1ac7895a2..a96fa07b3f3b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1262,9 +1262,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                              u64 leaf_no)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *bh;
        struct gfs2_leaf *lf;
-        unsigned entries = 0;
+        unsigned entries = 0, entries2 = 0;
        unsigned leaves = 0;
        const struct gfs2_dirent **darr, *dent;
        struct dirent_gather g;
@@ -1290,7 +1291,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                return 0;
        error = -ENOMEM;
-        larr = vmalloc((leaves + entries) * sizeof(void *));
+        /*
+         * The extra 99 entries are not normally used, but are a buffer
+         * zone in case the number of entries in the leaf is corrupt.
+         * 99 is the maximum number of entries that can fit in a single
+         * leaf block.
+         */
+        larr = vmalloc((leaves + entries + 99) * sizeof(void *));
        if (!larr)
                goto out;
        darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1305,10 +1312,20 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                lf = (struct gfs2_leaf *)bh->b_data;
                lfn = be64_to_cpu(lf->lf_next);
                if (lf->lf_entries) {
+                        entries2 += be16_to_cpu(lf->lf_entries);
                        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
                                                gfs2_dirent_gather, NULL, &g);
                        error = PTR_ERR(dent);
-                        if (IS_ERR(dent)) {
+                        if (IS_ERR(dent))
+                                goto out_kfree;
+                        if (entries2 != g.offset) {
+                                fs_warn(sdp, "Number of entries corrupt in dir "
+                                                "leaf %llu, entries2 (%u) != "
+                                                "g.offset (%u)\n",
+                                        (unsigned long long)bh->b_blocknr,
+                                        entries2, g.offset);
+                                        
+                                error = -EIO;
                                goto out_kfree;
                        }
                        error = 0;
@@ -1318,6 +1335,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                }
        } while(lfn);
+        BUG_ON(entries2 != entries);
        error = do_filldir_main(ip, offset, opaque, filldir, darr,
                                entries, copied);
 out_kfree:
@@ -1401,6 +1419,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                  filldir_t filldir)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct dirent_gather g;
        const struct gfs2_dirent **darr, *dent;
        struct buffer_head *dibh;
@@ -1423,8 +1442,8 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                return error;
        error = -ENOMEM;
-        darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
+        /* 96 is max number of dirents which can be stuffed into an inode */
-                       GFP_KERNEL);
+        darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
        if (darr) {
                g.pdent = darr;
                g.offset = 0;
@@ -1434,6 +1453,15 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                        error = PTR_ERR(dent);
                        goto out;
                }
+                if (dip->i_di.di_entries != g.offset) {
+                        fs_warn(sdp, "Number of entries corrupt in dir %llu, "
+                                "ip->i_di.di_entries (%u) != g.offset (%u)\n",
+                                (unsigned long long)dip->i_num.no_addr,
+                                dip->i_di.di_entries,
+                                g.offset);
+                        error = -EIO;
+                        goto out;
+                }
                error = do_filldir_main(dip, offset, opaque, filldir, darr,
                                        dip->i_di.di_entries, &copied);
 out:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12accb08fe02..1815429a2978 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -23,6 +23,10 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -40,20 +44,30 @@ struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
 };
+struct glock_iter {
+        int hash;                     /* hash bucket index         */
+        struct gfs2_sbd *sdp;         /* incore superblock         */
+        struct gfs2_glock *gl;        /* current glock struct      */
+        struct hlist_head *hb_list;   /* current hash bucket ptr   */
+        struct seq_file *seq;         /* sequence file for debugfs */
+        char string[512];             /* scratch space             */
+};
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct gfs2_glock *gl);
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
-static int dump_inode(struct gfs2_inode *ip);
+static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
-static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
+static struct dentry *gfs2_root;
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct dentry *gfs2_root;
 /*
 * Despite what you might think, the numbers below are not arbitrary :-)
@@ -202,7 +216,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
                gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
                gfs2_assert(sdp, list_empty(&gl->gl_holders));
                gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-                gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
                gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
                glock_free(gl);
                rv = 1;
@@ -303,7 +316,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        atomic_set(&gl->gl_ref, 1);
        gl->gl_state = LM_ST_UNLOCKED;
        gl->gl_hash = hash;
-        gl->gl_owner = NULL;
+        gl->gl_owner_pid = 0;
        gl->gl_ip = 0;
        gl->gl_ops = glops;
        gl->gl_req_gh = NULL;
@@ -367,7 +380,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
        INIT_LIST_HEAD(&gh->gh_list);
        gh->gh_gl = gl;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
-        gh->gh_owner = current;
+        gh->gh_owner_pid = current->pid;
        gh->gh_state = state;
        gh->gh_flags = flags;
        gh->gh_error = 0;
@@ -389,7 +402,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 {
        gh->gh_state = state;
        gh->gh_flags = flags;
-        gh->gh_iflags &= 1 << HIF_ALLOCED;
+        gh->gh_iflags = 0;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
 }
@@ -406,54 +419,8 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-/**
+static void gfs2_holder_wake(struct gfs2_holder *gh)
- * gfs2_holder_get - get a struct gfs2_holder structure
- * @gl: the glock
- * @state: the state we're requesting
- * @flags: the modifier flags
- * @gfp_flags:
- *
- * Figure out how big an impact this function has.  Either:
- * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
- * 2) Leave it like it is
- *
- * Returns: the holder structure, NULL on ENOMEM
- */
-static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
-                                           unsigned int state,
-                                           int flags, gfp_t gfp_flags)
-{
-        struct gfs2_holder *gh;
-        gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
-        if (!gh)
-                return NULL;
-        gfs2_holder_init(gl, state, flags, gh);
-        set_bit(HIF_ALLOCED, &gh->gh_iflags);
-        gh->gh_ip = (unsigned long)__builtin_return_address(0);
-        return gh;
-}
-/**
- * gfs2_holder_put - get rid of a struct gfs2_holder structure
- * @gh: the holder structure
- *
- */
-static void gfs2_holder_put(struct gfs2_holder *gh)
 {
-        gfs2_holder_uninit(gh);
-        kfree(gh);
-}
-static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
-{
-        if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
-                gfs2_holder_put(gh);
-                return;
-        }
        clear_bit(HIF_WAIT, &gh->gh_iflags);
        smp_mb();
        wake_up_bit(&gh->gh_iflags, HIF_WAIT);
@@ -519,7 +486,7 @@ static int rq_promote(struct gfs2_holder *gh)
                                gfs2_reclaim_glock(sdp);
                        }
-                        gfs2_glock_xmote_th(gh);
+                        gfs2_glock_xmote_th(gh->gh_gl, gh);
                        spin_lock(&gl->gl_spin);
                }
                return 1;
@@ -542,7 +509,7 @@ static int rq_promote(struct gfs2_holder *gh)
        gh->gh_error = 0;
        set_bit(HIF_HOLDER, &gh->gh_iflags);
-        gfs2_holder_dispose_or_wake(gh);
+        gfs2_holder_wake(gh);
        return 0;
 }
@@ -554,32 +521,24 @@ static int rq_promote(struct gfs2_holder *gh)
 * Returns: 1 if the queue is blocked
 */
-static int rq_demote(struct gfs2_holder *gh)
+static int rq_demote(struct gfs2_glock *gl)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
        if (!list_empty(&gl->gl_holders))
                return 1;
-        if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+        if (gl->gl_state == gl->gl_demote_state ||
-                list_del_init(&gh->gh_list);
+            gl->gl_state == LM_ST_UNLOCKED) {
-                gh->gh_error = 0;
+                clear_bit(GLF_DEMOTE, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
+                return 0;
-                gfs2_holder_dispose_or_wake(gh);
-                spin_lock(&gl->gl_spin);
-        } else {
-                gl->gl_req_gh = gh;
-                set_bit(GLF_LOCK, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
-                if (gh->gh_state == LM_ST_UNLOCKED ||
-                    gl->gl_state != LM_ST_EXCLUSIVE)
-                        gfs2_glock_drop_th(gl);
-                else
-                        gfs2_glock_xmote_th(gh);
-                spin_lock(&gl->gl_spin);
        }
+        set_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        if (gl->gl_demote_state == LM_ST_UNLOCKED ||
+            gl->gl_state != LM_ST_EXCLUSIVE)
+                gfs2_glock_drop_th(gl);
+        else
+                gfs2_glock_xmote_th(gl, NULL);
+        spin_lock(&gl->gl_spin);
        return 0;
 }
@@ -607,16 +566,8 @@ static void run_queue(struct gfs2_glock *gl)
                        else
                                gfs2_assert_warn(gl->gl_sbd, 0);
-                } else if (!list_empty(&gl->gl_waiters2) &&
+                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-                           !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+                        blocked = rq_demote(gl);
-                        gh = list_entry(gl->gl_waiters2.next,
-                                        struct gfs2_holder, gh_list);
-                        if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
-                                blocked = rq_demote(gh);
-                        else
-                                gfs2_assert_warn(gl->gl_sbd, 0);
                } else if (!list_empty(&gl->gl_waiters3)) {
                        gh = list_entry(gl->gl_waiters3.next,
                                        struct gfs2_holder, gh_list);
@@ -654,7 +605,7 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                list_add_tail(&gh.gh_list, &gl->gl_waiters1);
        } else {
-                gl->gl_owner = current;
+                gl->gl_owner_pid = current->pid;
                gl->gl_ip = (unsigned long)__builtin_return_address(0);
                clear_bit(HIF_WAIT, &gh.gh_iflags);
                smp_mb();
@@ -681,7 +632,7 @@ static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                acquired = 0;
        } else {
-                gl->gl_owner = current;
+                gl->gl_owner_pid = current->pid;
                gl->gl_ip = (unsigned long)__builtin_return_address(0);
        }
        spin_unlock(&gl->gl_spin);
@@ -699,7 +650,7 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 {
        spin_lock(&gl->gl_spin);
        clear_bit(GLF_LOCK, &gl->gl_flags);
-        gl->gl_owner = NULL;
+        gl->gl_owner_pid = 0;
        gl->gl_ip = 0;
        run_queue(gl);
        BUG_ON(!spin_is_locked(&gl->gl_spin));
@@ -707,50 +658,24 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 }
 /**
- * handle_callback - add a demote request to a lock's queue
+ * handle_callback - process a demote request
 * @gl: the glock
 * @state: the state the caller wants us to change to
 *
- * Note: This may fail sliently if we are out of memory.
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
 */
 static void handle_callback(struct gfs2_glock *gl, unsigned int state)
 {
-        struct gfs2_holder *gh, *new_gh = NULL;
-restart:
        spin_lock(&gl->gl_spin);
+        if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
-        list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+                gl->gl_demote_state = state;
-                if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+                gl->gl_demote_time = jiffies;
-                    gl->gl_req_gh != gh) {
+        } else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
-                        if (gh->gh_state != state)
+                gl->gl_demote_state = state;
-                                gh->gh_state = LM_ST_UNLOCKED;
-                        goto out;
-                }
-        }
-        if (new_gh) {
-                list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
-                new_gh = NULL;
-        } else {
-                spin_unlock(&gl->gl_spin);
-                new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_NOFS);
-                if (!new_gh)
-                        return;
-                set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
-                set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
-                set_bit(HIF_WAIT, &new_gh->gh_iflags);
-                goto restart;
        }
-out:
        spin_unlock(&gl->gl_spin);
-        if (new_gh)
-                gfs2_holder_put(new_gh);
 }
 /**
@@ -810,56 +735,37 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
        /*  Deal with each possible exit condition  */
-        if (!gh)
+        if (!gh) {
                gl->gl_stamp = jiffies;
-        else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+                if (ret & LM_OUT_CANCELED)
+                        op_done = 0;
+                else
+                        clear_bit(GLF_DEMOTE, &gl->gl_flags);
+        } else {
                spin_lock(&gl->gl_spin);
                list_del_init(&gh->gh_list);
                gh->gh_error = -EIO;
-                spin_unlock(&gl->gl_spin);
+                if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
-        } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+                        goto out;
-                spin_lock(&gl->gl_spin);
+                gh->gh_error = GLR_CANCELED;
-                list_del_init(&gh->gh_list);
+                if (ret & LM_OUT_CANCELED) 
-                if (gl->gl_state == gh->gh_state ||
+                        goto out;
-                    gl->gl_state == LM_ST_UNLOCKED) {
+                if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+                        list_add_tail(&gh->gh_list, &gl->gl_holders);
                        gh->gh_error = 0;
-                } else {
+                        set_bit(HIF_HOLDER, &gh->gh_iflags);
-                        if (gfs2_assert_warn(sdp, gh->gh_flags &
+                        set_bit(HIF_FIRST, &gh->gh_iflags);
-                                        (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+                        op_done = 0;
-                                fs_warn(sdp, "ret = 0x%.8X\n", ret);
+                        goto out;
-                        gh->gh_error = GLR_TRYFAILED;
                }
-                spin_unlock(&gl->gl_spin);
-                if (ret & LM_OUT_CANCELED)
-                        handle_callback(gl, LM_ST_UNLOCKED);
-        } else if (ret & LM_OUT_CANCELED) {
-                spin_lock(&gl->gl_spin);
-                list_del_init(&gh->gh_list);
-                gh->gh_error = GLR_CANCELED;
-                spin_unlock(&gl->gl_spin);
-        } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-                spin_lock(&gl->gl_spin);
-                list_move_tail(&gh->gh_list, &gl->gl_holders);
-                gh->gh_error = 0;
-                set_bit(HIF_HOLDER, &gh->gh_iflags);
-                spin_unlock(&gl->gl_spin);
-                set_bit(HIF_FIRST, &gh->gh_iflags);
-                op_done = 0;
-        } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-                spin_lock(&gl->gl_spin);
-                list_del_init(&gh->gh_list);
                gh->gh_error = GLR_TRYFAILED;
-                spin_unlock(&gl->gl_spin);
+                if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+                        goto out;
-        } else {
+                gh->gh_error = -EINVAL;
                if (gfs2_assert_withdraw(sdp, 0) == -1)
                        fs_err(sdp, "ret = 0x%.8X\n", ret);
+out:
+                spin_unlock(&gl->gl_spin);
        }
        if (glops->go_xmote_bh)
@@ -877,7 +783,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
        gfs2_glock_put(gl);
        if (gh)
-                gfs2_holder_dispose_or_wake(gh);
+                gfs2_holder_wake(gh);
 }
 /**
@@ -888,12 +794,11 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 *
 */
-void gfs2_glock_xmote_th(struct gfs2_holder *gh)
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
-        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int flags = gh->gh_flags;
+        int flags = gh ? gh->gh_flags : 0;
-        unsigned state = gh->gh_state;
+        unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
                                 LM_FLAG_NOEXP | LM_FLAG_ANY |
@@ -943,6 +848,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
        gfs2_assert_warn(sdp, !ret);
        state_change(gl, LM_ST_UNLOCKED);
+        clear_bit(GLF_DEMOTE, &gl->gl_flags);
        if (glops->go_inval)
                glops->go_inval(gl, DIO_METADATA);
@@ -964,7 +870,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
        gfs2_glock_put(gl);
        if (gh)
-                gfs2_holder_dispose_or_wake(gh);
+                gfs2_holder_wake(gh);
 }
 /**
@@ -1097,18 +1003,32 @@ static int glock_wait_internal(struct gfs2_holder *gh)
 }
 static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+find_holder_by_owner(struct list_head *head, pid_t pid)
 {
        struct gfs2_holder *gh;
        list_for_each_entry(gh, head, gh_list) {
-                if (gh->gh_owner == owner)
+                if (gh->gh_owner_pid == pid)
                        return gh;
        }
        return NULL;
 }
+static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        if (gi) {
+                vsprintf(gi->string, fmt, args);
+                seq_printf(gi->seq, gi->string);
+        }
+        else
+                vprintk(fmt, args);
+        va_end(args);
+}
 /**
 * add_to_queue - Add a holder to the wait queue (but look for recursion)
 * @gh: the holder structure to add
@@ -1120,24 +1040,24 @@ static void add_to_queue(struct gfs2_holder *gh)
        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_holder *existing;
-        BUG_ON(!gh->gh_owner);
+        BUG_ON(!gh->gh_owner_pid);
        if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
                BUG();
-        existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+        existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid);
        if (existing) {
                print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
-                printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+                printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
                printk(KERN_INFO "lock type : %d lock state : %d\n",
                                existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
                print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-                printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+                printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
                printk(KERN_INFO "lock type : %d lock state : %d\n",
                                gl->gl_name.ln_type, gl->gl_state);
                BUG();
        }
-        existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+        existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid);
        if (existing) {
                print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
                print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
@@ -1267,9 +1187,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                if (glops->go_unlock)
                        glops->go_unlock(gh);
-                gl->gl_stamp = jiffies;
                spin_lock(&gl->gl_spin);
+                gl->gl_stamp = jiffies;
        }
        clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1841,6 +1760,15 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 *  Diagnostic routines to help debug distributed deadlock
 */
+static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
+                              unsigned long address)
+{
+        char buffer[KSYM_SYMBOL_LEN];
+        sprint_symbol(buffer, address);
+        print_dbg(gi, fmt, buffer);
+}
 /**
 * dump_holder - print information about a glock holder
 * @str: a string naming the type of holder
@@ -1849,31 +1777,37 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_holder(char *str, struct gfs2_holder *gh)
+static int dump_holder(struct glock_iter *gi, char *str,
+                       struct gfs2_holder *gh)
 {
        unsigned int x;
-        int error = -ENOBUFS;
+        struct task_struct *gh_owner;
-        printk(KERN_INFO "  %s\n", str);
+        print_dbg(gi, "  %s\n", str);
-        printk(KERN_INFO "    owner = %ld\n",
+        if (gh->gh_owner_pid) {
-                   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+                print_dbg(gi, "    owner = %ld ", (long)gh->gh_owner_pid);
-        printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
+                gh_owner = find_task_by_pid(gh->gh_owner_pid);
-        printk(KERN_INFO "    gh_flags =");
+                if (gh_owner)
+                        print_dbg(gi, "(%s)\n", gh_owner->comm);
+                else
+                        print_dbg(gi, "(ended)\n");
+        } else
+                print_dbg(gi, "    owner = -1\n");
+        print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
+        print_dbg(gi, "    gh_flags =");
        for (x = 0; x < 32; x++)
                if (gh->gh_flags & (1 << x))
-                        printk(" %u", x);
+                        print_dbg(gi, " %u", x);
-        printk(" \n");
+        print_dbg(gi, " \n");
-        printk(KERN_INFO "    error = %d\n", gh->gh_error);
+        print_dbg(gi, "    error = %d\n", gh->gh_error);
-        printk(KERN_INFO "    gh_iflags =");
+        print_dbg(gi, "    gh_iflags =");
        for (x = 0; x < 32; x++)
                if (test_bit(x, &gh->gh_iflags))
-                        printk(" %u", x);
+                        print_dbg(gi, " %u", x);
-        printk(" \n");
+        print_dbg(gi, " \n");
-        print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
+        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
-        error = 0;
-        return error;
+        return 0;
 }
 /**
@@ -1883,25 +1817,20 @@ static int dump_holder(char *str, struct gfs2_holder *gh)
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_inode(struct gfs2_inode *ip)
+static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
 {
        unsigned int x;
-        int error = -ENOBUFS;
-        printk(KERN_INFO "  Inode:\n");
+        print_dbg(gi, "  Inode:\n");
-        printk(KERN_INFO "    num = %llu %llu\n",
+        print_dbg(gi, "    num = %llu/%llu\n",
-                    (unsigned long long)ip->i_num.no_formal_ino,
+                    ip->i_num.no_formal_ino, ip->i_num.no_addr);
-                    (unsigned long long)ip->i_num.no_addr);
+        print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
-        printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+        print_dbg(gi, "    i_flags =");
-        printk(KERN_INFO "    i_flags =");
        for (x = 0; x < 32; x++)
                if (test_bit(x, &ip->i_flags))
-                        printk(" %u", x);
+                        print_dbg(gi, " %u", x);
-        printk(" \n");
+        print_dbg(gi, " \n");
+        return 0;
-        error = 0;
-        return error;
 }
 /**
@@ -1912,74 +1841,86 @@ static int dump_inode(struct gfs2_inode *ip)
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int dump_glock(struct gfs2_glock *gl)
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
        unsigned int x;
        int error = -ENOBUFS;
+        struct task_struct *gl_owner;
        spin_lock(&gl->gl_spin);
-        printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+        print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
-               (unsigned long long)gl->gl_name.ln_number);
+                   (unsigned long long)gl->gl_name.ln_number);
-        printk(KERN_INFO "  gl_flags =");
+        print_dbg(gi, "  gl_flags =");
        for (x = 0; x < 32; x++) {
                if (test_bit(x, &gl->gl_flags))
-                        printk(" %u", x);
+                        print_dbg(gi, " %u", x);
        }
-        printk(" \n");
+        if (!test_bit(GLF_LOCK, &gl->gl_flags))
-        printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+                print_dbg(gi, " (unlocked)");
-        printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
+        print_dbg(gi, " \n");
-        printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
+        print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-        print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
+        print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
-        printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+        if (gl->gl_owner_pid) {
-        printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+                gl_owner = find_task_by_pid(gl->gl_owner_pid);
-        printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+                if (gl_owner)
-        printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+                        print_dbg(gi, "  gl_owner = pid %d (%s)\n",
-        printk(KERN_INFO "  le = %s\n",
+                                  gl->gl_owner_pid, gl_owner->comm);
+                else
+                        print_dbg(gi, "  gl_owner = %d (ended)\n",
+                                  gl->gl_owner_pid);
+        } else
+                print_dbg(gi, "  gl_owner = -1\n");
+        print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
+        print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+        print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+        print_dbg(gi, "  le = %s\n",
                   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
-        printk(KERN_INFO "  reclaim = %s\n",
+        print_dbg(gi, "  reclaim = %s\n",
-                    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+                   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
        if (gl->gl_aspace)
-                printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+                print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-                       gl->gl_aspace->i_mapping->nrpages);
+                           gl->gl_aspace->i_mapping->nrpages);
        else
-                printk(KERN_INFO "  aspace = no\n");
+                print_dbg(gi, "  aspace = no\n");
-        printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+        print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
        if (gl->gl_req_gh) {
-                error = dump_holder("Request", gl->gl_req_gh);
+                error = dump_holder(gi, "Request", gl->gl_req_gh);
                if (error)
                        goto out;
        }
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                error = dump_holder("Holder", gh);
+                error = dump_holder(gi, "Holder", gh);
                if (error)
                        goto out;
        }
        list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
-                error = dump_holder("Waiter1", gh);
+                error = dump_holder(gi, "Waiter1", gh);
-                if (error)
-                        goto out;
-        }
-        list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-                error = dump_holder("Waiter2", gh);
                if (error)
                        goto out;
        }
        list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-                error = dump_holder("Waiter3", gh);
+                error = dump_holder(gi, "Waiter3", gh);
                if (error)
                        goto out;
        }
+        if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+                print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
+                          gl->gl_demote_state,
+                          (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ));
+        }
        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
                if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-                    list_empty(&gl->gl_holders)) {
+                        list_empty(&gl->gl_holders)) {
-                        error = dump_inode(gl->gl_object);
+                        error = dump_inode(gi, gl->gl_object);
                        if (error)
                                goto out;
                } else {
                        error = -ENOBUFS;
-                        printk(KERN_INFO "  Inode: busy\n");
+                        print_dbg(gi, "  Inode: busy\n");
                }
        }
@@ -2014,7 +1955,7 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
                        if (gl->gl_sbd != sdp)
                                continue;
-                        error = dump_glock(gl);
+                        error = dump_glock(NULL, gl);
                        if (error)
                                break;
                }
@@ -2043,3 +1984,189 @@ int __init gfs2_glock_init(void)
        return 0;
 }
+static int gfs2_glock_iter_next(struct glock_iter *gi)
+{
+        read_lock(gl_lock_addr(gi->hash));
+        while (1) {
+                if (!gi->hb_list) {  /* If we don't have a hash bucket yet */
+                        gi->hb_list = &gl_hash_table[gi->hash].hb_list;
+                        if (hlist_empty(gi->hb_list)) {
+                                read_unlock(gl_lock_addr(gi->hash));
+                                gi->hash++;
+                                read_lock(gl_lock_addr(gi->hash));
+                                gi->hb_list = NULL;
+                                if (gi->hash >= GFS2_GL_HASH_SIZE) {
+                                        read_unlock(gl_lock_addr(gi->hash));
+                                        return 1;
+                                }
+                                else
+                                        continue;
+                        }
+                        if (!hlist_empty(gi->hb_list)) {
+                                gi->gl = list_entry(gi->hb_list->first,
+                                                    struct gfs2_glock,
+                                                    gl_list);
+                        }
+                } else {
+                        if (gi->gl->gl_list.next == NULL) {
+                                read_unlock(gl_lock_addr(gi->hash));
+                                gi->hash++;
+                                read_lock(gl_lock_addr(gi->hash));
+                                gi->hb_list = NULL;
+                                continue;
+                        }
+                        gi->gl = list_entry(gi->gl->gl_list.next,
+                                            struct gfs2_glock, gl_list);
+                }
+                if (gi->gl)
+                        break;
+        }
+        read_unlock(gl_lock_addr(gi->hash));
+        return 0;
+}
+static void gfs2_glock_iter_free(struct glock_iter *gi)
+{
+        kfree(gi);
+}
+static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
+{
+        struct glock_iter *gi;
+        gi = kmalloc(sizeof (*gi), GFP_KERNEL);
+        if (!gi)
+                return NULL;
+        gi->sdp = sdp;
+        gi->hash = 0;
+        gi->gl = NULL;
+        gi->hb_list = NULL;
+        gi->seq = NULL;
+        memset(gi->string, 0, sizeof(gi->string));
+        if (gfs2_glock_iter_next(gi)) {
+                gfs2_glock_iter_free(gi);
+                return NULL;
+        }
+        return gi;
+}
+static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+{
+        struct glock_iter *gi;
+        loff_t n = *pos;
+        gi = gfs2_glock_iter_init(file->private);
+        if (!gi)
+                return NULL;
+        while (n--) {
+                if (gfs2_glock_iter_next(gi)) {
+                        gfs2_glock_iter_free(gi);
+                        return NULL;
+                }
+        }
+        return gi;
+}
+static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+                                 loff_t *pos)
+{
+        struct glock_iter *gi = iter_ptr;
+        (*pos)++;
+        if (gfs2_glock_iter_next(gi)) {
+                gfs2_glock_iter_free(gi);
+                return NULL;
+        }
+        return gi;
+}
+static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+        /* nothing for now */
+}
+static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+{
+        struct glock_iter *gi = iter_ptr;
+        gi->seq = file;
+        dump_glock(gi, gi->gl);
+        return 0;
+}
+static struct seq_operations gfs2_glock_seq_ops = {
+        .start = gfs2_glock_seq_start,
+        .next  = gfs2_glock_seq_next,
+        .stop  = gfs2_glock_seq_stop,
+        .show  = gfs2_glock_seq_show,
+};
+static int gfs2_debugfs_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &gfs2_glock_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private;
+        return 0;
+}
+static const struct file_operations gfs2_debug_fops = {
+        .owner   = THIS_MODULE,
+        .open    = gfs2_debugfs_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+        sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+        if (!sdp->debugfs_dir)
+                return -ENOMEM;
+        sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+                                                         S_IFREG | S_IRUGO,
+                                                         sdp->debugfs_dir, sdp,
+                                                         &gfs2_debug_fops);
+        if (!sdp->debugfs_dentry_glocks)
+                return -ENOMEM;
+        return 0;
+}
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
+{
+        if (sdp && sdp->debugfs_dir) {
+                if (sdp->debugfs_dentry_glocks) {
+                        debugfs_remove(sdp->debugfs_dentry_glocks);
+                        sdp->debugfs_dentry_glocks = NULL;
+                }
+                debugfs_remove(sdp->debugfs_dir);
+                sdp->debugfs_dir = NULL;
+        }
+}
+int gfs2_register_debugfs(void)
+{
+        gfs2_root = debugfs_create_dir("gfs2", NULL);
+        return gfs2_root ? 0 : -ENOMEM;
+}
+void gfs2_unregister_debugfs(void)
+{
+        debugfs_remove(gfs2_root);
+        gfs2_root = NULL;
+}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f50e40ceca43..11477ca3a3c0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -38,7 +38,7 @@ static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
        /* Look in glock's list of holders for one with current task as owner */
        spin_lock(&gl->gl_spin);
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                if (gh->gh_owner == current) {
+                if (gh->gh_owner_pid == current->pid) {
                        locked = 1;
                        break;
                }
@@ -67,7 +67,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
        int ret;
        spin_lock(&gl->gl_spin);
-        ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+        ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
        spin_unlock(&gl->gl_spin);
        return ret;
 }
@@ -135,5 +135,9 @@ void gfs2_scand_internal(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
 int __init gfs2_glock_init(void);
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+int gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 49f0dbf40d86..d995441373ab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -115,11 +115,8 @@ enum {
        /* Actions */
        HIF_MUTEX               = 0,
        HIF_PROMOTE             = 1,
-        HIF_DEMOTE              = 2,
        /* States */
-        HIF_ALLOCED             = 4,
-        HIF_DEALLOC             = 5,
        HIF_HOLDER              = 6,
        HIF_FIRST               = 7,
        HIF_ABORTED             = 9,
@@ -130,7 +127,7 @@ struct gfs2_holder {
        struct list_head gh_list;
        struct gfs2_glock *gh_gl;
-        struct task_struct *gh_owner;
+        pid_t gh_owner_pid;
        unsigned int gh_state;
        unsigned gh_flags;
@@ -142,8 +139,8 @@ struct gfs2_holder {
 enum {
        GLF_LOCK                = 1,
        GLF_STICKY              = 2,
+        GLF_DEMOTE              = 3,
        GLF_DIRTY               = 5,
-        GLF_SKIP_WAITERS2       = 6,
 };
 struct gfs2_glock {
@@ -156,11 +153,12 @@ struct gfs2_glock {
        unsigned int gl_state;
        unsigned int gl_hash;
-        struct task_struct *gl_owner;
+        unsigned int gl_demote_state; /* state requested by remote node */
+        unsigned long gl_demote_time; /* time of first demote request */
+        pid_t gl_owner_pid;
        unsigned long gl_ip;
        struct list_head gl_holders;
        struct list_head gl_waiters1;   /* HIF_MUTEX */
-        struct list_head gl_waiters2;   /* HIF_DEMOTE */
        struct list_head gl_waiters3;   /* HIF_PROMOTE */
        const struct gfs2_glock_operations *gl_ops;
@@ -611,6 +609,8 @@ struct gfs2_sbd {
        unsigned long sd_last_warning;
        struct vfsmount *sd_gfs2mnt;
+        struct dentry *debugfs_dir;    /* debugfs directory */
+        struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
 #endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index b167addf9fd1..c305255bfe8a 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -151,7 +151,7 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 /* make_strname - convert GFS lock numbers to a string */
-static inline void make_strname(struct lm_lockname *lockname,
+static inline void make_strname(const struct lm_lockname *lockname,
                                struct gdlm_strname *str)
 {
        sprintf(str->name, "%8x%16llx", lockname->ln_type,
@@ -169,6 +169,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
                return -ENOMEM;
        lp->lockname = *name;
+        make_strname(name, &lp->strname);
        lp->ls = ls;
        lp->cur = DLM_LOCK_IV;
        lp->lvb = NULL;
@@ -227,7 +228,6 @@ void gdlm_put_lock(void *lock)
 unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 {
        struct gdlm_ls *ls = lp->ls;
-        struct gdlm_strname str;
        int error, bast = 1;
        /*
@@ -249,8 +249,6 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
        if (test_bit(LFL_NOBAST, &lp->flags))
                bast = 0;
-        make_strname(&lp->lockname, &str);
        set_bit(LFL_ACTIVE, &lp->flags);
        log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
@@ -258,8 +256,8 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
                  lp->cur, lp->req, lp->lkf);
        error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-                         str.name, str.namelen, 0, gdlm_ast, lp,
+                         lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
-                         bast ? gdlm_bast : NULL);
+                         lp, bast ? gdlm_bast : NULL);
        if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
                lp->lksb.sb_status = -EAGAIN;
@@ -268,7 +266,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
        }
        if (error) {
-                log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+                log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
                          "flags=%lx", ls->fsname, lp->lockname.ln_type,
                          (unsigned long long)lp->lockname.ln_number, error,
                          lp->cur, lp->req, lp->lkf, lp->flags);
@@ -296,7 +294,7 @@ static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
        error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
        if (error) {
-                log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+                log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
                          "flags=%lx", ls->fsname, lp->lockname.ln_type,
                          (unsigned long long)lp->lockname.ln_number, error,
                          lp->cur, lp->req, lp->lkf, lp->flags);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a87c7bf3c568..d074c6e6f9bf 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -36,7 +36,7 @@
 #define GDLM_STRNAME_BYTES      24
 #define GDLM_LVB_SIZE           32
-#define GDLM_DROP_COUNT         200000
+#define GDLM_DROP_COUNT         0
 #define GDLM_DROP_PERIOD        60
 #define GDLM_NAME_LEN           128
@@ -106,6 +106,7 @@ enum {
 struct gdlm_lock {
        struct gdlm_ls          *ls;
        struct lm_lockname      lockname;
+        struct gdlm_strname     strname;
        char                    *lvb;
        struct dlm_lksb         lksb;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4746b884662d..d9fe3ca40e18 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -190,7 +190,6 @@ static struct kobj_type gdlm_ktype = {
 };
 static struct kset gdlm_kset = {
-        .subsys = &kernel_subsys,
        .kobj   = {.name = "lock_dlm",},
        .ktype  = &gdlm_ktype,
 };
@@ -225,6 +224,7 @@ int gdlm_sysfs_init(void)
 {
        int error;
+        kobj_set_kset_s(&gdlm_kset, kernel_subsys);
        error = kset_register(&gdlm_kset);
        if (error)
                printk("lock_dlm: cannot register kset %d\n", error);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 16bb4b4561ae..f82d84d05d23 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -33,16 +33,17 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        tr->tr_touched = 1;
-        if (!list_empty(&le->le_list))
-                return;
        gl = container_of(le, struct gfs2_glock, gl_le);
        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
                return;
-        gfs2_glock_hold(gl);
-        set_bit(GLF_DIRTY, &gl->gl_flags);
        gfs2_log_lock(sdp);
+        if (!list_empty(&le->le_list)){
+                gfs2_log_unlock(sdp);
+                return;
+        }
+        gfs2_glock_hold(gl);
+        set_bit(GLF_DIRTY, &gl->gl_flags);
        sdp->sd_log_num_gl++;
        list_add(&le->le_list, &sdp->sd_log_le_gl);
        gfs2_log_unlock(sdp);
@@ -415,13 +416,14 @@ static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
        tr->tr_touched = 1;
-        if (!list_empty(&le->le_list))
-                return;
        rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-        gfs2_rgrp_bh_hold(rgd);
        gfs2_log_lock(sdp);
+        if (!list_empty(&le->le_list)){
+                gfs2_log_unlock(sdp);
+                return;
+        }
+        gfs2_rgrp_bh_hold(rgd);
        sdp->sd_log_num_rg++;
        list_add(&le->le_list, &sdp->sd_log_le_rg);
        gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 6e8a59809abf..e460487c0557 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -27,8 +27,7 @@
 static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
        struct gfs2_inode *ip = foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                inode_init_once(&ip->i_inode);
                spin_lock_init(&ip->i_spin);
                init_rwsem(&ip->i_rw_mutex);
@@ -39,13 +38,11 @@ static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned
 static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
        struct gfs2_glock *gl = foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                INIT_HLIST_NODE(&gl->gl_list);
                spin_lock_init(&gl->gl_spin);
                INIT_LIST_HEAD(&gl->gl_holders);
                INIT_LIST_HEAD(&gl->gl_waiters1);
-                INIT_LIST_HEAD(&gl->gl_waiters2);
                INIT_LIST_HEAD(&gl->gl_waiters3);
                gl->gl_lvb = NULL;
                atomic_set(&gl->gl_lvb_count, 0);
@@ -103,6 +100,8 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
+        gfs2_register_debugfs();
        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
        return 0;
@@ -130,6 +129,7 @@ fail:
 static void __exit exit_gfs2_fs(void)
 {
+        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 32caecd20300..4864659555d4 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/parser.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -20,6 +21,52 @@
 #include "sys.h"
 #include "util.h"
+enum {
+        Opt_lockproto,
+        Opt_locktable,
+        Opt_hostdata,
+        Opt_spectator,
+        Opt_ignore_local_fs,
+        Opt_localflocks,
+        Opt_localcaching,
+        Opt_debug,
+        Opt_nodebug,
+        Opt_upgrade,
+        Opt_num_glockd,
+        Opt_acl,
+        Opt_noacl,
+        Opt_quota_off,
+        Opt_quota_account,
+        Opt_quota_on,
+        Opt_suiddir,
+        Opt_nosuiddir,
+        Opt_data_writeback,
+        Opt_data_ordered,
+};
+static match_table_t tokens = {
+        {Opt_lockproto, "lockproto=%s"},
+        {Opt_locktable, "locktable=%s"},
+        {Opt_hostdata, "hostdata=%s"},
+        {Opt_spectator, "spectator"},
+        {Opt_ignore_local_fs, "ignore_local_fs"},
+        {Opt_localflocks, "localflocks"},
+        {Opt_localcaching, "localcaching"},
+        {Opt_debug, "debug"},
+        {Opt_nodebug, "nodebug"},
+        {Opt_upgrade, "upgrade"},
+        {Opt_num_glockd, "num_glockd=%d"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_quota_off, "quota=off"},
+        {Opt_quota_account, "quota=account"},
+        {Opt_quota_on, "quota=on"},
+        {Opt_suiddir, "suiddir"},
+        {Opt_nosuiddir, "nosuiddir"},
+        {Opt_data_writeback, "data=writeback"},
+        {Opt_data_ordered, "data=ordered"}
+};
 /**
 * gfs2_mount_args - Parse mount options
 * @sdp:
@@ -54,146 +101,150 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
           process them */
        for (options = data; (o = strsep(&options, ",")); ) {
+                int token, option;
+                substring_t tmp[MAX_OPT_ARGS];
                if (!*o)
                        continue;
-                v = strchr(o, '=');
+                token = match_token(o, tokens, tmp);
-                if (v)
+                switch (token) {
-                        *v++ = 0;
+                case Opt_lockproto:
+                        v = match_strdup(&tmp[0]);
+                        if (!v) {
+                                fs_info(sdp, "no memory for lockproto\n");
+                                error = -ENOMEM;
+                                goto out_error;
+                        }
-                if (!strcmp(o, "lockproto")) {
+                        if (remount && strcmp(v, args->ar_lockproto)) {
-                        if (!v)
+                                kfree(v);
-                                goto need_value;
-                        if (remount && strcmp(v, args->ar_lockproto))
                                goto cant_remount;
+                        }
+                        
                        strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
                        args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
-                }
+                        kfree(v);
+                        break;
+                case Opt_locktable:
+                        v = match_strdup(&tmp[0]);
+                        if (!v) {
+                                fs_info(sdp, "no memory for locktable\n");
+                                error = -ENOMEM;
+                                goto out_error;
+                        }
-                else if (!strcmp(o, "locktable")) {
+                        if (remount && strcmp(v, args->ar_locktable)) {
-                        if (!v)
+                                kfree(v);
-                                goto need_value;
-                        if (remount && strcmp(v, args->ar_locktable))
                                goto cant_remount;
+                        }
                        strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
-                        args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
+                        args->ar_locktable[GFS2_LOCKNAME_LEN - 1]  = 0;
-                }
+                        kfree(v);
+                        break;
+                case Opt_hostdata:
+                        v = match_strdup(&tmp[0]);
+                        if (!v) {
+                                fs_info(sdp, "no memory for hostdata\n");
+                                error = -ENOMEM;
+                                goto out_error;
+                        }
-                else if (!strcmp(o, "hostdata")) {
+                        if (remount && strcmp(v, args->ar_hostdata)) {
-                        if (!v)
+                                kfree(v);
-                                goto need_value;
-                        if (remount && strcmp(v, args->ar_hostdata))
                                goto cant_remount;
+                        }
                        strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
                        args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
-                }
+                        kfree(v);
+                        break;
-                else if (!strcmp(o, "spectator")) {
+                case Opt_spectator:
                        if (remount && !args->ar_spectator)
                                goto cant_remount;
                        args->ar_spectator = 1;
                        sdp->sd_vfs->s_flags |= MS_RDONLY;
-                }
+                        break;
+                case Opt_ignore_local_fs:
-                else if (!strcmp(o, "ignore_local_fs")) {
                        if (remount && !args->ar_ignore_local_fs)
                                goto cant_remount;
                        args->ar_ignore_local_fs = 1;
-                }
+                        break;
+                case Opt_localflocks:
-                else if (!strcmp(o, "localflocks")) {
                        if (remount && !args->ar_localflocks)
                                goto cant_remount;
                        args->ar_localflocks = 1;
-                }
+                        break;
+                case Opt_localcaching:
-                else if (!strcmp(o, "localcaching")) {
                        if (remount && !args->ar_localcaching)
                                goto cant_remount;
                        args->ar_localcaching = 1;
-                }
+                        break;
+                case Opt_debug:
-                else if (!strcmp(o, "debug"))
                        args->ar_debug = 1;
+                        break;
-                else if (!strcmp(o, "nodebug"))
+                case Opt_nodebug:
                        args->ar_debug = 0;
+                        break;
-                else if (!strcmp(o, "upgrade")) {
+                case Opt_upgrade:
                        if (remount && !args->ar_upgrade)
                                goto cant_remount;
                        args->ar_upgrade = 1;
-                }
+                        break;
+                case Opt_num_glockd:
+                        if ((error = match_int(&tmp[0], &option))) {
+                                fs_info(sdp, "problem getting num_glockd\n");
+                                goto out_error;
+                        }
-                else if (!strcmp(o, "num_glockd")) {
+                        if (remount && option != args->ar_num_glockd)
-                        unsigned int x;
-                        if (!v)
-                                goto need_value;
-                        sscanf(v, "%u", &x);
-                        if (remount && x != args->ar_num_glockd)
                                goto cant_remount;
-                        if (!x || x > GFS2_GLOCKD_MAX) {
+                        if (!option || option > GFS2_GLOCKD_MAX) {
                                fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-                                        GFS2_GLOCKD_MAX, x);
+                                        GFS2_GLOCKD_MAX, option);
                                error = -EINVAL;
-                                break;
+                                goto out_error;
                        }
-                        args->ar_num_glockd = x;
+                        args->ar_num_glockd = option;
-                }
+                        break;
+                case Opt_acl:
-                else if (!strcmp(o, "acl")) {
                        args->ar_posix_acl = 1;
                        sdp->sd_vfs->s_flags |= MS_POSIXACL;
-                }
+                        break;
+                case Opt_noacl:
-                else if (!strcmp(o, "noacl")) {
                        args->ar_posix_acl = 0;
                        sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
-                }
+                        break;
+                case Opt_quota_off:
-                else if (!strcmp(o, "quota")) {
+                        args->ar_quota = GFS2_QUOTA_OFF;
-                        if (!v)
+                        break;
-                                goto need_value;
+                case Opt_quota_account:
-                        if (!strcmp(v, "off"))
+                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
-                                args->ar_quota = GFS2_QUOTA_OFF;
+                        break;
-                        else if (!strcmp(v, "account"))
+                case Opt_quota_on:
-                                args->ar_quota = GFS2_QUOTA_ACCOUNT;
+                        args->ar_quota = GFS2_QUOTA_ON;
-                        else if (!strcmp(v, "on"))
+                        break;
-                                args->ar_quota = GFS2_QUOTA_ON;
+                case Opt_suiddir:
-                        else {
-                                fs_info(sdp, "invalid value for quota\n");
-                                error = -EINVAL;
-                                break;
-                        }
-                }
-                else if (!strcmp(o, "suiddir"))
                        args->ar_suiddir = 1;
+                        break;
-                else if (!strcmp(o, "nosuiddir"))
+                case Opt_nosuiddir:
                        args->ar_suiddir = 0;
+                        break;
-                else if (!strcmp(o, "data")) {
+                case Opt_data_writeback:
-                        if (!v)
+                        args->ar_data = GFS2_DATA_WRITEBACK;
-                                goto need_value;
+                        break;
-                        if (!strcmp(v, "writeback"))
+                case Opt_data_ordered:
-                                args->ar_data = GFS2_DATA_WRITEBACK;
+                        args->ar_data = GFS2_DATA_ORDERED;
-                        else if (!strcmp(v, "ordered"))
+                        break;
-                                args->ar_data = GFS2_DATA_ORDERED;
+                default:
-                        else {
-                                fs_info(sdp, "invalid value for data\n");
-                                error = -EINVAL;
-                                break;
-                        }
-                }
-                else {
                        fs_info(sdp, "unknown option: %s\n", o);
                        error = -EINVAL;
-                        break;
+                        goto out_error;
                }
        }
+out_error:
        if (error)
                fs_info(sdp, "invalid mount option(s)\n");
@@ -202,10 +253,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
        return error;
-need_value:
-        fs_info(sdp, "need value for option %s\n", o);
-        return -EINVAL;
 cant_remount:
        fs_info(sdp, "can't remount with option %s\n", o);
        return -EINVAL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index b3b7e8475359..30c15622174f 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -197,7 +197,19 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
        void *kaddr;
        int error;
-        BUG_ON(page->index);
+        /*
+         * Due to the order of unstuffing files and ->nopage(), we can be
+         * asked for a zero page in the case of a stuffed file being extended,
+         * so we need to supply one here. It doesn't happen often.
+         */
+        if (unlikely(page->index)) {
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr, 0, PAGE_CACHE_SIZE);
+                kunmap_atomic(kaddr, KM_USER0);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+                return 0;
+        }
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
@@ -208,9 +220,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
               ip->i_di.di_size);
        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
        kunmap_atomic(kaddr, KM_USER0);
+        flush_dcache_page(page);
        brelse(dibh);
        SetPageUptodate(page);
        return 0;
@@ -507,7 +518,9 @@ static int gfs2_commit_write(struct file *file, struct page *page,
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
+        unlock_page(page);
        gfs2_glock_dq_m(1, &ip->i_gh);
+        lock_page(page);
        gfs2_holder_uninit(&ip->i_gh);
        return 0;
@@ -520,7 +533,9 @@ fail_endtrans:
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
+        unlock_page(page);
        gfs2_glock_dq_m(1, &ip->i_gh);
+        lock_page(page);
        gfs2_holder_uninit(&ip->i_gh);
 fail_nounlock:
        ClearPageUptodate(page);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ee54cb667083..2c5f8e7def0d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -690,6 +690,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        if (error)
                goto fail;
+        gfs2_create_debugfs_file(sdp);
        error = gfs2_sys_fs_add(sdp);
        if (error)
                goto fail;
@@ -754,6 +756,7 @@ fail_lm:
 fail_sys:
        gfs2_sys_fs_del(sdp);
 fail:
+        gfs2_delete_debugfs_file(sdp);
        kfree(sdp);
        sb->s_fs_info = NULL;
        return error;
@@ -896,6 +899,7 @@ error:
 static void gfs2_kill_sb(struct super_block *sb)
 {
+        gfs2_delete_debugfs_file(sb->s_fs_info);
        kill_block_super(sb);
 }
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index b89999d3a767..485ce3d49923 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -284,6 +284,31 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 }
 /**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+        if (inode->i_private && inode->i_nlink) {
+                struct gfs2_inode *ip = GFS2_I(inode);
+                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        clear_nlink(inode);
+        }
+        generic_drop_inode(inode);
+}
+/**
 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
 * @inode: The VFS inode
 *
@@ -441,7 +466,7 @@ out_unlock:
 out_uninit:
        gfs2_holder_uninit(&ip->i_iopen_gh);
        gfs2_glock_dq_uninit(&gh);
-        if (error)
+        if (error && error != GLR_TRYFAILED)
                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
        truncate_inode_pages(&inode->i_data, 0);
@@ -481,6 +506,7 @@ const struct super_operations gfs2_super_ops = {
        .statfs                 = gfs2_statfs,
        .remount_fs             = gfs2_remount_fs,
        .clear_inode            = gfs2_clear_inode,
+        .drop_inode             = gfs2_drop_inode,
        .show_options           = gfs2_show_options,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8d9c08b5c4b6..1727f5012efe 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -27,6 +27,7 @@
 #include "trans.h"
 #include "ops_file.h"
 #include "util.h"
+#include "log.h"
 #define BFITNOENT ((u32)~0)
@@ -697,8 +698,6 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 * @al: the struct gfs2_alloc structure describing the reservation
 *
 * If there's room for the requested blocks to be allocated from the RG:
- *   Sets the $al_reserved_data field in @al.
- *   Sets the $al_reserved_meta field in @al.
 *   Sets the $al_rgd field in @al.
 *
 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
@@ -709,6 +708,9 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int ret = 0;
+        if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+                return 0;
        spin_lock(&sdp->sd_rindex_spin);
        if (rgd->rd_free_clone >= al->al_requested) {
                al->al_rgd = rgd;
@@ -941,9 +943,13 @@ static int get_local_rgrp(struct gfs2_inode *ip)
                        rgd = gfs2_rgrpd_get_first(sdp);
                if (rgd == begin) {
-                        if (++loops >= 2 || !skipped)
+                        if (++loops >= 3)
                                return -ENOSPC;
+                        if (!skipped)
+                                loops++;
                        flags = 0;
+                        if (loops == 2)
+                                gfs2_log_flush(sdp, NULL);
                }
        }
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d01f9f0fda26..c26c21b53c19 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -222,7 +222,6 @@ static struct kobj_type gfs2_ktype = {
 };
 static struct kset gfs2_kset = {
-        .subsys = &fs_subsys,
        .kobj   = {.name = "gfs2"},
        .ktype  = &gfs2_ktype,
 };
@@ -554,6 +553,7 @@ int gfs2_sys_init(void)
 {
        gfs2_sys_margs = NULL;
        spin_lock_init(&gfs2_sys_margs_lock);
+        kobj_set_kset_s(&gfs2_kset, fs_subsys);
        return kset_register(&gfs2_kset);
 }
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 623f509f1d47..4f1888f16cf0 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -434,7 +434,7 @@ static void hfs_init_once(void *p, struct kmem_cache *cachep, unsigned long flag
 {
        struct hfs_inode_info *i = p;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&i->vfs_inode);
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1a97f9293447..37afbec8a761 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -470,7 +470,7 @@ static void hfsplus_init_once(void *p, struct kmem_cache *cachep, unsigned long
 {
        struct hfsplus_inode_info *i = p;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&i->vfs_inode);
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e0174e338526..1b95f39fbc37 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -176,8 +176,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                mutex_init(&ei->i_mutex);
                mutex_init(&ei->i_parent_mutex);
                inode_init_once(&ei->vfs_inode);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3d413f..98959b87cdf8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
+#include <linux/mman.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
@@ -98,10 +99,7 @@ out:
 * Called under down_write(mmap_sem).
 */
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-                unsigned long len, unsigned long pgoff, unsigned long flags);
-#else
 static unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -115,6 +113,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        if (len > TASK_SIZE)
                return -ENOMEM;
+        if (flags & MAP_FIXED) {
+                if (prepare_hugepage_range(addr, len, pgoff))
+                        return -EINVAL;
+                return addr;
+        }
        if (addr) {
                addr = ALIGN(addr, HPAGE_SIZE);
                vma = find_vma(mm, addr);
@@ -453,7 +457,7 @@ static int hugetlbfs_symlink(struct inode *dir,
 */
 static int hugetlbfs_set_page_dirty(struct page *page)
 {
-        struct page *head = (struct page *)page_private(page);
+        struct page *head = compound_head(page);
        SetPageDirty(head);
        return 0;
@@ -552,8 +556,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
        struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
@@ -744,6 +747,9 @@ struct file *hugetlb_zero_setup(size_t size)
        char buf[16];
        static atomic_t counter;
+        if (!hugetlbfs_vfsmount)
+                return ERR_PTR(-ENOENT);
        if (!can_do_hugetlb_shm())
                return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index 5abb097ab1b0..b4296bf62739 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -213,8 +213,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct inode * inode = (struct inode *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(inode);
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 64a96cdfe3a4..e99f7ff4ecb4 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -77,8 +77,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
        struct iso_inode_info *ei = foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/jffs2/LICENCE b/fs/jffs2/LICENCE
index cd81d83e4ad2..562885908135 100644
--- a/fs/jffs2/LICENCE
+++ b/fs/jffs2/LICENCE
@@ -1,7 +1,7 @@
 The files in this directory and elsewhere which refer to this LICENCE
 file are part of JFFS2, the Journalling Flash File System v2.
-        Copyright (C) 2001, 2002 Red Hat, Inc.
+        Copyright © 2001-2007 Red Hat, Inc. and others
 JFFS2 is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
@@ -28,8 +28,3 @@ of the GNU General Public License.
 This exception does not invalidate any other reasons why a work based on
 this file might be covered by the GNU General Public License.
-For information on obtaining alternative licences for JFFS2, see 
-http://sources.redhat.com/jffs2/jffs2-licence.html
-        $Id: LICENCE,v 1.1 2002/05/20 14:56:37 dwmw2 Exp $
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 7f28ee0bd132..c32b241e3d91 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -1,7 +1,6 @@
 #
 # Makefile for the Linux Journalling Flash File System v2 (JFFS2)
 #
-# $Id: Makefile.common,v 1.11 2005/09/07 08:34:53 havasi Exp $
 #
 obj-$(CONFIG_JFFS2_FS) += jffs2.o
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index c8f0bd64e53e..d14d5a4dc5ac 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -1,4 +1,3 @@
-        $Id: README.Locking,v 1.12 2005/04/13 13:22:35 dwmw2 Exp $
        JFFS2 LOCKING DOCUMENTATION
        ---------------------------
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
index d0e23b26fa50..5d3ea4070f01 100644
--- a/fs/jffs2/TODO
+++ b/fs/jffs2/TODO
@@ -1,4 +1,3 @@
-$Id: TODO,v 1.18 2005/09/22 11:24:56 dedekind Exp $
 - support asynchronous operation -- add a per-fs 'reserved_space' count,
   let each outstanding write reserve the _maximum_ amount of physical
@@ -30,8 +29,6 @@ $Id: TODO,v 1.18 2005/09/22 11:24:56 dedekind Exp $
     the full dirent, we only need to go to the flash in lookup() when we think we've
     got a match, and in readdir(). 
   - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately?
-   - Remove totlen from jffs2_raw_node_ref? Need to have totlen passed into
-        jffs2_mark_node_obsolete(). Can all callers work it out?
   - Remove size from jffs2_raw_node_frag. 
 dedekind:
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 73f0d60f73a5..a46101ee867a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fa327dbd3171..c84378cee82a 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 struct jffs2_acl_entry {
        jint16_t        e_tag;
        jint16_t        e_perm;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 888f236e5494..0c82dfcfd246 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: background.c,v 1.54 2005/05/20 21:37:12 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 07119c42a861..0ca2fff2617f 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: build.c,v 1.85 2005/11/07 11:14:38 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 7001ba26c067..485d065de41f 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -1,16 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
- * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
 *                    University of Szeged, Hungary
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr.c,v 1.46 2005/11/07 11:14:38 gleixner Exp $
- *
 */
 #include "compr.h"
@@ -268,144 +266,6 @@ int jffs2_unregister_compressor(struct jffs2_compressor *comp)
        return 0;
 }
-#ifdef CONFIG_JFFS2_PROC
-#define JFFS2_STAT_BUF_SIZE 16000
-char *jffs2_list_compressors(void)
-{
-        struct jffs2_compressor *this;
-        char *buf, *act_buf;
-        act_buf = buf = kmalloc(JFFS2_STAT_BUF_SIZE,GFP_KERNEL);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                act_buf += sprintf(act_buf, "%10s priority:%d ", this->name, this->priority);
-                if ((this->disabled)||(!this->compress))
-                        act_buf += sprintf(act_buf,"disabled");
-                else
-                        act_buf += sprintf(act_buf,"enabled");
-                act_buf += sprintf(act_buf,"\n");
-        }
-        return buf;
-}
-char *jffs2_stats(void)
-{
-        struct jffs2_compressor *this;
-        char *buf, *act_buf;
-        act_buf = buf = kmalloc(JFFS2_STAT_BUF_SIZE,GFP_KERNEL);
-        act_buf += sprintf(act_buf,"JFFS2 compressor statistics:\n");
-        act_buf += sprintf(act_buf,"%10s   ","none");
-        act_buf += sprintf(act_buf,"compr: %d blocks (%d)  decompr: %d blocks\n", none_stat_compr_blocks,
-                           none_stat_compr_size, none_stat_decompr_blocks);
-        spin_lock(&jffs2_compressor_list_lock);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                act_buf += sprintf(act_buf,"%10s ",this->name);
-                if ((this->disabled)||(!this->compress))
-                        act_buf += sprintf(act_buf,"- ");
-                else
-                        act_buf += sprintf(act_buf,"+ ");
-                act_buf += sprintf(act_buf,"compr: %d blocks (%d/%d)  decompr: %d blocks ", this->stat_compr_blocks,
-                                   this->stat_compr_new_size, this->stat_compr_orig_size,
-                                   this->stat_decompr_blocks);
-                act_buf += sprintf(act_buf,"\n");
-        }
-        spin_unlock(&jffs2_compressor_list_lock);
-        return buf;
-}
-char *jffs2_get_compression_mode_name(void)
-{
-        switch (jffs2_compression_mode) {
-        case JFFS2_COMPR_MODE_NONE:
-                return "none";
-        case JFFS2_COMPR_MODE_PRIORITY:
-                return "priority";
-        case JFFS2_COMPR_MODE_SIZE:
-                return "size";
-        }
-        return "unkown";
-}
-int jffs2_set_compression_mode_name(const char *name)
-{
-        if (!strcmp("none",name)) {
-                jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
-                return 0;
-        }
-        if (!strcmp("priority",name)) {
-                jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY;
-                return 0;
-        }
-        if (!strcmp("size",name)) {
-                jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE;
-                return 0;
-        }
-        return 1;
-}
-static int jffs2_compressor_Xable(const char *name, int disabled)
-{
-        struct jffs2_compressor *this;
-        spin_lock(&jffs2_compressor_list_lock);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                if (!strcmp(this->name, name)) {
-                        this->disabled = disabled;
-                        spin_unlock(&jffs2_compressor_list_lock);
-                        return 0;
-                }
-        }
-        spin_unlock(&jffs2_compressor_list_lock);
-        printk(KERN_WARNING "JFFS2: compressor %s not found.\n",name);
-        return 1;
-}
-int jffs2_enable_compressor_name(const char *name)
-{
-        return jffs2_compressor_Xable(name, 0);
-}
-int jffs2_disable_compressor_name(const char *name)
-{
-        return jffs2_compressor_Xable(name, 1);
-}
-int jffs2_set_compressor_priority(const char *name, int priority)
-{
-        struct jffs2_compressor *this,*comp;
-        spin_lock(&jffs2_compressor_list_lock);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                if (!strcmp(this->name, name)) {
-                        this->priority = priority;
-                        comp = this;
-                        goto reinsert;
-                }
-        }
-        spin_unlock(&jffs2_compressor_list_lock);
-        printk(KERN_WARNING "JFFS2: compressor %s not found.\n",name);
-        return 1;
-reinsert:
-        /* list is sorted in the order of priority, so if
-           we change it we have to reinsert it into the
-           good place */
-        list_del(&comp->list);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                if (this->priority < comp->priority) {
-                        list_add(&comp->list, this->list.prev);
-                        spin_unlock(&jffs2_compressor_list_lock);
-                        return 0;
-                }
-        }
-        list_add_tail(&comp->list, &jffs2_compressor_list);
-        spin_unlock(&jffs2_compressor_list_lock);
-        return 0;
-}
-#endif
 void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
 {
        if (orig != comprbuf)
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 509b8b1c0811..68cc7010dbdf 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -1,13 +1,10 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
 *                    University of Szeged, Hungary
 *
- * For licensing information, see the file 'LICENCE' in the
+ * For licensing information, see the file 'LICENCE' in this directory.
- * jffs2 directory.
- *
- * $Id: compr.h,v 1.9 2005/11/07 11:14:38 gleixner Exp $
 *
 */
@@ -76,16 +73,6 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig);
-#ifdef CONFIG_JFFS2_PROC
-int jffs2_enable_compressor_name(const char *name);
-int jffs2_disable_compressor_name(const char *name);
-int jffs2_set_compression_mode_name(const char *mode_name);
-char *jffs2_get_compression_mode_name(void);
-int jffs2_set_compressor_priority(const char *mode_name, int priority);
-char *jffs2_list_compressors(void);
-char *jffs2_stats(void);
-#endif
 /* Compressor modules */
 /* These functions will be called by jffs2_compressors_init/exit */
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 2eb1b7428d16..0d0bfd2e4e0d 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -1,13 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr_rtime.c,v 1.14 2004/06/23 16:34:40 havasi Exp $
 *
 *
 * Very simple lz77-ish encoder.
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index e792e675d624..ea0431e047d5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -1,23 +1,94 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr_rubin.c,v 1.20 2004/06/23 16:34:40 havasi Exp $
- *
 */
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
-#include "compr_rubin.h"
+#include <linux/errno.h>
-#include "histo_mips.h"
 #include "compr.h"
+#define RUBIN_REG_SIZE   16
+#define UPPER_BIT_RUBIN    (((long) 1)<<(RUBIN_REG_SIZE-1))
+#define LOWER_BITS_RUBIN   ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
+#define BIT_DIVIDER_MIPS 1043
+static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
+#include <linux/errno.h>
+struct pushpull {
+        unsigned char *buf;
+        unsigned int buflen;
+        unsigned int ofs;
+        unsigned int reserve;
+};
+struct rubin_state {
+        unsigned long p;
+        unsigned long q;
+        unsigned long rec_q;
+        long bit_number;
+        struct pushpull pp;
+        int bit_divider;
+        int bits[8];
+};
+static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+{
+        pp->buf = buf;
+        pp->buflen = buflen;
+        pp->ofs = ofs;
+        pp->reserve = reserve;
+}
+static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
+{
+        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+                return -ENOSPC;
+        }
+        if (bit) {
+                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
+        }
+        else {
+                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
+        }
+        pp->ofs++;
+        return 0;
+}
+static inline int pushedbits(struct pushpull *pp)
+{
+        return pp->ofs;
+}
+static inline int pullbit(struct pushpull *pp)
+{
+        int bit;
+        bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
+        pp->ofs++;
+        return bit;
+}
+static inline int pulledbits(struct pushpull *pp)
+{
+        return pp->ofs;
+}
 static void init_rubin(struct rubin_state *rs, int div, int *bits)
 {
        int c;
diff --git a/fs/jffs2/compr_rubin.h b/fs/jffs2/compr_rubin.h
deleted file mode 100644
index bf1a93451621..000000000000
--- a/fs/jffs2/compr_rubin.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Rubin encoder/decoder header       */
-/* work started at   : aug   3, 1994  */
-/* last modification : aug  15, 1994  */
-/* $Id: compr_rubin.h,v 1.7 2005/11/07 11:14:38 gleixner Exp $ */
-#include "pushpull.h"
-#define RUBIN_REG_SIZE   16
-#define UPPER_BIT_RUBIN    (((long) 1)<<(RUBIN_REG_SIZE-1))
-#define LOWER_BITS_RUBIN   ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
-struct rubin_state {
-        unsigned long p;
-        unsigned long q;
-        unsigned long rec_q;
-        long bit_number;
-        struct pushpull pp;
-        int bit_divider;
-        int bits[8];
-};
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 0c1fc6e20b43..2b87fccc1557 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr_zlib.c,v 1.32 2005/11/07 11:14:38 gleixner Exp $
- *
 */
 #if !defined(__KERNEL__) && !defined(__ECOS)
diff --git a/fs/jffs2/comprtest.c b/fs/jffs2/comprtest.c
deleted file mode 100644
index f0fb8be7740c..000000000000
--- a/fs/jffs2/comprtest.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/* $Id: comprtest.c,v 1.6 2005/11/07 11:14:38 gleixner Exp $ */
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <asm/types.h>
-#if 0
-#define TESTDATA_LEN 512
-static unsigned char testdata[TESTDATA_LEN] = {
- 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x02, 0x00, 0x03, 0x00, 0x01, 0x00, 0x00, 0x00, 0x60, 0x83, 0x04, 0x08, 0x34, 0x00, 0x00, 0x00,
- 0xb0, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34, 0x00, 0x20, 0x00, 0x06, 0x00, 0x28, 0x00,
- 0x1e, 0x00, 0x1b, 0x00, 0x06, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x34, 0x80, 0x04, 0x08,
- 0x34, 0x80, 0x04, 0x08, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, 0xf4, 0x80, 0x04, 0x08,
- 0xf4, 0x80, 0x04, 0x08, 0x13, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
- 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x04, 0x08,
- 0x00, 0x80, 0x04, 0x08, 0x0d, 0x05, 0x00, 0x00, 0x0d, 0x05, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x05, 0x00, 0x00, 0x10, 0x95, 0x04, 0x08,
- 0x10, 0x95, 0x04, 0x08, 0xe8, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x58, 0x05, 0x00, 0x00, 0x58, 0x95, 0x04, 0x08,
- 0x58, 0x95, 0x04, 0x08, 0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x08, 0x81, 0x04, 0x08,
- 0x08, 0x81, 0x04, 0x08, 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x2f, 0x6c, 0x69, 0x62, 0x2f, 0x6c, 0x64, 0x2d, 0x6c, 0x69, 0x6e, 0x75,
- 0x78, 0x2e, 0x73, 0x6f, 0x2e, 0x32, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
- 0x01, 0x00, 0x00, 0x00, 0x47, 0x4e, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
- 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
- 0x0c, 0x83, 0x04, 0x08, 0x81, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
- 0x1c, 0x83, 0x04, 0x08, 0xac, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
- 0x2c, 0x83, 0x04, 0x08, 0xdd, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
- 0x3c, 0x83, 0x04, 0x08, 0x2e, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
- 0x4c, 0x83, 0x04, 0x08, 0x7d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
- 0x00, 0x85, 0x04, 0x08, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x5f, 0x5f, 0x67,
- 0x6d, 0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x5f, 0x5f, 0x00, 0x6c, 0x69, 0x62, 0x63,
- 0x2e, 0x73, 0x6f, 0x2e, 0x36, 0x00, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x00, 0x5f, 0x5f, 0x63};
-#else
-#define TESTDATA_LEN 3481
-static unsigned char testdata[TESTDATA_LEN] = {
- 0x23, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x22, 0x64, 0x62, 0x65, 0x6e, 0x63, 0x68,
- 0x2e, 0x68, 0x22, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x41, 0x58,
- 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x20, 0x31, 0x30, 0x30, 0x30, 0x0a, 0x0a, 0x73, 0x74, 0x61,
- 0x74, 0x69, 0x63, 0x20, 0x63, 0x68, 0x61, 0x72, 0x20, 0x62, 0x75, 0x66, 0x5b, 0x37, 0x30, 0x30,
- 0x30, 0x30, 0x5d, 0x3b, 0x0a, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61,
- 0x74, 0x69, 0x63, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x09, 0x69, 0x6e,
- 0x74, 0x20, 0x66, 0x64, 0x3b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c,
- 0x65, 0x3b, 0x0a, 0x7d, 0x20, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x4d, 0x41, 0x58, 0x5f,
- 0x46, 0x49, 0x4c, 0x45, 0x53, 0x5d, 0x3b, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f,
- 0x5f, 0x75, 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e,
- 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72,
- 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x75,
- 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x21, 0x3d, 0x20,
- 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28,
- 0x25, 0x64, 0x29, 0x20, 0x75, 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x20, 0x25, 0x73, 0x20, 0x66, 0x61,
- 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09,
- 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75,
- 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72,
- 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a,
- 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x5f, 0x66,
- 0x69, 0x6c, 0x65, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x64, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x3b, 0x0a,
- 0x09, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a,
- 0x09, 0x09, 0x73, 0x20, 0x3d, 0x20, 0x4d, 0x49, 0x4e, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x6f, 0x66,
- 0x28, 0x62, 0x75, 0x66, 0x29, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x09,
- 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73,
- 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x2d, 0x3d, 0x20, 0x73, 0x3b, 0x0a,
- 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x6f, 0x70,
- 0x65, 0x6e, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
- 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x64, 0x2c,
- 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x3d,
- 0x20, 0x4f, 0x5f, 0x52, 0x44, 0x57, 0x52, 0x7c, 0x4f, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x3b,
- 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x73, 0x74, 0x61, 0x74, 0x20, 0x73, 0x74,
- 0x3b, 0x0a, 0x09, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f,
- 0x75, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28,
- 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69,
- 0x7a, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x7c,
- 0x3d, 0x20, 0x4f, 0x5f, 0x54, 0x52, 0x55, 0x4e, 0x43, 0x3b, 0x0a, 0x0a, 0x09, 0x66, 0x64, 0x20,
- 0x3d, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x66, 0x6c,
- 0x61, 0x67, 0x73, 0x2c, 0x20, 0x30, 0x36, 0x30, 0x30, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20,
- 0x28, 0x66, 0x64, 0x20, 0x3d, 0x3d, 0x20, 0x2d, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70,
- 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x6f, 0x70, 0x65, 0x6e,
- 0x20, 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x68,
- 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22,
- 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65,
- 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x68,
- 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28,
- 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72,
- 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x66, 0x73, 0x74, 0x61, 0x74, 0x28, 0x66, 0x64, 0x2c,
- 0x20, 0x26, 0x73, 0x74, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65,
- 0x20, 0x3e, 0x20, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b,
- 0x0a, 0x23, 0x69, 0x66, 0x20, 0x44, 0x45, 0x42, 0x55, 0x47, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
- 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64,
- 0x69, 0x6e, 0x67, 0x20, 0x25, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x25, 0x64, 0x20, 0x66, 0x72, 0x6f,
- 0x6d, 0x20, 0x25, 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66,
- 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e, 0x74,
- 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x23, 0x65,
- 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x09, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x5f, 0x66, 0x69,
- 0x6c, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x2d, 0x20, 0x73, 0x74,
- 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x20, 0x65, 0x6c,
- 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x3c, 0x20, 0x73, 0x74,
- 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72,
- 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x69, 0x6e, 0x67,
- 0x20, 0x25, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x25, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x25,
- 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e,
- 0x74, 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09,
- 0x09, 0x66, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x73,
- 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69,
- 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b, 0x69,
- 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61, 0x62,
- 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d, 0x20,
- 0x30, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66,
- 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53,
- 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x66, 0x69,
- 0x6c, 0x65, 0x20, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x66, 0x75, 0x6c, 0x6c, 0x20, 0x66, 0x6f,
- 0x72, 0x20, 0x25, 0x73, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b,
- 0x0a, 0x09, 0x09, 0x65, 0x78, 0x69, 0x74, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09,
- 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65,
- 0x20, 0x3d, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x3b, 0x0a, 0x09, 0x66, 0x74, 0x61, 0x62,
- 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x20, 0x3d, 0x20, 0x66, 0x64, 0x3b, 0x0a, 0x09,
- 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2b, 0x2b, 0x20, 0x25, 0x20, 0x31, 0x30,
- 0x30, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e,
- 0x74, 0x66, 0x28, 0x22, 0x2e, 0x22, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76,
- 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x69, 0x6e, 0x74,
- 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x69, 0x7a,
- 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x0a, 0x7b,
- 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x62,
- 0x75, 0x66, 0x5b, 0x30, 0x5d, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x6d, 0x65, 0x6d, 0x73,
- 0x65, 0x74, 0x28, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x31, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x6f,
- 0x66, 0x28, 0x62, 0x75, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28,
- 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b,
- 0x69, 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61,
- 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d,
- 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a,
- 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58,
- 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x31, 0x0a,
- 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64,
- 0x6f, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x3a, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20,
- 0x25, 0x64, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20,
- 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25, 0x64, 0x20, 0x6f, 0x66, 0x73, 0x3d, 0x25, 0x64, 0x5c, 0x6e,
- 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e,
- 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c,
- 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x3b, 0x0a,
- 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b,
- 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x6c, 0x73, 0x65, 0x65, 0x6b, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c,
- 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x2c,
- 0x20, 0x53, 0x45, 0x45, 0x4b, 0x5f, 0x53, 0x45, 0x54, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20,
- 0x28, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d,
- 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20,
- 0x21, 0x3d, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
- 0x6e, 0x74, 0x66, 0x28, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65,
- 0x64, 0x20, 0x6f, 0x6e, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x5c, 0x6e,
- 0x22, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d,
- 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x65, 0x61, 0x64, 0x28, 0x69,
- 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73,
- 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29,
- 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20,
- 0x28, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53,
- 0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74,
- 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d,
- 0x3d, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
- 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41,
- 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
- 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x65, 0x61,
- 0x64, 0x3a, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x77, 0x61, 0x73,
- 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25,
- 0x64, 0x20, 0x6f, 0x66, 0x73, 0x3d, 0x25, 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e,
- 0x74, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c,
- 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75,
- 0x72, 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x6c, 0x73, 0x65, 0x65, 0x6b, 0x28, 0x66, 0x74,
- 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73,
- 0x65, 0x74, 0x2c, 0x20, 0x53, 0x45, 0x45, 0x4b, 0x5f, 0x53, 0x45, 0x54, 0x29, 0x3b, 0x0a, 0x09,
- 0x72, 0x65, 0x61, 0x64, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66,
- 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x7d,
- 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28,
- 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69,
- 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x3d, 0x30, 0x3b,
- 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b, 0x69, 0x2b, 0x2b, 0x29,
- 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b,
- 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x68, 0x61, 0x6e,
- 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09,
- 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c,
- 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22,
- 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x3a, 0x20, 0x68,
- 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74,
- 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20,
- 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72,
- 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28, 0x66, 0x74, 0x61,
- 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x29, 0x3b, 0x0a, 0x09, 0x66, 0x74, 0x61,
- 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x20,
- 0x30, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x6d, 0x6b,
- 0x64, 0x69, 0x72, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29,
- 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61,
- 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x6b, 0x64, 0x69, 0x72,
- 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x30, 0x37, 0x30, 0x30, 0x29, 0x20, 0x21, 0x3d,
- 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x44, 0x45, 0x42, 0x55, 0x47, 0x0a,
- 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x6d, 0x6b, 0x64, 0x69, 0x72, 0x20,
- 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e,
- 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6e, 0x61,
- 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72,
- 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x7d, 0x0a,
- 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x6d, 0x64, 0x69, 0x72,
- 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x7b, 0x0a,
- 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29,
- 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x72, 0x6d, 0x64, 0x69, 0x72, 0x28, 0x66, 0x6e,
- 0x61, 0x6d, 0x65, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70,
- 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x72, 0x6d, 0x64, 0x69, 0x72, 0x20, 0x25, 0x73, 0x20,
- 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20,
- 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c,
- 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29,
- 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f,
- 0x5f, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x6f, 0x6c,
- 0x64, 0x2c, 0x20, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x6e, 0x65, 0x77, 0x29, 0x0a, 0x7b, 0x0a,
- 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a,
- 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x6e, 0x65, 0x77, 0x29, 0x3b, 0x0a,
- 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x6f, 0x6c, 0x64,
- 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09,
- 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x20,
- 0x25, 0x73, 0x20, 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73,
- 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72,
- 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d,
- 0x0a, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x28,
- 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74,
- 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74,
- 0x20, 0x73, 0x74, 0x61, 0x74, 0x20, 0x73, 0x74, 0x3b, 0x0a, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75,
- 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69,
- 0x66, 0x20, 0x28, 0x73, 0x74, 0x61, 0x74, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x26,
- 0x73, 0x74, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72,
- 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74,
- 0x61, 0x74, 0x3a, 0x20, 0x25, 0x73, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25, 0x64, 0x20, 0x25,
- 0x73, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d,
- 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f,
- 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74,
- 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x53, 0x5f, 0x49,
- 0x53, 0x44, 0x49, 0x52, 0x28, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x29,
- 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28,
- 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x21, 0x3d, 0x20, 0x73, 0x69,
- 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22,
- 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, 0x25, 0x73,
- 0x20, 0x77, 0x72, 0x6f, 0x6e, 0x67, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x25, 0x64, 0x20, 0x25,
- 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d,
- 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69,
- 0x7a, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a,
- 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x28,
- 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74,
- 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x64, 0x6f, 0x5f, 0x6f, 0x70, 0x65,
- 0x6e, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x35, 0x30, 0x30, 0x30, 0x2c, 0x20, 0x73,
- 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28,
- 0x35, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a
-};
-#endif
-static unsigned char comprbuf[TESTDATA_LEN];
-static unsigned char decomprbuf[TESTDATA_LEN];
-int jffs2_decompress(unsigned char comprtype, unsigned char *cdata_in,
-                     unsigned char *data_out, uint32_t cdatalen, uint32_t datalen);
-unsigned char jffs2_compress(unsigned char *data_in, unsigned char *cpage_out,
-                             uint32_t *datalen, uint32_t *cdatalen);
-int init_module(void ) {
-        unsigned char comprtype;
-        uint32_t c, d;
-        int ret;
-        printk("Original data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
-               testdata[0],testdata[1],testdata[2],testdata[3],
-               testdata[4],testdata[5],testdata[6],testdata[7],
-               testdata[8],testdata[9],testdata[10],testdata[11],
-               testdata[12],testdata[13],testdata[14],testdata[15]);
-        d = TESTDATA_LEN;
-        c = TESTDATA_LEN;
-        comprtype = jffs2_compress(testdata, comprbuf, &d, &c);
-        printk("jffs2_compress used compression type %d. Compressed size %d, uncompressed size %d\n",
-               comprtype, c, d);
-        printk("Compressed data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
-               comprbuf[0],comprbuf[1],comprbuf[2],comprbuf[3],
-               comprbuf[4],comprbuf[5],comprbuf[6],comprbuf[7],
-               comprbuf[8],comprbuf[9],comprbuf[10],comprbuf[11],
-               comprbuf[12],comprbuf[13],comprbuf[14],comprbuf[15]);
-        ret = jffs2_decompress(comprtype, comprbuf, decomprbuf, c, d);
-        printk("jffs2_decompress returned %d\n", ret);
-        printk("Decompressed data:  %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
-               decomprbuf[0],decomprbuf[1],decomprbuf[2],decomprbuf[3],
-               decomprbuf[4],decomprbuf[5],decomprbuf[6],decomprbuf[7],
-               decomprbuf[8],decomprbuf[9],decomprbuf[10],decomprbuf[11],
-               decomprbuf[12],decomprbuf[13],decomprbuf[14],decomprbuf[15]);
-        if (memcmp(decomprbuf, testdata, d))
-                printk("Compression and decompression corrupted data\n");
-        else
-                printk("Compression good for %d bytes\n", d);
-        return 1;
-}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 4189e4a36050..3a32c64ed497 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -1,15 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: debug.c,v 1.12 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/pagemap.h>
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index f89c85d5a3f8..2a49f2c51a9f 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -1,15 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: debug.h,v 1.21 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #ifndef _JFFS2_DEBUG_H_
 #define _JFFS2_DEBUG_H_
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9fa2e27f0641..c1dfca310dd6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: dir.c,v 1.90 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ad0121088dde..66e7c2f1e644 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: erase.c,v 1.85 2005/09/20 14:53:15 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
@@ -333,7 +331,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
                *bad_offset = ofs;
-                ret = jffs2_flash_read(c, ofs, readlen, &retlen, ebuf);
+                ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
                if (ret) {
                        printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
                        goto fail;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e82eeaf7590d..99871279a1ed 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: file.c,v 1.104 2005/10/18 23:29:35 tpoynor Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index abb90c0c09cc..1d3b7a9fc828 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: fs.c,v 1.66 2005/09/27 13:17:29 dedekind Exp $
- *
 */
 #include <linux/capability.h>
@@ -672,6 +670,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
                        return ret;
        }
+        /* and an UBI volume */
+        if (jffs2_ubivol(c)) {
+                ret = jffs2_ubivol_setup(c);
+                if (ret)
+                        return ret;
+        }
        return ret;
 }
@@ -690,4 +695,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
        if (jffs2_nor_wbuf_flash(c)) {
                jffs2_nor_wbuf_flash_cleanup(c);
        }
+        /* and an UBI volume */
+        if (jffs2_ubivol(c)) {
+                jffs2_ubivol_cleanup(c);
+        }
 }
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3a3cf225981f..2d99e06ab223 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: gc.c,v 1.155 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -144,7 +142,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                               c->unchecked_size);
                        jffs2_dbg_dump_block_lists_nolock(c);
                        spin_unlock(&c->erase_completion_lock);
-                        BUG();
+                        up(&c->alloc_sem);
+                        return -ENOSPC;
                }
                spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 69099835de1c..f4d525b0ea53 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: ioctl.c,v 1.10 2005/11/07 11:14:40 gleixner Exp $
- *
 */
 #include <linux/fs.h>
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 3a566077ac95..0b78fdc9773b 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -1,4 +1,13 @@
-/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #ifndef _JFFS2_FS_I
 #define _JFFS2_FS_I
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index ea88f69af130..b13298a824ed 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -1,4 +1,13 @@
-/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #ifndef _JFFS2_FS_SB
 #define _JFFS2_FS_SB
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 83f9881ec4cc..35c1a5e30ba1 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: malloc.c,v 1.31 2005/11/07 11:14:40 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 5a6b4d64206c..4bf86088b3ae 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: nodelist.c,v 1.115 2005/11/07 11:14:40 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -54,7 +52,7 @@ void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new
        *prev = new;
 }
-void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
+uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
 {
        struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size);
@@ -76,18 +74,24 @@ void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint
        }
        if (size == 0)
-                return;
+                return 0;
-        /*
-         * If the last fragment starts at the RAM page boundary, it is
-         * REF_PRISTINE irrespective of its size.
-         */
        frag = frag_last(list);
+        /* Sanity check for truncation to longer than we started with... */
+        if (!frag)
+                return 0;
+        if (frag->ofs + frag->size < size)
+                return frag->ofs + frag->size;
+        /* If the last fragment starts at the RAM page boundary, it is
+         * REF_PRISTINE irrespective of its size. */
        if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
                dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
                        frag->ofs, frag->ofs + frag->size);
                frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
        }
+        return size;
 }
 static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
@@ -397,466 +401,6 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
        return 0;
 }
-/*
- * Check the data CRC of the node.
- *
- * Returns: 0 if the data CRC is correct;
- *          1 - if incorrect;
- *          error code if an error occured.
- */
-static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
-{
-        struct jffs2_raw_node_ref *ref = tn->fn->raw;
-        int err = 0, pointed = 0;
-        struct jffs2_eraseblock *jeb;
-        unsigned char *buffer;
-        uint32_t crc, ofs, len;
-        size_t retlen;
-        BUG_ON(tn->csize == 0);
-        if (!jffs2_is_writebuffered(c))
-                goto adj_acc;
-        /* Calculate how many bytes were already checked */
-        ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
-        len = ofs % c->wbuf_pagesize;
-        if (likely(len))
-                len = c->wbuf_pagesize - len;
-        if (len >= tn->csize) {
-                dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
-                        ref_offset(ref), tn->csize, ofs);
-                goto adj_acc;
-        }
-        ofs += len;
-        len = tn->csize - len;
-        dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
-                ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
-#ifndef __ECOS
-        /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
-         * adding and jffs2_flash_read_end() interface. */
-        if (c->mtd->point) {
-                err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
-                if (!err && retlen < tn->csize) {
-                        JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
-                        c->mtd->unpoint(c->mtd, buffer, ofs, len);
-                } else if (err)
-                        JFFS2_WARNING("MTD point failed: error code %d.\n", err);
-                else
-                        pointed = 1; /* succefully pointed to device */
-        }
-#endif
-        if (!pointed) {
-                buffer = kmalloc(len, GFP_KERNEL);
-                if (unlikely(!buffer))
-                        return -ENOMEM;
-                /* TODO: this is very frequent pattern, make it a separate
-                 * routine */
-                err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
-                if (err) {
-                        JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
-                        goto free_out;
-                }
-                if (retlen != len) {
-                        JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
-                        err = -EIO;
-                        goto free_out;
-                }
-        }
-        /* Continue calculating CRC */
-        crc = crc32(tn->partial_crc, buffer, len);
-        if(!pointed)
-                kfree(buffer);
-#ifndef __ECOS
-        else
-                c->mtd->unpoint(c->mtd, buffer, ofs, len);
-#endif
-        if (crc != tn->data_crc) {
-                JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
-                        ofs, tn->data_crc, crc);
-                return 1;
-        }
-adj_acc:
-        jeb = &c->blocks[ref->flash_offset / c->sector_size];
-        len = ref_totlen(c, jeb, ref);
-        /*
-         * Mark the node as having been checked and fix the
-         * accounting accordingly.
-         */
-        spin_lock(&c->erase_completion_lock);
-        jeb->used_size += len;
-        jeb->unchecked_size -= len;
-        c->used_size += len;
-        c->unchecked_size -= len;
-        spin_unlock(&c->erase_completion_lock);
-        return 0;
-free_out:
-        if(!pointed)
-                kfree(buffer);
-#ifndef __ECOS
-        else
-                c->mtd->unpoint(c->mtd, buffer, ofs, len);
-#endif
-        return err;
-}
-/*
- * Helper function for jffs2_add_older_frag_to_fragtree().
- *
- * Checks the node if we are in the checking stage.
- */
-static int check_node(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn)
-{
-        int ret;
-        BUG_ON(ref_obsolete(tn->fn->raw));
-        /* We only check the data CRC of unchecked nodes */
-        if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
-                return 0;
-        dbg_fragtree2("check node %#04x-%#04x, phys offs %#08x.\n",
-                tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
-        ret = check_node_data(c, tn);
-        if (unlikely(ret < 0)) {
-                JFFS2_ERROR("check_node_data() returned error: %d.\n",
-                        ret);
-        } else if (unlikely(ret > 0)) {
-                dbg_fragtree2("CRC error, mark it obsolete.\n");
-                jffs2_mark_node_obsolete(c, tn->fn->raw);
-        }
-        return ret;
-}
-/*
- * Helper function for jffs2_add_older_frag_to_fragtree().
- *
- * Called when the new fragment that is being inserted
- * splits a hole fragment.
- */
-static int split_hole(struct jffs2_sb_info *c, struct rb_root *root,
-                      struct jffs2_node_frag *newfrag, struct jffs2_node_frag *hole)
-{
-        dbg_fragtree2("fragment %#04x-%#04x splits the hole %#04x-%#04x\n",
-                newfrag->ofs, newfrag->ofs + newfrag->size, hole->ofs, hole->ofs + hole->size);
-        if (hole->ofs == newfrag->ofs) {
-                /*
-                 * Well, the new fragment actually starts at the same offset as
-                 * the hole.
-                 */
-                if (hole->ofs + hole->size > newfrag->ofs + newfrag->size) {
-                        /*
-                         * We replace the overlapped left part of the hole by
-                         * the new node.
-                         */
-                        dbg_fragtree2("insert fragment %#04x-%#04x and cut the left part of the hole\n",
-                                newfrag->ofs, newfrag->ofs + newfrag->size);
-                        rb_replace_node(&hole->rb, &newfrag->rb, root);
-                        hole->ofs += newfrag->size;
-                        hole->size -= newfrag->size;
-                        /*
-                         * We know that 'hole' should be the right hand
-                         * fragment.
-                         */
-                        jffs2_fragtree_insert(hole, newfrag);
-                        rb_insert_color(&hole->rb, root);
-                } else {
-                        /*
-                         * Ah, the new fragment is of the same size as the hole.
-                         * Relace the hole by it.
-                         */
-                        dbg_fragtree2("insert fragment %#04x-%#04x and overwrite hole\n",
-                                newfrag->ofs, newfrag->ofs + newfrag->size);
-                        rb_replace_node(&hole->rb, &newfrag->rb, root);
-                        jffs2_free_node_frag(hole);
-                }
-        } else {
-                /* The new fragment lefts some hole space at the left */
-                struct jffs2_node_frag * newfrag2 = NULL;
-                if (hole->ofs + hole->size > newfrag->ofs + newfrag->size) {
-                        /* The new frag also lefts some space at the right */
-                        newfrag2 = new_fragment(NULL, newfrag->ofs +
-                                newfrag->size, hole->ofs + hole->size
-                                - newfrag->ofs - newfrag->size);
-                        if (unlikely(!newfrag2)) {
-                                jffs2_free_node_frag(newfrag);
-                                return -ENOMEM;
-                        }
-                }
-                hole->size = newfrag->ofs - hole->ofs;
-                dbg_fragtree2("left the hole %#04x-%#04x at the left and inserd fragment %#04x-%#04x\n",
-                        hole->ofs, hole->ofs + hole->size, newfrag->ofs, newfrag->ofs + newfrag->size);
-                jffs2_fragtree_insert(newfrag, hole);
-                rb_insert_color(&newfrag->rb, root);
-                if (newfrag2) {
-                        dbg_fragtree2("left the hole %#04x-%#04x at the right\n",
-                                newfrag2->ofs, newfrag2->ofs + newfrag2->size);
-                        jffs2_fragtree_insert(newfrag2, newfrag);
-                        rb_insert_color(&newfrag2->rb, root);
-                }
-        }
-        return 0;
-}
-/*
- * This function is used when we build inode. It expects the nodes are passed
- * in the decreasing version order. The whole point of this is to improve the
- * inodes checking on NAND: we check the nodes' data CRC only when they are not
- * obsoleted. Previously, add_frag_to_fragtree() function was used and
- * nodes were passed to it in the increasing version ordes and CRCs of all
- * nodes were checked.
- *
- * Note: tn->fn->size shouldn't be zero.
- *
- * Returns 0 if the node was inserted
- *         1 if it wasn't inserted (since it is obsolete)
- *         < 0 an if error occured
- */
-int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
-                                     struct jffs2_tmp_dnode_info *tn)
-{
-        struct jffs2_node_frag *this, *newfrag;
-        uint32_t lastend;
-        struct jffs2_full_dnode *fn = tn->fn;
-        struct rb_root *root = &f->fragtree;
-        uint32_t fn_size = fn->size, fn_ofs = fn->ofs;
-        int err, checked = 0;
-        int ref_flag;
-        dbg_fragtree("insert fragment %#04x-%#04x, ver %u\n", fn_ofs, fn_ofs + fn_size, tn->version);
-        /* Skip all the nodes which are completed before this one starts */
-        this = jffs2_lookup_node_frag(root, fn_ofs);
-        if (this)
-                dbg_fragtree2("'this' found %#04x-%#04x (%s)\n", this->ofs, this->ofs + this->size, this->node ? "data" : "hole");
-        if (this)
-                lastend = this->ofs + this->size;
-        else
-                lastend = 0;
-        /* Detect the preliminary type of node */
-        if (fn->size >= PAGE_CACHE_SIZE)
-                ref_flag = REF_PRISTINE;
-        else
-                ref_flag = REF_NORMAL;
-        /* See if we ran off the end of the root */
-        if (lastend <= fn_ofs) {
-                /* We did */
-                /*
-                 * We are going to insert the new node into the
-                 * fragment tree, so check it.
-                 */
-                err = check_node(c, f, tn);
-                if (err != 0)
-                        return err;
-                fn->frags = 1;
-                newfrag = new_fragment(fn, fn_ofs, fn_size);
-                if (unlikely(!newfrag))
-                        return -ENOMEM;
-                err = no_overlapping_node(c, root, newfrag, this, lastend);
-                if (unlikely(err != 0)) {
-                        jffs2_free_node_frag(newfrag);
-                        return err;
-                }
-                goto out_ok;
-        }
-        fn->frags = 0;
-        while (1) {
-                /*
-                 * Here we have:
-                 * fn_ofs < this->ofs + this->size && fn_ofs >= this->ofs.
-                 *
-                 * Remember, 'this' has higher version, any non-hole node
-                 * which is already in the fragtree is newer then the newly
-                 * inserted.
-                 */
-                if (!this->node) {
-                        /*
-                         * 'this' is the hole fragment, so at least the
-                         * beginning of the new fragment is valid.
-                         */
-                        /*
-                         * We are going to insert the new node into the
-                         * fragment tree, so check it.
-                         */
-                        if (!checked) {
-                                err = check_node(c, f, tn);
-                                if (unlikely(err != 0))
-                                        return err;
-                                checked = 1;
-                        }
-                        if (this->ofs + this->size >= fn_ofs + fn_size) {
-                                /* We split the hole on two parts */
-                                fn->frags += 1;
-                                newfrag = new_fragment(fn, fn_ofs, fn_size);
-                                if (unlikely(!newfrag))
-                                        return -ENOMEM;
-                                err = split_hole(c, root, newfrag, this);
-                                if (unlikely(err))
-                                        return err;
-                                goto out_ok;
-                        }
-                        /*
-                         * The beginning of the new fragment is valid since it
-                         * overlaps the hole node.
-                         */
-                        ref_flag = REF_NORMAL;
-                        fn->frags += 1;
-                        newfrag = new_fragment(fn, fn_ofs,
-                                        this->ofs + this->size - fn_ofs);
-                        if (unlikely(!newfrag))
-                                return -ENOMEM;
-                        if (fn_ofs == this->ofs) {
-                                /*
-                                 * The new node starts at the same offset as
-                                 * the hole and supersieds the hole.
-                                 */
-                                dbg_fragtree2("add the new fragment instead of hole %#04x-%#04x, refcnt %d\n",
-                                        fn_ofs, fn_ofs + this->ofs + this->size - fn_ofs, fn->frags);
-                                rb_replace_node(&this->rb, &newfrag->rb, root);
-                                jffs2_free_node_frag(this);
-                        } else {
-                                /*
-                                 * The hole becomes shorter as its right part
-                                 * is supersieded by the new fragment.
-                                 */
-                                dbg_fragtree2("reduce size of hole %#04x-%#04x to %#04x-%#04x\n",
-                                        this->ofs, this->ofs + this->size, this->ofs, this->ofs + this->size - newfrag->size);
-                                dbg_fragtree2("add new fragment %#04x-%#04x, refcnt %d\n", fn_ofs,
-                                        fn_ofs + this->ofs + this->size - fn_ofs, fn->frags);
-                                this->size -= newfrag->size;
-                                jffs2_fragtree_insert(newfrag, this);
-                                rb_insert_color(&newfrag->rb, root);
-                        }
-                        fn_ofs += newfrag->size;
-                        fn_size -= newfrag->size;
-                        this = rb_entry(rb_next(&newfrag->rb),
-                                        struct jffs2_node_frag, rb);
-                        dbg_fragtree2("switch to the next 'this' fragment: %#04x-%#04x %s\n",
-                                this->ofs, this->ofs + this->size, this->node ? "(data)" : "(hole)");
-                }
-                /*
-                 * 'This' node is not the hole so it obsoletes the new fragment
-                 * either fully or partially.
-                 */
-                if (this->ofs + this->size >= fn_ofs + fn_size) {
-                        /* The new node is obsolete, drop it */
-                        if (fn->frags == 0) {
-                                dbg_fragtree2("%#04x-%#04x is obsolete, mark it obsolete\n", fn_ofs, fn_ofs + fn_size);
-                                ref_flag = REF_OBSOLETE;
-                        }
-                        goto out_ok;
-                } else {
-                        struct jffs2_node_frag *new_this;
-                        /* 'This' node obsoletes the beginning of the new node */
-                        dbg_fragtree2("the beginning %#04x-%#04x is obsolete\n", fn_ofs, this->ofs + this->size);
-                        ref_flag = REF_NORMAL;
-                        fn_size -= this->ofs + this->size - fn_ofs;
-                        fn_ofs = this->ofs + this->size;
-                        dbg_fragtree2("now considering %#04x-%#04x\n", fn_ofs, fn_ofs + fn_size);
-                        new_this = rb_entry(rb_next(&this->rb), struct jffs2_node_frag, rb);
-                        if (!new_this) {
-                                /*
-                                 * There is no next fragment. Add the rest of
-                                 * the new node as the right-hand child.
-                                 */
-                                if (!checked) {
-                                        err = check_node(c, f, tn);
-                                        if (unlikely(err != 0))
-                                                return err;
-                                        checked = 1;
-                                }
-                                fn->frags += 1;
-                                newfrag = new_fragment(fn, fn_ofs, fn_size);
-                                if (unlikely(!newfrag))
-                                        return -ENOMEM;
-                                dbg_fragtree2("there are no more fragments, insert %#04x-%#04x\n",
-                                        newfrag->ofs, newfrag->ofs + newfrag->size);
-                                rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right);
-                                rb_insert_color(&newfrag->rb, root);
-                                goto out_ok;
-                        } else {
-                                this = new_this;
-                                dbg_fragtree2("switch to the next 'this' fragment: %#04x-%#04x %s\n",
-                                        this->ofs, this->ofs + this->size, this->node ? "(data)" : "(hole)");
-                        }
-                }
-        }
-out_ok:
-        BUG_ON(fn->size < PAGE_CACHE_SIZE && ref_flag == REF_PRISTINE);
-        if (ref_flag == REF_OBSOLETE) {
-                dbg_fragtree2("the node is obsolete now\n");
-                /* jffs2_mark_node_obsolete() will adjust space accounting */
-                jffs2_mark_node_obsolete(c, fn->raw);
-                return 1;
-        }
-        dbg_fragtree2("the node is \"%s\" now\n", ref_flag == REF_NORMAL ? "REF_NORMAL" : "REF_PRISTINE");
-        /* Space accounting was adjusted at check_node_data() */
-        spin_lock(&c->erase_completion_lock);
-        fn->raw->flash_offset = ref_offset(fn->raw) | ref_flag;
-        spin_unlock(&c->erase_completion_lock);
-        return 0;
-}
 void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state)
 {
        spin_lock(&c->inocache_lock);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 4178b4b55948..25126a062cae 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: nodelist.h,v 1.140 2005/09/07 08:34:54 havasi Exp $
- *
 */
 #ifndef __JFFS2_NODELIST_H__
@@ -40,6 +38,9 @@
 #define cpu_to_je32(x) ((jint32_t){x})
 #define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)})
+#define constant_cpu_to_je16(x) ((jint16_t){x})
+#define constant_cpu_to_je32(x) ((jint32_t){x})
 #define je16_to_cpu(x) ((x).v16)
 #define je32_to_cpu(x) ((x).v32)
 #define jemode_to_cpu(x) (jffs2_to_os_mode((x).m))
@@ -48,6 +49,9 @@
 #define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)})
 #define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))})
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)})
 #define je16_to_cpu(x) (be16_to_cpu(x.v16))
 #define je32_to_cpu(x) (be32_to_cpu(x.v32))
 #define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m)))
@@ -56,6 +60,9 @@
 #define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)})
 #define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))})
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)})
 #define je16_to_cpu(x) (le16_to_cpu(x.v16))
 #define je32_to_cpu(x) (le32_to_cpu(x.v32))
 #define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m)))
@@ -216,7 +223,20 @@ struct jffs2_tmp_dnode_info
        uint32_t version;
        uint32_t data_crc;
        uint32_t partial_crc;
-        uint32_t csize;
+        uint16_t csize;
+        uint16_t overlapped;
+};
+/* Temporary data structure used during readinode. */
+struct jffs2_readinode_info
+{
+        struct rb_root tn_root;
+        struct jffs2_tmp_dnode_info *mdata_tn;
+        uint32_t highest_version;
+        uint32_t latest_mctime;
+        uint32_t mctime_ver;
+        struct jffs2_full_dirent *fds;
+        struct jffs2_raw_node_ref *latest_ref;
 };
 struct jffs2_full_dirent
@@ -319,6 +339,15 @@ static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
 #define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb)
 #define frag_erase(frag, list) rb_erase(&frag->rb, list);
+#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb)
+#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb)
+#define tn_erase(tn, list) rb_erase(&tn->rb, list);
+#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb)
+#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb)
 /* nodelist.c */
 void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list);
 void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state);
@@ -333,8 +362,7 @@ struct rb_node *rb_next(struct rb_node *);
 struct rb_node *rb_prev(struct rb_node *);
 void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
-void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
+uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
-int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
 struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
                                               struct jffs2_eraseblock *jeb,
                                               uint32_t ofs, uint32_t len,
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index d88376992ed9..dbc908ad622b 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: nodemgmt.c,v 1.127 2005/09/20 15:49:12 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
@@ -172,6 +170,11 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
+        if (c->nextblock == NULL) {
+                D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n",
+                  jeb->offset));
+                return;
+        }
        /* Check, if we have a dirty block now, or if it was dirty already */
        if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) {
                c->dirty_size += jeb->wasted_size;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index e07a0edcdb4f..80daea96bbc2 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2002-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: os-linux.h,v 1.64 2005/09/30 13:59:13 dedekind Exp $
- *
 */
 #ifndef __JFFS2_OS_LINUX_H__
@@ -98,6 +96,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_nor_wbuf_flash(c) (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
 #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
+#define jffs2_ubivol(c) (0)
+#define jffs2_ubivol_setup(c) (0)
+#define jffs2_ubivol_cleanup(c) do {} while (0)
 #else /* NAND and/or ECC'd NOR support present */
@@ -133,6 +134,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
+int jffs2_ubivol_setup(struct jffs2_sb_info *c);
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
 #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/pushpull.h b/fs/jffs2/pushpull.h
deleted file mode 100644
index c0c2a9158dff..000000000000
--- a/fs/jffs2/pushpull.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * JFFS2 -- Journalling Flash File System, Version 2.
- *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
- *
- * Created by David Woodhouse <dwmw2@infradead.org>
- *
- * For licensing information, see the file 'LICENCE' in this directory.
- *
- * $Id: pushpull.h,v 1.10 2004/11/16 20:36:11 dwmw2 Exp $
- *
- */
-#ifndef __PUSHPULL_H__
-#define __PUSHPULL_H__
-#include <linux/errno.h>
-struct pushpull {
-        unsigned char *buf;
-        unsigned int buflen;
-        unsigned int ofs;
-        unsigned int reserve;
-};
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
-{
-        pp->buf = buf;
-        pp->buflen = buflen;
-        pp->ofs = ofs;
-        pp->reserve = reserve;
-}
-static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
-{
-        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
-                return -ENOSPC;
-        }
-        if (bit) {
-                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
-        }
-        else {
-                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
-        }
-        pp->ofs++;
-        return 0;
-}
-static inline int pushedbits(struct pushpull *pp)
-{
-        return pp->ofs;
-}
-static inline int pullbit(struct pushpull *pp)
-{
-        int bit;
-        bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
-        pp->ofs++;
-        return bit;
-}
-static inline int pulledbits(struct pushpull *pp)
-{
-        return pp->ofs;
-}
-#endif /* __PUSHPULL_H__ */
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index f3b86da833ba..cfe05c1966a5 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: read.c,v 1.42 2005/11/07 11:14:41 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 717a48cf7df2..6aff38930b50 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: readinode.c,v 1.143 2005/11/07 11:14:41 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -22,30 +20,510 @@
 #include "nodelist.h"
 /*
- * Put a new tmp_dnode_info into the temporaty RB-tree, keeping the list in
+ * Check the data CRC of the node.
- * order of increasing version.
+ *
+ * Returns: 0 if the data CRC is correct;
+ *          1 - if incorrect;
+ *          error code if an error occured.
 */
-static void jffs2_add_tn_to_tree(struct jffs2_tmp_dnode_info *tn, struct rb_root *list)
+static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
 {
-        struct rb_node **p = &list->rb_node;
+        struct jffs2_raw_node_ref *ref = tn->fn->raw;
-        struct rb_node * parent = NULL;
+        int err = 0, pointed = 0;
-        struct jffs2_tmp_dnode_info *this;
+        struct jffs2_eraseblock *jeb;
+        unsigned char *buffer;
-        while (*p) {
+        uint32_t crc, ofs, len;
-                parent = *p;
+        size_t retlen;
-                this = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+        BUG_ON(tn->csize == 0);
-                /* There may actually be a collision here, but it doesn't
-                   actually matter. As long as the two nodes with the same
+        if (!jffs2_is_writebuffered(c))
-                   version are together, it's all fine. */
+                goto adj_acc;
-                if (tn->version > this->version)
-                        p = &(*p)->rb_left;
+        /* Calculate how many bytes were already checked */
+        ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
+        len = ofs % c->wbuf_pagesize;
+        if (likely(len))
+                len = c->wbuf_pagesize - len;
+        if (len >= tn->csize) {
+                dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
+                        ref_offset(ref), tn->csize, ofs);
+                goto adj_acc;
+        }
+        ofs += len;
+        len = tn->csize - len;
+        dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
+                ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
+#ifndef __ECOS
+        /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
+         * adding and jffs2_flash_read_end() interface. */
+        if (c->mtd->point) {
+                err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
+                if (!err && retlen < tn->csize) {
+                        JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+                        c->mtd->unpoint(c->mtd, buffer, ofs, len);
+                } else if (err)
+                        JFFS2_WARNING("MTD point failed: error code %d.\n", err);
                else
-                        p = &(*p)->rb_right;
+                        pointed = 1; /* succefully pointed to device */
+        }
+#endif
+        if (!pointed) {
+                buffer = kmalloc(len, GFP_KERNEL);
+                if (unlikely(!buffer))
+                        return -ENOMEM;
+                /* TODO: this is very frequent pattern, make it a separate
+                 * routine */
+                err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
+                if (err) {
+                        JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
+                        goto free_out;
+                }
+                if (retlen != len) {
+                        JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
+                        err = -EIO;
+                        goto free_out;
+                }
+        }
+        /* Continue calculating CRC */
+        crc = crc32(tn->partial_crc, buffer, len);
+        if(!pointed)
+                kfree(buffer);
+#ifndef __ECOS
+        else
+                c->mtd->unpoint(c->mtd, buffer, ofs, len);
+#endif
+        if (crc != tn->data_crc) {
+                JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
+                        ofs, tn->data_crc, crc);
+                return 1;
        }
-        rb_link_node(&tn->rb, parent, p);
+adj_acc:
-        rb_insert_color(&tn->rb, list);
+        jeb = &c->blocks[ref->flash_offset / c->sector_size];
+        len = ref_totlen(c, jeb, ref);
+        /* If it should be REF_NORMAL, it'll get marked as such when
+           we build the fragtree, shortly. No need to worry about GC
+           moving it while it's marked REF_PRISTINE -- GC won't happen
+           till we've finished checking every inode anyway. */
+        ref->flash_offset |= REF_PRISTINE;
+        /*
+         * Mark the node as having been checked and fix the
+         * accounting accordingly.
+         */
+        spin_lock(&c->erase_completion_lock);
+        jeb->used_size += len;
+        jeb->unchecked_size -= len;
+        c->used_size += len;
+        c->unchecked_size -= len;
+        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+        spin_unlock(&c->erase_completion_lock);
+        return 0;
+free_out:
+        if(!pointed)
+                kfree(buffer);
+#ifndef __ECOS
+        else
+                c->mtd->unpoint(c->mtd, buffer, ofs, len);
+#endif
+        return err;
+}
+/*
+ * Helper function for jffs2_add_older_frag_to_fragtree().
+ *
+ * Checks the node if we are in the checking stage.
+ */
+static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+        int ret;
+        BUG_ON(ref_obsolete(tn->fn->raw));
+        /* We only check the data CRC of unchecked nodes */
+        if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
+                return 0;
+        dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n",
+                      tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
+        ret = check_node_data(c, tn);
+        if (unlikely(ret < 0)) {
+                JFFS2_ERROR("check_node_data() returned error: %d.\n",
+                        ret);
+        } else if (unlikely(ret > 0)) {
+                dbg_readinode("CRC error, mark it obsolete.\n");
+                jffs2_mark_node_obsolete(c, tn->fn->raw);
+        }
+        return ret;
+}
+static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset)
+{
+        struct rb_node *next;
+        struct jffs2_tmp_dnode_info *tn = NULL;
+        dbg_readinode("root %p, offset %d\n", tn_root, offset);
+        next = tn_root->rb_node;
+        while (next) {
+                tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb);
+                if (tn->fn->ofs < offset)
+                        next = tn->rb.rb_right;
+                else if (tn->fn->ofs >= offset)
+                        next = tn->rb.rb_left;
+                else
+                        break;
+        }
+        return tn;
+}
+static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+        jffs2_mark_node_obsolete(c, tn->fn->raw);
+        jffs2_free_full_dnode(tn->fn);
+        jffs2_free_tmp_dnode_info(tn);
+}
+/*
+ * This function is used when we read an inode. Data nodes arrive in
+ * arbitrary order -- they may be older or newer than the nodes which
+ * are already in the tree. Where overlaps occur, the older node can
+ * be discarded as long as the newer passes the CRC check. We don't
+ * bother to keep track of holes in this rbtree, and neither do we deal
+ * with frags -- we can have multiple entries starting at the same
+ * offset, and the one with the smallest length will come first in the
+ * ordering.
+ *
+ * Returns 0 if the node was inserted
+ *         1 if the node is obsolete (because we can't mark it so yet)
+ *         < 0 an if error occurred
+ */
+static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
+                                struct jffs2_readinode_info *rii,
+                                struct jffs2_tmp_dnode_info *tn)
+{
+        uint32_t fn_end = tn->fn->ofs + tn->fn->size;
+        struct jffs2_tmp_dnode_info *insert_point = NULL, *this;
+        dbg_readinode("insert fragment %#04x-%#04x, ver %u\n", tn->fn->ofs, fn_end, tn->version);
+        /* If a node has zero dsize, we only have to keep if it if it might be the
+           node with highest version -- i.e. the one which will end up as f->metadata.
+           Note that such nodes won't be REF_UNCHECKED since there are no data to
+           check anyway. */
+        if (!tn->fn->size) {
+                if (rii->mdata_tn) {
+                        /* We had a candidate mdata node already */
+                        dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version);
+                        jffs2_kill_tn(c, rii->mdata_tn);
+                }
+                rii->mdata_tn = tn;
+                dbg_readinode("keep new mdata with ver %d\n", tn->version);
+                return 0;
+        }
+        /* Find the earliest node which _may_ be relevant to this one */
+        this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs);
+        if (!this) {
+                /* First addition to empty tree. $DEITY how I love the easy cases */
+                rb_link_node(&tn->rb, NULL, &rii->tn_root.rb_node);
+                rb_insert_color(&tn->rb, &rii->tn_root);
+                dbg_readinode("keep new frag\n");
+                return 0;
+        }
+        /* If we add a new node it'll be somewhere under here. */
+        insert_point = this;
+        /* If the node is coincident with another at a lower address,
+           back up until the other node is found. It may be relevant */
+        while (tn->overlapped)
+                tn = tn_prev(tn);
+        dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
+        while (this) {
+                if (this->fn->ofs > fn_end)
+                        break;
+                dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n",
+                              this->version, this->fn->ofs, this->fn->size);
+                if (this->version == tn->version) {
+                        /* Version number collision means REF_PRISTINE GC. Accept either of them
+                           as long as the CRC is correct. Check the one we have already...  */
+                        if (!check_tn_node(c, this)) {
+                                /* The one we already had was OK. Keep it and throw away the new one */
+                                dbg_readinode("Like old node. Throw away new\n");
+                                jffs2_kill_tn(c, tn);
+                                return 0;
+                        } else {
+                                /* Who cares if the new one is good; keep it for now anyway. */
+                                rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+                                /* Same overlapping from in front and behind */
+                                tn->overlapped = this->overlapped;
+                                jffs2_kill_tn(c, this);
+                                dbg_readinode("Like new node. Throw away old\n");
+                                return 0;
+                        }
+                }
+                if (this->version < tn->version &&
+                    this->fn->ofs >= tn->fn->ofs &&
+                    this->fn->ofs + this->fn->size <= fn_end) {
+                        /* New node entirely overlaps 'this' */
+                        if (check_tn_node(c, tn)) {
+                                dbg_readinode("new node bad CRC\n");
+                                jffs2_kill_tn(c, tn);
+                                return 0;
+                        }
+                        /* ... and is good. Kill 'this'... */
+                        rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+                        tn->overlapped = this->overlapped;
+                        jffs2_kill_tn(c, this);
+                        /* ... and any subsequent nodes which are also overlapped */
+                        this = tn_next(tn);
+                        while (this && this->fn->ofs + this->fn->size < fn_end) {
+                                struct jffs2_tmp_dnode_info *next = tn_next(this);
+                                if (this->version < tn->version) {
+                                        tn_erase(this, &rii->tn_root);
+                                        dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n",
+                                                      this->version, this->fn->ofs,
+                                                      this->fn->ofs+this->fn->size);
+                                        jffs2_kill_tn(c, this);
+                                }
+                                this = next;
+                        }
+                        dbg_readinode("Done inserting new\n");
+                        return 0;
+                }
+                if (this->version > tn->version &&
+                    this->fn->ofs <= tn->fn->ofs &&
+                    this->fn->ofs+this->fn->size >= fn_end) {
+                        /* New node entirely overlapped by 'this' */
+                        if (!check_tn_node(c, this)) {
+                                dbg_readinode("Good CRC on old node. Kill new\n");
+                                jffs2_kill_tn(c, tn);
+                                return 0;
+                        }
+                        /* ... but 'this' was bad. Replace it... */
+                        rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+                        dbg_readinode("Bad CRC on old overlapping node. Kill it\n");
+                        jffs2_kill_tn(c, this);
+                        return 0;
+                }
+                /* We want to be inserted under the last node which is
+                   either at a lower offset _or_ has a smaller range */
+                if (this->fn->ofs < tn->fn->ofs ||
+                    (this->fn->ofs == tn->fn->ofs &&
+                     this->fn->size <= tn->fn->size))
+                        insert_point = this;
+                this = tn_next(this);
+        }
+        dbg_readinode("insert_point %p, ver %d, 0x%x-0x%x, ov %d\n",
+                      insert_point, insert_point->version, insert_point->fn->ofs,
+                      insert_point->fn->ofs+insert_point->fn->size,
+                      insert_point->overlapped);
+        /* We neither completely obsoleted nor were completely
+           obsoleted by an earlier node. Insert under insert_point */
+        {
+                struct rb_node *parent = &insert_point->rb;
+                struct rb_node **link = &parent;
+                while (*link) {
+                        parent = *link;
+                        insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+                        if (tn->fn->ofs > insert_point->fn->ofs)
+                                link = &insert_point->rb.rb_right;
+                        else if (tn->fn->ofs < insert_point->fn->ofs ||
+                                 tn->fn->size < insert_point->fn->size)
+                                link = &insert_point->rb.rb_left;
+                        else
+                                link = &insert_point->rb.rb_right;
+                }
+                rb_link_node(&tn->rb, &insert_point->rb, link);
+                rb_insert_color(&tn->rb, &rii->tn_root);
+        }
+        /* If there's anything behind that overlaps us, note it */
+        this = tn_prev(tn);
+        if (this) {
+                while (1) {
+                        if (this->fn->ofs + this->fn->size > tn->fn->ofs) {
+                                dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n",
+                                              this, this->version, this->fn->ofs,
+                                              this->fn->ofs+this->fn->size);
+                                tn->overlapped = 1;
+                                break;
+                        }
+                        if (!this->overlapped)
+                                break;
+                        this = tn_prev(this);
+                }
+        }
+        /* If the new node overlaps anything ahead, note it */
+        this = tn_next(tn);
+        while (this && this->fn->ofs < fn_end) {
+                this->overlapped = 1;
+                dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n",
+                              this->version, this->fn->ofs,
+                              this->fn->ofs+this->fn->size);
+                this = tn_next(this);
+        }
+        return 0;
+}
+/* Trivial function to remove the last node in the tree. Which by definition
+   has no right-hand -- so can be removed just by making its only child (if
+   any) take its place under its parent. */
+static void eat_last(struct rb_root *root, struct rb_node *node)
+{
+        struct rb_node *parent = rb_parent(node);
+        struct rb_node **link;
+        /* LAST! */
+        BUG_ON(node->rb_right);
+        if (!parent)
+                link = &root->rb_node;
+        else if (node == parent->rb_left)
+                link = &parent->rb_left;
+        else
+                link = &parent->rb_right;
+        *link = node->rb_left;
+        /* Colour doesn't matter now. Only the parent pointer. */
+        if (node->rb_left)
+                node->rb_left->rb_parent_color = node->rb_parent_color;
+}
+/* We put this in reverse order, so we can just use eat_last */
+static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
+{
+        struct rb_node **link = &ver_root->rb_node;
+        struct rb_node *parent = NULL;
+        struct jffs2_tmp_dnode_info *this_tn;
+        while (*link) {
+                parent = *link;
+                this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+                if (tn->version > this_tn->version)
+                        link = &parent->rb_left;
+                else
+                        link = &parent->rb_right;
+        }
+        dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root);
+        rb_link_node(&tn->rb, parent, link);
+        rb_insert_color(&tn->rb, ver_root);
+}
+/* Build final, normal fragtree from tn tree. It doesn't matter which order
+   we add nodes to the real fragtree, as long as they don't overlap. And
+   having thrown away the majority of overlapped nodes as we went, there
+   really shouldn't be many sets of nodes which do overlap. If we start at
+   the end, we can use the overlap markers -- we can just eat nodes which
+   aren't overlapped, and when we encounter nodes which _do_ overlap we
+   sort them all into a temporary tree in version order before replaying them. */
+static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
+                                      struct jffs2_inode_info *f,
+                                      struct jffs2_readinode_info *rii)
+{
+        struct jffs2_tmp_dnode_info *pen, *last, *this;
+        struct rb_root ver_root = RB_ROOT;
+        uint32_t high_ver = 0;
+        if (rii->mdata_tn) {
+                dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn);
+                high_ver = rii->mdata_tn->version;
+                rii->latest_ref = rii->mdata_tn->fn->raw;
+        }
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+        this = tn_last(&rii->tn_root);
+        while (this) {
+                dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs,
+                             this->fn->ofs+this->fn->size, this->overlapped);
+                this = tn_prev(this);
+        }
+#endif
+        pen = tn_last(&rii->tn_root);
+        while ((last = pen)) {
+                pen = tn_prev(last);
+                eat_last(&rii->tn_root, &last->rb);
+                ver_insert(&ver_root, last);
+                if (unlikely(last->overlapped))
+                        continue;
+                /* Now we have a bunch of nodes in reverse version
+                   order, in the tree at ver_root. Most of the time,
+                   there'll actually be only one node in the 'tree',
+                   in fact. */
+                this = tn_last(&ver_root);
+                while (this) {
+                        struct jffs2_tmp_dnode_info *vers_next;
+                        int ret;
+                        vers_next = tn_prev(this);
+                        eat_last(&ver_root, &this->rb);
+                        if (check_tn_node(c, this)) {
+                                dbg_readinode("node ver %x, 0x%x-0x%x failed CRC\n",
+                                             this->version, this->fn->ofs,
+                                             this->fn->ofs+this->fn->size);
+                                jffs2_kill_tn(c, this);
+                        } else {
+                                if (this->version > high_ver) {
+                                        /* Note that this is different from the other
+                                           highest_version, because this one is only
+                                           counting _valid_ nodes which could give the
+                                           latest inode metadata */
+                                        high_ver = this->version;
+                                        rii->latest_ref = this->fn->raw;
+                                }
+                                dbg_readinode("Add %p (v %x, 0x%x-0x%x, ov %d) to fragtree\n",
+                                             this, this->version, this->fn->ofs,
+                                             this->fn->ofs+this->fn->size, this->overlapped);
+                                ret = jffs2_add_full_dnode_to_inode(c, f, this->fn);
+                                if (ret) {
+                                        /* Free the nodes in vers_root; let the caller
+                                           deal with the rest */
+                                        JFFS2_ERROR("Add node to tree failed %d\n", ret);
+                                        while (1) {
+                                                vers_next = tn_prev(this);
+                                                if (check_tn_node(c, this))
+                                                        jffs2_mark_node_obsolete(c, this->fn->raw);
+                                                jffs2_free_full_dnode(this->fn);
+                                                jffs2_free_tmp_dnode_info(this);
+                                                this = vers_next;
+                                                if (!this)
+                                                        break;
+                                                eat_last(&ver_root, &vers_next->rb);
+                                        }
+                                        return ret;
+                                }
+                                jffs2_free_tmp_dnode_info(this);
+                        }
+                        this = vers_next;
+                }
+        }
+        return 0;
 }
 static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
@@ -112,8 +590,8 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
 *          negative error code on failure.
 */
 static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-                                struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp,
+                                struct jffs2_raw_dirent *rd, size_t read,
-                                uint32_t *latest_mctime, uint32_t *mctime_ver)
+                                struct jffs2_readinode_info *rii)
 {
        struct jffs2_full_dirent *fd;
        uint32_t crc;
@@ -125,7 +603,8 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
        if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
                JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
                             ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        /* If we've never checked the CRCs on this node, check them now */
@@ -137,7 +616,8 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
                if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
                        JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
                                    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
-                        return 1;
+                        jffs2_mark_node_obsolete(c, ref);
+                        return 0;
                }
                jeb = &c->blocks[ref->flash_offset / c->sector_size];
@@ -161,10 +641,13 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
        fd->ino = je32_to_cpu(rd->ino);
        fd->type = rd->type;
+        if (fd->version > rii->highest_version)
+                rii->highest_version = fd->version;
        /* Pick out the mctime of the latest dirent */
-        if(fd->version > *mctime_ver && je32_to_cpu(rd->mctime)) {
+        if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) {
-                *mctime_ver = fd->version;
+                rii->mctime_ver = fd->version;
-                *latest_mctime = je32_to_cpu(rd->mctime);
+                rii->latest_mctime = je32_to_cpu(rd->mctime);
        }
        /*
@@ -201,7 +684,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
         * Wheee. We now have a complete jffs2_full_dirent structure, with
         * the name in it and everything. Link it into the list
         */
-        jffs2_add_fd_to_list(c, fd, fdp);
+        jffs2_add_fd_to_list(c, fd, &rii->fds);
        return 0;
 }
@@ -210,13 +693,13 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 * Helper function for jffs2_get_inode_nodes().
 * It is called every time an inode node is found.
 *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
 *          1 if the node should be marked obsolete;
 *          negative error code on failure.
 */
 static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-                             struct jffs2_raw_inode *rd, struct rb_root *tnp, int rdlen,
+                             struct jffs2_raw_inode *rd, int rdlen,
-                             uint32_t *latest_mctime, uint32_t *mctime_ver)
+                             struct jffs2_readinode_info *rii)
 {
        struct jffs2_tmp_dnode_info *tn;
        uint32_t len, csize;
@@ -230,7 +713,8 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
                JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
                             ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        tn = jffs2_alloc_tmp_dnode_info();
@@ -342,6 +826,10 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        tn->data_crc = je32_to_cpu(rd->data_crc);
        tn->csize = csize;
        tn->fn->raw = ref;
+        tn->overlapped = 0;
+        if (tn->version > rii->highest_version)
+                rii->highest_version = tn->version;
        /* There was a bug where we wrote hole nodes out with
           csize/dsize swapped. Deal with it */
@@ -353,13 +841,25 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        dbg_readinode("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n",
                  ref_offset(ref), je32_to_cpu(rd->version), je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize);
-        jffs2_add_tn_to_tree(tn, tnp);
+        ret = jffs2_add_tn_to_tree(c, rii, tn);
+        if (ret) {
+                jffs2_free_full_dnode(tn->fn);
+        free_out:
+                jffs2_free_tmp_dnode_info(tn);
+                return ret;
+        }
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+        dbg_readinode("After adding ver %d:\n", tn->version);
+        tn = tn_first(&rii->tn_root);
+        while (tn) {
+                dbg_readinode("%p: v %d r 0x%x-0x%x ov %d\n",
+                             tn, tn->version, tn->fn->ofs,
+                             tn->fn->ofs+tn->fn->size, tn->overlapped);
+                tn = tn_next(tn);
+        }
+#endif
        return 0;
-free_out:
-        jffs2_free_tmp_dnode_info(tn);
-        return ret;
 }
 /*
@@ -379,7 +879,8 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
                JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n",
                            je16_to_cpu(un->magic), je16_to_cpu(un->nodetype),
                            je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc));
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
@@ -407,7 +908,8 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
        case JFFS2_FEATURE_RWCOMPAT_DELETE:
                JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
                             je16_to_cpu(un->nodetype), ref_offset(ref));
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        return 0;
@@ -421,92 +923,62 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 *          negative error code on failure.
 */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-                     int right_size, int *rdlen, unsigned char *buf, unsigned char *bufstart)
+                     int needed_len, int *rdlen, unsigned char *buf)
 {
-        int right_len, err, len;
+        int err, to_read = needed_len - *rdlen;
        size_t retlen;
        uint32_t offs;
        if (jffs2_is_writebuffered(c)) {
-                right_len = c->wbuf_pagesize - (bufstart - buf);
+                int rem = to_read % c->wbuf_pagesize;
-                if (right_size + (int)(bufstart - buf) > c->wbuf_pagesize)
-                        right_len += c->wbuf_pagesize;
-        } else
-                right_len = right_size;
-        if (*rdlen == right_len)
+                if (rem)
-                return 0;
+                        to_read += c->wbuf_pagesize - rem;
+        }
        /* We need to read more data */
        offs = ref_offset(ref) + *rdlen;
-        if (jffs2_is_writebuffered(c)) {
-                bufstart = buf + c->wbuf_pagesize;
-                len = c->wbuf_pagesize;
-        } else {
-                bufstart = buf + *rdlen;
-                len = right_size - *rdlen;
-        }
-        dbg_readinode("read more %d bytes\n", len);
+        dbg_readinode("read more %d bytes\n", to_read);
-        err = jffs2_flash_read(c, offs, len, &retlen, bufstart);
+        err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen);
        if (err) {
                JFFS2_ERROR("can not read %d bytes from 0x%08x, "
-                        "error code: %d.\n", len, offs, err);
+                        "error code: %d.\n", to_read, offs, err);
                return err;
        }
-        if (retlen < len) {
+        if (retlen < to_read) {
                JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
-                                offs, retlen, len);
+                                offs, retlen, to_read);
                return -EIO;
        }
-        *rdlen = right_len;
+        *rdlen += to_read;
        return 0;
 }
 /* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated
-   with this ino, returning the former in order of version */
+   with this ino. Perform a preliminary ordering on data nodes, throwing away
+   those which are completely obsoleted by newer ones. The naïve approach we
+   use to take of just returning them _all_ in version order will cause us to
+   run out of memory in certain degenerate cases. */
 static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
-                                 struct rb_root *tnp, struct jffs2_full_dirent **fdp,
+                                 struct jffs2_readinode_info *rii)
-                                 uint32_t *highest_version, uint32_t *latest_mctime,
-                                 uint32_t *mctime_ver)
 {
        struct jffs2_raw_node_ref *ref, *valid_ref;
-        struct rb_root ret_tn = RB_ROOT;
-        struct jffs2_full_dirent *ret_fd = NULL;
        unsigned char *buf = NULL;
        union jffs2_node_union *node;
        size_t retlen;
        int len, err;
-        *mctime_ver = 0;
+        rii->mctime_ver = 0;
        dbg_readinode("ino #%u\n", f->inocache->ino);
-        if (jffs2_is_writebuffered(c)) {
-                /*
-                 * If we have the write buffer, we assume the minimal I/O unit
-                 * is c->wbuf_pagesize. We implement some optimizations which in
-                 * this case and we need a temporary buffer of size =
-                 * 2*c->wbuf_pagesize bytes (see comments in read_dnode()).
-                 * Basically, we want to read not only the node header, but the
-                 * whole wbuf (NAND page in case of NAND) or 2, if the node
-                 * header overlaps the border between the 2 wbufs.
-                 */
-                len = 2*c->wbuf_pagesize;
-        } else {
-                /*
-                 * When there is no write buffer, the size of the temporary
-                 * buffer is the size of the larges node header.
-                 */
-                len = sizeof(union jffs2_node_union);
-        }
        /* FIXME: in case of NOR and available ->point() this
         * needs to be fixed. */
+        len = sizeof(union jffs2_node_union) + c->wbuf_pagesize;
        buf = kmalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
@@ -516,8 +988,6 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
        if (!valid_ref && f->inocache->ino != 1)
                JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino);
        while (valid_ref) {
-                unsigned char *bufstart;
                /* We can hold a pointer to a non-obsolete node without the spinlock,
                   but _obsolete_ nodes may disappear at any time, if the block
                   they're in gets erased. So if we mark 'ref' obsolete while we're
@@ -533,32 +1003,31 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                /*
                 * At this point we don't know the type of the node we're going
                 * to read, so we do not know the size of its header. In order
-                 * to minimize the amount of flash IO we assume the node has
+                 * to minimize the amount of flash IO we assume the header is
-                 * size = JFFS2_MIN_NODE_HEADER.
+                 * of size = JFFS2_MIN_NODE_HEADER.
                 */
+                len = JFFS2_MIN_NODE_HEADER;
                if (jffs2_is_writebuffered(c)) {
+                        int end, rem;
                        /*
-                         * We treat 'buf' as 2 adjacent wbufs. We want to
+                         * We are about to read JFFS2_MIN_NODE_HEADER bytes,
-                         * adjust bufstart such as it points to the
+                         * but this flash has some minimal I/O unit. It is
-                         * beginning of the node within this wbuf.
+                         * possible that we'll need to read more soon, so read
+                         * up to the next min. I/O unit, in order not to
+                         * re-read the same min. I/O unit twice.
                         */
-                        bufstart = buf + (ref_offset(ref) % c->wbuf_pagesize);
+                        end = ref_offset(ref) + len;
-                        /* We will read either one wbuf or 2 wbufs. */
+                        rem = end % c->wbuf_pagesize;
-                        len = c->wbuf_pagesize - (bufstart - buf);
+                        if (rem)
-                        if (JFFS2_MIN_NODE_HEADER + (int)(bufstart - buf) > c->wbuf_pagesize) {
+                                end += c->wbuf_pagesize - rem;
-                                /* The header spans the border of the first wbuf */
+                        len = end - ref_offset(ref);
-                                len += c->wbuf_pagesize;
-                        }
-                } else {
-                        bufstart = buf;
-                        len = JFFS2_MIN_NODE_HEADER;
                }
                dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref));
                /* FIXME: point() */
-                err = jffs2_flash_read(c, ref_offset(ref), len,
+                err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf);
-                                       &retlen, bufstart);
                if (err) {
                        JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err);
                        goto free_out;
@@ -570,7 +1039,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                        goto free_out;
                }
-                node = (union jffs2_node_union *)bufstart;
+                node = (union jffs2_node_union *)buf;
                /* No need to mask in the valid bit; it shouldn't be invalid */
                if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
@@ -583,10 +1052,10 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                        jffs2_mark_node_obsolete(c, ref);
                        goto cont;
                }
-                /* Due to poor choice of crc32 seed, an all-zero node will have a correct CRC */
+                if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) {
-                if (!je32_to_cpu(node->u.hdr_crc) && !je16_to_cpu(node->u.nodetype) &&
+                        /* Not a JFFS2 node, whinge and move on */
-                    !je16_to_cpu(node->u.magic) && !je32_to_cpu(node->u.totlen)) {
+                        JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n",
-                        JFFS2_NOTICE("All zero node header at %#08x.\n", ref_offset(ref));
+                                     je16_to_cpu(node->u.magic), ref_offset(ref));
                        jffs2_mark_node_obsolete(c, ref);
                        goto cont;
                }
@@ -596,46 +1065,34 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                case JFFS2_NODETYPE_DIRENT:
                        if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent)) {
-                                err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf, bufstart);
+                                err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf);
                                if (unlikely(err))
                                        goto free_out;
                        }
-                        err = read_direntry(c, ref, &node->d, retlen, &ret_fd, latest_mctime, mctime_ver);
+                        err = read_direntry(c, ref, &node->d, retlen, rii);
-                        if (err == 1) {
+                        if (unlikely(err))
-                                jffs2_mark_node_obsolete(c, ref);
-                                break;
-                        } else if (unlikely(err))
                                goto free_out;
-                        if (je32_to_cpu(node->d.version) > *highest_version)
-                                *highest_version = je32_to_cpu(node->d.version);
                        break;
                case JFFS2_NODETYPE_INODE:
                        if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode)) {
-                                err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf, bufstart);
+                                err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf);
                                if (unlikely(err))
                                        goto free_out;
                        }
-                        err = read_dnode(c, ref, &node->i, &ret_tn, len, latest_mctime, mctime_ver);
+                        err = read_dnode(c, ref, &node->i, len, rii);
-                        if (err == 1) {
+                        if (unlikely(err))
-                                jffs2_mark_node_obsolete(c, ref);
-                                break;
-                        } else if (unlikely(err))
                                goto free_out;
-                        if (je32_to_cpu(node->i.version) > *highest_version)
-                                *highest_version = je32_to_cpu(node->i.version);
                        break;
                default:
                        if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node)) {
-                                err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf, bufstart);
+                                err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf);
                                if (unlikely(err))
                                        goto free_out;
                        }
@@ -653,17 +1110,19 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
        }
        spin_unlock(&c->erase_completion_lock);
-        *tnp = ret_tn;
-        *fdp = ret_fd;
        kfree(buf);
+        f->highest_version = rii->highest_version;
        dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n",
-                        f->inocache->ino, *highest_version, *latest_mctime, *mctime_ver);
+                      f->inocache->ino, rii->highest_version, rii->latest_mctime,
+                      rii->mctime_ver);
        return 0;
 free_out:
-        jffs2_free_tmp_dnode_info_list(&ret_tn);
+        jffs2_free_tmp_dnode_info_list(&rii->tn_root);
-        jffs2_free_full_dirent_list(ret_fd);
+        jffs2_free_full_dirent_list(rii->fds);
+        rii->fds = NULL;
        kfree(buf);
        return err;
 }
@@ -672,20 +1131,17 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                                        struct jffs2_inode_info *f,
                                        struct jffs2_raw_inode *latest_node)
 {
-        struct jffs2_tmp_dnode_info *tn;
+        struct jffs2_readinode_info rii;
-        struct rb_root tn_list;
+        uint32_t crc, new_size;
-        struct rb_node *rb, *repl_rb;
-        struct jffs2_full_dirent *fd_list;
-        struct jffs2_full_dnode *fn, *first_fn = NULL;
-        uint32_t crc;
-        uint32_t latest_mctime, mctime_ver;
        size_t retlen;
        int ret;
        dbg_readinode("ino #%u nlink is %d\n", f->inocache->ino, f->inocache->nlink);
+        memset(&rii, 0, sizeof(rii));
        /* Grab all nodes relevant to this ino */
-        ret = jffs2_get_inode_nodes(c, f, &tn_list, &fd_list, &f->highest_version, &latest_mctime, &mctime_ver);
+        ret = jffs2_get_inode_nodes(c, f, &rii);
        if (ret) {
                JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret);
@@ -693,74 +1149,42 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                        jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
                return ret;
        }
-        f->dents = fd_list;
-        rb = rb_first(&tn_list);
-        while (rb) {
+        ret = jffs2_build_inode_fragtree(c, f, &rii);
-                cond_resched();
+        if (ret) {
-                tn = rb_entry(rb, struct jffs2_tmp_dnode_info, rb);
+                JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n",
-                fn = tn->fn;
+                            f->inocache->ino, ret);
-                ret = 1;
+                if (f->inocache->state == INO_STATE_READING)
-                dbg_readinode("consider node ver %u, phys offset "
+                        jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
-                        "%#08x(%d), range %u-%u.\n", tn->version,
+                jffs2_free_tmp_dnode_info_list(&rii.tn_root);
-                        ref_offset(fn->raw), ref_flags(fn->raw),
+                /* FIXME: We could at least crc-check them all */
-                        fn->ofs, fn->ofs + fn->size);
+                if (rii.mdata_tn) {
+                        jffs2_free_full_dnode(rii.mdata_tn->fn);
-                if (fn->size) {
+                        jffs2_free_tmp_dnode_info(rii.mdata_tn);
-                        ret = jffs2_add_older_frag_to_fragtree(c, f, tn);
+                        rii.mdata_tn = NULL;
-                        /* TODO: the error code isn't checked, check it */
+                }
-                        jffs2_dbg_fragtree_paranoia_check_nolock(f);
+                return ret;
-                        BUG_ON(ret < 0);
+        }
-                        if (!first_fn && ret == 0)
-                                first_fn = fn;
-                } else if (!first_fn) {
-                        first_fn = fn;
-                        f->metadata = fn;
-                        ret = 0; /* Prevent freeing the metadata update node */
-                } else
-                        jffs2_mark_node_obsolete(c, fn->raw);
-                BUG_ON(rb->rb_left);
-                if (rb_parent(rb) && rb_parent(rb)->rb_left == rb) {
-                        /* We were then left-hand child of our parent. We need
-                         * to move our own right-hand child into our place. */
-                        repl_rb = rb->rb_right;
-                        if (repl_rb)
-                                rb_set_parent(repl_rb, rb_parent(rb));
-                } else
-                        repl_rb = NULL;
-                rb = rb_next(rb);
-                /* Remove the spent tn from the tree; don't bother rebalancing
-                 * but put our right-hand child in our own place. */
-                if (rb_parent(&tn->rb)) {
-                        if (rb_parent(&tn->rb)->rb_left == &tn->rb)
-                                rb_parent(&tn->rb)->rb_left = repl_rb;
-                        else if (rb_parent(&tn->rb)->rb_right == &tn->rb)
-                                rb_parent(&tn->rb)->rb_right = repl_rb;
-                        else BUG();
-                } else if (tn->rb.rb_right)
-                        rb_set_parent(tn->rb.rb_right, NULL);
-                jffs2_free_tmp_dnode_info(tn);
+        if (rii.mdata_tn) {
-                if (ret) {
+                if (rii.mdata_tn->fn->raw == rii.latest_ref) {
-                        dbg_readinode("delete dnode %u-%u.\n",
+                        f->metadata = rii.mdata_tn->fn;
-                                fn->ofs, fn->ofs + fn->size);
+                        jffs2_free_tmp_dnode_info(rii.mdata_tn);
-                        jffs2_free_full_dnode(fn);
+                } else {
+                        jffs2_kill_tn(c, rii.mdata_tn);
                }
+                rii.mdata_tn = NULL;
        }
-        jffs2_dbg_fragtree_paranoia_check_nolock(f);
-        BUG_ON(first_fn && ref_obsolete(first_fn->raw));
+        f->dents = rii.fds;
-        fn = first_fn;
+        jffs2_dbg_fragtree_paranoia_check_nolock(f);
-        if (unlikely(!first_fn)) {
+        if (unlikely(!rii.latest_ref)) {
                /* No data nodes for this inode. */
                if (f->inocache->ino != 1) {
                        JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino);
-                        if (!fd_list) {
+                        if (!rii.fds) {
                                if (f->inocache->state == INO_STATE_READING)
                                        jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
                                return -EIO;
@@ -778,7 +1202,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                return 0;
        }
-        ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(*latest_node), &retlen, (void *)latest_node);
+        ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node);
        if (ret || retlen != sizeof(*latest_node)) {
                JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
                        ret, retlen, sizeof(*latest_node));
@@ -791,7 +1215,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
        crc = crc32(0, latest_node, sizeof(*latest_node)-8);
        if (crc != je32_to_cpu(latest_node->node_crc)) {
                JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
-                        f->inocache->ino, ref_offset(fn->raw));
+                        f->inocache->ino, ref_offset(rii.latest_ref));
                up(&f->sem);
                jffs2_do_clear_inode(c, f);
                return -EIO;
@@ -799,17 +1223,22 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
        switch(jemode_to_cpu(latest_node->mode) & S_IFMT) {
        case S_IFDIR:
-                if (mctime_ver > je32_to_cpu(latest_node->version)) {
+                if (rii.mctime_ver > je32_to_cpu(latest_node->version)) {
                        /* The times in the latest_node are actually older than
                           mctime in the latest dirent. Cheat. */
-                        latest_node->ctime = latest_node->mtime = cpu_to_je32(latest_mctime);
+                        latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime);
                }
                break;
        case S_IFREG:
                /* If it was a regular file, truncate it to the latest node's isize */
-                jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
+                new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
+                if (new_size != je32_to_cpu(latest_node->isize)) {
+                        JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n",
+                                      f->inocache->ino, je32_to_cpu(latest_node->isize), new_size);
+                        latest_node->isize = cpu_to_je32(new_size);
+                }
                break;
        case S_IFLNK:
@@ -832,7 +1261,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                                return -ENOMEM;
                        }
-                        ret = jffs2_flash_read(c, ref_offset(fn->raw) + sizeof(*latest_node),
+                        ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node),
                                                je32_to_cpu(latest_node->csize), &retlen, (char *)f->target);
                        if (ret  || retlen != je32_to_cpu(latest_node->csize)) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7fb45bd4915c..2a1c976c7924 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1,15 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: scan.c,v 1.125 2005/09/30 13:59:13 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -636,16 +635,17 @@ scan_more:
                if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) {
                        uint32_t inbuf_ofs;
-                        uint32_t empty_start;
+                        uint32_t empty_start, scan_end;
                        empty_start = ofs;
                        ofs += 4;
+                        scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len);
                        D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs));
                more_empty:
                        inbuf_ofs = ofs - buf_ofs;
-                        while (inbuf_ofs < buf_len) {
+                        while (inbuf_ofs < scan_end) {
-                                if (*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff) {
+                                if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) {
                                        printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
                                               empty_start, ofs);
                                        if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
@@ -666,7 +666,11 @@ scan_more:
                                D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
                                return BLK_STATE_CLEANMARKER;
                        }
+                        if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */
+                                scan_end = buf_len;
+                                goto more_empty;
+                        }
+                        
                        /* See how much more there is to read in this eraseblock... */
                        buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
                        if (!buf_len) {
@@ -676,6 +680,8 @@ scan_more:
                                          empty_start));
                                break;
                        }
+                        /* point never reaches here */
+                        scan_end = buf_len;
                        D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs));
                        err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
                        if (err)
@@ -734,18 +740,8 @@ scan_more:
                        ofs += 4;
                        continue;
                }
-                /* Due to poor choice of crc32 seed, an all-zero node will have a correct CRC */
-                if (!je32_to_cpu(node->hdr_crc) && !je16_to_cpu(node->nodetype) &&
-                    !je16_to_cpu(node->magic) && !je32_to_cpu(node->totlen)) {
-                        noisy_printk(&noise, "jffs2_scan_eraseblock(): All zero node header at 0x%08x.\n", ofs);
-                        if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
-                                return err;
-                        ofs += 4;
-                        continue;
-                }
-                if (ofs + je32_to_cpu(node->totlen) >
+                if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) {
-                    jeb->offset + c->sector_size) {
                        /* Eep. Node goes over the end of the erase block. */
                        printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
                               ofs, je32_to_cpu(node->totlen));
@@ -952,8 +948,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
                                 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
 {
        struct jffs2_inode_cache *ic;
-        uint32_t ino = je32_to_cpu(ri->ino);
+        uint32_t crc, ino = je32_to_cpu(ri->ino);
-        int err;
        D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
@@ -966,21 +961,22 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
           Which means that the _full_ amount of time to get to proper write mode with GC
           operational may actually be _longer_ than before. Sucks to be me. */
+        /* Check the node CRC in any case. */
+        crc = crc32(0, ri, sizeof(*ri)-8);
+        if (crc != je32_to_cpu(ri->node_crc)) {
+                printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on "
+                       "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+                       ofs, je32_to_cpu(ri->node_crc), crc);
+                /*
+                 * We believe totlen because the CRC on the node
+                 * _header_ was OK, just the node itself failed.
+                 */
+                return jffs2_scan_dirty_space(c, jeb,
+                                              PAD(je32_to_cpu(ri->totlen)));
+        }
        ic = jffs2_get_ino_cache(c, ino);
        if (!ic) {
-                /* Inocache get failed. Either we read a bogus ino# or it's just genuinely the
-                   first node we found for this inode. Do a CRC check to protect against the former
-                   case */
-                uint32_t crc = crc32(0, ri, sizeof(*ri)-8);
-                if (crc != je32_to_cpu(ri->node_crc)) {
-                        printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
-                               ofs, je32_to_cpu(ri->node_crc), crc);
-                        /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-                        if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
-                                return err;
-                        return 0;
-                }
                ic = jffs2_scan_make_ino_cache(c, ino);
                if (!ic)
                        return -ENOMEM;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 52a9894a6364..bc9f6ba10823 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 30f888414ce7..d828b296392a 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -1,16 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
- *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *                   Zoltan Sogor <weth@inf.u-szeged.hu>,
- *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *                   Patrik Kluba <pajko@halom.u-szeged.hu>,
- *                     University of Szeged, Hungary
+ *                   University of Szeged, Hungary
- *               2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *             2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: summary.c,v 1.4 2005/09/26 11:37:21 havasi Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 6bf1f6aa4552..0c6669e21390 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -1,15 +1,13 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
- *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *                   Zoltan Sogor <weth@inf.u-szeged.hu>,
- *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *                   Patrik Kluba <pajko@halom.u-szeged.hu>,
- *                     University of Szeged, Hungary
+ *                   University of Szeged, Hungary
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: summary.h,v 1.2 2005/09/26 11:37:21 havasi Exp $
- *
 */
 #ifndef JFFS2_SUMMARY_H
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index cc7e8e71ad46..45368f8bbe72 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: super.c,v 1.110 2005/11/07 11:14:42 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -49,8 +47,7 @@ static void jffs2_i_init_once(void * foo, struct kmem_cache * cachep, unsigned l
 {
        struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                init_MUTEX(&ei->sem);
                inode_init_once(&ei->vfs_inode);
        }
@@ -347,7 +344,7 @@ static int __init init_jffs2_fs(void)
 #ifdef CONFIG_JFFS2_SUMMARY
               " (SUMMARY) "
 #endif
-               " (C) 2001-2006 Red Hat, Inc.\n");
+               " © 2001-2006 Red Hat, Inc.\n");
        jffs2_inode_cachep = kmem_cache_create("jffs2_i",
                                             sizeof(struct jffs2_inode_info),
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 7e4882c8a7ed..b7339c3b6ad9 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -1,17 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: symlink.c,v 1.19 2005/11/07 11:14:42 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4fac6dd53954..c556e85a565c 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1,16 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
- * Copyright (C) 2004 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright © 2004 Thomas Gleixner <tglx@linutronix.de>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 * Modified debugged and enhanced by Thomas Gleixner <tglx@linutronix.de>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: wbuf.c,v 1.100 2005/09/30 13:59:13 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
@@ -345,6 +343,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                return;
        }
+        /* The summary is not recovered, so it must be disabled for this erase block */
+        jffs2_sum_disable_collecting(c->summary);
        ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
        if (ret) {
                printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
@@ -967,9 +968,9 @@ exit:
 static const struct jffs2_unknown_node oob_cleanmarker =
 {
-        .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK),
+        .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK),
-        .nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
+        .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
-        .totlen = cpu_to_je32(8)
+        .totlen = constant_cpu_to_je32(8)
 };
 /*
@@ -1208,3 +1209,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
        kfree(c->wbuf);
 }
+int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
+        c->cleanmarker_size = 0;
+        if (c->mtd->writesize == 1)
+                /* We do not need write-buffer */
+                return 0;
+        init_rwsem(&c->wbuf_sem);
+        c->wbuf_pagesize =  c->mtd->writesize;
+        c->wbuf_ofs = 0xFFFFFFFF;
+        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+        if (!c->wbuf)
+                return -ENOMEM;
+        printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+        return 0;
+}
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
+        kfree(c->wbuf);
+}
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 67176792e138..c9fe0ab3a329 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: write.c,v 1.97 2005/11/07 11:14:42 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -507,8 +505,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
        uint32_t alloclen;
        int ret;
-        if (1 /* alternative branch needs testing */ ||
+        if (!jffs2_can_mark_obsolete(c)) {
-            !jffs2_can_mark_obsolete(c)) {
                /* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */
                rd = jffs2_alloc_raw_dirent();
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index c638ae1008de..b9276b11bac6 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: writev.c,v 1.8 2005/09/09 15:11:58 havasi Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4bb3f1897330..78fc08893a6c 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 06a5c69dcf8b..3b0ff2925937 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #ifndef _JFFS2_FS_XATTR_H_
 #define _JFFS2_FS_XATTR_H_
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ed046e19dbfa..8ec5765ef348 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 2f8e9aa01ea0..40942bc516bb 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 58deae007507..6b3acb0b5781 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -184,8 +184,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
        struct metapage *mp = (struct metapage *)foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                mp->lid = 0;
                mp->lsn = 0;
                mp->flag = 0;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 52d73d54a931..ea9dc3e65dcf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -752,8 +752,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
        struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
                INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
                init_rwsem(&jfs_ip->rdwrlock);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index eb243edf8932..2102e2d0134d 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -225,16 +225,13 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_monres_sz    2
 #define SM_unmonres_sz  1
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
 static struct rpc_procinfo      nsm_procedures[] = {
 [SM_MON] = {
                .p_proc         = SM_MON,
                .p_encode       = (kxdrproc_t) xdr_encode_mon,
                .p_decode       = (kxdrproc_t) xdr_decode_stat_res,
-                .p_bufsiz       = MAX(SM_mon_sz, SM_monres_sz) << 2,
+                .p_arglen       = SM_mon_sz,
+                .p_replen       = SM_monres_sz,
                .p_statidx      = SM_MON,
                .p_name         = "MONITOR",
        },
@@ -242,7 +239,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
                .p_proc         = SM_UNMON,
                .p_encode       = (kxdrproc_t) xdr_encode_unmon,
                .p_decode       = (kxdrproc_t) xdr_decode_stat,
-                .p_bufsiz       = MAX(SM_mon_id_sz, SM_unmonres_sz) << 2,
+                .p_arglen       = SM_mon_id_sz,
+                .p_replen       = SM_unmonres_sz,
                .p_statidx      = SM_UNMON,
                .p_name         = "UNMONITOR",
        },
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 34dae5d70738..9702956d206c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -510,17 +510,20 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
        return 0;
 }
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
 /*
 * Buffer requirements for NLM
 */
 #define NLM_void_sz             0
 #define NLM_cookie_sz           1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz           1+XDR_QUADLEN(sizeof(utsname()->nodename))
+#define NLM_caller_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM_netobj_sz           1+XDR_QUADLEN(XDR_MAX_NETOBJ)
+#define NLM_owner_sz            1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-/* #define NLM_owner_sz         1+XDR_QUADLEN(NLM_MAXOWNER) */
 #define NLM_fhandle_sz          1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz             3+NLM_caller_sz+NLM_netobj_sz+NLM_fhandle_sz
+#define NLM_lock_sz             3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
-#define NLM_holder_sz           4+NLM_netobj_sz
+#define NLM_holder_sz           4+NLM_owner_sz
 #define NLM_testargs_sz         NLM_cookie_sz+1+NLM_lock_sz
 #define NLM_lockargs_sz         NLM_cookie_sz+4+NLM_lock_sz
@@ -531,10 +534,6 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 #define NLM_res_sz              NLM_cookie_sz+1
 #define NLM_norep_sz            0
-#ifndef MAX
-# define MAX(a, b)              (((a) > (b))? (a) : (b))
-#endif
 /*
 * For NLM, a void procedure really returns nothing
 */
@@ -545,7 +544,8 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
        .p_proc      = NLMPROC_##proc,                                  \
        .p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,            \
        .p_decode    = (kxdrproc_t) nlmclt_decode_##restype,            \
-        .p_bufsiz    = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2,        \
+        .p_arglen    = NLM_##argtype##_sz,                              \
+        .p_replen    = NLM_##restype##_sz,                              \
        .p_statidx   = NLMPROC_##proc,                                  \
        .p_name      = #proc,                                           \
        }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index a78240551219..ce1efdbe1b3a 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -516,17 +516,24 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
        return 0;
 }
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
 /*
 * Buffer requirements for NLM
 */
 #define NLM4_void_sz            0
 #define NLM4_cookie_sz          1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz          1+XDR_QUADLEN(NLM_MAXSTRLEN)
+#define NLM4_caller_sz          1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-#define NLM4_netobj_sz          1+XDR_QUADLEN(XDR_MAX_NETOBJ)
+#define NLM4_owner_sz           1+XDR_QUADLEN(NLMCLNT_OHSIZE)
-/* #define NLM4_owner_sz                1+XDR_QUADLEN(NLM4_MAXOWNER) */
 #define NLM4_fhandle_sz         1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_netobj_sz+NLM4_fhandle_sz
+#define NLM4_lock_sz            5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz          6+NLM4_netobj_sz
+#define NLM4_holder_sz          6+NLM4_owner_sz
 #define NLM4_testargs_sz        NLM4_cookie_sz+1+NLM4_lock_sz
 #define NLM4_lockargs_sz        NLM4_cookie_sz+4+NLM4_lock_sz
@@ -537,10 +544,6 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 #define NLM4_res_sz             NLM4_cookie_sz+1
 #define NLM4_norep_sz           0
-#ifndef MAX
-# define MAX(a,b)               (((a) > (b))? (a) : (b))
-#endif
 /*
 * For NLM, a void procedure really returns nothing
 */
@@ -551,7 +554,8 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
        .p_proc      = NLMPROC_##proc,                                  \
        .p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,           \
        .p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,           \
-        .p_bufsiz    = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2,      \
+        .p_arglen    = NLM4_##argtype##_sz,                             \
+        .p_replen    = NLM4_##restype##_sz,                             \
        .p_statidx   = NLMPROC_##proc,                                  \
        .p_name      = #proc,                                           \
        }
diff --git a/fs/locks.c b/fs/locks.c
index 53b0cd153202..671a034dc999 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -203,8 +203,7 @@ static void init_once(void *foo, struct kmem_cache *cache, unsigned long flags)
 {
        struct file_lock *lock = (struct file_lock *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) !=
+        if (!(flags & SLAB_CTOR_CONSTRUCTOR))
-                                        SLAB_CTOR_CONSTRUCTOR)
                return;
        locks_init_lock(lock);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index cb4cb571fddf..e207cbe70951 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -65,7 +65,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
                if (!PageUptodate(page))
                        goto fail;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 92e383af3709..2f4d43a2a310 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -73,8 +73,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index ee60cc4d3453..94b2f60aec22 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1243,22 +1243,13 @@ int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
        return err;
 }
-/*
+static inline struct dentry *__lookup_hash_kern(struct qstr *name, struct dentry *base, struct nameidata *nd)
- * Restricted form of lookup. Doesn't follow links, single-component only,
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
-static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
 {
-        struct dentry * dentry;
+        struct dentry *dentry;
        struct inode *inode;
        int err;
        inode = base->d_inode;
-        err = permission(inode, MAY_EXEC, nd);
-        dentry = ERR_PTR(err);
-        if (err)
-                goto out;
        /*
         * See if the low-level filesystem might want
@@ -1287,35 +1278,76 @@ out:
        return dentry;
 }
+/*
+ * Restricted form of lookup. Doesn't follow links, single-component only,
+ * needs parent already locked. Doesn't follow mounts.
+ * SMP-safe.
+ */
+static inline struct dentry * __lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd)
+{
+        struct dentry *dentry;
+        struct inode *inode;
+        int err;
+        inode = base->d_inode;
+        err = permission(inode, MAY_EXEC, nd);
+        dentry = ERR_PTR(err);
+        if (err)
+                goto out;
+        dentry = __lookup_hash_kern(name, base, nd);
+out:
+        return dentry;
+}
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
        return __lookup_hash(&nd->last, nd->dentry, nd);
 }
 /* SMP-safe */
-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
+static inline int __lookup_one_len(const char *name, struct qstr *this, struct dentry *base, int len)
 {
        unsigned long hash;
-        struct qstr this;
        unsigned int c;
-        this.name = name;
+        this->name = name;
-        this.len = len;
+        this->len = len;
        if (!len)
-                goto access;
+                return -EACCES;
        hash = init_name_hash();
        while (len--) {
                c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
-                        goto access;
+                        return -EACCES;
                hash = partial_name_hash(c, hash);
        }
-        this.hash = end_name_hash(hash);
+        this->hash = end_name_hash(hash);
+        return 0;
+}
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+        int err;
+        struct qstr this;
+        err = __lookup_one_len(name, &this, base, len);
+        if (err)
+                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
-access:
+}
-        return ERR_PTR(-EACCES);
+struct dentry *lookup_one_len_kern(const char *name, struct dentry *base, int len)
+{
+        int err;
+        struct qstr this;
+        err = __lookup_one_len(name, &this, base, len);
+        if (err)
+                return ERR_PTR(err);
+        return __lookup_hash_kern(&this, base, NULL);
 }
 /*
@@ -2639,19 +2671,9 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
        struct address_space *mapping = dentry->d_inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
-                goto sync_fail;
+                return (char*)page;
-        wait_on_page_locked(page);
-        if (!PageUptodate(page))
-                goto async_fail;
        *ppage = page;
        return kmap(page);
-async_fail:
-        page_cache_release(page);
-        return ERR_PTR(-EIO);
-sync_fail:
-        return (char*)page;
 }
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 7285c94956c4..c29f00ad495d 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -60,8 +60,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                mutex_init(&ei->open_mutex);
                inode_init_once(&ei->vfs_inode);
        }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2190e6c2792e..5bd03b97002e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -618,7 +618,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
        if (clp->cl_nfsversion == 3) {
                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
                        server->namelen = NFS3_MAXNAMLEN;
-                server->caps |= NFS_CAP_READDIRPLUS;
+                if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+                        server->caps |= NFS_CAP_READDIRPLUS;
        } else {
                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
                        server->namelen = NFS2_MAXNAMLEN;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cd3469720cbf..625d8e5fb39d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -154,6 +154,8 @@ typedef struct {
        decode_dirent_t decode;
        int             plus;
        int             error;
+        unsigned long   timestamp;
+        int             timestamp_valid;
 } nfs_readdir_descriptor_t;
 /* Now we cache directories properly, by stuffing the dirent
@@ -195,6 +197,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
                }
                goto error;
        }
+        desc->timestamp = timestamp;
+        desc->timestamp_valid = 1;
        SetPageUptodate(page);
        spin_lock(&inode->i_lock);
        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
@@ -225,6 +229,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
        if (IS_ERR(p))
                return PTR_ERR(p);
        desc->ptr = p;
+        if (desc->timestamp_valid)
+                desc->entry->fattr->time_start = desc->timestamp;
+        else
+                desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
        return 0;
 }
@@ -316,14 +324,16 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
                        __FUNCTION__, desc->page_index,
                        (long long) *desc->dir_cookie);
+        /* If we find the page in the page_cache, we cannot be sure
+         * how fresh the data is, so we will ignore readdir_plus attributes.
+         */
+        desc->timestamp_valid = 0;
        page = read_cache_page(inode->i_mapping, desc->page_index,
                               (filler_t *)nfs_readdir_filler, desc);
        if (IS_ERR(page)) {
                status = PTR_ERR(page);
                goto out;
        }
-        if (!PageUptodate(page))
-                goto read_error;
        /* NOTE: Someone else may have changed the READDIRPLUS flag */
        desc->page = page;
@@ -337,9 +347,6 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 out:
        dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status);
        return status;
- read_error:
-        page_cache_release(page);
-        return -EIO;
 }
 /*
@@ -468,6 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct rpc_cred *cred = nfs_file_cred(file);
        struct page     *page = NULL;
        int             status;
+        unsigned long   timestamp;
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -477,6 +485,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                status = -ENOMEM;
                goto out;
        }
+        timestamp = jiffies;
        desc->error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie,
                                                page,
                                                NFS_SERVER(inode)->dtsize,
@@ -487,6 +496,8 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        desc->page = page;
        desc->ptr = kmap(page);         /* matching kunmap in nfs_do_filldir */
        if (desc->error >= 0) {
+                desc->timestamp = timestamp;
+                desc->timestamp_valid = 1;
                if ((status = dir_decode(desc)) == 0)
                        desc->entry->prev_cookie = *desc->dir_cookie;
        } else
@@ -849,6 +860,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
        nfs_inode_return_delegation(inode);
+        if (S_ISDIR(inode->i_mode))
+                /* drop any readdir cache as it could easily be old */
+                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                lock_kernel();
                drop_nlink(inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2877744cb606..889de60f8a84 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -54,6 +54,7 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
+#include "internal.h"
 #include "iostat.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -271,7 +272,7 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
                bytes = min(rsize,count);
                result = -ENOMEM;
-                data = nfs_readdata_alloc(pgbase + bytes);
+                data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
                if (unlikely(!data))
                        break;
@@ -602,7 +603,7 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
                bytes = min(wsize,count);
                result = -ENOMEM;
-                data = nfs_writedata_alloc(pgbase + bytes);
+                data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
                if (unlikely(!data))
                        break;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 44aa9b726573..1e9a915d1fea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1167,8 +1167,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct nfs_inode *nfsi = (struct nfs_inode *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                inode_init_once(&nfsi->vfs_inode);
                spin_lock_init(&nfsi->req_lock);
                INIT_LIST_HEAD(&nfsi->dirty);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6610f2b02077..ad2b40db1e65 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -231,3 +231,15 @@ unsigned int nfs_page_length(struct page *page)
        }
        return 0;
 }
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+        return ((unsigned long)len + (unsigned long)base +
+                PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index f75fe72b4160..ca5a266a3140 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -133,13 +133,15 @@ xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 #define MNT_dirpath_sz          (1 + 256)
 #define MNT_fhstatus_sz         (1 + 8)
+#define MNT_fhstatus3_sz        (1 + 16)
 static struct rpc_procinfo      mnt_procedures[] = {
 [MNTPROC_MNT] = {
          .p_proc               = MNTPROC_MNT,
          .p_encode             = (kxdrproc_t) xdr_encode_dirpath,      
          .p_decode             = (kxdrproc_t) xdr_decode_fhstatus,
-          .p_bufsiz             = MNT_dirpath_sz << 2,
+          .p_arglen             = MNT_dirpath_sz,
+          .p_replen             = MNT_fhstatus_sz,
          .p_statidx            = MNTPROC_MNT,
          .p_name               = "MOUNT",
        },
@@ -150,7 +152,8 @@ static struct rpc_procinfo mnt3_procedures[] = {
          .p_proc               = MOUNTPROC3_MNT,
          .p_encode             = (kxdrproc_t) xdr_encode_dirpath,
          .p_decode             = (kxdrproc_t) xdr_decode_fhstatus3,
-          .p_bufsiz             = MNT_dirpath_sz << 2,
+          .p_arglen             = MNT_dirpath_sz,
+          .p_replen             = MNT_fhstatus3_sz,
          .p_statidx            = MOUNTPROC3_MNT,
          .p_name               = "MOUNT",
        },
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 3be4e72a0227..abd9f8b48943 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -687,16 +687,13 @@ nfs_stat_to_errno(int stat)
        return nfs_errtbl[i].errno;
 }
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
 #define PROC(proc, argtype, restype, timer)                             \
 [NFSPROC_##proc] = {                                                    \
        .p_proc     =  NFSPROC_##proc,                                  \
        .p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,                  \
        .p_decode   =  (kxdrproc_t) nfs_xdr_##restype,                  \
-        .p_bufsiz   =  MAX(NFS_##argtype##_sz,NFS_##restype##_sz) << 2, \
+        .p_arglen   =  NFS_##argtype##_sz,                              \
+        .p_replen   =  NFS_##restype##_sz,                              \
        .p_timer    =  timer,                                           \
        .p_statidx  =  NFSPROC_##proc,                                  \
        .p_name     =  #proc,                                           \
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0ace092d126f..b51df8eb9f01 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1102,16 +1102,13 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 }
 #endif  /* CONFIG_NFS_V3_ACL */
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
 #define PROC(proc, argtype, restype, timer)                             \
 [NFS3PROC_##proc] = {                                                   \
        .p_proc      = NFS3PROC_##proc,                                 \
        .p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,                 \
        .p_decode    = (kxdrproc_t) nfs3_xdr_##restype,                 \
-        .p_bufsiz    = MAX(NFS3_##argtype##_sz,NFS3_##restype##_sz) << 2,       \
+        .p_arglen    = NFS3_##argtype##_sz,                             \
+        .p_replen    = NFS3_##restype##_sz,                             \
        .p_timer     = timer,                                           \
        .p_statidx   = NFS3PROC_##proc,                                 \
        .p_name      = #proc,                                           \
@@ -1153,7 +1150,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
                .p_proc = ACLPROC3_GETACL,
                .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
                .p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
-                .p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2,
+                .p_arglen = ACL3_getaclargs_sz,
+                .p_replen = ACL3_getaclres_sz,
                .p_timer = 1,
                .p_name = "GETACL",
        },
@@ -1161,7 +1159,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
                .p_proc = ACLPROC3_SETACL,
                .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
                .p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
-                .p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2,
+                .p_arglen = ACL3_setaclargs_sz,
+                .p_replen = ACL3_setaclres_sz,
                .p_timer = 0,
                .p_name = "SETACL",
        },
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9000ec52f72..d6a30e965787 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2647,8 +2647,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        if (ret == 0)
+        nfs_zap_caches(inode);
-                nfs4_write_cached_acl(inode, buf, buflen);
        return ret;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f02d522fd788..b8c28f2380a5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4546,16 +4546,13 @@ nfs4_stat_to_errno(int stat)
        return stat;
 }
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
 #define PROC(proc, argtype, restype)                            \
 [NFSPROC4_CLNT_##proc] = {                                      \
        .p_proc   = NFSPROC4_COMPOUND,                          \
        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,            \
        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,            \
-        .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
+        .p_arglen = NFS4_##argtype##_sz,                        \
+        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
        .p_name   = #proc,                                      \
    }
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 75f819dc0255..49d1008ce1d7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto)
        printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
                program, version, NIPQUAD(servaddr));
        set_sockaddr(&sin, servaddr, 0);
-        return rpc_getport_external(&sin, program, version, proto);
+        return rpcb_getport_external(&sin, program, version, proto);
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ca4b1d4ff42b..388950118f59 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -17,7 +17,8 @@
 #include <linux/nfs_page.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
-#include <linux/writeback.h>
+#include "internal.h"
 #define NFS_PARANOIA 1
@@ -50,9 +51,7 @@ nfs_page_free(struct nfs_page *p)
 * @count: number of bytes to read/write
 *
 * The page must be locked by the caller. This makes sure we never
- * create two different requests for the same page, and avoids
+ * create two different requests for the same page.
- * a possible deadlock when we reach the hard limit on the number
- * of dirty pages.
 * User should ensure it is safe to sleep in this function.
 */
 struct nfs_page *
@@ -63,16 +62,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_page         *req;
-        /* Deal with hard limits.  */
        for (;;) {
                /* try to allocate the request struct */
                req = nfs_page_alloc();
                if (req != NULL)
                        break;
-                /* Try to free up at least one request in order to stay
-                 * below the hard limit
-                 */
                if (signalled() && (server->flags & NFS_MOUNT_INTR))
                        return ERR_PTR(-ERESTARTSYS);
                yield();
@@ -223,124 +218,151 @@ out:
 }
 /**
- * nfs_coalesce_requests - Split coalesced requests out from a list.
+ * nfs_pageio_init - initialise a page io descriptor
- * @head: source list
+ * @desc: pointer to descriptor
- * @dst: destination list
+ * @inode: pointer to inode
- * @nmax: maximum number of requests to coalesce
+ * @doio: pointer to io function
- *
+ * @bsize: io block size
- * Moves a maximum of 'nmax' elements from one list to another.
+ * @io_flags: extra parameters for the io function
- * The elements are checked to ensure that they form a contiguous set
- * of pages, and that the RPC credentials are the same.
 */
-int
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
-nfs_coalesce_requests(struct list_head *head, struct list_head *dst,
+                     struct inode *inode,
-                      unsigned int nmax)
+                     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+                     size_t bsize,
+                     int io_flags)
 {
-        struct nfs_page         *req = NULL;
+        INIT_LIST_HEAD(&desc->pg_list);
-        unsigned int            npages = 0;
+        desc->pg_bytes_written = 0;
+        desc->pg_count = 0;
-        while (!list_empty(head)) {
+        desc->pg_bsize = bsize;
-                struct nfs_page *prev = req;
+        desc->pg_base = 0;
+        desc->pg_inode = inode;
-                req = nfs_list_entry(head->next);
+        desc->pg_doio = doio;
-                if (prev) {
+        desc->pg_ioflags = io_flags;
-                        if (req->wb_context->cred != prev->wb_context->cred)
+        desc->pg_error = 0;
-                                break;
-                        if (req->wb_context->lockowner != prev->wb_context->lockowner)
-                                break;
-                        if (req->wb_context->state != prev->wb_context->state)
-                                break;
-                        if (req->wb_index != (prev->wb_index + 1))
-                                break;
-                        if (req->wb_pgbase != 0)
-                                break;
-                }
-                nfs_list_remove_request(req);
-                nfs_list_add_request(req, dst);
-                npages++;
-                if (req->wb_pgbase + req->wb_bytes != PAGE_CACHE_SIZE)
-                        break;
-                if (npages >= nmax)
-                        break;
-        }
-        return npages;
 }
-#define NFS_SCAN_MAXENTRIES 16
 /**
- * nfs_scan_dirty - Scan the radix tree for dirty requests
+ * nfs_can_coalesce_requests - test two requests for compatibility
- * @mapping: pointer to address space
+ * @prev: pointer to nfs_page
- * @wbc: writeback_control structure
+ * @req: pointer to nfs_page
- * @dst: Destination list
 *
- * Moves elements from one of the inode request lists.
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
- * If the number of requests is set to 0, the entire address_space
+ * page data area they describe is contiguous, and that their RPC
- * starting at index idx_start, is scanned.
+ * credentials, NFSv4 open state, and lockowners are the same.
- * The requests are *not* checked to ensure that they form a contiguous set.
+ *
- * You must be holding the inode's req_lock when calling this function
+ * Return 'true' if this is the case, else return 'false'.
 */
-long nfs_scan_dirty(struct address_space *mapping,
+static int nfs_can_coalesce_requests(struct nfs_page *prev,
-                        struct writeback_control *wbc,
+                                     struct nfs_page *req)
-                        struct list_head *dst)
 {
-        struct nfs_inode *nfsi = NFS_I(mapping->host);
+        if (req->wb_context->cred != prev->wb_context->cred)
-        struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
-        struct nfs_page *req;
-        pgoff_t idx_start, idx_end;
-        long res = 0;
-        int found, i;
-        if (nfsi->ndirty == 0)
                return 0;
-        if (wbc->range_cyclic) {
+        if (req->wb_context->lockowner != prev->wb_context->lockowner)
-                idx_start = 0;
+                return 0;
-                idx_end = ULONG_MAX;
+        if (req->wb_context->state != prev->wb_context->state)
-        } else if (wbc->range_end == 0) {
+                return 0;
-                idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
+        if (req->wb_index != (prev->wb_index + 1))
-                idx_end = ULONG_MAX;
+                return 0;
-        } else {
+        if (req->wb_pgbase != 0)
-                idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
+                return 0;
-                idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
-        }
+                return 0;
+        return 1;
+}
-        for (;;) {
+/**
-                unsigned int toscan = NFS_SCAN_MAXENTRIES;
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+                                     struct nfs_page *req)
+{
+        size_t newlen = req->wb_bytes;
-                found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
+        if (desc->pg_count != 0) {
-                                (void **)&pgvec[0], idx_start, toscan,
+                struct nfs_page *prev;
-                                NFS_PAGE_TAG_DIRTY);
-                /* Did we make progress? */
+                /*
-                if (found <= 0)
+                 * FIXME: ideally we should be able to coalesce all requests
-                        break;
+                 * that are not block boundary aligned, but currently this
+                 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+                 * since nfs_flush_multi and nfs_pagein_multi assume you
+                 * can have only one struct nfs_page.
+                 */
+                if (desc->pg_bsize < PAGE_SIZE)
+                        return 0;
+                newlen += desc->pg_count;
+                if (newlen > desc->pg_bsize)
+                        return 0;
+                prev = nfs_list_entry(desc->pg_list.prev);
+                if (!nfs_can_coalesce_requests(prev, req))
+                        return 0;
+        } else
+                desc->pg_base = req->wb_pgbase;
+        nfs_list_remove_request(req);
+        nfs_list_add_request(req, &desc->pg_list);
+        desc->pg_count = newlen;
+        return 1;
+}
-                for (i = 0; i < found; i++) {
+/*
-                        req = pgvec[i];
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
-                        if (!wbc->range_cyclic && req->wb_index > idx_end)
+ */
-                                goto out;
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+        if (!list_empty(&desc->pg_list)) {
+                int error = desc->pg_doio(desc->pg_inode,
+                                          &desc->pg_list,
+                                          nfs_page_array_len(desc->pg_base,
+                                                             desc->pg_count),
+                                          desc->pg_count,
+                                          desc->pg_ioflags);
+                if (error < 0)
+                        desc->pg_error = error;
+                else
+                        desc->pg_bytes_written += desc->pg_count;
+        }
+        if (list_empty(&desc->pg_list)) {
+                desc->pg_count = 0;
+                desc->pg_base = 0;
+        }
+}
-                        /* Try to lock request and mark it for writeback */
+/**
-                        if (!nfs_set_page_writeback_locked(req))
+ * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
-                                goto next;
+ * @desc: destination io descriptor
-                        radix_tree_tag_clear(&nfsi->nfs_page_tree,
+ * @req: request
-                                        req->wb_index, NFS_PAGE_TAG_DIRTY);
+ *
-                        nfsi->ndirty--;
+ * Returns true if the request 'req' was successfully coalesced into the
-                        nfs_list_remove_request(req);
+ * existing list of pages 'desc'.
-                        nfs_list_add_request(req, dst);
+ */
-                        res++;
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
-                        if (res == LONG_MAX)
+                           struct nfs_page *req)
-                                goto out;
+{
-next:
+        while (!nfs_pageio_do_add_request(desc, req)) {
-                        idx_start = req->wb_index + 1;
+                nfs_pageio_doio(desc);
-                }
+                if (desc->pg_error < 0)
+                        return 0;
        }
-out:
+        return 1;
-        WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
-        return res;
 }
 /**
+ * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+        nfs_pageio_doio(desc);
+}
+#define NFS_SCAN_MAXENTRIES 16
+/**
 * nfs_scan_list - Scan a list for matching requests
 * @nfsi: NFS inode
 * @head: One of the NFS inode request lists
@@ -355,12 +377,12 @@ out:
 * You must be holding the inode's req_lock when calling this function
 */
 int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
-                struct list_head *dst, unsigned long idx_start,
+                struct list_head *dst, pgoff_t idx_start,
                unsigned int npages)
 {
        struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
        struct nfs_page *req;
-        unsigned long idx_end;
+        pgoff_t idx_end;
        int found, i;
        int res;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ab4d5a9edf2..9a55807b2a70 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -27,7 +27,8 @@
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
-static int nfs_pagein_one(struct list_head *, struct inode *);
+static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
@@ -36,9 +37,8 @@ static mempool_t *nfs_rdata_mempool;
 #define MIN_POOL_READ   (32)
-struct nfs_read_data *nfs_readdata_alloc(size_t len)
+struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-        unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
        if (p) {
@@ -133,7 +133,10 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
        nfs_list_add_request(new, &one_request);
-        nfs_pagein_one(&one_request, inode);
+        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
+                nfs_pagein_multi(inode, &one_request, 1, len, 0);
+        else
+                nfs_pagein_one(inode, &one_request, 1, len, 0);
        return 0;
 }
@@ -230,7 +233,7 @@ static void nfs_execute_read(struct nfs_read_data *data)
 * won't see the new data until our attribute cache is updated.  This is more
 * or less conventional NFS client behavior.
 */
-static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
+static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
 {
        struct nfs_page *req = nfs_list_entry(head->next);
        struct page *page = req->wb_page;
@@ -242,11 +245,11 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
        nfs_list_remove_request(req);
-        nbytes = req->wb_bytes;
+        nbytes = count;
        do {
                size_t len = min(nbytes,rsize);
-                data = nfs_readdata_alloc(len);
+                data = nfs_readdata_alloc(1);
                if (!data)
                        goto out_bad;
                INIT_LIST_HEAD(&data->pages);
@@ -258,23 +261,19 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
        ClearPageError(page);
        offset = 0;
-        nbytes = req->wb_bytes;
+        nbytes = count;
        do {
                data = list_entry(list.next, struct nfs_read_data, pages);
                list_del_init(&data->pages);
                data->pagevec[0] = page;
-                if (nbytes > rsize) {
+                if (nbytes < rsize)
-                        nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+                        rsize = nbytes;
-                                        rsize, offset);
+                nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-                        offset += rsize;
+                                  rsize, offset);
-                        nbytes -= rsize;
+                offset += rsize;
-                } else {
+                nbytes -= rsize;
-                        nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-                                        nbytes, offset);
-                        nbytes = 0;
-                }
                nfs_execute_read(data);
        } while (nbytes != 0);
@@ -291,30 +290,24 @@ out_bad:
        return -ENOMEM;
 }
-static int nfs_pagein_one(struct list_head *head, struct inode *inode)
+static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_read_data    *data;
-        unsigned int            count;
-        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
+        data = nfs_readdata_alloc(npages);
-                return nfs_pagein_multi(head, inode);
-        data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
        if (!data)
                goto out_bad;
        INIT_LIST_HEAD(&data->pages);
        pages = data->pagevec;
-        count = 0;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &data->pages);
                ClearPageError(req->wb_page);
                *pages++ = req->wb_page;
-                count += req->wb_bytes;
        }
        req = nfs_list_entry(data->pages.next);
@@ -327,28 +320,6 @@ out_bad:
        return -ENOMEM;
 }
-static int
-nfs_pagein_list(struct list_head *head, int rpages)
-{
-        LIST_HEAD(one_request);
-        struct nfs_page         *req;
-        int                     error = 0;
-        unsigned int            pages = 0;
-        while (!list_empty(head)) {
-                pages += nfs_coalesce_requests(head, &one_request, rpages);
-                req = nfs_list_entry(one_request.next);
-                error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode);
-                if (error < 0)
-                        break;
-        }
-        if (error >= 0)
-                return pages;
-        nfs_async_read_error(head);
-        return error;
-}
 /*
 * This is the callback from RPC telling us whether a reply was
 * received or some error occurred (timeout or socket shutdown).
@@ -538,7 +509,7 @@ out_error:
 }
 struct nfs_readdesc {
-        struct list_head *head;
+        struct nfs_pageio_descriptor *pgio;
        struct nfs_open_context *ctx;
 };
@@ -562,19 +533,21 @@ readpage_async_filler(void *data, struct page *page)
        }
        if (len < PAGE_CACHE_SIZE)
                memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
-        nfs_list_add_request(new, desc->head);
+        nfs_pageio_add_request(desc->pgio, new);
        return 0;
 }
 int nfs_readpages(struct file *filp, struct address_space *mapping,
                struct list_head *pages, unsigned nr_pages)
 {
-        LIST_HEAD(head);
+        struct nfs_pageio_descriptor pgio;
        struct nfs_readdesc desc = {
-                .head           = &head,
+                .pgio = &pgio,
        };
        struct inode *inode = mapping->host;
        struct nfs_server *server = NFS_SERVER(inode);
+        size_t rsize = server->rsize;
+        unsigned long npages;
        int ret = -ESTALE;
        dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
@@ -593,13 +566,16 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        } else
                desc.ctx = get_nfs_open_context((struct nfs_open_context *)
                                filp->private_data);
+        if (rsize < PAGE_CACHE_SIZE)
+                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
+        else
+                nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
        ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-        if (!list_empty(&head)) {
-                int err = nfs_pagein_list(&head, server->rpages);
+        nfs_pageio_complete(&pgio);
-                if (!ret)
+        npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                        nfs_add_stats(inode, NFSIOS_READPAGES, err);
+        nfs_add_stats(inode, NFSIOS_READPAGES, npages);
-                        ret = err;
-        }
        put_nfs_open_context(desc.ctx);
 out:
        return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1eae44b9a1a..ca20d3cc2609 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -204,9 +204,9 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        lock_kernel();
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
-        buf->f_type = NFS_SUPER_MAGIC;
        if (error < 0)
                goto out_err;
+        buf->f_type = NFS_SUPER_MAGIC;
        /*
         * Current versions of glibc do not correctly handle the
@@ -233,15 +233,14 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_ffree = res.afiles;
        buf->f_namelen = server->namelen;
- out:
        unlock_kernel();
        return 0;
 out_err:
        dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-        buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
+        unlock_kernel();
-        goto out;
+        return error;
 }
 /*
@@ -291,6 +290,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                { NFS_MOUNT_NOAC, ",noac", "" },
                { NFS_MOUNT_NONLM, ",nolock", "" },
                { NFS_MOUNT_NOACL, ",noacl", "" },
+                { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
                { 0, NULL, NULL }
        };
        const struct proc_nfs_info *nfs_infop;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index f4a0548b9ce8..bc2821331c29 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -61,15 +61,9 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
                err = page;
                goto read_failed;
        }
-        if (!PageUptodate(page)) {
-                err = ERR_PTR(-EIO);
-                goto getlink_read_error;
-        }
        nd_set_link(nd, kmap(page));
        return page;
-getlink_read_error:
-        page_cache_release(page);
 read_failed:
        nd_set_link(nd, err);
        return NULL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ad2e91b4904f..5d44b8bd1070 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -38,8 +38,8 @@
 static struct nfs_page * nfs_update_request(struct nfs_open_context*,
                                            struct page *,
                                            unsigned int, unsigned int);
-static void nfs_mark_request_dirty(struct nfs_page *req);
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
+                                  struct inode *inode, int ioflags);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
@@ -72,9 +72,8 @@ void nfs_commit_free(struct nfs_write_data *wdata)
        call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free);
 }
-struct nfs_write_data *nfs_writedata_alloc(size_t len)
+struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
-        unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
        if (p) {
@@ -140,7 +139,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
 {
        struct inode *inode = page->mapping->host;
        loff_t end, i_size = i_size_read(inode);
-        unsigned long end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+        pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
        if (i_size > 0 && page->index < end_index)
                return;
@@ -202,7 +201,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 static int wb_priority(struct writeback_control *wbc)
 {
        if (wbc->for_reclaim)
-                return FLUSH_HIGHPRI;
+                return FLUSH_HIGHPRI | FLUSH_STABLE;
        if (wbc->for_kupdate)
                return FLUSH_LOWPRI;
        return 0;
@@ -252,10 +251,12 @@ static void nfs_end_page_writeback(struct page *page)
 * was not tagged.
 * May also return an error if the user signalled nfs_wait_on_request().
 */
-static int nfs_page_mark_flush(struct page *page)
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+                                struct page *page)
 {
        struct nfs_page *req;
-        spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+        struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+        spinlock_t *req_lock = &nfsi->req_lock;
        int ret;
        spin_lock(req_lock);
@@ -273,19 +274,30 @@ static int nfs_page_mark_flush(struct page *page)
                 *       request as dirty (in which case we don't care).
                 */
                spin_unlock(req_lock);
+                /* Prevent deadlock! */
+                nfs_pageio_complete(pgio);
                ret = nfs_wait_on_request(req);
                nfs_release_request(req);
                if (ret != 0)
                        return ret;
                spin_lock(req_lock);
        }
-        spin_unlock(req_lock);
+        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-        if (nfs_set_page_writeback(page) == 0) {
+                /* This request is marked for commit */
-                nfs_list_remove_request(req);
+                spin_unlock(req_lock);
-                nfs_mark_request_dirty(req);
+                nfs_unlock_request(req);
+                nfs_pageio_complete(pgio);
+                return 1;
+        }
+        if (nfs_set_page_writeback(page) != 0) {
+                spin_unlock(req_lock);
+                BUG();
        }
+        radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
+                        NFS_PAGE_TAG_WRITEBACK);
        ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
-        nfs_unlock_request(req);
+        spin_unlock(req_lock);
+        nfs_pageio_add_request(pgio, req);
        return ret;
 }
@@ -294,6 +306,7 @@ static int nfs_page_mark_flush(struct page *page)
 */
 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
+        struct nfs_pageio_descriptor mypgio, *pgio;
        struct nfs_open_context *ctx;
        struct inode *inode = page->mapping->host;
        unsigned offset;
@@ -302,7 +315,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
-        err = nfs_page_mark_flush(page);
+        if (wbc->for_writepages)
+                pgio = wbc->fs_private;
+        else {
+                nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
+                pgio = &mypgio;
+        }
+        err = nfs_page_async_flush(pgio, page);
        if (err <= 0)
                goto out;
        err = 0;
@@ -319,12 +339,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
        put_nfs_open_context(ctx);
        if (err != 0)
                goto out;
-        err = nfs_page_mark_flush(page);
+        err = nfs_page_async_flush(pgio, page);
        if (err > 0)
                err = 0;
 out:
        if (!wbc->for_writepages)
-                nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc));
+                nfs_pageio_complete(pgio);
        return err;
 }
@@ -340,20 +360,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
+        struct nfs_pageio_descriptor pgio;
        int err;
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+        nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+        wbc->fs_private = &pgio;
        err = generic_writepages(mapping, wbc);
+        nfs_pageio_complete(&pgio);
        if (err)
                return err;
-        err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
+        if (pgio.pg_error)
-        if (err < 0)
+                return pgio.pg_error;
-                goto out;
+        return 0;
-        nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
-        err = 0;
-out:
-        return err;
 }
 /*
@@ -376,6 +396,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        }
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
+        if (PageDirty(req->wb_page))
+                set_bit(PG_NEED_FLUSH, &req->wb_flags);
        nfsi->npages++;
        atomic_inc(&req->wb_count);
        return 0;
@@ -395,6 +417,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
+        if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
+                __set_page_dirty_nobuffers(req->wb_page);
        nfsi->npages--;
        if (!nfsi->npages) {
                spin_unlock(&nfsi->req_lock);
@@ -406,24 +430,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        nfs_release_request(req);
 }
-/*
- * Add a request to the inode's dirty list.
- */
-static void
-nfs_mark_request_dirty(struct nfs_page *req)
-{
-        struct inode *inode = req->wb_context->dentry->d_inode;
-        struct nfs_inode *nfsi = NFS_I(inode);
-        spin_lock(&nfsi->req_lock);
-        radix_tree_tag_set(&nfsi->nfs_page_tree,
-                        req->wb_index, NFS_PAGE_TAG_DIRTY);
-        nfs_list_add_request(req, &nfsi->dirty);
-        nfsi->ndirty++;
-        spin_unlock(&nfsi->req_lock);
-        __mark_inode_dirty(inode, I_DIRTY_PAGES);
-}
 static void
 nfs_redirty_request(struct nfs_page *req)
 {
@@ -438,7 +444,7 @@ nfs_dirty_request(struct nfs_page *req)
 {
        struct page *page = req->wb_page;
-        if (page == NULL)
+        if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
                return 0;
        return !PageWriteback(req->wb_page);
 }
@@ -456,10 +462,48 @@ nfs_mark_request_commit(struct nfs_page *req)
        spin_lock(&nfsi->req_lock);
        nfs_list_add_request(req, &nfsi->commit);
        nfsi->ncommit++;
+        set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
        spin_unlock(&nfsi->req_lock);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
+static inline
+int nfs_write_need_commit(struct nfs_write_data *data)
+{
+        return data->verf.committed != NFS_FILE_SYNC;
+}
+static inline
+int nfs_reschedule_unstable_write(struct nfs_page *req)
+{
+        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+                nfs_mark_request_commit(req);
+                return 1;
+        }
+        if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
+                nfs_redirty_request(req);
+                return 1;
+        }
+        return 0;
+}
+#else
+static inline void
+nfs_mark_request_commit(struct nfs_page *req)
+{
+}
+static inline
+int nfs_write_need_commit(struct nfs_write_data *data)
+{
+        return 0;
+}
+static inline
+int nfs_reschedule_unstable_write(struct nfs_page *req)
+{
+        return 0;
+}
 #endif
 /*
@@ -467,11 +511,11 @@ nfs_mark_request_commit(struct nfs_page *req)
 *
 * Interruptible by signals only if mounted with intr flag.
 */
-static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_start, unsigned int npages)
+static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_page *req;
-        unsigned long           idx_end, next;
+        pgoff_t idx_end, next;
        unsigned int            res = 0;
        int                     error;
@@ -500,18 +544,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
        return res;
 }
-static void nfs_cancel_dirty_list(struct list_head *head)
-{
-        struct nfs_page *req;
-        while(!list_empty(head)) {
-                req = nfs_list_entry(head->next);
-                nfs_list_remove_request(req);
-                nfs_end_page_writeback(req->wb_page);
-                nfs_inode_remove_request(req);
-                nfs_clear_page_writeback(req);
-        }
-}
 static void nfs_cancel_commit_list(struct list_head *head)
 {
        struct nfs_page *req;
@@ -520,6 +552,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
                req = nfs_list_entry(head->next);
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                nfs_list_remove_request(req);
+                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
                nfs_inode_remove_request(req);
                nfs_unlock_request(req);
        }
@@ -537,7 +570,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
 * The requests are *not* checked to ensure that they form a contiguous set.
 */
 static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        int res = 0;
@@ -551,40 +584,12 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
        return res;
 }
 #else
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        return 0;
 }
 #endif
-static int nfs_wait_on_write_congestion(struct address_space *mapping)
-{
-        struct inode *inode = mapping->host;
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
-        int ret = 0;
-        might_sleep();
-        if (!bdi_write_congested(bdi))
-                return 0;
-        nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
-        do {
-                struct rpc_clnt *clnt = NFS_CLIENT(inode);
-                sigset_t oldset;
-                rpc_clnt_sigmask(clnt, &oldset);
-                ret = congestion_wait_interruptible(WRITE, HZ/10);
-                rpc_clnt_sigunmask(clnt, &oldset);
-                if (ret == -ERESTARTSYS)
-                        break;
-                ret = 0;
-        } while (bdi_write_congested(bdi));
-        return ret;
-}
 /*
 * Try to update any existing write request, or create one if there is none.
 * In order to match, the request's credentials must match those of
@@ -599,12 +604,10 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
        struct inode *inode = mapping->host;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_page         *req, *new = NULL;
-        unsigned long           rqend, end;
+        pgoff_t         rqend, end;
        end = offset + bytes;
-        if (nfs_wait_on_write_congestion(mapping))
-                return ERR_PTR(-ERESTARTSYS);
        for (;;) {
                /* Loop over all inode entries and see if we find
                 * A request for the page we wish to update
@@ -746,26 +749,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
-        nfs_end_page_writeback(req->wb_page);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-        if (!PageError(req->wb_page)) {
-                if (NFS_NEED_RESCHED(req)) {
-                        nfs_redirty_request(req);
-                        goto out;
-                } else if (NFS_NEED_COMMIT(req)) {
-                        nfs_mark_request_commit(req);
-                        goto out;
-                }
-        }
-        nfs_inode_remove_request(req);
-out:
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
-        nfs_clear_commit(req);
+                nfs_end_page_writeback(req->wb_page);
-        nfs_clear_reschedule(req);
+                nfs_inode_remove_request(req);
-#else
+        } else
-        nfs_inode_remove_request(req);
+                nfs_end_page_writeback(req->wb_page);
-#endif
        nfs_clear_page_writeback(req);
 }
@@ -842,7 +831,7 @@ static void nfs_execute_write(struct nfs_write_data *data)
 * Generate multiple small requests to write out a single
 * contiguous dirty area on one page.
 */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
+static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
 {
        struct nfs_page *req = nfs_list_entry(head->next);
        struct page *page = req->wb_page;
@@ -854,11 +843,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
        nfs_list_remove_request(req);
-        nbytes = req->wb_bytes;
+        nbytes = count;
        do {
                size_t len = min(nbytes, wsize);
-                data = nfs_writedata_alloc(len);
+                data = nfs_writedata_alloc(1);
                if (!data)
                        goto out_bad;
                list_add(&data->pages, &list);
@@ -869,23 +858,19 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
        ClearPageError(page);
        offset = 0;
-        nbytes = req->wb_bytes;
+        nbytes = count;
        do {
                data = list_entry(list.next, struct nfs_write_data, pages);
                list_del_init(&data->pages);
                data->pagevec[0] = page;
-                if (nbytes > wsize) {
+                if (nbytes < wsize)
-                        nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+                        wsize = nbytes;
-                                        wsize, offset, how);
+                nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-                        offset += wsize;
+                                   wsize, offset, how);
-                        nbytes -= wsize;
+                offset += wsize;
-                } else {
+                nbytes -= wsize;
-                        nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-                                        nbytes, offset, how);
-                        nbytes = 0;
-                }
                nfs_execute_write(data);
        } while (nbytes != 0);
@@ -897,8 +882,8 @@ out_bad:
                list_del(&data->pages);
                nfs_writedata_release(data);
        }
-        nfs_end_page_writeback(req->wb_page);
        nfs_redirty_request(req);
+        nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_writeback(req);
        return -ENOMEM;
 }
@@ -911,26 +896,23 @@ out_bad:
 * This is the case if nfs_updatepage detects a conflicting request
 * that has been written but not committed.
 */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
+static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_write_data   *data;
-        unsigned int            count;
-        data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize);
+        data = nfs_writedata_alloc(npages);
        if (!data)
                goto out_bad;
        pages = data->pagevec;
-        count = 0;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &data->pages);
                ClearPageError(req->wb_page);
                *pages++ = req->wb_page;
-                count += req->wb_bytes;
        }
        req = nfs_list_entry(data->pages.next);
@@ -943,47 +925,22 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
        while (!list_empty(head)) {
                struct nfs_page *req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
-                nfs_end_page_writeback(req->wb_page);
                nfs_redirty_request(req);
+                nfs_end_page_writeback(req->wb_page);
                nfs_clear_page_writeback(req);
        }
        return -ENOMEM;
 }
-static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how)
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+                                  struct inode *inode, int ioflags)
 {
-        LIST_HEAD(one_request);
-        int (*flush_one)(struct inode *, struct list_head *, int);
-        struct nfs_page *req;
-        int wpages = NFS_SERVER(inode)->wpages;
        int wsize = NFS_SERVER(inode)->wsize;
-        int error;
-        flush_one = nfs_flush_one;
        if (wsize < PAGE_CACHE_SIZE)
-                flush_one = nfs_flush_multi;
+                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
-        /* For single writes, FLUSH_STABLE is more efficient */
+        else
-        if (npages <= wpages && npages == NFS_I(inode)->npages
+                nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
-                        && nfs_list_entry(head->next)->wb_bytes <= wsize)
-                how |= FLUSH_STABLE;
-        do {
-                nfs_coalesce_requests(head, &one_request, wpages);
-                req = nfs_list_entry(one_request.next);
-                error = flush_one(inode, &one_request, how);
-                if (error < 0)
-                        goto out_err;
-        } while (!list_empty(head));
-        return 0;
-out_err:
-        while (!list_empty(head)) {
-                req = nfs_list_entry(head->next);
-                nfs_list_remove_request(req);
-                nfs_end_page_writeback(req->wb_page);
-                nfs_redirty_request(req);
-                nfs_clear_page_writeback(req);
-        }
-        return error;
 }
 /*
@@ -1008,22 +965,28 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
                nfs_set_pageerror(page);
                req->wb_context->error = task->tk_status;
                dprintk(", error = %d\n", task->tk_status);
-        } else {
+                goto out;
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-                if (data->verf.committed < NFS_FILE_SYNC) {
-                        if (!NFS_NEED_COMMIT(req)) {
-                                nfs_defer_commit(req);
-                                memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-                                dprintk(" defer commit\n");
-                        } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
-                                nfs_defer_reschedule(req);
-                                dprintk(" server reboot detected\n");
-                        }
-                } else
-#endif
-                        dprintk(" OK\n");
        }
+        if (nfs_write_need_commit(data)) {
+                spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+                spin_lock(req_lock);
+                if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
+                        /* Do nothing we need to resend the writes */
+                } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
+                        dprintk(" defer commit\n");
+                } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
+                        set_bit(PG_NEED_RESCHED, &req->wb_flags);
+                        clear_bit(PG_NEED_COMMIT, &req->wb_flags);
+                        dprintk(" server reboot detected\n");
+                }
+                spin_unlock(req_lock);
+        } else
+                dprintk(" OK\n");
+out:
        if (atomic_dec_and_test(&req->wb_complete))
                nfs_writepage_release(req);
 }
@@ -1064,25 +1027,21 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
                if (task->tk_status < 0) {
                        nfs_set_pageerror(page);
                        req->wb_context->error = task->tk_status;
-                        nfs_end_page_writeback(page);
-                        nfs_inode_remove_request(req);
                        dprintk(", error = %d\n", task->tk_status);
-                        goto next;
+                        goto remove_request;
                }
-                nfs_end_page_writeback(page);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+                if (nfs_write_need_commit(data)) {
-                if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
+                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-                        nfs_inode_remove_request(req);
+                        nfs_mark_request_commit(req);
-                        dprintk(" OK\n");
+                        nfs_end_page_writeback(page);
+                        dprintk(" marked for commit\n");
                        goto next;
                }
-                memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
+                dprintk(" OK\n");
-                nfs_mark_request_commit(req);
+remove_request:
-                dprintk(" marked for commit\n");
+                nfs_end_page_writeback(page);
-#else
                nfs_inode_remove_request(req);
-#endif
        next:
                nfs_clear_page_writeback(req);
        }
@@ -1270,6 +1229,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
+                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                dprintk("NFS: commit (%s/%Ld %d@%Ld)",
@@ -1304,31 +1264,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
        .rpc_call_done = nfs_commit_done,
        .rpc_release = nfs_commit_release,
 };
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
-        return 0;
-}
-#endif
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
-{
-        struct nfs_inode *nfsi = NFS_I(mapping->host);
-        LIST_HEAD(head);
-        long res;
-        spin_lock(&nfsi->req_lock);
-        res = nfs_scan_dirty(mapping, wbc, &head);
-        spin_unlock(&nfsi->req_lock);
-        if (res) {
-                int error = nfs_flush_list(mapping->host, &head, res, how);
-                if (error < 0)
-                        return error;
-        }
-        return res;
-}
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 int nfs_commit_inode(struct inode *inode, int how)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -1345,13 +1281,18 @@ int nfs_commit_inode(struct inode *inode, int how)
        }
        return res;
 }
+#else
+static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+{
+        return 0;
+}
 #endif
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
        struct inode *inode = mapping->host;
        struct nfs_inode *nfsi = NFS_I(inode);
-        unsigned long idx_start, idx_end;
+        pgoff_t idx_start, idx_end;
        unsigned int npages = 0;
        LIST_HEAD(head);
        int nocommit = how & FLUSH_NOCOMMIT;
@@ -1364,41 +1305,24 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
                idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
                idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (idx_end > idx_start) {
-                        unsigned long l_npages = 1 + idx_end - idx_start;
+                        pgoff_t l_npages = 1 + idx_end - idx_start;
                        npages = l_npages;
                        if (sizeof(npages) != sizeof(l_npages) &&
-                                        (unsigned long)npages != l_npages)
+                                        (pgoff_t)npages != l_npages)
                                npages = 0;
                }
        }
        how &= ~FLUSH_NOCOMMIT;
        spin_lock(&nfsi->req_lock);
        do {
-                wbc->pages_skipped = 0;
                ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
                if (ret != 0)
                        continue;
-                pages = nfs_scan_dirty(mapping, wbc, &head);
-                if (pages != 0) {
-                        spin_unlock(&nfsi->req_lock);
-                        if (how & FLUSH_INVALIDATE) {
-                                nfs_cancel_dirty_list(&head);
-                                ret = pages;
-                        } else
-                                ret = nfs_flush_list(inode, &head, pages, how);
-                        spin_lock(&nfsi->req_lock);
-                        continue;
-                }
-                if (wbc->pages_skipped != 0)
-                        continue;
                if (nocommit)
                        break;
                pages = nfs_scan_commit(inode, &head, idx_start, npages);
-                if (pages == 0) {
+                if (pages == 0)
-                        if (wbc->pages_skipped != 0)
-                                continue;
                        break;
-                }
                if (how & FLUSH_INVALIDATE) {
                        spin_unlock(&nfsi->req_lock);
                        nfs_cancel_commit_list(&head);
@@ -1430,7 +1354,7 @@ int nfs_wb_all(struct inode *inode)
        };
        int ret;
-        ret = generic_writepages(mapping, &wbc);
+        ret = nfs_writepages(mapping, &wbc);
        if (ret < 0)
                goto out;
        ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
@@ -1453,11 +1377,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
        };
        int ret;
-        if (!(how & FLUSH_NOWRITEPAGE)) {
+        ret = nfs_writepages(mapping, &wbc);
-                ret = generic_writepages(mapping, &wbc);
+        if (ret < 0)
-                if (ret < 0)
+                goto out;
-                        goto out;
-        }
        ret = nfs_sync_mapping_wait(mapping, &wbc, how);
        if (ret >= 0)
                return 0;
@@ -1480,7 +1402,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
        int ret;
        BUG_ON(!PageLocked(page));
-        if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) {
+        if (clear_page_dirty_for_io(page)) {
                ret = nfs_writepage_locked(page, &wbc);
                if (ret < 0)
                        goto out;
@@ -1505,15 +1427,32 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 int nfs_set_page_dirty(struct page *page)
 {
+        struct address_space *mapping = page->mapping;
+        struct inode *inode;
+        spinlock_t *req_lock;
        struct nfs_page *req;
+        int ret;
-        req = nfs_page_find_request(page);
+        if (!mapping)
+                goto out_raced;
+        inode = mapping->host;
+        if (!inode)
+                goto out_raced;
+        req_lock = &NFS_I(inode)->req_lock;
+        spin_lock(req_lock);
+        req = nfs_page_find_request_locked(page);
        if (req != NULL) {
                /* Mark any existing write requests for flushing */
-                set_bit(PG_NEED_FLUSH, &req->wb_flags);
+                ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
+                spin_unlock(req_lock);
                nfs_release_request(req);
+                return ret;
        }
-        return __set_page_dirty_nobuffers(page);
+        ret = __set_page_dirty_nobuffers(page);
+        spin_unlock(req_lock);
+        return ret;
+out_raced:
+        return !TestSetPageDirty(page);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fb14d68eacab..32ffea033c7a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -315,16 +315,13 @@ out:
 /*
 * RPC procedure tables
 */
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
 #define PROC(proc, call, argtype, restype)                              \
 [NFSPROC4_CLNT_##proc] = {                                              \
        .p_proc   = NFSPROC4_CB_##call,                                 \
        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
-        .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
+        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_replen = NFS4_##restype##_sz,                                \
        .p_statidx = NFSPROC4_CB_##call,                                \
        .p_name   = #proc,                                              \
 }
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 9393f4b1e298..caecc58f529c 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -89,9 +89,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
        struct page *page = read_mapping_page(mapping, index, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (PageUptodate(page) && !PageError(page))
+                if (!PageError(page))
                        return page;
                ntfs_unmap_page(page);
                return ERR_PTR(-EIO);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 7659cc192995..1c08fefe487a 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2532,14 +2532,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
                page = read_mapping_page(mapping, idx, NULL);
                if (IS_ERR(page)) {
                        ntfs_error(vol->sb, "Failed to read first partial "
-                                        "page (sync error, index 0x%lx).", idx);
+                                        "page (error, index 0x%lx).", idx);
-                        return PTR_ERR(page);
-                }
-                wait_on_page_locked(page);
-                if (unlikely(!PageUptodate(page))) {
-                        ntfs_error(vol->sb, "Failed to read first partial page "
-                                        "(async error, index 0x%lx).", idx);
-                        page_cache_release(page);
                        return PTR_ERR(page);
                }
                /*
@@ -2602,14 +2595,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
                page = read_mapping_page(mapping, idx, NULL);
                if (IS_ERR(page)) {
                        ntfs_error(vol->sb, "Failed to read last partial page "
-                                        "(sync error, index 0x%lx).", idx);
+                                        "(error, index 0x%lx).", idx);
-                        return PTR_ERR(page);
-                }
-                wait_on_page_locked(page);
-                if (unlikely(!PageUptodate(page))) {
-                        ntfs_error(vol->sb, "Failed to read last partial page "
-                                        "(async error, index 0x%lx).", idx);
-                        page_cache_release(page);
                        return PTR_ERR(page);
                }
                kaddr = kmap_atomic(page, KM_USER0);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d69c4595ccd0..dbbac5593106 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -236,8 +236,7 @@ do_non_resident_extend:
                        err = PTR_ERR(page);
                        goto init_err_out;
                }
-                wait_on_page_locked(page);
+                if (unlikely(PageError(page))) {
-                if (unlikely(!PageUptodate(page) || PageError(page))) {
                        page_cache_release(page);
                        err = -EIO;
                        goto init_err_out;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1594c90b7164..21d834e5ed73 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2471,7 +2471,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
        s64 nr_free = vol->nr_clusters;
        u32 *kaddr;
        struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
-        filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
        struct page *page;
        pgoff_t index, max_index;
@@ -2494,24 +2493,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
                 * Read the page from page cache, getting it from backing store
                 * if necessary, and increment the use count.
                 */
-                page = read_cache_page(mapping, index, (filler_t*)readpage,
+                page = read_mapping_page(mapping, index, NULL);
-                                NULL);
                /* Ignore pages which errored synchronously. */
                if (IS_ERR(page)) {
-                        ntfs_debug("Sync read_cache_page() error. Skipping "
+                        ntfs_debug("read_mapping_page() error. Skipping "
                                        "page (index 0x%lx).", index);
                        nr_free -= PAGE_CACHE_SIZE * 8;
                        continue;
                }
-                wait_on_page_locked(page);
-                /* Ignore pages which errored asynchronously. */
-                if (!PageUptodate(page)) {
-                        ntfs_debug("Async read_cache_page() error. Skipping "
-                                        "page (index 0x%lx).", index);
-                        page_cache_release(page);
-                        nr_free -= PAGE_CACHE_SIZE * 8;
-                        continue;
-                }
                kaddr = (u32*)kmap_atomic(page, KM_USER0);
                /*
                 * For each 4 bytes, subtract the number of set bits. If this
@@ -2562,7 +2551,6 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 {
        u32 *kaddr;
        struct address_space *mapping = vol->mftbmp_ino->i_mapping;
-        filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
        struct page *page;
        pgoff_t index;
@@ -2576,21 +2564,11 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
                 * Read the page from page cache, getting it from backing store
                 * if necessary, and increment the use count.
                 */
-                page = read_cache_page(mapping, index, (filler_t*)readpage,
+                page = read_mapping_page(mapping, index, NULL);
-                                NULL);
                /* Ignore pages which errored synchronously. */
                if (IS_ERR(page)) {
-                        ntfs_debug("Sync read_cache_page() error. Skipping "
+                        ntfs_debug("read_mapping_page() error. Skipping "
-                                        "page (index 0x%lx).", index);
-                        nr_free -= PAGE_CACHE_SIZE * 8;
-                        continue;
-                }
-                wait_on_page_locked(page);
-                /* Ignore pages which errored asynchronously. */
-                if (!PageUptodate(page)) {
-                        ntfs_debug("Async read_cache_page() error. Skipping "
                                        "page (index 0x%lx).", index);
-                        page_cache_release(page);
                        nr_free -= PAGE_CACHE_SIZE * 8;
                        continue;
                }
@@ -3107,8 +3085,7 @@ static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep,
 {
        ntfs_inode *ni = (ntfs_inode *)foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-                        SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(VFS_I(ni));
 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..19712a7d145f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -47,63 +49,243 @@
 #include "buffer_head_io.h"
-static int ocfs2_extent_contig(struct inode *inode,
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-                               struct ocfs2_extent_rec *ext,
-                               u64 blkno);
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+/*
-                                     handle_t *handle,
+ * Structures which describe a path through a btree, and functions to
-                                     struct inode *inode,
+ * manipulate them.
-                                     int wanted,
+ *
-                                     struct ocfs2_alloc_context *meta_ac,
+ * The idea here is to be as generic as possible with the tree
-                                     struct buffer_head *bhs[]);
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+        struct buffer_head              *bh;
+        struct ocfs2_extent_list        *el;
+};
-static int ocfs2_add_branch(struct ocfs2_super *osb,
+#define OCFS2_MAX_PATH_DEPTH    5
-                            handle_t *handle,
-                            struct inode *inode,
-                            struct buffer_head *fe_bh,
-                            struct buffer_head *eb_bh,
-                            struct buffer_head *last_eb_bh,
-                            struct ocfs2_alloc_context *meta_ac);
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+struct ocfs2_path {
-                                  handle_t *handle,
+        int                     p_tree_depth;
-                                  struct inode *inode,
+        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
-                                  struct buffer_head *fe_bh,
+};
-                                  struct ocfs2_alloc_context *meta_ac,
-                                  struct buffer_head **ret_new_eb_bh);
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
-                                  handle_t *handle,
+#define path_root_el(_path) ((_path)->p_node[0].el)
-                                  struct inode *inode,
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-                                  struct buffer_head *fe_bh,
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-                                  u64 blkno,
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-                                  u32 new_clusters);
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+/*
-                                    struct inode *inode,
+ * Reset the actual path elements so that we can re-use the structure
-                                    struct buffer_head *fe_bh,
+ * to build another path. Generally, this involves freeing the buffer
-                                    struct buffer_head **target_bh);
+ * heads.
+ */
+static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+        int i, start = 0, depth = 0;
+        struct ocfs2_path_item *node;
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+        if (keep_root)
-                                       struct inode *inode,
+                start = 1;
-                                       struct ocfs2_dinode *fe,
-                                       unsigned int new_i_clusters,
+        for(i = start; i < path_num_items(path); i++) {
-                                       struct buffer_head *old_last_eb,
+                node = &path->p_node[i];
-                                       struct buffer_head **new_last_eb);
+                brelse(node->bh);
+                node->bh = NULL;
+                node->el = NULL;
+        }
+        /*
+         * Tree depth may change during truncate, or insert. If we're
+         * keeping the root extent list, then make sure that our path
+         * structure reflects the proper depth.
+         */
+        if (keep_root)
+                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        path->p_tree_depth = depth;
+}
+static void ocfs2_free_path(struct ocfs2_path *path)
+{
+        if (path) {
+                ocfs2_reinit_path(path, 0);
+                kfree(path);
+        }
+}
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+        int i;
+        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+                brelse(dest->p_node[i].bh);
+                dest->p_node[i].bh = src->p_node[i].bh;
+                dest->p_node[i].el = src->p_node[i].el;
+                src->p_node[i].bh = NULL;
+                src->p_node[i].el = NULL;
+        }
+}
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+                                        struct buffer_head *eb_bh)
+{
+        struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+        /*
+         * Right now, no root bh is an extent block, so this helps
+         * catch code errors with dinode trees. The assertion can be
+         * safely removed if we ever need to insert extent block
+         * structures at the root.
+         */
+        BUG_ON(index == 0);
+        path->p_node[index].bh = eb_bh;
+        path->p_node[index].el = &eb->h_list;
+}
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+                                         struct ocfs2_extent_list *root_el)
+{
+        struct ocfs2_path *path;
+        BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+        path = kzalloc(sizeof(*path), GFP_NOFS);
+        if (path) {
+                path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+                get_bh(root_bh);
+                path_root_bh(path) = root_bh;
+                path_root_el(path) = root_el;
+        }
+        return path;
+}
+/*
+ * Allocate and initialize a new path based on a disk inode tree.
+ */
+static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_list *el = &di->id2.i_list;
+        return ocfs2_new_path(di_bh, el);
+}
+/*
+ * Convenience function to journal all components in a path.
+ */
+static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
+                                     struct ocfs2_path *path)
+{
+        int i, ret = 0;
+        if (!path)
+                goto out;
+        for(i = 0; i < path_num_items(path); i++) {
+                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        return ret;
+}
+enum ocfs2_contig_type {
+        CONTIG_NONE = 0,
+        CONTIG_LEFT,
+        CONTIG_RIGHT
+};
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-static int ocfs2_extent_contig(struct inode *inode,
+/*
-                               struct ocfs2_extent_rec *ext,
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
-                               u64 blkno)
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+                                     struct ocfs2_extent_rec *ext,
+                                     u64 blkno)
+{
+        u64 blk_end = le64_to_cpu(ext->e_blkno);
+        blk_end += ocfs2_clusters_to_blocks(sb,
+                                    le16_to_cpu(ext->e_leaf_clusters));
+        return blkno == blk_end;
+}
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+                                  struct ocfs2_extent_rec *right)
+{
+        u32 left_range;
+        left_range = le32_to_cpu(left->e_cpos) +
+                le16_to_cpu(left->e_leaf_clusters);
+        return (left_range == le32_to_cpu(right->e_cpos));
+}
+static enum ocfs2_contig_type
+        ocfs2_extent_contig(struct inode *inode,
+                            struct ocfs2_extent_rec *ext,
+                            struct ocfs2_extent_rec *insert_rec)
 {
-        return blkno == (le64_to_cpu(ext->e_blkno) +
+        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
-                         ocfs2_clusters_to_blocks(inode->i_sb,
-                                                  le32_to_cpu(ext->e_clusters)));
+        if (ocfs2_extents_adjacent(ext, insert_rec) &&
+            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+                        return CONTIG_RIGHT;
+        blkno = le64_to_cpu(ext->e_blkno);
+        if (ocfs2_extents_adjacent(insert_rec, ext) &&
+            ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+                return CONTIG_LEFT;
+        return CONTIG_NONE;
 }
 /*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+        APPEND_NONE = 0,
+        APPEND_TAIL,
+};
+struct ocfs2_insert_type {
+        enum ocfs2_append_type  ins_appending;
+        enum ocfs2_contig_type  ins_contig;
+        int                     ins_contig_index;
+        int                     ins_free_records;
+        int                     ins_tree_depth;
+};
+/*
 * How many free extents have we got before we need more meta data?
 */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
 }
 /*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
+{
+        int i;
+        i = le16_to_cpu(el->l_next_free_rec) - 1;
+        return le32_to_cpu(el->l_recs[i].e_cpos) +
+                ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+/*
 * Add an entire tree branch to our inode. eb_bh is the extent block
 * to start at, if we don't want to start the branch at the dinode
 * structure.
@@ -250,7 +454,7 @@ bail:
 * for the new last extent block.
 *
 * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
 */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *eb_el;
        struct ocfs2_extent_list  *el;
+        u32 new_cpos;
        mlog_entry_void();
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
+        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
         * linked with the rest of the tree.
         * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                eb->h_next_leaf_blk = 0;
                eb_el->l_tree_depth = cpu_to_le16(i);
                eb_el->l_next_free_rec = cpu_to_le16(1);
-                eb_el->l_recs[0].e_cpos = fe->i_clusters;
+                /*
+                 * This actually counts as an empty extent as
+                 * c_clusters == 0
+                 */
+                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-                eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+                /*
+                 * eb_el isn't always an interior node, but even leaf
+                 * nodes want a zero'd flags and reserved field so
+                 * this gets the whole 32 bits regardless of use.
+                 */
+                eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * either be on the fe, or the extent block passed in. */
        i = le16_to_cpu(el->l_next_free_rec);
        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
-        el->l_recs[i].e_cpos = fe->i_clusters;
+        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
-        el->l_recs[i].e_clusters = 0;
+        el->l_recs[i].e_int_clusters = 0;
        le16_add_cpu(&el->l_next_free_rec, 1);
        /* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                                  struct buffer_head **ret_new_eb_bh)
 {
        int status, i;
+        u32 new_clusters;
        struct buffer_head *new_eb_bh = NULL;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        /* copy the fe data into the new extent block */
        eb_el->l_tree_depth = fe_el->l_tree_depth;
        eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-                eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+                eb_el->l_recs[i] = fe_el->l_recs[i];
-                eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-                eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-        }
        status = ocfs2_journal_dirty(handle, new_eb_bh);
        if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
+        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
        /* update fe now */
        le16_add_cpu(&fe_el->l_tree_depth, 1);
        fe_el->l_recs[0].e_cpos = 0;
        fe_el->l_recs[0].e_blkno = eb->h_blkno;
-        fe_el->l_recs[0].e_clusters = fe->i_clusters;
+        fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-                fe_el->l_recs[i].e_cpos = 0;
+                memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-                fe_el->l_recs[i].e_clusters = 0;
-                fe_el->l_recs[i].e_blkno = 0;
-        }
        fe_el->l_next_free_rec = cpu_to_le16(1);
        /* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
 }
 /*
- * Expects the tree to already have room in the rightmost leaf for the
- * extent.  Updates all the extent blocks (and the dinode) on the way
- * down.
- */
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
-                                  handle_t *handle,
-                                  struct inode *inode,
-                                  struct buffer_head *fe_bh,
-                                  u64 start_blk,
-                                  u32 new_clusters)
-{
-        int status, i, num_bhs = 0;
-        u64 next_blkno;
-        u16 next_free;
-        struct buffer_head **eb_bhs = NULL;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *el;
-        mlog_entry_void();
-        status = ocfs2_journal_access(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        el = &fe->id2.i_list;
-        if (el->l_tree_depth) {
-                /* This is another operation where we want to be
-                 * careful about our tree updates. An error here means
-                 * none of the previous changes we made should roll
-                 * forward. As a result, we have to record the buffers
-                 * for this part of the tree in an array and reserve a
-                 * journal write to them before making any changes. */
-                num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
-                eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
-                                 GFP_KERNEL);
-                if (!eb_bhs) {
-                        status = -ENOMEM;
-                        mlog_errno(status);
-                        goto bail;
-                }
-                i = 0;
-                while(el->l_tree_depth) {
-                        next_free = le16_to_cpu(el->l_next_free_rec);
-                        if (next_free == 0) {
-                                ocfs2_error(inode->i_sb,
-                                            "Dinode %llu has a bad extent list",
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                                status = -EIO;
-                                goto bail;
-                        }
-                        next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
-                        BUG_ON(i >= num_bhs);
-                        status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
-                                                  OCFS2_BH_CACHED, inode);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-                        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-                                                                 eb);
-                                status = -EIO;
-                                goto bail;
-                        }
-                        status = ocfs2_journal_access(handle, inode, eb_bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        el = &eb->h_list;
-                        i++;
-                        /* When we leave this loop, eb_bhs[num_bhs - 1] will
-                         * hold the bottom-most leaf extent block. */
-                }
-                BUG_ON(el->l_tree_depth);
-                el = &fe->id2.i_list;
-                /* If we have tree depth, then the fe update is
-                 * trivial, and we want to switch el out for the
-                 * bottom-most leaf in order to update it with the
-                 * actual extent data below. */
-                next_free = le16_to_cpu(el->l_next_free_rec);
-                if (next_free == 0) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu has a bad extent list",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        status = -EIO;
-                        goto bail;
-                }
-                le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                             new_clusters);
-                /* (num_bhs - 1) to avoid the leaf */
-                for(i = 0; i < (num_bhs - 1); i++) {
-                        eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-                        el = &eb->h_list;
-                        /* finally, make our actual change to the
-                         * intermediate extent blocks. */
-                        next_free = le16_to_cpu(el->l_next_free_rec);
-                        le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                                     new_clusters);
-                        status = ocfs2_journal_dirty(handle, eb_bhs[i]);
-                        if (status < 0)
-                                mlog_errno(status);
-                }
-                BUG_ON(i != (num_bhs - 1));
-                /* note that the leaf block wasn't touched in
-                 * the loop above */
-                eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
-                el = &eb->h_list;
-                BUG_ON(el->l_tree_depth);
-        }
-        /* yay, we can finally add the actual extent now! */
-        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le16_to_cpu(el->l_next_free_rec) &&
-            ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
-                le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
-        } else if (le16_to_cpu(el->l_next_free_rec) &&
-                   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
-                /* having an empty extent at eof is legal. */
-                if (el->l_recs[i].e_cpos != fe->i_clusters) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu trailing extent is bad: "
-                                    "cpos (%u) != number of clusters (%u)",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                    le32_to_cpu(el->l_recs[i].e_cpos),
-                                    le32_to_cpu(fe->i_clusters));
-                        status = -EIO;
-                        goto bail;
-                }
-                el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-                el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-        } else {
-                /* No contiguous record, or no empty record at eof, so
-                 * we add a new one. */
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
-                       le16_to_cpu(el->l_count));
-                i = le16_to_cpu(el->l_next_free_rec);
-                el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-                el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-                el->l_recs[i].e_cpos = fe->i_clusters;
-                le16_add_cpu(&el->l_next_free_rec, 1);
-        }
-        /*
-         * extent_map errors are not fatal, so they are ignored outside
-         * of flushing the thing.
-         */
-        status = ocfs2_extent_map_append(inode, &el->l_recs[i],
-                                         new_clusters);
-        if (status) {
-                mlog_errno(status);
-                ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
-        }
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
-                if (status < 0)
-                        mlog_errno(status);
-        }
-        status = 0;
-bail:
-        if (eb_bhs) {
-                for (i = 0; i < num_bhs; i++)
-                        if (eb_bhs[i])
-                                brelse(eb_bhs[i]);
-                kfree(eb_bhs);
-        }
-        mlog_exit(status);
-        return status;
-}
-/*
 * Should only be called when there is no space left in any of the
 * leaf nodes. What we want to do is find the lowest tree depth
 * non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
        return status;
 }
-/* the caller needs to update fe->i_clusters */
+/*
-int ocfs2_insert_extent(struct ocfs2_super *osb,
+ * This is only valid for leaf nodes, which are the only ones that can
-                        handle_t *handle,
+ * have empty extents anyway.
-                        struct inode *inode,
+ */
-                        struct buffer_head *fe_bh,
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-                        u64 start_blk,
-                        u32 new_clusters,
-                        struct ocfs2_alloc_context *meta_ac)
 {
-        int status, i, shift;
+        return !rec->e_leaf_clusters;
-        struct buffer_head *last_eb_bh = NULL;
+}
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        int count = le16_to_cpu(el->l_count);
+        unsigned int num_bytes;
+        BUG_ON(!next_free);
+        /* This will cause us to go off the end of our extent list. */
+        BUG_ON(next_free >= count);
+        num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+        memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+                              struct ocfs2_extent_rec *insert_rec)
+{
+        int i, insert_index, next_free, has_empty, num_bytes;
+        u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+        struct ocfs2_extent_rec *rec;
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+        BUG_ON(!next_free);
+        /* The tree code before us didn't allow enough room in the leaf. */
+        if (el->l_next_free_rec == el->l_count && !has_empty)
+                BUG();
+        /*
+         * The easiest way to approach this is to just remove the
+         * empty extent and temporarily decrement next_free.
+         */
+        if (has_empty) {
+                /*
+                 * If next_free was 1 (only an empty extent), this
+                 * loop won't execute, which is fine. We still want
+                 * the decrement above to happen.
+                 */
+                for(i = 0; i < (next_free - 1); i++)
+                        el->l_recs[i] = el->l_recs[i+1];
+                next_free--;
+        }
+        /*
+         * Figure out what the new record index should be.
+         */
+        for(i = 0; i < next_free; i++) {
+                rec = &el->l_recs[i];
+                if (insert_cpos < le32_to_cpu(rec->e_cpos))
+                        break;
+        }
+        insert_index = i;
+        mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
+             insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+        BUG_ON(insert_index < 0);
+        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+        BUG_ON(insert_index > next_free);
+        /*
+         * No need to memmove if we're just adding to the tail.
+         */
+        if (insert_index != next_free) {
+                BUG_ON(next_free >= le16_to_cpu(el->l_count));
+                num_bytes = next_free - insert_index;
+                num_bytes *= sizeof(struct ocfs2_extent_rec);
+                memmove(&el->l_recs[insert_index + 1],
+                        &el->l_recs[insert_index],
+                        num_bytes);
+        }
+        /*
+         * Either we had an empty extent, and need to re-increment or
+         * there was no empty extent on a non full rightmost leaf node,
+         * in which case we still need to increment.
+         */
+        next_free++;
+        el->l_next_free_rec = cpu_to_le16(next_free);
+        /*
+         * Make sure none of the math above just messed up our tree.
+         */
+        BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+        el->l_recs[insert_index] = *insert_rec;
+}
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (next_free == 0)
+                goto set_and_inc;
+        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                return;
+        mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+                        "Asked to create an empty extent in a full list:\n"
+                        "count = %u, tree depth = %u",
+                        le16_to_cpu(el->l_count),
+                        le16_to_cpu(el->l_tree_depth));
+        ocfs2_shift_records_right(el);
+set_and_inc:
+        le16_add_cpu(&el->l_next_free_rec, 1);
+        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+static int ocfs2_find_subtree_root(struct inode *inode,
+                                   struct ocfs2_path *left,
+                                   struct ocfs2_path *right)
+{
+        int i = 0;
+        /*
+         * Check that the caller passed in two paths from the same tree.
+         */
+        BUG_ON(path_root_bh(left) != path_root_bh(right));
+        do {
+                i++;
+                /*
+                 * The caller didn't pass two adjacent paths.
+                 */
+                mlog_bug_on_msg(i > left->p_tree_depth,
+                                "Inode %lu, left depth %u, right depth %u\n"
+                                "left leaf blk %llu, right leaf blk %llu\n",
+                                inode->i_ino, left->p_tree_depth,
+                                right->p_tree_depth,
+                                (unsigned long long)path_leaf_bh(left)->b_blocknr,
+                                (unsigned long long)path_leaf_bh(right)->b_blocknr);
+        } while (left->p_node[i].bh->b_blocknr ==
+                 right->p_node[i].bh->b_blocknr);
+        return i - 1;
+}
+typedef void (path_insert_t)(void *, struct buffer_head *);
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct inode *inode,
+                             struct ocfs2_extent_list *root_el, u32 cpos,
+                             path_insert_t *func, void *data)
+{
+        int i, ret = 0;
+        u32 range;
+        u64 blkno;
        struct buffer_head *bh = NULL;
-        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *el;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog_entry_void();
+        el = root_el;
+        while (el->l_tree_depth) {
+                if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has empty extent list at "
+                                    "depth %u\n",
+                                    (unsigned long long)oi->ip_blkno,
+                                    le16_to_cpu(el->l_tree_depth));
+                        ret = -EROFS;
+                        goto out;
-        mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
+                }
-             new_clusters, (unsigned long long)start_blk,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+                for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
-        el = &fe->id2.i_list;
+                        rec = &el->l_recs[i];
+                        /*
+                         * In the case that cpos is off the allocation
+                         * tree, this should just wind up returning the
+                         * rightmost record.
+                         */
+                        range = le32_to_cpu(rec->e_cpos) +
+                                ocfs2_rec_clusters(el, rec);
+                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+                            break;
+                }
-        if (el->l_tree_depth) {
+                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
-                /* jump to end of tree */
+                if (blkno == 0) {
-                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                        ocfs2_error(inode->i_sb,
-                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
+                                    "Inode %llu has bad blkno in extent list "
-                if (status < 0) {
+                                    "at depth %u (index %d)\n",
-                        mlog_exit(status);
+                                    (unsigned long long)oi->ip_blkno,
-                        goto bail;
+                                    le16_to_cpu(el->l_tree_depth), i);
+                        ret = -EROFS;
+                        goto out;
                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                brelse(bh);
+                bh = NULL;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+                                       &bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                        ret = -EIO;
+                        goto out;
+                }
+                if (le16_to_cpu(el->l_next_free_rec) >
+                    le16_to_cpu(el->l_count)) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has bad count in extent list "
+                                    "at block %llu (next free=%u, count=%u)\n",
+                                    (unsigned long long)oi->ip_blkno,
+                                    (unsigned long long)bh->b_blocknr,
+                                    le16_to_cpu(el->l_next_free_rec),
+                                    le16_to_cpu(el->l_count));
+                        ret = -EROFS;
+                        goto out;
+                }
+                if (func)
+                        func(data, bh);
+        }
+out:
+        /*
+         * Catch any trailing bh that the loop didn't handle.
+         */
+        brelse(bh);
+        return ret;
+}
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+        int index;
+        struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+        struct find_path_data *fp = data;
+        get_bh(bh);
+        ocfs2_path_insert_eb(fp->path, fp->index, bh);
+        fp->index++;
+}
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+                           u32 cpos)
+{
+        struct find_path_data data;
+        data.index = 1;
+        data.path = path;
+        return __ocfs2_find_path(inode, path_root_el(path), cpos,
+                                 find_path_ins, &data);
+}
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+        struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+        struct ocfs2_extent_list *el = &eb->h_list;
+        struct buffer_head **ret = data;
+        /* We want to retain only the leaf block. */
+        if (le16_to_cpu(el->l_tree_depth) == 0) {
+                get_bh(bh);
+                *ret = bh;
+        }
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+                    u32 cpos, struct buffer_head **leaf_bh)
+{
+        int ret;
+        struct buffer_head *bh = NULL;
+        ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *leaf_bh = bh;
+out:
+        return ret;
+}
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+                                  struct ocfs2_extent_list *left_child_el,
+                                  struct ocfs2_extent_rec *right_rec,
+                                  struct ocfs2_extent_list *right_child_el)
+{
+        u32 left_clusters, right_end;
+        /*
+         * Interior nodes never have holes. Their cpos is the cpos of
+         * the leftmost record in their child list. Their cluster
+         * count covers the full theoretical range of their child list
+         * - the range between their cpos and the cpos of the record
+         * immediately to their right.
+         */
+        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+        left_clusters -= le32_to_cpu(left_rec->e_cpos);
+        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+        /*
+         * Calculate the rightmost cluster count boundary before
+         * moving cpos - we will need to adjust clusters after
+         * updating e_cpos to keep the same highest cluster count.
+         */
+        right_end = le32_to_cpu(right_rec->e_cpos);
+        right_end += le32_to_cpu(right_rec->e_int_clusters);
+        right_rec->e_cpos = left_rec->e_cpos;
+        le32_add_cpu(&right_rec->e_cpos, left_clusters);
+        right_end -= le32_to_cpu(right_rec->e_cpos);
+        right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+                                      struct ocfs2_extent_list *left_el,
+                                      struct ocfs2_extent_list *right_el,
+                                      u64 left_el_blkno)
+{
+        int i;
+        BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+               le16_to_cpu(left_el->l_tree_depth));
+        for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+                if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+                        break;
+        }
+        /*
+         * The path walking code should have never returned a root and
+         * two paths which are not adjacent.
+         */
+        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+        ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+                                      &root_el->l_recs[i + 1], right_el);
+}
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ */
+static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+                                       struct ocfs2_path *left_path,
+                                       struct ocfs2_path *right_path,
+                                       int subtree_index)
+{
+        int ret, i, idx;
+        struct ocfs2_extent_list *el, *left_el, *right_el;
+        struct ocfs2_extent_rec *left_rec, *right_rec;
+        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+        /*
+         * Update the counts and position values within all the
+         * interior nodes to reflect the leaf rotation we just did.
+         *
+         * The root node is handled below the loop.
+         *
+         * We begin the loop with right_el and left_el pointing to the
+         * leaf lists and work our way up.
+         *
+         * NOTE: within this loop, left_el and right_el always refer
+         * to the *child* lists.
+         */
+        left_el = path_leaf_el(left_path);
+        right_el = path_leaf_el(right_path);
+        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+                mlog(0, "Adjust records at index %u\n", i);
+                /*
+                 * One nice property of knowing that all of these
+                 * nodes are below the root is that we only deal with
+                 * the leftmost right node record and the rightmost
+                 * left node record.
+                 */
+                el = left_path->p_node[i].el;
+                idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+                left_rec = &el->l_recs[idx];
+                el = right_path->p_node[i].el;
+                right_rec = &el->l_recs[0];
+                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+                                              right_el);
+                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                if (ret)
+                        mlog_errno(ret);
+                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * Setup our list pointers now so that the current
+                 * parents become children in the next iteration.
+                 */
+                left_el = left_path->p_node[i].el;
+                right_el = right_path->p_node[i].el;
+        }
+        /*
+         * At the root node, adjust the two adjacent records which
+         * begin our path to the leaves.
+         */
+        el = left_path->p_node[subtree_index].el;
+        left_el = left_path->p_node[subtree_index + 1].el;
+        right_el = right_path->p_node[subtree_index + 1].el;
+        ocfs2_adjust_root_records(el, left_el, right_el,
+                                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
+        root_bh = left_path->p_node[subtree_index].bh;
+        ret = ocfs2_journal_dirty(handle, root_bh);
+        if (ret)
+                mlog_errno(ret);
+}
+static int ocfs2_rotate_subtree_right(struct inode *inode,
+                                      handle_t *handle,
+                                      struct ocfs2_path *left_path,
+                                      struct ocfs2_path *right_path,
+                                      int subtree_index)
+{
+        int ret, i;
+        struct buffer_head *right_leaf_bh;
+        struct buffer_head *left_leaf_bh = NULL;
+        struct buffer_head *root_bh;
+        struct ocfs2_extent_list *right_el, *left_el;
+        struct ocfs2_extent_rec move_rec;
+        left_leaf_bh = path_leaf_bh(left_path);
+        left_el = path_leaf_el(left_path);
+        if (left_el->l_next_free_rec != left_el->l_count) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has non-full interior leaf node %llu"
+                            "(next free = %u)",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)left_leaf_bh->b_blocknr,
+                            le16_to_cpu(left_el->l_next_free_rec));
+                return -EROFS;
+        }
+        /*
+         * This extent block may already have an empty record, so we
+         * return early if so.
+         */
+        if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+                return 0;
+        root_bh = left_path->p_node[subtree_index].bh;
+        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+                ret = ocfs2_journal_access(handle, inode,
+                                           right_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access(handle, inode,
+                                           left_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        right_leaf_bh = path_leaf_bh(right_path);
+        right_el = path_leaf_el(right_path);
+        /* This is a code error, not a disk corruption. */
+        mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+                        "because rightmost leaf block %llu is empty\n",
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (unsigned long long)right_leaf_bh->b_blocknr);
+        ocfs2_create_empty_extent(right_el);
+        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* Do the copy now. */
+        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+        move_rec = left_el->l_recs[i];
+        right_el->l_recs[0] = move_rec;
+        /*
+         * Clear out the record we just copied and shift everything
+         * over, leaving an empty extent in the left leaf.
+         *
+         * We temporarily subtract from next_free_rec so that the
+         * shift will lose the tail record (which is now defunct).
+         */
+        le16_add_cpu(&left_el->l_next_free_rec, -1);
+        ocfs2_shift_records_right(left_el);
+        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+        le16_add_cpu(&left_el->l_next_free_rec, 1);
+        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+                                subtree_index);
+out:
+        return ret;
+}
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                         struct ocfs2_path *path, u32 *cpos)
+{
+        int i, j, ret = 0;
+        u64 blkno;
+        struct ocfs2_extent_list *el;
+        BUG_ON(path->p_tree_depth == 0);
+        *cpos = 0;
+        blkno = path_leaf_bh(path)->b_blocknr;
+        /* Start at the tree node just above the leaf and work our way up. */
+        i = path->p_tree_depth - 1;
+        while (i >= 0) {
+                el = path->p_node[i].el;
+                /*
+                 * Find the extent record just before the one in our
+                 * path.
+                 */
+                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+                                if (j == 0) {
+                                        if (i == 0) {
+                                                /*
+                                                 * We've determined that the
+                                                 * path specified is already
+                                                 * the leftmost one - return a
+                                                 * cpos of zero.
+                                                 */
+                                                goto out;
+                                        }
+                                        /*
+                                         * The leftmost record points to our
+                                         * leaf - we need to travel up the
+                                         * tree one level.
+                                         */
+                                        goto next_node;
+                                }
+                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+                                *cpos = *cpos + ocfs2_rec_clusters(el,
+                                                           &el->l_recs[j - 1]);
+                                *cpos = *cpos - 1;
+                                goto out;
+                        }
+                }
+                /*
+                 * If we got here, we never found a valid node where
+                 * the tree indicated one should be.
+                 */
+                ocfs2_error(sb,
+                            "Invalid extent tree at extent block %llu\n",
+                            (unsigned long long)blkno);
+                ret = -EROFS;
+                goto out;
+next_node:
+                blkno = path->p_node[i].bh->b_blocknr;
+                i--;
+        }
+out:
+        return ret;
+}
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+                                           struct ocfs2_path *path)
+{
+        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+        if (handle->h_buffer_credits < credits)
+                return ocfs2_extend_trans(handle, credits);
+        return 0;
+}
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+                                                 u32 insert_cpos)
+{
+        struct ocfs2_extent_list *left_el;
+        struct ocfs2_extent_rec *rec;
+        int next_free;
+        left_el = path_leaf_el(left_path);
+        next_free = le16_to_cpu(left_el->l_next_free_rec);
+        rec = &left_el->l_recs[next_free - 1];
+        if (insert_cpos > le32_to_cpu(rec->e_cpos))
+                return 1;
+        return 0;
+}
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon succesful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(struct inode *inode,
+                                   handle_t *handle,
+                                   u32 insert_cpos,
+                                   struct ocfs2_path *right_path,
+                                   struct ocfs2_path **ret_left_path)
+{
+        int ret, start;
+        u32 cpos;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                   path_root_el(right_path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+        /*
+         * What we want to do here is:
+         *
+         * 1) Start with the rightmost path.
+         *
+         * 2) Determine a path to the leaf block directly to the left
+         *    of that leaf.
+         *
+         * 3) Determine the 'subtree root' - the lowest level tree node
+         *    which contains a path to both leaves.
+         *
+         * 4) Rotate the subtree.
+         *
+         * 5) Find the next subtree by considering the left path to be
+         *    the new right path.
+         *
+         * The check at the top of this while loop also accepts
+         * insert_cpos == cpos because cpos is only a _theoretical_
+         * value to get us the left path - insert_cpos might very well
+         * be filling that hole.
+         *
+         * Stop at a cpos of '0' because we either started at the
+         * leftmost branch (i.e., a tree with one branch and a
+         * rotation inside of it), or we've gone as far as we can in
+         * rotating subtrees.
+         */
+        while (cpos && insert_cpos <= cpos) {
+                mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
+                     insert_cpos, cpos);
+                ret = ocfs2_find_path(inode, left_path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                mlog_bug_on_msg(path_leaf_bh(left_path) ==
+                                path_leaf_bh(right_path),
+                                "Inode %lu: error during insert of %u "
+                                "(left path cpos %u) results in two identical "
+                                "paths ending at %llu\n",
+                                inode->i_ino, insert_cpos, cpos,
+                                (unsigned long long)
+                                path_leaf_bh(left_path)->b_blocknr);
+                if (ocfs2_rotate_requires_path_adjustment(left_path,
+                                                          insert_cpos)) {
+                        mlog(0, "Path adjustment required\n");
+                        /*
+                         * We've rotated the tree as much as we
+                         * should. The rest is up to
+                         * ocfs2_insert_path() to complete, after the
+                         * record insertion. We indicate this
+                         * situation by returning the left path.
+                         *
+                         * The reason we don't adjust the records here
+                         * before the record insert is that an error
+                         * later might break the rule where a parent
+                         * record e_cpos will reflect the actual
+                         * e_cpos of the 1st nonempty record of the
+                         * child list.
+                         */
+                        *ret_left_path = left_path;
+                        goto out_ret_path;
+                }
+                start = ocfs2_find_subtree_root(inode, left_path, right_path);
+                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                     start,
+                     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
+                     right_path->p_tree_depth);
+                ret = ocfs2_extend_rotate_transaction(handle, start,
+                                                      right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+                                                 right_path, start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * There is no need to re-read the next right path
+                 * as we know that it'll be our current left
+                 * path. Optimize by copying values instead.
+                 */
+                ocfs2_mv_path(right_path, left_path);
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                    &cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(left_path);
+out_ret_path:
+        return ret;
+}
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_insert_type *insert,
+                                 struct inode *inode)
+{
+        int i = insert->ins_contig_index;
+        unsigned int range;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        /*
+         * Contiguous insert - either left or right.
+         */
+        if (insert->ins_contig != CONTIG_NONE) {
+                rec = &el->l_recs[i];
+                if (insert->ins_contig == CONTIG_LEFT) {
+                        rec->e_blkno = insert_rec->e_blkno;
+                        rec->e_cpos = insert_rec->e_cpos;
+                }
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                return;
+        }
+        /*
+         * Handle insert into an empty leaf.
+         */
+        if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+            ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+             ocfs2_is_empty_extent(&el->l_recs[0]))) {
+                el->l_recs[0] = *insert_rec;
+                el->l_next_free_rec = cpu_to_le16(1);
+                return;
+        }
+        /*
+         * Appending insert.
+         */
+        if (insert->ins_appending == APPEND_TAIL) {
+                i = le16_to_cpu(el->l_next_free_rec) - 1;
+                rec = &el->l_recs[i];
+                range = le32_to_cpu(rec->e_cpos)
+                        + le16_to_cpu(rec->e_leaf_clusters);
+                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+                                le16_to_cpu(el->l_count),
+                                "inode %lu, depth %u, count %u, next free %u, "
+                                "rec.cpos %u, rec.clusters %u, "
+                                "insert.cpos %u, insert.clusters %u\n",
+                                inode->i_ino,
+                                le16_to_cpu(el->l_tree_depth),
+                                le16_to_cpu(el->l_count),
+                                le16_to_cpu(el->l_next_free_rec),
+                                le32_to_cpu(el->l_recs[i].e_cpos),
+                                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+                                le32_to_cpu(insert_rec->e_cpos),
+                                le16_to_cpu(insert_rec->e_leaf_clusters));
+                i++;
+                el->l_recs[i] = *insert_rec;
+                le16_add_cpu(&el->l_next_free_rec, 1);
+                return;
+        }
+        /*
+         * Ok, we have to rotate.
+         *
+         * At this point, it is safe to assume that inserting into an
+         * empty leaf and appending to a leaf have both been handled
+         * above.
+         *
+         * This leaf needs to have space, either by the empty 1st
+         * extent record, or by virtue of an l_next_rec < l_count.
+         */
+        ocfs2_rotate_leaf(el, insert_rec);
+}
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+                                                struct ocfs2_dinode *di,
+                                                u32 clusters)
+{
+        le32_add_cpu(&di->i_clusters, clusters);
+        spin_lock(&OCFS2_I(inode)->ip_lock);
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+        spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+                                    struct ocfs2_extent_rec *insert_rec,
+                                    struct ocfs2_path *right_path,
+                                    struct ocfs2_path **ret_left_path)
+{
+        int ret, i, next_free;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        /*
+         * This shouldn't happen for non-trees. The extent rec cluster
+         * count manipulation below only works for interior nodes.
+         */
+        BUG_ON(right_path->p_tree_depth == 0);
+        /*
+         * If our appending insert is at the leftmost edge of a leaf,
+         * then we might need to update the rightmost records of the
+         * neighboring path.
+         */
+        el = path_leaf_el(right_path);
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        if (next_free == 0 ||
+            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+                u32 left_cpos;
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                    &left_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                mlog(0, "Append may need a left path update. cpos: %u, "
+                     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+                     left_cpos);
+                /*
+                 * No need to worry if the append is already in the
+                 * leftmost leaf.
+                 */
+                if (left_cpos) {
+                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                                   path_root_el(right_path));
+                        if (!left_path) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /*
+                         * ocfs2_insert_path() will pass the left_path to the
+                         * journal for us.
+                         */
+                }
+        }
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_root_el(right_path);
+        bh = path_root_bh(right_path);
+        i = 0;
+        while (1) {
+                struct ocfs2_extent_rec *rec;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (next_free == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Dinode %llu has a bad extent list",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        ret = -EIO;
+                        goto out;
+                }
+                rec = &el->l_recs[next_free - 1];
+                rec->e_int_clusters = insert_rec->e_cpos;
+                le32_add_cpu(&rec->e_int_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                le32_add_cpu(&rec->e_int_clusters,
+                             -le32_to_cpu(rec->e_cpos));
+                ret = ocfs2_journal_dirty(handle, bh);
+                if (ret)
+                        mlog_errno(ret);
+                /* Don't touch the leaf node */
+                if (++i >= right_path->p_tree_depth)
+                        break;
+                bh = right_path->p_node[i].bh;
+                el = right_path->p_node[i].el;
+        }
+        *ret_left_path = left_path;
+        ret = 0;
+out:
+        if (ret != 0)
+                ocfs2_free_path(left_path);
+        return ret;
+}
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct inode *inode,
+                             handle_t *handle,
+                             struct ocfs2_path *left_path,
+                             struct ocfs2_path *right_path,
+                             struct ocfs2_extent_rec *insert_rec,
+                             struct ocfs2_insert_type *insert)
+{
+        int ret, subtree_index;
+        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+        struct ocfs2_extent_list *el;
+        /*
+         * Pass both paths to the journal. The majority of inserts
+         * will be touching all components anyway.
+         */
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (left_path) {
+                int credits = handle->h_buffer_credits;
+                /*
+                 * There's a chance that left_path got passed back to
+                 * us without being accounted for in the
+                 * journal. Extend our transaction here to be sure we
+                 * can change those blocks.
+                 */
+                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, credits);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access_path(inode, handle, left_path);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        el = path_leaf_el(right_path);
+        ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
+        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        if (ret)
+                mlog_errno(ret);
+        if (left_path) {
+                /*
+                 * The rotate code has indicated that we need to fix
+                 * up portions of the tree after the insert.
+                 *
+                 * XXX: Should we extend the transaction here?
+                 */
+                subtree_index = ocfs2_find_subtree_root(inode, left_path,
+                                                        right_path);
+                ocfs2_complete_edge_insert(inode, handle, left_path,
+                                           right_path, subtree_index);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_do_insert_extent(struct inode *inode,
+                                  handle_t *handle,
+                                  struct buffer_head *di_bh,
+                                  struct ocfs2_extent_rec *insert_rec,
+                                  struct ocfs2_insert_type *type)
+{
+        int ret, rotate = 0;
+        u32 cpos;
+        struct ocfs2_path *right_path = NULL;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_extent_list *el;
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        el = &di->id2.i_list;
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le16_to_cpu(el->l_tree_depth) == 0) {
+                ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+                goto out_update_clusters;
+        }
+        right_path = ocfs2_new_inode_path(di_bh);
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Determine the path to start with. Rotations need the
+         * rightmost path, everything else can go directly to the
+         * target leaf.
+         */
+        cpos = le32_to_cpu(insert_rec->e_cpos);
+        if (type->ins_appending == APPEND_NONE &&
+            type->ins_contig == CONTIG_NONE) {
+                rotate = 1;
+                cpos = UINT_MAX;
+        }
+        ret = ocfs2_find_path(inode, right_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Rotations and appends need special treatment - they modify
+         * parts of the tree's above them.
+         *
+         * Both might pass back a path immediate to the left of the
+         * one being inserted to. This will be cause
+         * ocfs2_insert_path() to modify the rightmost records of
+         * left_path to account for an edge insert.
+         *
+         * XXX: When modifying this code, keep in mind that an insert
+         * can wind up skipping both of these two special cases...
+         */
+        if (rotate) {
+                ret = ocfs2_rotate_tree_right(inode, handle,
+                                              le32_to_cpu(insert_rec->e_cpos),
+                                              right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else if (type->ins_appending == APPEND_TAIL
+                   && type->ins_contig != CONTIG_LEFT) {
+                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+                                               right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+                                insert_rec, type);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+out_update_clusters:
+        ocfs2_update_dinode_clusters(inode, di,
+                                     le16_to_cpu(insert_rec->e_leaf_clusters));
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        ocfs2_free_path(left_path);
+        ocfs2_free_path(right_path);
+        return ret;
+}
+static void ocfs2_figure_contig_type(struct inode *inode,
+                                     struct ocfs2_insert_type *insert,
+                                     struct ocfs2_extent_list *el,
+                                     struct ocfs2_extent_rec *insert_rec)
+{
+        int i;
+        enum ocfs2_contig_type contig_type = CONTIG_NONE;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
+                                                  insert_rec);
+                if (contig_type != CONTIG_NONE) {
+                        insert->ins_contig_index = i;
+                        break;
+                }
+        }
+        insert->ins_contig = contig_type;
+}
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+                                        struct ocfs2_extent_list *el,
+                                        struct ocfs2_extent_rec *insert_rec)
+{
+        int i;
+        u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+        struct ocfs2_extent_rec *rec;
+        insert->ins_appending = APPEND_NONE;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (!el->l_next_free_rec)
+                goto set_tail_append;
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                /* Were all records empty? */
+                if (le16_to_cpu(el->l_next_free_rec) == 1)
+                        goto set_tail_append;
        }
-        /* Can we allocate without adding/shifting tree bits? */
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le16_to_cpu(el->l_next_free_rec) == 0
+        rec = &el->l_recs[i];
-            || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
-            || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+        if (cpos >=
-            || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+            (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
-                goto out_add;
+                goto set_tail_append;
+        return;
+set_tail_append:
+        insert->ins_appending = APPEND_TAIL;
+}
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct inode *inode,
+                                    struct buffer_head *di_bh,
+                                    struct buffer_head **last_eb_bh,
+                                    struct ocfs2_extent_rec *insert_rec,
+                                    struct ocfs2_insert_type *insert)
+{
+        int ret;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *path = NULL;
+        struct buffer_head *bh = NULL;
+        el = &di->id2.i_list;
+        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+        if (el->l_tree_depth) {
+                /*
+                 * If we have tree depth, we read in the
+                 * rightmost extent block ahead of time as
+                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+                 * may want it later.
+                 */
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk), &bh,
+                                       OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_exit(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) bh->b_data;
+                el = &eb->h_list;
+        }
+        /*
+         * Unless we have a contiguous insert, we'll need to know if
+         * there is room left in our allocation tree for another
+         * extent record.
+         *
+         * XXX: This test is simplistic, we can search for empty
+         * extent records too.
+         */
+        insert->ins_free_records = le16_to_cpu(el->l_count) -
+                le16_to_cpu(el->l_next_free_rec);
+        if (!insert->ins_tree_depth) {
+                ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+                ocfs2_figure_appending_type(insert, el, insert_rec);
+                return 0;
+        }
+        path = ocfs2_new_inode_path(di_bh);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * In the case that we're inserting past what the tree
+         * currently accounts for, ocfs2_find_path() will return for
+         * us the rightmost tree path. This is accounted for below in
+         * the appending code.
+         */
+        ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        /*
+         * Now that we have the path, there's two things we want to determine:
+         * 1) Contiguousness (also set contig_index if this is so)
+         *
+         * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+         */
+        ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+        /*
+         * The insert code isn't quite ready to deal with all cases of
+         * left contiguousness. Specifically, if it's an insert into
+         * the 1st record in a leaf, it will require the adjustment of
+         * cluster count on the last record of the path directly to it's
+         * left. For now, just catch that case and fool the layers
+         * above us. This works just fine for tree_depth == 0, which
+         * is why we allow that above.
+         */
+        if (insert->ins_contig == CONTIG_LEFT &&
+            insert->ins_contig_index == 0)
+                insert->ins_contig = CONTIG_NONE;
+        /*
+         * Ok, so we can simply compare against last_eb to figure out
+         * whether the path doesn't exist. This will only happen in
+         * the case that we're doing a tail append, so maybe we can
+         * take advantage of that information somehow.
+         */
+        if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+                /*
+                 * Ok, ocfs2_find_path() returned us the rightmost
+                 * tree path. This might be an appending insert. There are
+                 * two cases:
+                 *    1) We're doing a true append at the tail:
+                 *      -This might even be off the end of the leaf
+                 *    2) We're "appending" by rotating in the tail
+                 */
+                ocfs2_figure_appending_type(insert, el, insert_rec);
+        }
+out:
+        ocfs2_free_path(path);
+        if (ret == 0)
+                *last_eb_bh = bh;
+        else
+                brelse(bh);
+        return ret;
+}
+/*
+ * Insert an extent into an inode btree.
+ *
+ * The caller needs to update fe->i_clusters
+ */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+                        handle_t *handle,
+                        struct inode *inode,
+                        struct buffer_head *fe_bh,
+                        u32 cpos,
+                        u64 start_blk,
+                        u32 new_clusters,
+                        struct ocfs2_alloc_context *meta_ac)
+{
+        int status, shift;
+        struct buffer_head *last_eb_bh = NULL;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_insert_type insert = {0, };
+        struct ocfs2_extent_rec rec;
+        mlog(0, "add %u clusters at position %u to inode %llu\n",
+             new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+                        (OCFS2_I(inode)->ip_clusters != cpos),
+                        "Device %s, asking for sparse allocation: inode %llu, "
+                        "cpos %u, clusters %u\n",
+                        osb->dev_str,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
+                        OCFS2_I(inode)->ip_clusters);
+        memset(&rec, 0, sizeof(rec));
+        rec.e_cpos = cpu_to_le32(cpos);
+        rec.e_blkno = cpu_to_le64(start_blk);
+        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+                                          &insert);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
-        mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+        mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
-             "tree now.\n");
+             "Insert.contig_index: %d, Insert.free_records: %d, "
+             "Insert.tree_depth: %d\n",
+             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+             insert.ins_free_records, insert.ins_tree_depth);
+        /*
+         * Avoid growing the tree unless we're out of records and the
+         * insert type requres one.
+         */
+        if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+                goto out_add;
        shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
        if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
         * and didn't find room for any more extents - we need to add
         * another tree level */
        if (shift) {
-                /* if we hit a leaf, we'd better be empty :) */
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
-                       le16_to_cpu(el->l_count));
                BUG_ON(bh);
-                mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+                mlog(0, "need to shift tree depth "
-                     "(current = %u)\n",
+                     "(current = %d)\n", insert.ins_tree_depth);
-                     le16_to_cpu(fe->id2.i_list.l_tree_depth));
                /* ocfs2_shift_tree_depth will return us a buffer with
                 * the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        mlog_errno(status);
                        goto bail;
                }
+                insert.ins_tree_depth++;
                /* Special case: we have room now if we shifted from
                 * tree_depth 0 */
-                if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+                if (insert.ins_tree_depth == 1)
                        goto out_add;
        }
        /* call ocfs2_add_branch to add the final part of the tree with
         * the new data. */
-        mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+        mlog(0, "add branch. bh = %p\n", bh);
        status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
                                  meta_ac);
        if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        }
 out_add:
-        /* Finally, we can add clusters. */
+        /* Finally, we can add clusters. This might rotate the tree for us. */
-        status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
-                                        start_blk, new_clusters);
        if (status < 0)
                mlog_errno(status);
+        else
+                ocfs2_extent_map_insert_rec(inode, &rec);
 bail:
        if (bh)
@@ -1355,7 +2869,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
        tl = &tl_copy->id2.i_dealloc;
        num_recs = le16_to_cpu(tl->tl_used);
        mlog(0, "cleanup %u records from %llu\n", num_recs,
-             (unsigned long long)tl_copy->i_blkno);
+             (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
        mutex_lock(&tl_inode->i_mutex);
        for(i = 0; i < num_recs; i++) {
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 * block will be deleted, and if it will, what the new last extent
 * block will be so we can update his h_next_leaf_blk field, as well
 * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       struct inode *inode,
+                                       unsigned int clusters_to_del,
-                                       struct ocfs2_dinode *fe,
+                                       struct ocfs2_path *path,
-                                       u32 new_i_clusters,
-                                       struct buffer_head *old_last_eb,
                                       struct buffer_head **new_last_eb)
 {
-        int i, status = 0;
+        int next_free, ret = 0;
-        u64 block = 0;
+        u32 cpos;
+        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        struct buffer_head *bh = NULL;
        *new_last_eb = NULL;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        /* we have no tree, so of course, no last_eb. */
-        if (!fe->id2.i_list.l_tree_depth)
+        if (!path->p_tree_depth)
-                goto bail;
+                goto out;
        /* trunc to zero special case - this makes tree_depth = 0
         * regardless of what it is.  */
-        if (!new_i_clusters)
+        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto bail;
+                goto out;
-        eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+        el = path_leaf_el(path);
-        el = &(eb->h_list);
        BUG_ON(!el->l_next_free_rec);
-        /* Make sure that this guy will actually be empty after we
+        /*
-         * clear away the data. */
+         * Make sure that this extent list will actually be empty
-        if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+         * after we clear away the data. We can shortcut out if
-                goto bail;
+         * there's more than one non-empty extent in the
+         * list. Otherwise, a check of the remaining extent is
+         * necessary.
+         */
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        rec = NULL;
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                if (next_free > 2)
+                        goto out;
-        /* Ok, at this point, we know that last_eb will definitely
+                /* We may have a valid extent in index 1, check it. */
-         * change, so lets traverse the tree and find the second to
+                if (next_free == 2)
-         * last extent block. */
+                        rec = &el->l_recs[1];
-        el = &(fe->id2.i_list);
-        /* go down the tree, */
+                /*
-        do {
+                 * Fall through - no more nonempty extents, so we want
-                for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+                 * to delete this leaf.
-                        if (le32_to_cpu(el->l_recs[i].e_cpos) <
+                 */
-                            new_i_clusters) {
+        } else {
-                                block = le64_to_cpu(el->l_recs[i].e_blkno);
+                if (next_free > 1)
-                                break;
+                        goto out;
-                        }
+                rec = &el->l_recs[0];
+        }
+        if (rec) {
+                /*
+                 * Check it we'll only be trimming off the end of this
+                 * cluster.
+                 */
+                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+                        goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        eb = (struct ocfs2_extent_block *) bh->b_data;
+        el = &eb->h_list;
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                ret = -EROFS;
+                goto out;
+        }
+        *new_last_eb = bh;
+        get_bh(*new_last_eb);
+        mlog(0, "returning block %llu, (cpos: %u)\n",
+             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
+out:
+        brelse(bh);
+        return ret;
+}
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ *   - start journaling of each path component.
+ *   - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+                           handle_t *handle, struct ocfs2_truncate_context *tc,
+                           u32 clusters_to_del, u64 *delete_start)
+{
+        int ret, i, index = path->p_tree_depth;
+        u32 new_edge = 0;
+        u64 deleted_eb = 0;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        *delete_start = 0;
+        while (index >= 0) {
+                bh = path->p_node[index].bh;
+                el = path->p_node[index].el;
+                mlog(0, "traveling tree (index = %d, block = %llu)\n",
+                     index,  (unsigned long long)bh->b_blocknr);
+                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+                if (index !=
+                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has invalid ext. block %llu",
+                                    inode->i_ino,
+                                    (unsigned long long)bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
                }
-                BUG_ON(i < 0);
-                if (bh) {
+find_tail_record:
-                        brelse(bh);
+                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                        bh = NULL;
+                rec = &el->l_recs[i];
+                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+                     ocfs2_rec_clusters(el, rec),
+                     (unsigned long long)le64_to_cpu(rec->e_blkno),
+                     le16_to_cpu(el->l_next_free_rec));
+                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+                if (le16_to_cpu(el->l_tree_depth) == 0) {
+                        /*
+                         * If the leaf block contains a single empty
+                         * extent and no records, we can just remove
+                         * the block.
+                         */
+                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                el->l_next_free_rec = cpu_to_le16(0);
+                                goto delete;
+                        }
+                        /*
+                         * Remove any empty extents by shifting things
+                         * left. That should make life much easier on
+                         * the code below. This condition is rare
+                         * enough that we shouldn't see a performance
+                         * hit.
+                         */
+                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                                for(i = 0;
+                                    i < le16_to_cpu(el->l_next_free_rec); i++)
+                                        el->l_recs[i] = el->l_recs[i + 1];
+                                memset(&el->l_recs[i], 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                /*
+                                 * We've modified our extent list. The
+                                 * simplest way to handle this change
+                                 * is to being the search from the
+                                 * start again.
+                                 */
+                                goto find_tail_record;
+                        }
+                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+                        /*
+                         * We'll use "new_edge" on our way back up the
+                         * tree to know what our rightmost cpos is.
+                         */
+                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
+                        new_edge += le32_to_cpu(rec->e_cpos);
+                        /*
+                         * The caller will use this to delete data blocks.
+                         */
+                        *delete_start = le64_to_cpu(rec->e_blkno)
+                                + ocfs2_clusters_to_blocks(inode->i_sb,
+                                        le16_to_cpu(rec->e_leaf_clusters));
+                        /*
+                         * If it's now empty, remove this record.
+                         */
+                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                        }
+                } else {
+                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                                goto delete;
+                        }
+                        /* Can this actually happen? */
+                        if (le16_to_cpu(el->l_next_free_rec) == 0)
+                                goto delete;
+                        /*
+                         * We never actually deleted any clusters
+                         * because our leaf was empty. There's no
+                         * reason to adjust the rightmost edge then.
+                         */
+                        if (new_edge == 0)
+                                goto delete;
+                        rec->e_int_clusters = cpu_to_le32(new_edge);
+                        le32_add_cpu(&rec->e_int_clusters,
+                                     -le32_to_cpu(rec->e_cpos));
+                         /*
+                          * A deleted child record should have been
+                          * caught above.
+                          */
+                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
                }
-                status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+delete:
-                                         inode);
+                ret = ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
+                if (ret) {
-                        mlog_errno(status);
+                        mlog_errno(ret);
-                        goto bail;
+                        goto out;
                }
-                eb = (struct ocfs2_extent_block *) bh->b_data;
-                el = &eb->h_list;
+                mlog(0, "extent list container %llu, after: record %d: "
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                     "(%u, %u, %llu), next = %u.\n",
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                     (unsigned long long)bh->b_blocknr, i,
-                        status = -EIO;
+                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                        goto bail;
+                     (unsigned long long)le64_to_cpu(rec->e_blkno),
+                     le16_to_cpu(el->l_next_free_rec));
+                /*
+                 * We must be careful to only attempt delete of an
+                 * extent block (and not the root inode block).
+                 */
+                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+                        struct ocfs2_extent_block *eb =
+                                (struct ocfs2_extent_block *)bh->b_data;
+                        /*
+                         * Save this for use when processing the
+                         * parent block.
+                         */
+                        deleted_eb = le64_to_cpu(eb->h_blkno);
+                        mlog(0, "deleting this extent block.\n");
+                        ocfs2_remove_from_cache(inode, bh);
+                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+                        if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+                                /*
+                                 * This code only understands how to
+                                 * lock the suballocator in slot 0,
+                                 * which is fine because allocation is
+                                 * only ever done out of that
+                                 * suballocator too. A future version
+                                 * might change that however, so avoid
+                                 * a free if we don't know how to
+                                 * handle it. This way an fs incompat
+                                 * bit will not be necessary.
+                                 */
+                                ret = ocfs2_free_extent_block(handle,
+                                                              tc->tc_ext_alloc_inode,
+                                                              tc->tc_ext_alloc_bh,
+                                                              eb);
+                                /* An error here is not fatal. */
+                                if (ret < 0)
+                                        mlog_errno(ret);
+                        }
+                } else {
+                        deleted_eb = 0;
                }
-        } while (el->l_tree_depth);
-        *new_last_eb = bh;
+                index--;
-        get_bh(*new_last_eb);
+        }
-        mlog(0, "returning block %llu\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno));
-bail:
-        if (bh)
-                brelse(bh);
-        return status;
+        ret = 0;
+out:
+        return ret;
 }
 static int ocfs2_do_truncate(struct ocfs2_super *osb,
                             unsigned int clusters_to_del,
                             struct inode *inode,
                             struct buffer_head *fe_bh,
-                             struct buffer_head *old_last_eb_bh,
                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc)
+                             struct ocfs2_truncate_context *tc,
+                             struct ocfs2_path *path)
 {
-        int status, i, depth;
+        int status;
        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_block *last_eb = NULL;
        struct ocfs2_extent_list *el;
-        struct buffer_head *eb_bh = NULL;
        struct buffer_head *last_eb_bh = NULL;
-        u64 next_eb = 0;
        u64 delete_blk = 0;
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(osb,
+        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             inode,
+                                             path, &last_eb_bh);
-                                             fe,
-                                             le32_to_cpu(fe->i_clusters) -
-                                                        clusters_to_del,
-                                             old_last_eb_bh,
-                                             &last_eb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        if (last_eb_bh)
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        /*
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+         * Each component will be touched, so we might as well journal
+         * here to avoid having to handle errors later.
+         */
+        status = ocfs2_journal_access_path(inode, handle, path);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
+        if (last_eb_bh) {
+                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        }
        el = &(fe->id2.i_list);
+        /*
+         * Lower levels depend on this never happening, but it's best
+         * to check it up here before changing the tree.
+         */
+        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %lu has an empty extent record, depth %u\n",
+                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
+                status = -EROFS;
+                goto bail;
+        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-        fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-        i = le16_to_cpu(el->l_next_free_rec) - 1;
+        status = ocfs2_trim_tree(inode, path, handle, tc,
+                                 clusters_to_del, &delete_blk);
-        BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+        if (status) {
-        le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+                mlog_errno(status);
-        /* tree depth zero, we can just delete the clusters, otherwise
+                goto bail;
-         * we need to record the offset of the next level extent block
-         * as we may overwrite it. */
-        if (!el->l_tree_depth)
-                delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                        + ocfs2_clusters_to_blocks(osb->sb,
-                                        le32_to_cpu(el->l_recs[i].e_clusters));
-        else
-                next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
-        if (!el->l_recs[i].e_clusters) {
-                /* if we deleted the whole extent record, then clear
-                 * out the other fields and update the extent
-                 * list. For depth > 0 trees, we've already recorded
-                 * the extent block in 'next_eb' */
-                el->l_recs[i].e_cpos = 0;
-                el->l_recs[i].e_blkno = 0;
-                BUG_ON(!el->l_next_free_rec);
-                le16_add_cpu(&el->l_next_free_rec, -1);
        }
-        depth = le16_to_cpu(el->l_tree_depth);
+        if (le32_to_cpu(fe->i_clusters) == 0) {
-        if (!fe->i_clusters) {
                /* trunc to zero is a special case. */
                el->l_tree_depth = 0;
                fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                /* If there will be a new last extent block, then by
                 * definition, there cannot be any leaves to the right of
                 * him. */
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                last_eb->h_next_leaf_blk = 0;
                status = ocfs2_journal_dirty(handle, last_eb_bh);
                if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                }
        }
-        /* if our tree depth > 0, update all the tree blocks below us. */
+        if (delete_blk) {
-        while (depth) {
+                status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
+                                                   clusters_to_del);
-                     depth,  (unsigned long long)next_eb);
-                status = ocfs2_read_block(osb, next_eb, &eb_bh,
-                                          OCFS2_BH_CACHED, inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
-                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+        }
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        status = 0;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+bail:
-                        status = -EIO;
-                        goto bail;
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        return 0;
+}
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        return ocfs2_journal_dirty_data(handle, bh);
+}
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+                                     struct page **pages, int numpages,
+                                     u64 phys, handle_t *handle)
+{
+        int i, ret, partial = 0;
+        void *kaddr;
+        struct page *page;
+        unsigned int from, to = PAGE_CACHE_SIZE;
+        struct super_block *sb = inode->i_sb;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        if (numpages == 0)
+                goto out;
+        from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+        if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+                /*
+                 * Since 'from' has been capped to a value below page
+                 * size, this calculation won't be able to overflow
+                 * 'to'
+                 */
+                to = ocfs2_align_bytes_to_clusters(sb, from);
+                /*
+                 * The truncate tail in this case should never contain
+                 * more than one page at maximum. The loop below also
+                 * assumes this.
+                 */
+                BUG_ON(numpages != 1);
+        }
+        for(i = 0; i < numpages; i++) {
+                page = pages[i];
+                BUG_ON(from > PAGE_CACHE_SIZE);
+                BUG_ON(to > PAGE_CACHE_SIZE);
+                ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+                if (ret)
+                        mlog_errno(ret);
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + from, 0, to - from);
+                kunmap_atomic(kaddr, KM_USER0);
+                /*
+                 * Need to set the buffers we zero'd into uptodate
+                 * here if they aren't - ocfs2_map_page_blocks()
+                 * might've skipped some
+                 */
+                if (ocfs2_should_order_data(inode)) {
+                        ret = walk_page_buffers(handle,
+                                                page_buffers(page),
+                                                from, to, &partial,
+                                                ocfs2_ordered_zero_func);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                } else {
+                        ret = walk_page_buffers(handle, page_buffers(page),
+                                                from, to, &partial,
+                                                ocfs2_writeback_zero_func);
+                        if (ret < 0)
+                                mlog_errno(ret);
                }
-                el = &(eb->h_list);
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                if (!partial)
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                        SetPageUptodate(page);
-                if (status < 0) {
-                        mlog_errno(status);
+                flush_dcache_page(page);
-                        goto bail;
+                /*
+                 * Every page after the 1st one should be completely zero'd.
+                 */
+                from = 0;
+        }
+out:
+        if (pages) {
+                for (i = 0; i < numpages; i++) {
+                        page = pages[i];
+                        unlock_page(page);
+                        mark_page_accessed(page);
+                        page_cache_release(page);
                }
+        }
+}
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
-                BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+                                int *num, u64 *phys)
+{
+        int i, numpages = 0, ret = 0;
+        unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+        unsigned int ext_flags;
+        struct super_block *sb = inode->i_sb;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long index;
+        u64 next_cluster_bytes;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        /* Cluster boundary, so we don't need to grab any pages. */
+        if ((isize & (csize - 1)) == 0)
+                goto out;
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
+        ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+                                          phys, NULL, &ext_flags);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
-                mlog(0, "extent block %llu, before: record %d: "
+        /* Tail is a hole. */
-                     "(%u, %u, %llu), next = %u\n",
+        if (*phys == 0)
-                     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
+                goto out;
-                     le32_to_cpu(el->l_recs[i].e_cpos),
-                     le32_to_cpu(el->l_recs[i].e_clusters),
-                     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+        /* Tail is marked as unwritten, we can count on write to zero
-                le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+         * in that case. */
+        if (ext_flags & OCFS2_EXT_UNWRITTEN)
-                next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+                goto out;
-                /* bottom-most block requires us to delete data.*/
-                if (!el->l_tree_depth)
-                        delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                                + ocfs2_clusters_to_blocks(osb->sb,
-                                        le32_to_cpu(el->l_recs[i].e_clusters));
-                if (!el->l_recs[i].e_clusters) {
-                        el->l_recs[i].e_cpos = 0;
-                        el->l_recs[i].e_blkno = 0;
-                        BUG_ON(!el->l_next_free_rec);
-                        le16_add_cpu(&el->l_next_free_rec, -1);
-                }
-                mlog(0, "extent block %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u\n",
-                     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-                     le32_to_cpu(el->l_recs[i].e_cpos),
-                     le32_to_cpu(el->l_recs[i].e_clusters),
-                     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                status = ocfs2_journal_dirty(handle, eb_bh);
+        next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
-                if (status < 0) {
+        index = isize >> PAGE_CACHE_SHIFT;
-                        mlog_errno(status);
+        do {
-                        goto bail;
+                pages[numpages] = grab_cache_page(mapping, index);
+                if (!pages[numpages]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
                }
-                if (!el->l_next_free_rec) {
+                numpages++;
-                        mlog(0, "deleting this extent block.\n");
+                index++;
+        } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
-                        ocfs2_remove_from_cache(inode, eb_bh);
-                        BUG_ON(el->l_recs[0].e_clusters);
+out:
-                        BUG_ON(el->l_recs[0].e_cpos);
+        if (ret != 0) {
-                        BUG_ON(el->l_recs[0].e_blkno);
+                if (pages) {
-                        if (eb->h_suballoc_slot == 0) {
+                        for (i = 0; i < numpages; i++) {
-                                /*
+                                if (pages[i]) {
-                                 * This code only understands how to
+                                        unlock_page(pages[i]);
-                                 * lock the suballocator in slot 0,
+                                        page_cache_release(pages[i]);
-                                 * which is fine because allocation is
-                                 * only ever done out of that
-                                 * suballocator too. A future version
-                                 * might change that however, so avoid
-                                 * a free if we don't know how to
-                                 * handle it. This way an fs incompat
-                                 * bit will not be necessary.
-                                 */
-                                status = ocfs2_free_extent_block(handle,
-                                                                 tc->tc_ext_alloc_inode,
-                                                                 tc->tc_ext_alloc_bh,
-                                                                 eb);
-                                if (status < 0) {
-                                        mlog_errno(status);
-                                        goto bail;
                                }
                        }
                }
-                brelse(eb_bh);
+                numpages = 0;
-                eb_bh = NULL;
-                depth--;
        }
-        BUG_ON(!delete_blk);
+        *num = numpages;
-        status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                                           clusters_to_del);
+        return ret;
-        if (status < 0) {
+}
-                mlog_errno(status);
-                goto bail;
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 new_i_size)
+{
+        int ret, numpages;
+        loff_t endbyte;
+        struct page **pages = NULL;
+        u64 phys;
+        /*
+         * File systems which don't support sparse files zero on every
+         * extend.
+         */
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                return 0;
+        pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+                        sizeof(struct page *), GFP_NOFS);
+        if (pages == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
        }
-        status = 0;
-bail:
+        ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
-        if (!status)
+        if (ret) {
-                ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+                mlog_errno(ret);
-        else
+                goto out;
-                ocfs2_extent_map_drop(inode, 0);
+        }
-        mlog_exit(status);
-        return status;
+        if (numpages == 0)
+                goto out;
+        ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+                                 handle);
+        /*
+         * Initiate writeout of the pages we zero'd here. We don't
+         * wait on them - the truncate_inode_pages() call later will
+         * do that for us.
+         */
+        endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+                                    endbyte - 1, SYNC_FILE_RANGE_WRITE);
+        if (ret)
+                mlog_errno(ret);
+out:
+        if (pages)
+                kfree(pages);
+        return ret;
 }
 /*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct ocfs2_truncate_context *tc)
 {
        int status, i, credits, tl_sem = 0;
-        u32 clusters_to_del, target_i_clusters;
+        u32 clusters_to_del, new_highest_cpos, range;
-        u64 last_eb = 0;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh;
        handle_t *handle = NULL;
        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_path *path = NULL;
        mlog_entry_void();
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        last_eb_bh = tc->tc_last_eb_bh;
+        path = ocfs2_new_inode_path(fe_bh);
-        tc->tc_last_eb_bh = NULL;
+        if (!path) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        ocfs2_extent_map_trunc(inode, new_highest_cpos);
-        if (fe->id2.i_list.l_tree_depth) {
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                el = &eb->h_list;
-        } else
-                el = &fe->id2.i_list;
-        last_eb = le64_to_cpu(fe->i_last_eb_blk);
 start:
-        mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+        /*
-             "last_eb = %llu, fe->i_last_eb_blk = %llu, "
+         * Check that we still have allocation to delete.
-             "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+         */
-             le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
+        if (OCFS2_I(inode)->ip_clusters == 0) {
-             (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
+                status = 0;
-             le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+                goto bail;
+        }
-        if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
-                mlog(0, "last_eb changed!\n");
-                BUG_ON(!fe->id2.i_list.l_tree_depth);
-                last_eb = le64_to_cpu(fe->i_last_eb_blk);
-                /* i_last_eb_blk may have changed, read it if
-                 * necessary. We don't have to worry about the
-                 * truncate to zero case here (where there becomes no
-                 * last_eb) because we never loop back after our work
-                 * is done. */
-                if (last_eb_bh) {
-                        brelse(last_eb_bh);
-                        last_eb_bh = NULL;
-                }
-                status = ocfs2_read_block(osb, last_eb,
+        /*
-                                          &last_eb_bh, OCFS2_BH_CACHED,
+         * Truncate always works against the rightmost tree branch.
-                                          inode);
+         */
-                if (status < 0) {
+        status = ocfs2_find_path(inode, path, UINT_MAX);
-                        mlog_errno(status);
+        if (status) {
-                        goto bail;
+                mlog_errno(status);
-                }
+                goto bail;
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        }
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
-                        status = -EIO;
+             OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
-                        goto bail;
-                }
+        /*
-                el = &(eb->h_list);
+         * By now, el will point to the extent list on the bottom most
+         * portion of this tree. Only the tail record is considered in
+         * each pass.
+         *
+         * We handle the following cases, in order:
+         * - empty extent: delete the remaining branch
+         * - remove the entire record
+         * - remove a partial record
+         * - no record needs to be removed (truncate has completed)
+         */
+        el = path_leaf_el(path);
+        if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has empty extent block at %llu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)path_leaf_bh(path)->b_blocknr);
+                status = -EROFS;
+                goto bail;
        }
-        /* by now, el will point to the extent list on the bottom most
-         * portion of this tree. */
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+        range = le32_to_cpu(el->l_recs[i].e_cpos) +
-                clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+                ocfs2_rec_clusters(el, &el->l_recs[i]);
-        else
+        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
-                clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+                clusters_to_del = 0;
+        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+        } else if (range > new_highest_cpos) {
+                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
-                                  target_i_clusters;
+                                  new_highest_cpos;
+        } else {
+                status = 0;
+                goto bail;
+        }
+        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+        BUG_ON(clusters_to_del == 0);
        mutex_lock(&tl_inode->i_mutex);
        tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
        }
        credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
-                                                fe, el);
+                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                                el);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
                goto bail;
        }
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+                                   tc, path);
-        if (status < 0)
-                mlog_errno(status);
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
-                                   last_eb_bh, handle, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1888,9 +3745,14 @@ start:
        ocfs2_commit_trans(osb, handle);
        handle = NULL;
-        BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+        ocfs2_reinit_path(path, 1);
-        if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
-                goto start;
+        /*
+         * The check above will catch the case where we've truncated
+         * away all allocation.
+         */
+        goto start;
 bail:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1902,8 +3764,7 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        if (last_eb_bh)
+        ocfs2_free_path(path);
-                brelse(last_eb_bh);
        /* This will drop the ext_alloc cluster lock for us */
        ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
        return status;
 }
 /*
 * Expects the inode to already be locked. This will figure out which
 * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct buffer_head *fe_bh,
                           struct ocfs2_truncate_context **tc)
 {
-        int status, metadata_delete;
+        int status, metadata_delete, i;
        unsigned int new_i_clusters;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
@@ -1941,23 +3801,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-             "%llu\n", fe->i_clusters, new_i_clusters,
+             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
-             (unsigned long long)fe->i_size);
+             (unsigned long long)le64_to_cpu(fe->i_size));
-        if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
-                ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
-                            "%u and size %llu whereas struct inode has "
-                            "cluster count %u and size %llu which caused an "
-                            "invalid truncate to %u clusters.",
-                            (unsigned long long)le64_to_cpu(fe->i_blkno),
-                            le32_to_cpu(fe->i_clusters),
-                            (unsigned long long)le64_to_cpu(fe->i_size),
-                            OCFS2_I(inode)->ip_clusters, i_size_read(inode),
-                            new_i_clusters);
-                mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
-                status = -EIO;
-                goto bail;
-        }
        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
        if (!(*tc)) {
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                        goto bail;
                }
                el = &(eb->h_list);
-                if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+                i = 0;
+                if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                        i = 1;
+                /*
+                 * XXX: Should we check that next_free_rec contains
+                 * the extent?
+                 */
+                if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
                        metadata_delete = 1;
        }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        handle_t *handle,
                        struct inode *inode,
                        struct buffer_head *fe_bh,
-                        u64 blkno,
+                        u32 cpos,
+                        u64 start_blk,
                        u32 new_clusters,
                        struct ocfs2_alloc_context *meta_ac);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
        struct buffer_head *tc_last_eb_bh;
 };
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct buffer_head *fe_bh,
                          struct ocfs2_truncate_context *tc);
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+                    u32 cpos, struct buffer_head **leaf_bh);
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+                                              struct ocfs2_extent_rec *rec)
+{
+        /*
+         * Cluster count in extent records is slightly different
+         * between interior nodes and leaf nodes. This is to support
+         * unwritten extents which need a flags field in leaf node
+         * records, thus shrinking the available space for a clusters
+         * field.
+         */
+        if (el->l_tree_depth)
+                return le32_to_cpu(rec->e_int_clusters);
+        else
+                return le16_to_cpu(rec->e_leaf_clusters);
+}
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c11443817..8e7cafb5fc6c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -37,6 +39,7 @@
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
@@ -75,7 +78,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
        if (!OCFS2_IS_VALID_DINODE(fe)) {
                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
+                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                     fe->i_signature);
                goto bail;
        }
@@ -134,7 +138,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh_result, int create)
 {
        int err = 0;
+        unsigned int ext_flags;
        u64 p_blkno, past_eof;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
                   (unsigned long long)iblock, bh_result, create);
@@ -149,17 +155,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        /* this can happen if another node truncs after our extend! */
+        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+                                          &ext_flags);
-        if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
-                                               OCFS2_I(inode)->ip_clusters))
-                err = -EIO;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        if (err)
-                goto bail;
-        err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
-                                          NULL);
        if (err) {
                mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
                     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +164,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        map_bh(bh_result, inode->i_sb, p_blkno);
+        /*
+         * ocfs2 never allocates in this function - the only time we
-        if (bh_result->b_blocknr == 0) {
+         * need to use BH_New is when we're extending i_size on a file
-                err = -EIO;
+         * system which doesn't support holes, in which case BH_New
-                mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+         * allows block_prepare_write() to zero.
-                     (unsigned long long)iblock,
+         */
-                     (unsigned long long)p_blkno,
+        mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        "ino %lu, iblock %llu\n", inode->i_ino,
-        }
+                        (unsigned long long)iblock);
+        /* Treat the unwritten extent as a hole for zeroing purposes. */
+        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                map_bh(bh_result, inode->i_sb, p_blkno);
+        if (!ocfs2_sparse_alloc(osb)) {
+                if (p_blkno == 0) {
+                        err = -EIO;
+                        mlog(ML_ERROR,
+                             "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+                             (unsigned long long)iblock,
+                             (unsigned long long)p_blkno,
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+                        dump_stack();
+                }
-        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+                mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-             (unsigned long long)past_eof);
+                     (unsigned long long)past_eof);
-        if (create && (iblock >= past_eof))
+                if (create && (iblock >= past_eof))
-                set_buffer_new(bh_result);
+                        set_buffer_new(bh_result);
+        }
 bail:
        if (err < 0)
@@ -276,8 +290,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/* This can also be called from ocfs2_write_zero_page() which has done
+/*
- * it's own cluster locking. */
+ * This is called from ocfs2_write_zero_page() which has handled it's
+ * own cluster locking and has ensured allocation exists for those
+ * blocks to be written.
+ */
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
                               unsigned from, unsigned to)
 {
@@ -292,44 +309,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
        return ret;
 }
-/*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback.  It must be able to perform its own locking around
- * ocfs2_get_block().
- */
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-                               unsigned from, unsigned to)
-{
-        struct inode *inode = page->mapping->host;
-        int ret;
-        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-        ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_prepare_write_nolock(inode, page, from, to);
-        ocfs2_meta_unlock(inode, 0);
-out:
-        mlog_exit(ret);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
 * their fixes when they happen) --Mark */
-static int walk_page_buffers(   handle_t *handle,
+int walk_page_buffers(  handle_t *handle,
-                                struct buffer_head *head,
+                        struct buffer_head *head,
-                                unsigned from,
+                        unsigned from,
-                                unsigned to,
+                        unsigned to,
-                                int *partial,
+                        int *partial,
-                                int (*fn)(      handle_t *handle,
+                        int (*fn)(      handle_t *handle,
-                                                struct buffer_head *bh))
+                                        struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -388,95 +378,6 @@ out:
        return handle;
 }
-static int ocfs2_commit_write(struct file *file, struct page *page,
-                              unsigned from, unsigned to)
-{
-        int ret;
-        struct buffer_head *di_bh = NULL;
-        struct inode *inode = page->mapping->host;
-        handle_t *handle = NULL;
-        struct ocfs2_dinode *di;
-        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-        /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-         * us to continue here without rechecking the I/O against
-         * changed inode values.
-         *
-         * 1) We're currently holding the inode alloc lock, so no
-         *    nodes can change it underneath us.
-         *
-         * 2) We've had to take the metadata lock at least once
-         *    already to check for extending writes, suid removal, etc.
-         *    The meta data update code then ensures that we don't get a
-         *    stale inode allocation image (i_size, i_clusters, etc).
-         */
-        ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_data_lock_with_page(inode, 1, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out_unlock_meta;
-        }
-        handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out_unlock_data;
-        }
-        /* Mark our buffer early. We'd rather catch this error up here
-         * as opposed to after a successful commit_write which would
-         * require us to set back inode->i_size. */
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        /* might update i_size */
-        ret = generic_commit_write(file, page, from, to);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        di = (struct ocfs2_dinode *)di_bh->b_data;
-        /* ocfs2_mark_inode_dirty() is too heavy to use here. */
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
-        di->i_size = cpu_to_le64((u64)i_size_read(inode));
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-out_unlock_data:
-        ocfs2_data_unlock(inode, 1);
-out_unlock_meta:
-        ocfs2_meta_unlock(inode, 1);
-out:
-        if (di_bh)
-                brelse(di_bh);
-        mlog_exit(ret);
-        return ret;
-}
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t status;
@@ -499,8 +400,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
                down_read(&OCFS2_I(inode)->ip_alloc_sem);
        }
-        err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+        err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
-                                          NULL);
        if (!INODE_JOURNAL(inode)) {
                up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +440,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
-        u64 p_blkno, inode_blocks;
+        u64 p_blkno, inode_blocks, contig_blocks;
-        int contig_blocks;
+        unsigned int ext_flags;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -549,33 +449,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
         * nicely aligned and of the right size, so there's no need
         * for us to check any of that. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
-                                                OCFS2_I(inode)->ip_clusters);
-        /*
-         * For a read which begins past the end of file, we return a hole.
-         */
-        if (!create && (iblock >= inode_blocks)) {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = 0;
-                goto bail;
-        }
        /*
         * Any write past EOF is not allowed because we'd be extending.
         */
        if (create && (iblock + max_blocks) > inode_blocks) {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
                ret = -EIO;
                goto bail;
        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
-        ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                          &contig_blocks);
+                                          &contig_blocks, &ext_flags);
        if (ret) {
                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                     (unsigned long long)iblock);
@@ -583,7 +470,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        map_bh(bh_result, inode->i_sb, p_blkno);
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has a hole at block %llu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)iblock);
+                ret = -EROFS;
+                goto bail;
+        }
+        /*
+         * get_more_blocks() expects us to describe a hole by clearing
+         * the mapped bit on bh_result().
+         *
+         * Consider an unwritten extent as a hole.
+         */
+        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                map_bh(bh_result, inode->i_sb, p_blkno);
+        else {
+                /*
+                 * ocfs2_prepare_inode_for_write() should have caught
+                 * the case where we'd be filling a hole and triggered
+                 * a buffered write instead.
+                 */
+                if (create) {
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                clear_buffer_mapped(bh_result);
+        }
        /* make sure we don't map more than max_blocks blocks here as
           that's all the kernel will handle at this point. */
@@ -606,12 +523,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                             void *private)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int level;
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
        ocfs2_iocb_clear_rw_locked(iocb);
-        up_read(&inode->i_alloc_sem);
-        ocfs2_rw_unlock(inode, 0);
+        level = ocfs2_iocb_rw_locked_level(iocb);
+        if (!level)
+                up_read(&inode->i_alloc_sem);
+        ocfs2_rw_unlock(inode, level);
 }
 /*
@@ -647,23 +569,27 @@ static ssize_t ocfs2_direct_IO(int rw,
        mlog_entry_void();
-        /*
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-         * We get PR data locks even for O_DIRECT.  This allows
+                /*
-         * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+                 * We get PR data locks even for O_DIRECT.  This
-         * extending and buffered zeroing writes race.  If they did
+                 * allows concurrent O_DIRECT I/O but doesn't let
-         * race then the buffered zeroing could be written back after
+                 * O_DIRECT with extending and buffered zeroing writes
-         * the O_DIRECT I/O.  It's one thing to tell people not to mix
+                 * race.  If they did race then the buffered zeroing
-         * buffered and O_DIRECT writes, but expecting them to
+                 * could be written back after the O_DIRECT I/O.  It's
-         * understand that file extension is also an implicit buffered
+                 * one thing to tell people not to mix buffered and
-         * write is too much.  By getting the PR we force writeback of
+                 * O_DIRECT writes, but expecting them to understand
-         * the buffered zeroing before proceeding.
+                 * that file extension is also an implicit buffered
-         */
+                 * write is too much.  By getting the PR we force
-        ret = ocfs2_data_lock(inode, 0);
+                 * writeback of the buffered zeroing before
-        if (ret < 0) {
+                 * proceeding.
-                mlog_errno(ret);
+                 */
-                goto out;
+                ret = ocfs2_data_lock(inode, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_data_unlock(inode, 0);
        }
-        ocfs2_data_unlock(inode, 0);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
@@ -675,11 +601,715 @@ out:
        return ret;
 }
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+                                            u32 cpos,
+                                            unsigned int *start,
+                                            unsigned int *end)
+{
+        unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+                unsigned int cpp;
+                cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+                cluster_start = cpos % cpp;
+                cluster_start = cluster_start << osb->s_clustersize_bits;
+                cluster_end = cluster_start + osb->s_clustersize;
+        }
+        BUG_ON(cluster_start > PAGE_SIZE);
+        BUG_ON(cluster_end > PAGE_SIZE);
+        if (start)
+                *start = cluster_start;
+        if (end)
+                *end = cluster_end;
+}
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+                                     struct ocfs2_super *osb, u32 cpos,
+                                     unsigned from, unsigned to)
+{
+        void *kaddr;
+        unsigned int cluster_start, cluster_end;
+        ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (from || to) {
+                if (from > cluster_start)
+                        memset(kaddr + cluster_start, 0, from - cluster_start);
+                if (to < cluster_end)
+                        memset(kaddr + to, 0, cluster_end - to);
+        } else {
+                memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+}
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                          struct inode *inode, unsigned int from,
+                          unsigned int to, int new)
+{
+        int ret = 0;
+        struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+        unsigned int block_end, block_start;
+        unsigned int bsize = 1 << inode->i_blkbits;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, bsize, 0);
+        head = page_buffers(page);
+        for (bh = head, block_start = 0; bh != head || !block_start;
+             bh = bh->b_this_page, block_start += bsize) {
+                block_end = block_start + bsize;
+                /*
+                 * Ignore blocks outside of our i/o range -
+                 * they may belong to unallocated clusters.
+                 */
+                if (block_start >= to || block_end <= from) {
+                        if (PageUptodate(page))
+                                set_buffer_uptodate(bh);
+                        continue;
+                }
+                /*
+                 * For an allocating write with cluster size >= page
+                 * size, we always write the entire page.
+                 */
+                if (buffer_new(bh))
+                        clear_buffer_new(bh);
+                if (!buffer_mapped(bh)) {
+                        map_bh(bh, inode->i_sb, *p_blkno);
+                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+                }
+                if (PageUptodate(page)) {
+                        if (!buffer_uptodate(bh))
+                                set_buffer_uptodate(bh);
+                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+                     (block_start < from || block_end > to)) {
+                        ll_rw_block(READ, 1, &bh);
+                        *wait_bh++=bh;
+                }
+                *p_blkno = *p_blkno + 1;
+        }
+        /*
+         * If we issued read requests - let them complete.
+         */
+        while(wait_bh > wait) {
+                wait_on_buffer(*--wait_bh);
+                if (!buffer_uptodate(*wait_bh))
+                        ret = -EIO;
+        }
+        if (ret == 0 || !new)
+                return ret;
+        /*
+         * If we get -EIO above, zero out any newly allocated blocks
+         * to avoid exposing stale data.
+         */
+        bh = head;
+        block_start = 0;
+        do {
+                void *kaddr;
+                block_end = block_start + bsize;
+                if (block_end <= from)
+                        goto next_bh;
+                if (block_start >= to)
+                        break;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr+block_start, 0, bh->b_size);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_buffer_uptodate(bh);
+                mark_buffer_dirty(bh);
+next_bh:
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return ret;
+}
+/*
+ * This will copy user data from the buffer page in the splice
+ * context.
+ *
+ * For now, we ignore SPLICE_F_MOVE as that would require some extra
+ * communication out all the way to ocfs2_write().
+ */
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        char *src, *dst;
+        struct ocfs2_splice_write_priv *sp = wc->w_private;
+        struct pipe_buffer *buf = sp->s_buf;
+        unsigned long bytes, src_from;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        from = sp->s_offset;
+        src_from = sp->s_buf_offset;
+        bytes = wc->w_count;
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        src = buf->ops->map(sp->s_pipe, buf, 1);
+        dst = kmap_atomic(wc->w_this_page, KM_USER1);
+        memcpy(dst + from, src + src_from, bytes);
+        kunmap_atomic(wc->w_this_page, KM_USER1);
+        buf->ops->unmap(sp->s_pipe, buf, src);
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        unsigned long bytes, src_from;
+        char *dst;
+        struct ocfs2_buffered_write_priv *bp = wc->w_private;
+        const struct iovec *cur_iov = bp->b_cur_iov;
+        char __user *buf;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        buf = cur_iov->iov_base + bp->b_cur_off;
+        src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+        from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+        /*
+         * This is a lot of comparisons, but it reads quite
+         * easily, which is important here.
+         */
+        /* Stay within the src page */
+        bytes = PAGE_SIZE - src_from;
+        /* Stay within the vector */
+        bytes = min(bytes,
+                    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+        /* Stay within count */
+        bytes = min(bytes, (unsigned long)wc->w_count);
+        /*
+         * For clustersize > page size, just stay within
+         * target page, otherwise we have to calculate pos
+         * within the cluster and obey the rightmost
+         * boundary.
+         */
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        } else {
+                /*
+                 * cluster size > page size is the most common
+                 * case - we just stay within the target page
+                 * boundary.
+                 */
+                bytes = min(bytes, PAGE_CACHE_SIZE - from);
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        dst = kmap(wc->w_this_page);
+        memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+        kunmap(wc->w_this_page);
+        /*
+         * XXX: This is slow, but simple. The caller of
+         * ocfs2_buffered_write_cluster() is responsible for
+         * passing through the iovecs, so it's difficult to
+         * predict what our next step is in here after our
+         * initial write. A future version should be pushing
+         * that iovec manipulation further down.
+         *
+         * By setting this, we indicate that a copy from user
+         * data was done, and subsequent calls for this
+         * cluster will skip copying more data.
+         */
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback.  Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+                                 u64 *p_blkno, struct page *page,
+                                 struct ocfs2_write_ctxt *wc, int new)
+{
+        int ret, copied = 0;
+        unsigned int from = 0, to = 0;
+        unsigned int cluster_start, cluster_end;
+        unsigned int zero_from = 0, zero_to = 0;
+        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+                                        &cluster_start, &cluster_end);
+        if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+            && !wc->w_finished_copy) {
+                wc->w_this_page = page;
+                wc->w_this_page_new = new;
+                ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied = ret;
+                zero_from = from;
+                zero_to = to;
+                if (new) {
+                        from = cluster_start;
+                        to = cluster_end;
+                }
+        } else {
+                /*
+                 * If we haven't allocated the new page yet, we
+                 * shouldn't be writing it out without copying user
+                 * data. This is likely a math error from the caller.
+                 */
+                BUG_ON(!new);
+                from = cluster_start;
+                to = cluster_end;
+                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Parts of newly allocated pages need to be zero'd.
+         *
+         * Above, we have also rewritten 'to' and 'from' - as far as
+         * the rest of the function is concerned, the entire cluster
+         * range inside of a page needs to be written.
+         *
+         * We can skip this if the page is up to date - it's already
+         * been zero'd from being read in as a hole.
+         */
+        if (new && !PageUptodate(page))
+                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+                                         wc->w_cpos, zero_from, zero_to);
+        flush_dcache_page(page);
+        if (ocfs2_should_order_data(inode)) {
+                ret = walk_page_buffers(handle,
+                                        page_buffers(page),
+                                        from, to, NULL,
+                                        ocfs2_journal_dirty_data);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        /*
+         * We don't use generic_commit_write() because we need to
+         * handle our own i_size update.
+         */
+        ret = block_commit_write(page, from, to);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return copied ? copied : ret;
+}
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+                           struct buffer_head *di_bh,
+                           struct ocfs2_alloc_context *data_ac,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_write_ctxt *wc)
+{
+        int ret, i, numpages = 1, new;
+        unsigned int copied = 0;
+        u32 tmp_pos;
+        u64 v_blkno, p_blkno;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned long index, start;
+        struct page **cpages;
+        new = phys == 0 ? 1 : 0;
+        /*
+         * Figure out how many pages we'll be manipulating here. For
+         * non allocating write, we just change the one
+         * page. Otherwise, we'll need a whole clusters worth.
+         */
+        if (new)
+                numpages = ocfs2_pages_per_cluster(inode->i_sb);
+        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+        if (!cpages) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * Fill our page array first. That way we've grabbed enough so
+         * that we can zero and flush if we error after adding the
+         * extent.
+         */
+        if (new) {
+                start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+                                                           wc->w_cpos);
+                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+        } else {
+                start = wc->w_pos >> PAGE_CACHE_SHIFT;
+                v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+        }
+        for(i = 0; i < numpages; i++) {
+                index = start + i;
+                cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
+                if (!cpages[i]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (new) {
+                /*
+                 * This is safe to call with the page locks - it won't take
+                 * any additional semaphores or cluster locks.
+                 */
+                tmp_pos = wc->w_cpos;
+                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+                                                 &tmp_pos, 1, di_bh, handle,
+                                                 data_ac, meta_ac, NULL);
+                /*
+                 * This shouldn't happen because we must have already
+                 * calculated the correct meta data allocation required. The
+                 * internal tree allocation code should know how to increase
+                 * transaction credits itself.
+                 *
+                 * If need be, we could handle -EAGAIN for a
+                 * RESTART_TRANS here.
+                 */
+                mlog_bug_on_msg(ret == -EAGAIN,
+                                "Inode %llu: EAGAIN return during allocation.\n",
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+                                          NULL);
+        if (ret < 0) {
+                /*
+                 * XXX: Should we go readonly here?
+                 */
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(p_blkno == 0);
+        for(i = 0; i < numpages; i++) {
+                ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+                                            wc, new);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied += ret;
+        }
+out:
+        for(i = 0; i < numpages; i++) {
+                unlock_page(cpages[i]);
+                mark_page_accessed(cpages[i]);
+                page_cache_release(cpages[i]);
+        }
+        kfree(cpages);
+        return copied ? copied : ret;
+}
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+                                  struct ocfs2_super *osb, loff_t pos,
+                                  size_t count, ocfs2_page_writer *cb,
+                                  void *cb_priv)
+{
+        wc->w_count = count;
+        wc->w_pos = pos;
+        wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+        wc->w_finished_copy = 0;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+                wc->w_large_pages = 1;
+        else
+                wc->w_large_pages = 0;
+        wc->w_write_data_page = cb;
+        wc->w_private = cb_priv;
+}
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv)
+{
+        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        ssize_t written = 0;
+        u32 phys;
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle;
+        struct ocfs2_write_ctxt wc;
+        ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        /*
+         * Take alloc sem here to prevent concurrent lookups. That way
+         * the mapping, zeroing and tree manipulation within
+         * ocfs2_write() will be safe against ->readpage(). This
+         * should also serve to lock out allocation from a shared
+         * writeable region.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        /* phys == 0 means that allocation is required. */
+        if (phys == 0) {
+                ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_meta;
+                }
+                credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+        }
+        ret = ocfs2_data_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_data;
+        }
+        written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+                              meta_ac, &wc);
+        if (written < 0) {
+                ret = written;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        pos += written;
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
+        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_data:
+        ocfs2_data_unlock(inode, 1);
+out_meta:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_meta_unlock(inode, 1);
+out:
+        brelse(di_bh);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return written ? written : ret;
+}
 const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
        .writepage      = ocfs2_writepage,
-        .prepare_write  = ocfs2_prepare_write,
-        .commit_write   = ocfs2_commit_write,
        .bmap           = ocfs2_bmap,
        .sync_page      = block_sync_page,
        .direct_IO      = ocfs2_direct_IO,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned from,
                                                         unsigned to);
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                          struct inode *inode, unsigned int from,
+                          unsigned int to, int new);
+int walk_page_buffers(  handle_t *handle,
+                        struct buffer_head *head,
+                        unsigned from,
+                        unsigned to,
+                        int *partial,
+                        int (*fn)(      handle_t *handle,
+                                        struct buffer_head *bh));
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+                                u64 *, unsigned int *, unsigned int *);
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv);
+struct ocfs2_write_ctxt {
+        size_t                          w_count;
+        loff_t                          w_pos;
+        u32                             w_cpos;
+        unsigned int                    w_finished_copy;
+        /* This is true if page_size > cluster_size */
+        unsigned int                    w_large_pages;
+        /* Filler callback and private data */
+        ocfs2_page_writer               *w_write_data_page;
+        void                            *w_private;
+        /* Only valid for the filler callback */
+        struct page                     *w_this_page;
+        unsigned int                    w_this_page_new;
+};
+struct ocfs2_buffered_write_priv {
+        char                            *b_src_buf;
+        const struct iovec              *b_cur_iov; /* Current iovec */
+        size_t                          b_cur_off; /* Offset in the
+                                                    * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc,
+                                  u64 *p_blkno,
+                                  unsigned int *ret_from,
+                                  unsigned int *ret_to);
+struct ocfs2_splice_write_priv {
+        struct splice_desc              *s_sd;
+        struct pipe_buffer              *s_buf;
+        struct pipe_inode_info          *s_pipe;
+        /* Neither offset value is ever larger than one page */
+        unsigned int                    s_offset;
+        unsigned int                    s_buf_offset;
+};
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+                                    struct ocfs2_write_ctxt *wc,
+                                    u64 *p_blkno,
+                                    unsigned int *ret_from,
+                                    unsigned int *ret_to);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
        test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
-        set_bit(0, (unsigned long *)&iocb->private)
+{
+        set_bit(0, (unsigned long *)&iocb->private);
+        if (level)
+                set_bit(1, (unsigned long *)&iocb->private);
+        else
+                clear_bit(1, (unsigned long *)&iocb->private);
+}
 #define ocfs2_iocb_clear_rw_locked(iocb) \
        clear_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_rw_locked_level(iocb) \
+        test_bit(1, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eba282da500e..979113479c66 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -438,7 +438,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
                                                                   hb_block));
        mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
-             (long long)cpu_to_le64(generation),
+             (long long)generation,
             le32_to_cpu(hb_block->hb_cksum));
 }
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 636593bf4d17..2e975c0a35e1 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -147,7 +147,7 @@ static struct kset mlog_kset = {
        .kobj   = {.name = "logmask", .ktype = &mlog_ktype},
 };
-int mlog_sys_init(struct subsystem *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_subsys)
 {
        int i = 0;
@@ -157,7 +157,7 @@ int mlog_sys_init(struct subsystem *o2cb_subsys)
        }
        mlog_attr_ptrs[i] = NULL;
-        mlog_kset.subsys = o2cb_subsys;
+        kobj_set_kset_s(&mlog_kset, o2cb_subsys);
        return kset_register(&mlog_kset);
 }
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a42628ba9ddf..75cd877f6d42 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -278,7 +278,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
-int mlog_sys_init(struct subsystem *o2cb_subsys);
+int mlog_sys_init(struct kset *o2cb_subsys);
 void mlog_sys_shutdown(void);
 #endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/reboot.h>
 #include "heartbeat.h"
 #include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
        /* panic spins with interrupts enabled.  with preempt
         * threads can still schedule, etc, etc */
        o2hb_stop_all_regions();
-        panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+        printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+        emergency_restart();
 }
 /* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 1d9f6acafa2e..64f6f378fd09 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -42,7 +42,6 @@ struct o2cb_attribute {
 #define O2CB_ATTR(_name, _mode, _show, _store)  \
 struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
-#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
 #define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
 static ssize_t o2cb_interface_revision_show(char *buf)
@@ -79,7 +78,7 @@ static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
 {
        struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-        struct subsystem *sbs = to_o2cb_subsys(kobj);
+        struct kset *sbs = to_kset(kobj);
        BUG_ON(sbs != &o2cb_subsys);
@@ -93,7 +92,7 @@ o2cb_store(struct kobject * kobj, struct attribute * attr,
             const char * buffer, size_t count)
 {
        struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-        struct subsystem *sbs = to_o2cb_subsys(kobj);
+        struct kset *sbs = to_kset(kobj);
        BUG_ON(sbs != &o2cb_subsys);
@@ -112,7 +111,7 @@ int o2cb_sys_init(void)
 {
        int ret;
-        o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+        o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
        ret = subsystem_register(&o2cb_subsys);
        if (ret)
                return ret;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 69caf3e12fea..0b229a9c7952 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1496,7 +1496,7 @@ static void o2net_start_connect(struct work_struct *work)
        sock->sk->sk_allocation = GFP_ATOMIC;
        myaddr.sin_family = AF_INET;
-        myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
+        myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
        myaddr.sin_port = (__force u16)htons(0); /* any port */
        ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
@@ -1521,8 +1521,8 @@ static void o2net_start_connect(struct work_struct *work)
        spin_unlock(&nn->nn_lock);
        remoteaddr.sin_family = AF_INET;
-        remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
+        remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
-        remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+        remoteaddr.sin_port = node->nd_ipv4_port;
        ret = sc->sc_sock->ops->connect(sc->sc_sock,
                                        (struct sockaddr *)&remoteaddr,
@@ -1810,8 +1810,8 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
        int ret;
        struct sockaddr_in sin = {
                .sin_family = PF_INET,
-                .sin_addr = { .s_addr = (__force u32)addr },
+                .sin_addr = { .s_addr = addr },
-                .sin_port = (__force u16)port,
+                .sin_port = port,
        };
        ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 8:
+ *      - Replace delete inode votes with a cluster lock
+ *
 * New in version 7:
 *      - DLM join domain includes the live nodemap
 *
@@ -57,7 +60,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..c441ef1f2bad 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 {
        int status;
        int extend;
-        u64 p_blkno;
+        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
        extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
        spin_unlock(&OCFS2_I(dir)->ip_lock);
        if (extend) {
-                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+                u32 offset = OCFS2_I(dir)->ip_clusters;
-                                                    parent_fe_bh, handle,
+                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+                                                    1, parent_fe_bh, handle,
                                                    data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
                }
        }
-        status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+        v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
-                                                   (sb->s_blocksize_bits - 9)),
+        status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
-                                             1, &p_blkno, NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -402,7 +403,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                            struct buffer_head **new_de_bh)
 {
        int status = 0;
-        int credits, num_free_extents;
+        int credits, num_free_extents, drop_alloc_sem = 0;
        loff_t dir_i_size;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
        struct ocfs2_alloc_context *data_ac = NULL;
@@ -451,6 +452,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
        }
+        down_write(&OCFS2_I(dir)->ip_alloc_sem);
+        drop_alloc_sem = 1;
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -486,7 +490,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
-        dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+        dir->i_blocks = ocfs2_inode_sector_count(dir);
        status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -496,6 +500,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        *new_de_bh = new_bh;
        get_bh(*new_de_bh);
 bail:
+        if (drop_alloc_sem)
+                up_write(&OCFS2_I(dir)->ip_alloc_sem);
        if (handle)
                ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 241cad342a48..2fd8bded38f3 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -312,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
            past->type != DLM_BAST) {
                mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
                     "name=%.*s\n", past->type, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_node(cookie),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(cookie),
                     locklen, name);
                ret = DLM_IVLOCKID;
                goto leave;
@@ -324,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
                mlog(0, "got %sast for unknown lockres! "
                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
                     past->type == DLM_AST ? "" : "b",
-                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_node(cookie),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+                     dlm_get_lock_cookie_seq(cookie),
                     locklen, name, locklen);
                ret = DLM_IVLOCKID;
                goto leave;
@@ -370,8 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-             dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+             dlm_get_lock_cookie_node(cookie),
-             dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+             dlm_get_lock_cookie_seq(cookie),
             locklen, name, locklen);
        ret = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c558442a0b44..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -430,11 +430,10 @@ redo_bucket:
                        dlm_lockres_put(res);
-                        cond_resched_lock(&dlm->spinlock);
                        if (dropped)
                                goto redo_bucket;
                }
+                cond_resched_lock(&dlm->spinlock);
                num += n;
                mlog(0, "%s: touched %d lockreses in bucket %d "
                     "(tot=%d)\n", dlm->name, n, i, num);
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 {
        int status = 0, tmpstat, node;
        struct domain_join_ctxt *ctxt;
-        enum dlm_query_join_response response;
+        enum dlm_query_join_response response = JOIN_DISALLOW;
        mlog_entry("%p", dlm);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index de952eba29a9..d4e46d067edd 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -263,8 +263,7 @@ static void dlmfs_init_once(void *foo,
        struct dlmfs_inode_private *ip =
                (struct dlmfs_inode_private *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                ip->ip_dlm = NULL;
                ip->ip_parent = NULL;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..671c4ed58ee2 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        }
                } while (status != 0);
+                spin_lock(&dlm_reco_state_lock);
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
                        case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                     ndata->node_num, dead_node);
                                break;
                }
+                spin_unlock(&dlm_reco_state_lock);
        }
        mlog(0, "done requesting all lock info\n");
@@ -1767,7 +1769,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        /* lock is always created locally first, and
                         * destroyed locally last.  it must be on the list */
                        if (!lock) {
-                                u64 c = ml->cookie;
+                                __be64 c = ml->cookie;
                                mlog(ML_ERROR, "could not find local lock "
                                               "with cookie %u:%llu!\n",
                                     dlm_get_lock_cookie_node(be64_to_cpu(c)),
@@ -1876,7 +1878,7 @@ skip_lvb:
                spin_lock(&res->spinlock);
                list_for_each_entry(lock, queue, list) {
                        if (lock->ml.cookie == ml->cookie) {
-                                u64 c = lock->ml.cookie;
+                                __be64 c = lock->ml.cookie;
                                mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
                                     "exists on this lockres!\n", dlm->name,
                                     res->lockname.len, res->lockname.name,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2b264c6ba039..cebd089f8955 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -76,7 +76,7 @@ repeat:
                goto repeat;
        }
        remove_wait_queue(&res->wq, &wait);
-        current->state = TASK_RUNNING;
+        __set_current_state(TASK_RUNNING);
 }
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..024777abc8e3 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -104,6 +104,35 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
                                     struct ocfs2_lock_res *lockres);
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+/* This aids in debugging situations where a bad LVB might be involved. */
+static void ocfs2_dump_meta_lvb_info(u64 level,
+                                     const char *function,
+                                     unsigned int line,
+                                     struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        mlog(level, "LVB information for %s (called from %s:%u):\n",
+             lockres->l_name, function, line);
+        mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
+             lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+             be32_to_cpu(lvb->lvb_igeneration));
+        mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
+             (unsigned long long)be64_to_cpu(lvb->lvb_isize),
+             be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
+             be16_to_cpu(lvb->lvb_imode));
+        mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
+             "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
+             (long long)be64_to_cpu(lvb->lvb_iatime_packed),
+             (long long)be64_to_cpu(lvb->lvb_ictime_packed),
+             (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+             be32_to_cpu(lvb->lvb_iattr));
+}
 /*
 * OCFS2 Lock Resource Operations
 *
@@ -225,11 +254,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+        .get_osb        = ocfs2_get_inode_osb,
+        .flags          = 0,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-                lockres->l_type == OCFS2_LOCK_TYPE_RW;
+                lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+                lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +408,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                case OCFS2_LOCK_TYPE_DATA:
                        ops = &ocfs2_inode_data_lops;
                        break;
+                case OCFS2_LOCK_TYPE_OPEN:
+                        ops = &ocfs2_inode_open_lops;
+                        break;
                default:
                        mlog_bug_on_msg(1, "type: %d\n", type);
                        ops = NULL; /* thanks, gcc */
@@ -1129,6 +1167,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
                goto bail;
        }
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto bail;
+        }
 bail:
        mlog_exit(ret);
        return ret;
@@ -1182,6 +1226,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
        mlog_exit_void();
 }
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+        int status = 0;
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(!inode);
+        mlog_entry_void();
+        mlog(0, "inode %llu take PRMODE open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (ocfs2_mount_local(osb))
+                goto out;
+        lockres = &OCFS2_I(inode)->ip_open_lockres;
+        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_PRMODE, 0, 0);
+        if (status < 0)
+                mlog_errno(status);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+        int status = 0, level;
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(!inode);
+        mlog_entry_void();
+        mlog(0, "inode %llu try to take %s open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             write ? "EXMODE" : "PRMODE");
+        if (ocfs2_mount_local(osb))
+                goto out;
+        lockres = &OCFS2_I(inode)->ip_open_lockres;
+        level = write ? LKM_EXMODE : LKM_PRMODE;
+        /*
+         * The file system may already holding a PRMODE/EXMODE open lock.
+         * Since we pass LKM_NOQUEUE, the request won't block waiting on
+         * other nodes and the -EAGAIN will indicate to the caller that
+         * this inode is still in use.
+         */
+        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                    level, LKM_NOQUEUE, 0);
+out:
+        mlog_exit(status);
+        return status;
+}
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        mlog_entry_void();
+        mlog(0, "inode %llu drop open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (ocfs2_mount_local(osb))
+                goto out;
+        if(lockres->l_ro_holders)
+                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                     LKM_PRMODE);
+        if(lockres->l_ex_holders)
+                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                     LKM_EXMODE);
+out:
+        mlog_exit_void();
+}
 int ocfs2_data_lock_full(struct inode *inode,
                         int write,
                         int arg_flags)
@@ -1387,8 +1524,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
                inode->i_blocks = 0;
        else
-                inode->i_blocks =
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                        ocfs2_align_bytes_to_sectors(i_size_read(inode));
        inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
        inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1615,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 {
        int status = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = NULL;
+        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
        struct ocfs2_dinode *fe;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry_void();
+        if (ocfs2_mount_local(osb))
+                goto bail;
        spin_lock(&oi->ip_lock);
        if (oi->ip_flags & OCFS2_INODE_DELETED) {
                mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1635,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
        }
        spin_unlock(&oi->ip_lock);
-        if (!ocfs2_mount_local(osb)) {
+        if (!ocfs2_should_refresh_lock_res(lockres))
-                lockres = &oi->ip_meta_lockres;
+                goto bail;
-                if (!ocfs2_should_refresh_lock_res(lockres))
-                        goto bail;
-        }
        /* This will discard any caching information we might have had
         * for the inode metadata. */
        ocfs2_metadata_cache_purge(inode);
-        /* will do nothing for inode types that don't use the extent
-         * map (directories, bitmap files, etc) */
        ocfs2_extent_map_trunc(inode, 0);
-        if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+        if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
                mlog(0, "Trusting LVB on inode %llu\n",
                     (unsigned long long)oi->ip_blkno);
                ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1691,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
        status = 0;
 bail_refresh:
-        if (lockres)
+        ocfs2_complete_lock_res_refresh(lockres, status);
-                ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
        mlog_exit(status);
        return status;
@@ -1630,7 +1762,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
                wait_event(osb->recovery_event,
                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
-        acquired = 0;
        lockres = &OCFS2_I(inode)->ip_meta_lockres;
        level = ex ? LKM_EXMODE : LKM_PRMODE;
        dlm_flags = 0;
@@ -2458,13 +2589,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
         * ocfs2_clear_inode has done it for us. */
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_data_lockres);
+                              &OCFS2_I(inode)->ip_open_lockres);
        if (err < 0)
                mlog_errno(err);
        status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+                              &OCFS2_I(inode)->ip_data_lockres);
+        if (err < 0)
+                mlog_errno(err);
+        if (err < 0 && !status)
+                status = err;
+        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
                mlog_errno(err);
@@ -2969,28 +3107,3 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
        mlog_exit_void();
 }
-/* This aids in debugging situations where a bad LVB might be involved. */
-void ocfs2_dump_meta_lvb_info(u64 level,
-                              const char *function,
-                              unsigned int line,
-                              struct ocfs2_lock_res *lockres)
-{
-        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
-        mlog(level, "LVB information for %s (called from %s:%u):\n",
-             lockres->l_name, function, line);
-        mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
-             lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
-             be32_to_cpu(lvb->lvb_igeneration));
-        mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
-             (unsigned long long)be64_to_cpu(lvb->lvb_isize),
-             be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
-             be16_to_cpu(lvb->lvb_imode));
-        mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
-             "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
-             (long long)be64_to_cpu(lvb->lvb_iatime_packed),
-             (long long)be64_to_cpu(lvb->lvb_ictime_packed),
-             (long long)be64_to_cpu(lvb->lvb_imtime_packed),
-             be32_to_cpu(lvb->lvb_iattr));
-}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..492bad32a8c0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
@@ -116,11 +119,4 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
-/* aids in debugging and tracking lvbs */
-void ocfs2_dump_meta_lvb_info(u64 level,
-                              const char *function,
-                              unsigned int line,
-                              struct ocfs2_lock_res *lockres);
-#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
 #endif  /* DLMGLUE_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 56e1fefc1205..bc48177bd183 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -140,7 +140,7 @@ bail:
        return parent;
 }
-static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                           int connectable)
 {
        struct inode *inode = dentry->d_inode;
@@ -148,6 +148,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
        int type = 1;
        u64 blkno;
        u32 generation;
+        __le32 *fh = (__force __le32 *) fh_in;
        mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
                   dentry->d_name.len, dentry->d_name.name,
@@ -199,7 +200,7 @@ bail:
        return type;
 }
-static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, u32 *fh_in,
                                      int fh_len, int fileid_type,
                                      int (*acceptable)(void *context,
                                                        struct dentry *de),
@@ -207,6 +208,7 @@ static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
 {
        struct ocfs2_inode_handle handle, parent;
        struct dentry *ret = NULL;
+        __le32 *fh = (__force __le32 *) fh_in;
        mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
                   sb, fh, fh_len, fileid_type, acceptable, context);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
 *
 * extent_map.c
 *
- * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * Block/Cluster mapping functions
- * the library.
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 *
@@ -26,1016 +25,528 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 #include "ocfs2.h"
+#include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 #include "buffer_head_io.h"
 /*
- * SUCK SUCK SUCK
+ * The extent caching implementation is intentionally trivial.
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-struct ocfs2_extent_map_entry {
-        struct rb_node e_node;
-        int e_tree_depth;
-        struct ocfs2_extent_rec e_rec;
-};
-struct ocfs2_em_insert_context {
-        int need_left;
-        int need_right;
-        struct ocfs2_extent_map_entry *new_ent;
-        struct ocfs2_extent_map_entry *old_ent;
-        struct ocfs2_extent_map_entry *left_ent;
-        struct ocfs2_extent_map_entry *right_ent;
-};
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-                        u32 cpos, u32 clusters,
-                        struct rb_node ***ret_p,
-                        struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
-                                   struct ocfs2_extent_rec *rec,
-                                   int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-                                         struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-                                      u32 cpos, u32 clusters,
-                                      struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-                                        u32 cpos, u32 clusters,
-                                        struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-                                       struct ocfs2_extent_rec *rec,
-                                       int tree_depth,
-                                       struct ocfs2_em_insert_context *ctxt);
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
-                                              u32 cpos, u32 clusters)
-{
-        if (le32_to_cpu(rec->e_cpos) > cpos)
-                return 0;
-        if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
-                              le32_to_cpu(rec->e_clusters))
-                return 0;
-        return 1;
-}
-/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock.  This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
 *
- * The rb_node garbage lets insertion share the search.  Trivial
+ * We only cache a small number of extents stored directly on the
- * callers pass NULL.
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
 */
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+void ocfs2_extent_map_init(struct inode *inode)
-                        u32 cpos, u32 clusters,
-                        struct rb_node ***ret_p,
-                        struct rb_node **ret_parent)
 {
-        struct rb_node **p = &em->em_extents.rb_node;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct rb_node *parent = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        while (*p)
-        {
-                parent = *p;
-                ent = rb_entry(parent, struct ocfs2_extent_map_entry,
-                               e_node);
-                if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
-                        p = &(*p)->rb_left;
-                        ent = NULL;
-                } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
-                                    le32_to_cpu(ent->e_rec.e_clusters))) {
-                        p = &(*p)->rb_right;
-                        ent = NULL;
-                } else
-                        break;
-        }
-        if (ret_p != NULL)
+        oi->ip_extent_map.em_num_items = 0;
-                *ret_p = p;
+        INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
-        if (ret_parent != NULL)
-                *ret_parent = parent;
-        return ent;
 }
-/*
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- * Find the leaf containing the interval we want.  While we're on our
+                                      unsigned int cpos,
- * way down the tree, fill in every record we see at any depth, because
+                                      struct ocfs2_extent_map_item **ret_emi)
- * we might want it later.
- *
- * Note that this code is run without ip_lock.  That's because it
- * sleeps while reading.  If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-                                      u32 cpos, u32 clusters,
-                                      struct ocfs2_extent_list *el)
 {
-        int i, ret;
+        unsigned int range;
-        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_map_item *emi;
-        u64 blkno;
-        u32 rec_end;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_rec *rec;
-        /*
-         * The bh data containing the el cannot change here, because
-         * we hold alloc_sem.  So we can do this without other
-         * locks.
-         */
-        while (el->l_tree_depth)
-        {
-                blkno = 0;
-                for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                        rec = &el->l_recs[i];
-                        rec_end = (le32_to_cpu(rec->e_cpos) +
-                                   le32_to_cpu(rec->e_clusters));
-                        ret = -EBADR;
-                        if (rec_end > OCFS2_I(inode)->ip_clusters) {
-                                mlog_errno(ret);
-                                ocfs2_error(inode->i_sb,
-                                            "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-                                            i,
-                                            (unsigned long long)le64_to_cpu(rec->e_blkno),
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                            OCFS2_I(inode)->ip_clusters);
-                                goto out_free;
-                        }
-                        if (rec_end <= cpos) {
-                                ret = ocfs2_extent_map_insert(inode, rec,
-                                                le16_to_cpu(el->l_tree_depth));
-                                if (ret && (ret != -EEXIST)) {
-                                        mlog_errno(ret);
-                                        goto out_free;
-                                }
-                                continue;
-                        }
-                        if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
-                                ret = ocfs2_extent_map_insert(inode, rec,
-                                                le16_to_cpu(el->l_tree_depth));
-                                if (ret && (ret != -EEXIST)) {
-                                        mlog_errno(ret);
-                                        goto out_free;
-                                }
-                                continue;
-                        }
-                        /*
+        *ret_emi = NULL;
-                         * We've found a record that matches our
-                         * interval.  We don't insert it because we're
-                         * about to traverse it.
-                         */
-                        /* Check to see if we're stradling */
-                        ret = -ESRCH;
-                        if (!ocfs2_extent_rec_contains_clusters(rec,
-                                                                cpos,
-                                                                clusters)) {
-                                mlog_errno(ret);
-                                goto out_free;
-                        }
-                        /*
+        list_for_each_entry(emi, &em->em_list, ei_list) {
-                         * If we've already found a record, the el has
+                range = emi->ei_cpos + emi->ei_clusters;
-                         * two records covering the same interval.
-                         * EEEK!
-                         */
-                        ret = -EBADR;
-                        if (blkno) {
-                                mlog_errno(ret);
-                                ocfs2_error(inode->i_sb,
-                                            "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
-                                            cpos, clusters,
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                            (unsigned long long)blkno, i,
-                                            (unsigned long long)le64_to_cpu(rec->e_blkno));
-                                goto out_free;
-                        }
-                        blkno = le64_to_cpu(rec->e_blkno);
+                if (cpos >= emi->ei_cpos && cpos < range) {
-                }
+                        list_move(&emi->ei_list, &em->em_list);
-                /*
+                        *ret_emi = emi;
-                 * We don't support holes, and we're still up
+                        break;
-                 * in the branches, so we'd better have found someone
-                 */
-                ret = -EBADR;
-                if (!blkno) {
-                        ocfs2_error(inode->i_sb,
-                                    "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
-                                    cpos, clusters,
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        mlog_errno(ret);
-                        goto out_free;
-                }
-                if (eb_bh) {
-                        brelse(eb_bh);
-                        eb_bh = NULL;
-                }
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-                                       blkno, &eb_bh, OCFS2_BH_CACHED,
-                                       inode);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_free;
-                }
-                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out_free;
                }
-                el = &eb->h_list;
        }
+}
-        BUG_ON(el->l_tree_depth);
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+                                   unsigned int *phys, unsigned int *len,
-        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                                   unsigned int *flags)
-                rec = &el->l_recs[i];
+{
+        unsigned int coff;
-                if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-                    OCFS2_I(inode)->ip_clusters) {
+        struct ocfs2_extent_map_item *emi;
-                        ret = -EBADR;
-                        mlog_errno(ret);
+        spin_lock(&oi->ip_lock);
-                        ocfs2_error(inode->i_sb,
-                                    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
+        __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
-                                    i,
+        if (emi) {
-                                    (unsigned long long)le64_to_cpu(rec->e_blkno),
+                coff = cpos - emi->ei_cpos;
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                *phys = emi->ei_phys + coff;
-                                    OCFS2_I(inode)->ip_clusters);
+                if (len)
-                        return ret;
+                        *len = emi->ei_clusters - coff;
-                }
+                if (flags)
+                        *flags = emi->ei_flags;
-                ret = ocfs2_extent_map_insert(inode, rec,
-                                              le16_to_cpu(el->l_tree_depth));
-                if (ret && (ret != -EEXIST)) {
-                        mlog_errno(ret);
-                        goto out_free;
-                }
        }
-        ret = 0;
+        spin_unlock(&oi->ip_lock);
-out_free:
+        if (emi == NULL)
-        if (eb_bh)
+                return -ENOENT;
-                brelse(eb_bh);
-        return ret;
+        return 0;
 }
 /*
- * This lookup actually will read from disk.  It has one invariant:
+ * Forget about all clusters equal to or greater than cpos.
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
 */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
-                                        u32 cpos,
-                                        u32 clusters,
-                                        struct ocfs2_extent_map_entry **ret_ent)
 {
-        int ret;
+        struct list_head *p, *n;
-        u64 blkno;
+        struct ocfs2_extent_map_item *emi;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_extent_map_entry *ent;
+        struct ocfs2_extent_map *em = &oi->ip_extent_map;
-        struct buffer_head *bh = NULL;
+        LIST_HEAD(tmp_list);
-        struct ocfs2_extent_block *eb;
+        unsigned int range;
-        struct ocfs2_dinode *di;
-        struct ocfs2_extent_list *el;
+        spin_lock(&oi->ip_lock);
+        list_for_each_safe(p, n, &em->em_list) {
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-        ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-        if (ent) {
+                if (emi->ei_cpos >= cpos) {
-                if (!ent->e_tree_depth) {
+                        /* Full truncate of this record. */
-                        spin_unlock(&OCFS2_I(inode)->ip_lock);
+                        list_move(&emi->ei_list, &tmp_list);
-                        *ret_ent = ent;
+                        BUG_ON(em->em_num_items == 0);
-                        return 0;
+                        em->em_num_items--;
-                }
+                        continue;
-                blkno = le64_to_cpu(ent->e_rec.e_blkno);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
-                                       OCFS2_BH_CACHED, inode);
-                if (ret) {
-                        mlog_errno(ret);
-                        if (bh)
-                                brelse(bh);
-                        return ret;
                }
-                eb = (struct ocfs2_extent_block *)bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(bh);
-                        return -EIO;
-                }
-                el = &eb->h_list;
-        } else {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                range = emi->ei_cpos + emi->ei_clusters;
-                                       OCFS2_I(inode)->ip_blkno, &bh,
+                if (range > cpos) {
-                                       OCFS2_BH_CACHED, inode);
+                        /* Partial truncate */
-                if (ret) {
+                        emi->ei_clusters = cpos - emi->ei_cpos;
-                        mlog_errno(ret);
-                        if (bh)
-                                brelse(bh);
-                        return ret;
                }
-                di = (struct ocfs2_dinode *)bh->b_data;
-                if (!OCFS2_IS_VALID_DINODE(di)) {
-                        brelse(bh);
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
-                        return -EIO;
-                }
-                el = &di->id2.i_list;
-        }
-        ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
-        brelse(bh);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
        }
+        spin_unlock(&oi->ip_lock);
-        ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+        list_for_each_safe(p, n, &tmp_list) {
-        if (!ent) {
+                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-                ret = -ESRCH;
+                list_del(&emi->ei_list);
-                mlog_errno(ret);
+                kfree(emi);
-                return ret;
        }
-        /* FIXME: Make sure this isn't a corruption */
-        BUG_ON(ent->e_tree_depth);
-        *ret_ent = ent;
-        return 0;
 }
 /*
- * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * Is any part of emi2 contained within emi1
- * thus racing lookup if the lock weren't held.
 */
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
-                                         struct ocfs2_extent_map_entry *ent)
+                                 struct ocfs2_extent_map_item *emi2)
 {
-        struct rb_node **p, *parent;
+        unsigned int range1, range2;
-        struct ocfs2_extent_map_entry *old_ent;
-        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+        /*
-                                          le32_to_cpu(ent->e_rec.e_clusters),
+         * Check if logical start of emi2 is inside emi1
-                                          &p, &parent);
+         */
-        if (old_ent)
+        range1 = emi1->ei_cpos + emi1->ei_clusters;
-                return -EEXIST;
+        if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+                return 1;
-        rb_link_node(&ent->e_node, parent, p);
+        /*
-        rb_insert_color(&ent->e_node, &em->em_extents);
+         * Check if logical end of emi2 is inside emi1
+         */
+        range2 = emi2->ei_cpos + emi2->ei_clusters;
+        if (range2 > emi1->ei_cpos && range2 <= range1)
+                return 1;
        return 0;
 }
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+                                  struct ocfs2_extent_map_item *src)
+{
+        dest->ei_cpos = src->ei_cpos;
+        dest->ei_phys = src->ei_phys;
+        dest->ei_clusters = src->ei_clusters;
+        dest->ei_flags = src->ei_flags;
+}
 /*
- * Simple rule: on any return code other than -EAGAIN, anything left
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
- * in the insert_context will be freed.
+ * otherwise.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert.  It is not an actual error, but it
- * tells the caller we have no more work to do.
 */
-static int ocfs2_extent_map_try_insert(struct inode *inode,
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
-                                       struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_extent_map_item *ins)
-                                       int tree_depth,
-                                       struct ocfs2_em_insert_context *ctxt)
 {
-        int ret;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *old_ent;
-        ctxt->need_left = 0;
-        ctxt->need_right = 0;
-        ctxt->old_ent = NULL;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-        if (!ret) {
-                ctxt->new_ent = NULL;
-                goto out_unlock;
-        }
-        /* Since insert_entry failed, the map MUST have old_ent */
-        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-                                          le32_to_cpu(rec->e_clusters),
-                                          NULL, NULL);
-        BUG_ON(!old_ent);
-        if (old_ent->e_tree_depth < tree_depth) {
-                /* Another thread beat us to the lower tree_depth */
-                ret = -EEXIST;
-                goto out_unlock;
-        }
-        if (old_ent->e_tree_depth == tree_depth) {
-                /*
-                 * Another thread beat us to this tree_depth.
-                 * Let's make sure we agree with that thread (the
-                 * extent_rec should be identical).
-                 */
-                if (!memcmp(rec, &old_ent->e_rec,
-                            sizeof(struct ocfs2_extent_rec)))
-                        ret = 0;
-                else
-                        /* FIXME: Should this be ESRCH/EBADR??? */
-                        ret = -EEXIST;
-                goto out_unlock;
-        }
        /*
-         * We do it in this order specifically so that no actual tree
+         * Handle contiguousness
-         * changes occur until we have all the pieces we need.  We
-         * don't want malloc failures to leave an inconsistent tree.
-         * Whenever we drop the lock, another process could be
-         * inserting.  Also note that, if another process just beat us
-         * to an insert, we might not need the same pieces we needed
-         * the first go round.  In the end, the pieces we need will
-         * be used, and the pieces we don't will be freed.
         */
-        ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+        if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
-                             le32_to_cpu(old_ent->e_rec.e_cpos));
+            ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
-        ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+            ins->ei_flags == emi->ei_flags) {
-                               le32_to_cpu(old_ent->e_rec.e_clusters)) >
+                emi->ei_clusters += ins->ei_clusters;
-                              (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+                return 1;
-        ret = -EAGAIN;
+        } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
-        if (ctxt->need_left) {
+                   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
-                if (!ctxt->left_ent)
+                   ins->ei_flags == emi->ei_flags) {
-                        goto out_unlock;
+                emi->ei_phys = ins->ei_phys;
-                *(ctxt->left_ent) = *old_ent;
+                emi->ei_cpos = ins->ei_cpos;
-                ctxt->left_ent->e_rec.e_clusters =
+                emi->ei_clusters += ins->ei_clusters;
-                        cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+                return 1;
-                                    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
-        }
-        if (ctxt->need_right) {
-                if (!ctxt->right_ent)
-                        goto out_unlock;
-                *(ctxt->right_ent) = *old_ent;
-                ctxt->right_ent->e_rec.e_cpos =
-                        cpu_to_le32(le32_to_cpu(rec->e_cpos) +
-                                    le32_to_cpu(rec->e_clusters));
-                ctxt->right_ent->e_rec.e_clusters =
-                        cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
-                                     le32_to_cpu(old_ent->e_rec.e_clusters)) -
-                                    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
-        }
-        rb_erase(&old_ent->e_node, &em->em_extents);
-        /* Now that he's erased, set him up for deletion */
-        ctxt->old_ent = old_ent;
-        if (ctxt->need_left) {
-                ret = ocfs2_extent_map_insert_entry(em,
-                                                    ctxt->left_ent);
-                if (ret)
-                        goto out_unlock;
-                ctxt->left_ent = NULL;
        }
-        if (ctxt->need_right) {
+        /*
-                ret = ocfs2_extent_map_insert_entry(em,
+         * Overlapping extents - this shouldn't happen unless we've
-                                                    ctxt->right_ent);
+         * split an extent to change it's flags. That is exceedingly
-                if (ret)
+         * rare, so there's no sense in trying to optimize it yet.
-                        goto out_unlock;
+         */
-                ctxt->right_ent = NULL;
+        if (ocfs2_ei_is_contained(emi, ins) ||
+            ocfs2_ei_is_contained(ins, emi)) {
+                ocfs2_copy_emi_fields(emi, ins);
+                return 1;
        }
-        ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+        /* No merge was possible. */
+        return 0;
-        if (!ret)
-                ctxt->new_ent = NULL;
-out_unlock:
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        return ret;
 }
+/*
-static int ocfs2_extent_map_insert(struct inode *inode,
+ * In order to reduce complexity on the caller, this insert function
-                                   struct ocfs2_extent_rec *rec,
+ * is intentionally liberal in what it will accept.
-                                   int tree_depth)
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+                                 struct ocfs2_extent_rec *rec)
 {
-        int ret;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_em_insert_context ctxt = {0, };
+        struct ocfs2_extent_map *em = &oi->ip_extent_map;
+        struct ocfs2_extent_map_item *emi, *new_emi = NULL;
-        if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+        struct ocfs2_extent_map_item ins;
-            OCFS2_I(inode)->ip_map.em_clusters) {
-                ret = -EBADR;
+        ins.ei_cpos = le32_to_cpu(rec->e_cpos);
-                mlog_errno(ret);
+        ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
-                return ret;
+                                               le64_to_cpu(rec->e_blkno));
+        ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+        ins.ei_flags = rec->e_flags;
+search:
+        spin_lock(&oi->ip_lock);
+        list_for_each_entry(emi, &em->em_list, ei_list) {
+                if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+                        list_move(&emi->ei_list, &em->em_list);
+                        spin_unlock(&oi->ip_lock);
+                        goto out;
+                }
        }
-        /* Zero e_clusters means a truncated tail record.  It better be EOF */
+        /*
-        if (!rec->e_clusters) {
+         * No item could be merged.
-                if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+         *
-                    OCFS2_I(inode)->ip_map.em_clusters) {
+         * Either allocate and add a new item, or overwrite the last recently
-                        ret = -EBADR;
+         * inserted.
-                        mlog_errno(ret);
+         */
-                        ocfs2_error(inode->i_sb,
-                                    "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
-                                    (unsigned long long)le64_to_cpu(rec->e_blkno),
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        return ret;
-                }
-                /* Ignore the truncated tail */
+        if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
-                return 0;
+                if (new_emi == NULL) {
-        }
+                        spin_unlock(&oi->ip_lock);
-        ret = -ENOMEM;
+                        new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
-        ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+                        if (new_emi == NULL)
-                                        GFP_NOFS);
+                                goto out;
-        if (!ctxt.new_ent) {
-                mlog_errno(ret);
-                return ret;
-        }
-        ctxt.new_ent->e_rec = *rec;
+                        goto search;
-        ctxt.new_ent->e_tree_depth = tree_depth;
-        do {
-                ret = -ENOMEM;
-                if (ctxt.need_left && !ctxt.left_ent) {
-                        ctxt.left_ent =
-                                kmem_cache_alloc(ocfs2_em_ent_cachep,
-                                                 GFP_NOFS);
-                        if (!ctxt.left_ent)
-                                break;
-                }
-                if (ctxt.need_right && !ctxt.right_ent) {
-                        ctxt.right_ent =
-                                kmem_cache_alloc(ocfs2_em_ent_cachep,
-                                                 GFP_NOFS);
-                        if (!ctxt.right_ent)
-                                break;
                }
-                ret = ocfs2_extent_map_try_insert(inode, rec,
+                ocfs2_copy_emi_fields(new_emi, &ins);
-                                                  tree_depth, &ctxt);
+                list_add(&new_emi->ei_list, &em->em_list);
-        } while (ret == -EAGAIN);
+                em->em_num_items++;
+                new_emi = NULL;
-        if ((ret < 0) && (ret != -EEXIST))
+        } else {
-                mlog_errno(ret);
+                BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+                emi = list_entry(em->em_list.prev,
+                                 struct ocfs2_extent_map_item, ei_list);
+                list_move(&emi->ei_list, &em->em_list);
+                ocfs2_copy_emi_fields(emi, &ins);
+        }
-        if (ctxt.left_ent)
+        spin_unlock(&oi->ip_lock);
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
-        if (ctxt.right_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
-        if (ctxt.old_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
-        if (ctxt.new_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
-        return ret;
+out:
+        if (new_emi)
+                kfree(new_emi);
 }
 /*
- * Append this record to the tail of the extent map.  It must be
+ * Return the 1st index within el which contains an extent start
- * tree_depth 0.  The record might be an extension of an existing
+ * larger than v_cluster.
- * record, and as such that needs to be handled.  eg:
- *
- * Existing record in the extent map:
- *
- *      cpos = 10, len = 10
- *      |---------|
- *
- * New Record:
- *
- *      cpos = 10, len = 20
- *      |------------------|
- *
- * The passed record is the new on-disk record.  The new_clusters value
- * is how many clusters were added to the file.  If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters.  If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
 */
-int ocfs2_extent_map_append(struct inode *inode,
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
-                            struct ocfs2_extent_rec *rec,
+                                       u32 v_cluster)
-                            u32 new_clusters)
 {
-        int ret;
+        int i;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_map_entry *ent;
-        struct ocfs2_extent_rec *old;
-        BUG_ON(!new_clusters);
-        BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
-        if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                /*
+                rec = &el->l_recs[i];
-                 * Size changed underneath us on disk.  Drop any
-                 * straddling records and update our idea of
-                 * i_clusters
-                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
-        }
-        mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+                if (v_cluster < le32_to_cpu(rec->e_cpos))
-                         le32_to_cpu(rec->e_clusters)) !=
+                        break;
-                        (em->em_clusters + new_clusters),
-                        "Inode %llu:\n"
-                        "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
-                        "em->em_clusters = %u + new_clusters = %u = %u\n",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                        le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
-                        le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
-                        em->em_clusters, new_clusters,
-                        em->em_clusters + new_clusters);
-        em->em_clusters += new_clusters;
-        ret = -ENOENT;
-        if (le32_to_cpu(rec->e_clusters) > new_clusters) {
-                /* This is a contiguous append */
-                ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
-                                              NULL, NULL);
-                if (ent) {
-                        old = &ent->e_rec;
-                        BUG_ON((le32_to_cpu(rec->e_cpos) +
-                                le32_to_cpu(rec->e_clusters)) !=
-                                 (le32_to_cpu(old->e_cpos) +
-                                  le32_to_cpu(old->e_clusters) +
-                                  new_clusters));
-                        if (ent->e_tree_depth == 0) {
-                                BUG_ON(le32_to_cpu(old->e_cpos) !=
-                                       le32_to_cpu(rec->e_cpos));
-                                BUG_ON(le64_to_cpu(old->e_blkno) !=
-                                       le64_to_cpu(rec->e_blkno));
-                                ret = 0;
-                        }
-                        /*
-                         * Let non-leafs fall through as -ENOENT to
-                         * force insertion of the new leaf.
-                         */
-                        le32_add_cpu(&old->e_clusters, new_clusters);
-                }
        }
-        if (ret == -ENOENT)
+        return i;
-                ret = ocfs2_extent_map_insert(inode, rec, 0);
-        if (ret < 0)
-                mlog_errno(ret);
-        return ret;
 }
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
 /*
- * Look up the record containing this cluster offset.  This record is
+ * Figure out the size of a hole which starts at v_cluster within the given
- * part of the extent map.  Do not free it.  Any changes you make to
+ * extent list.
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
 *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * If there is no more allocation past v_cluster, we return the maximum
- * rec->e_clusters -= 5;
+ * cluster size minus v_cluster.
 *
- * The lookup does not read from disk.  If the map isn't filled in for
+ * If we have in-inode extents, then el points to the dinode list and
- * an entry, you won't find it.
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
- *
+ * containing el.
- * Also note that the returned record is valid until alloc_sem is
- * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
 */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+static int ocfs2_figure_hole_clusters(struct inode *inode,
-                             struct ocfs2_extent_rec **rec,
+                                      struct ocfs2_extent_list *el,
-                             int *tree_depth)
+                                      struct buffer_head *eb_bh,
+                                      u32 v_cluster,
+                                      u32 *num_clusters)
 {
-        int ret = -ENOENT;
+        int ret, i;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct buffer_head *next_eb_bh = NULL;
-        struct ocfs2_extent_map_entry *ent;
+        struct ocfs2_extent_block *eb, *next_eb;
-        *rec = NULL;
+        i = ocfs2_search_for_hole_index(el, v_cluster);
-        if (cpos >= OCFS2_I(inode)->ip_clusters)
+        if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
-                return -EINVAL;
+                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-        if (cpos >= em->em_clusters) {
                /*
-                 * Size changed underneath us on disk.  Drop any
+                 * Check the next leaf for any extents.
-                 * straddling records and update our idea of
-                 * i_clusters
                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters ;
-        }
-        ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
-                                      NULL, NULL);
-        if (ent) {
+                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
-                *rec = &ent->e_rec;
+                        goto no_more_extents;
-                if (tree_depth)
-                        *tree_depth = ent->e_tree_depth;
-                ret = 0;
-        }
-        return ret;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-}
+                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                       &next_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-int ocfs2_extent_map_get_clusters(struct inode *inode,
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                                  u32 v_cpos, int count,
+                        ret = -EROFS;
-                                  u32 *p_cpos, int *ret_count)
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-{
+                        goto out;
-        int ret;
+                }
-        u32 coff, ccount;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        *p_cpos = ccount = 0;
+                el = &next_eb->h_list;
-        if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+                i = ocfs2_search_for_hole_index(el, v_cluster);
-                return -EINVAL;
+        }
-        if ((v_cpos + count) > em->em_clusters) {
+no_more_extents:
+        if (i == le16_to_cpu(el->l_next_free_rec)) {
                /*
-                 * Size changed underneath us on disk.  Drop any
+                 * We're at the end of our existing allocation. Just
-                 * straddling records and update our idea of
+                 * return the maximum number of clusters we could
-                 * i_clusters
+                 * possibly allocate.
                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+                *num_clusters = UINT_MAX - v_cluster;
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
+        } else {
+                *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
        }
+        ret = 0;
+out:
+        brelse(next_eb_bh);
+        return ret;
+}
-        ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+/*
-        if (ret)
+ * Return the index of the extent record which contains cluster #v_cluster.
-                return ret;
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+                                    u32 v_cluster)
+{
+        int ret = -1;
+        int i;
+        struct ocfs2_extent_rec *rec;
+        u32 rec_end, rec_start, clusters;
-        if (ent) {
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                /* We should never find ourselves straddling an interval */
+                rec = &el->l_recs[i];
-                if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
-                                                        v_cpos,
-                                                        count))
-                        return -ESRCH;
-                coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+                rec_start = le32_to_cpu(rec->e_cpos);
-                *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                clusters = ocfs2_rec_clusters(el, rec);
-                                le64_to_cpu(ent->e_rec.e_blkno)) +
-                          coff;
-                if (ret_count)
+                rec_end = rec_start + clusters;
-                        *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
-                return 0;
+                if (v_cluster >= rec_start && v_cluster < rec_end) {
+                        ret = i;
+                        break;
+                }
        }
+        return ret;
-        return -ENOENT;
 }
-#endif  /*  0  */
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+                       u32 *p_cluster, u32 *num_clusters,
-int ocfs2_extent_map_get_blocks(struct inode *inode,
+                       unsigned int *extent_flags)
-                                u64 v_blkno, int count,
-                                u64 *p_blkno, int *ret_count)
 {
-        int ret;
+        int ret, i;
-        u64 boff;
+        unsigned int flags = 0;
-        u32 cpos, clusters;
+        struct buffer_head *di_bh = NULL;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        struct buffer_head *eb_bh = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
+        struct ocfs2_dinode *di;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
        struct ocfs2_extent_rec *rec;
+        u32 coff;
-        *p_blkno = 0;
+        ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+                                      num_clusters, extent_flags);
-        cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+        if (ret == 0)
-        clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+                goto out;
-                                            (u64)count + bpc - 1);
-        if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
-                ret = -EINVAL;
-                mlog_errno(ret);
-                return ret;
-        }
-        if ((cpos + clusters) > em->em_clusters) {
-                /*
-                 * Size changed underneath us on disk.  Drop any
-                 * straddling records and update our idea of
-                 * i_clusters
-                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
-        }
-        ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+                               &di_bh, OCFS2_BH_CACHED, inode);
        if (ret) {
                mlog_errno(ret);
-                return ret;
+                goto out;
        }
-        if (ent)
+        di = (struct ocfs2_dinode *) di_bh->b_data;
-        {
+        el = &di->id2.i_list;
-                rec = &ent->e_rec;
-                /* We should never find ourselves straddling an interval */
+        if (el->l_tree_depth) {
-                if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+                ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
-                        ret = -ESRCH;
+                if (ret) {
                        mlog_errno(ret);
-                        return ret;
+                        goto out;
                }
-                boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
-                                                le32_to_cpu(rec->e_cpos));
+                el = &eb->h_list;
-                boff += (v_blkno & (u64)(bpc - 1));
-                *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
-                if (ret_count) {
+                if (el->l_tree_depth) {
-                        *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+                        ocfs2_error(inode->i_sb,
-                                        le32_to_cpu(rec->e_clusters)) - boff;
+                                    "Inode %lu has non zero tree depth in "
+                                    "leaf block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
                }
-                return 0;
        }
-        return -ENOENT;
+        i = ocfs2_search_extent_list(el, v_cluster);
-}
+        if (i == -1) {
+                /*
-int ocfs2_extent_map_init(struct inode *inode)
+                 * A hole was found. Return some canned values that
-{
+                 * callers can key on. If asked for, num_clusters will
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+                 * be populated with the size of the hole.
+                 */
-        em->em_extents = RB_ROOT;
+                *p_cluster = 0;
-        em->em_clusters = 0;
+                if (num_clusters) {
+                        ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-        return 0;
+                                                         v_cluster,
-}
+                                                         num_clusters);
+                        if (ret) {
-/* Needs the lock */
+                                mlog_errno(ret);
-static void __ocfs2_extent_map_drop(struct inode *inode,
+                                goto out;
-                                    u32 new_clusters,
+                        }
-                                    struct rb_node **free_head,
+                }
-                                    struct ocfs2_extent_map_entry **tail_ent)
+        } else {
-{
+                rec = &el->l_recs[i];
-        struct rb_node *node, *next;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *ent;
-        *free_head = NULL;
+                BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
-        ent = NULL;
+                if (!rec->e_blkno) {
-        node = rb_last(&em->em_extents);
+                        ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-        while (node)
+                                    "record (%u, %u, 0)", inode->i_ino,
-        {
+                                    le32_to_cpu(rec->e_cpos),
-                next = rb_prev(node);
+                                    ocfs2_rec_clusters(el, rec));
+                        ret = -EROFS;
+                        goto out;
+                }
-                ent = rb_entry(node, struct ocfs2_extent_map_entry,
+                coff = v_cluster - le32_to_cpu(rec->e_cpos);
-                               e_node);
-                if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
-                        break;
-                rb_erase(&ent->e_node, &em->em_extents);
+                *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                    le64_to_cpu(rec->e_blkno));
+                *p_cluster = *p_cluster + coff;
-                node->rb_right = *free_head;
+                if (num_clusters)
-                *free_head = node;
+                        *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
-                ent = NULL;
+                flags = rec->e_flags;
-                node = next;
-        }
-        /* Do we have an entry straddling new_clusters? */
+                ocfs2_extent_map_insert_rec(inode, rec);
-        if (tail_ent) {
-                if (ent &&
-                    ((le32_to_cpu(ent->e_rec.e_cpos) +
-                      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
-                        *tail_ent = ent;
-                else
-                        *tail_ent = NULL;
        }
-}
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-        struct rb_node *node;
-        struct ocfs2_extent_map_entry *ent;
-        while (free_head) {
+        if (extent_flags)
-                node = free_head;
+                *extent_flags = flags;
-                free_head = node->rb_right;
-                ent = rb_entry(node, struct ocfs2_extent_map_entry,
+out:
-                               e_node);
+        brelse(di_bh);
-                kmem_cache_free(ocfs2_em_ent_cachep, ent);
+        brelse(eb_bh);
-        }
+        return ret;
 }
 /*
- * Remove all entries past new_clusters, inclusive of an entry that
+ * This expects alloc_sem to be held. The allocation cannot change at
- * contains new_clusters.  This is effectively a cache forget.
+ * all while the map is in the process of being updated.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
 */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+                                u64 *ret_count, unsigned int *extent_flags)
 {
-        struct rb_node *free_head = NULL;
+        int ret;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-        struct ocfs2_extent_map_entry *ent;
+        u32 cpos, num_clusters, p_cluster;
+        u64 boff = 0;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+        cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
-        if (ent) {
+        ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
-                rb_erase(&ent->e_node, &em->em_extents);
+                                 extent_flags);
-                ent->e_node.rb_right = free_head;
+        if (ret) {
-                free_head = &ent->e_node;
+                mlog_errno(ret);
+                goto out;
        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /*
+         * p_cluster == 0 indicates a hole.
-        if (free_head)
+         */
-                __ocfs2_extent_map_drop_cleanup(free_head);
+        if (p_cluster) {
+                boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
-        return 0;
+                boff += (v_blkno & (u64)(bpc - 1));
-}
+        }
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.  This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
-        struct rb_node *free_head = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-        if (ent)
-                ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
-                                               le32_to_cpu(ent->e_rec.e_cpos));
-        OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        if (free_head)
-                __ocfs2_extent_map_drop_cleanup(free_head);
-        return 0;
-}
-int __init init_ocfs2_extent_maps(void)
+        *p_blkno = boff;
-{
-        ocfs2_em_ent_cachep =
-                kmem_cache_create("ocfs2_em_ent",
-                                  sizeof(struct ocfs2_extent_map_entry),
-                                  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-        if (!ocfs2_em_ent_cachep)
-                return -ENOMEM;
-        return 0;
+        if (ret_count) {
-}
+                *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+                *ret_count -= v_blkno & (u64)(bpc - 1);
+        }
-void exit_ocfs2_extent_maps(void)
+out:
-{
+        return ret;
-        kmem_cache_destroy(ocfs2_em_ent_cachep);
 }
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
-int init_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
-void exit_ocfs2_extent_maps(void);
+        unsigned int                    ei_cpos;
+        unsigned int                    ei_phys;
+        unsigned int                    ei_clusters;
+        unsigned int                    ei_flags;
-/*
+        struct list_head                ei_list;
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+};
- * to be held.  The allocation cannot change at all while the map is
- * in the process of being updated.
+#define OCFS2_MAX_EXTENT_MAP_ITEMS                      3
- */
+struct ocfs2_extent_map {
-int ocfs2_extent_map_init(struct inode *inode);
+        unsigned int                    em_num_items;
-int ocfs2_extent_map_append(struct inode *inode,
+        struct list_head                em_list;
-                            struct ocfs2_extent_rec *rec,
+};
-                            u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
+void ocfs2_extent_map_init(struct inode *inode);
-                                u64 v_blkno, int count,
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
-                                u64 *p_blkno, int *ret_count);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+                                 struct ocfs2_extent_rec *rec);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+                       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+                                u64 *ret_count, unsigned int *extent_flags);
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..9395b4fa547d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -206,16 +207,16 @@ out:
        return ret;
 }
-int ocfs2_set_inode_size(handle_t *handle,
+static int ocfs2_set_inode_size(handle_t *handle,
-                         struct inode *inode,
+                                struct inode *inode,
-                         struct buffer_head *fe_bh,
+                                struct buffer_head *fe_bh,
-                         u64 new_i_size)
+                                u64 new_i_size)
 {
        int status;
        mlog_entry_void();
        i_size_write(inode, new_i_size);
-        inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
        int status;
        handle_t *handle;
+        struct ocfs2_dinode *di;
        mlog_entry_void();
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+        status = ocfs2_journal_access(handle, inode, fe_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        /*
+         * Do this before setting i_size.
+         */
+        status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+        if (status) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        i_size_write(inode, new_i_size);
+        inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        di = (struct ocfs2_dinode *) fe_bh->b_data;
+        di->i_size = cpu_to_le64(new_i_size);
+        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0)
                mlog_errno(status);
+out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
        mlog_exit(status);
        return status;
 }
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                mlog_errno(status);
                goto bail;
        }
-        ocfs2_data_unlock(inode, 1);
-        if (le32_to_cpu(fe->i_clusters) ==
-            ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
-                mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
-                     fe->i_clusters);
-                /* No allocation change is required, so lets fast path
-                 * this truncate. */
-                status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-                if (status < 0)
-                        mlog_errno(status);
-                goto bail;
-        }
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        /* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+        ocfs2_data_unlock(inode, 1);
 bail:
        mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
 */
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
+                               u32 *logical_offset,
                               u32 clusters_to_add,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
        mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+        status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-                                     num_bits, meta_ac);
+                                     *logical_offset, block, num_bits,
+                                     meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        le32_add_cpu(&fe->i_clusters, num_bits);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        }
        clusters_to_add -= num_bits;
+        *logical_offset += num_bits;
        if (clusters_to_add) {
                mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
        return status;
 }
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Called from ocfs2_extend_allocation() for file systems which don't
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+                          u32 clusters_to_add,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac)
+{
+        int ret, num_free_extents;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *meta_ac = NULL;
+        *data_ac = NULL;
+        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+             "clusters_to_add = %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+             le32_to_cpu(di->i_clusters), clusters_to_add);
+        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Sparse allocation file systems need to be more conservative
+         * with reserving room for expansion - the actual allocation
+         * happens while we've got a journal handle open so re-taking
+         * a cluster lock (because we ran out of room for another
+         * extent) will violate ordering rules.
+         *
+         * Most of the time we'll only be seeing this 1 cluster at a time
+         * anyway.
+         */
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+                ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null *data_ac.
+                 */
+        }
+        return ret;
+}
 static int ocfs2_extend_allocation(struct inode *inode,
                                   u32 clusters_to_add)
 {
        int status = 0;
        int restart_func = 0;
        int drop_alloc_sem = 0;
-        int credits, num_free_extents;
+        int credits;
-        u32 prev_clusters;
+        u32 prev_clusters, logical_start;
        struct buffer_head *bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+        /*
+         * This function only exists for file systems which don't
+         * support holes.
+         */
+        BUG_ON(ocfs2_sparse_alloc(osb));
        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
                                  OCFS2_BH_CACHED, inode);
        if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
                goto leave;
        }
+        logical_start = OCFS2_I(inode)->ip_clusters;
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
-             "clusters_to_add = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-             fe->i_clusters, clusters_to_add);
-        num_free_extents = ocfs2_num_free_extents(osb,
-                                                  inode,
-                                                  fe);
-        if (num_free_extents < 0) {
-                status = num_free_extents;
-                mlog_errno(status);
-                goto leave;
-        }
-        if (!num_free_extents) {
-                status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
-                if (status < 0) {
-                        if (status != -ENOSPC)
-                                mlog_errno(status);
-                        goto leave;
-                }
-        }
-        status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
-        if (status < 0) {
-                if (status != -ENOSPC)
-                        mlog_errno(status);
-                goto leave;
-        }
        /* blocks peope in read/write from reading our allocation
         * until we're done changing it. We depend on i_mutex to block
         * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        drop_alloc_sem = 1;
+        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+                                       &meta_ac);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
        credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
        status = ocfs2_do_extend_allocation(osb,
                                            inode,
+                                            &logical_start,
                                            clusters_to_add,
                                            bh,
                                            handle,
@@ -637,7 +713,8 @@ restarted_transaction:
        }
        mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
-             fe->i_clusters, (unsigned long long)fe->i_size);
+             le32_to_cpu(fe->i_clusters),
+             (unsigned long long)le64_to_cpu(fe->i_size));
        mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
             OCFS2_I(inode)->ip_clusters, i_size_read(inode));
@@ -778,7 +855,7 @@ static int ocfs2_extend_file(struct inode *inode,
                             size_t tail_to_skip)
 {
        int ret = 0;
-        u32 clusters_to_add;
+        u32 clusters_to_add = 0;
        BUG_ON(!tail_to_skip && !di_bh);
@@ -790,6 +867,11 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                BUG_ON(tail_to_skip != 0);
+                goto out_update_size;
+        }
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
                OCFS2_I(inode)->ip_clusters;
@@ -825,6 +907,7 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out_unlock;
        }
+out_update_size:
        if (!tail_to_skip) {
                /* We're being called from ocfs2_setattr() which wants
                 * us to update i_size */
@@ -834,7 +917,8 @@ static int ocfs2_extend_file(struct inode *inode,
        }
 out_unlock:
-        ocfs2_data_unlock(inode, 1);
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                ocfs2_data_unlock(inode, 1);
 out:
        return ret;
@@ -972,7 +1056,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        ret = ocfs2_meta_lock(inode, NULL, 0);
        if (ret) {
-                mlog_errno(ret);
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
                goto out;
        }
@@ -1035,10 +1120,49 @@ out:
        return ret;
 }
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+                                       size_t count)
+{
+        int ret = 0;
+        unsigned int extent_flags;
+        u32 cpos, clusters, extent_len, phys_cpos;
+        struct super_block *sb = inode->i_sb;
+        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+        while (clusters) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+                                         &extent_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+                        ret = 1;
+                        break;
+                }
+                if (extent_len > clusters)
+                        extent_len = clusters;
+                clusters -= extent_len;
+                cpos += extent_len;
+        }
+out:
+        return ret;
+}
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
-                                         int appending)
+                                         int appending,
+                                         int *direct_io)
 {
        int ret = 0, meta_level = appending;
        struct inode *inode = dentry->d_inode;
@@ -1089,6 +1213,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                } else {
                        saved_pos = *ppos;
                }
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                        loff_t end = saved_pos + count;
+                        /*
+                         * Skip the O_DIRECT checks if we don't need
+                         * them.
+                         */
+                        if (!direct_io || !(*direct_io))
+                                break;
+                        /*
+                         * Allowing concurrent direct writes means
+                         * i_size changes wouldn't be synchronized, so
+                         * one node could wind up truncating another
+                         * nodes writes.
+                         */
+                        if (end > i_size_read(inode)) {
+                                *direct_io = 0;
+                                break;
+                        }
+                        /*
+                         * We don't fill holes during direct io, so
+                         * check for them here. If any are found, the
+                         * caller will have to retake some cluster
+                         * locks and initiate the io as buffered.
+                         */
+                        ret = ocfs2_check_range_for_holes(inode, saved_pos,
+                                                          count);
+                        if (ret == 1) {
+                                *direct_io = 0;
+                                ret = 0;
+                        } else if (ret < 0)
+                                mlog_errno(ret);
+                        break;
+                }
+                /*
+                 * The rest of this loop is concerned with legacy file
+                 * systems which don't support sparse files.
+                 */
                newsize = count + saved_pos;
                mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1308,264 @@ out:
        return ret;
 }
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+        const struct iovec *iov = *iovp;
+        size_t base = *basep;
+        do {
+                int copy = min(bytes, iov->iov_len - base);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        base = 0;
+                }
+        } while (bytes);
+        *iovp = iov;
+        *basep = base;
+}
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+                                            const struct iovec *cur_iov,
+                                            size_t iov_offset)
+{
+        int ret;
+        char *buf;
+        struct page *src_page = NULL;
+        buf = cur_iov->iov_base + iov_offset;
+        if (!segment_eq(get_fs(), KERNEL_DS)) {
+                /*
+                 * Pull in the user page. We want to do this outside
+                 * of the meta data locks in order to preserve locking
+                 * order in case of page fault.
+                 */
+                ret = get_user_pages(current, current->mm,
+                                     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+                                     0, 0, &src_page, NULL);
+                if (ret == 1)
+                        bp->b_src_buf = kmap(src_page);
+                else
+                        src_page = ERR_PTR(-EFAULT);
+        } else {
+                bp->b_src_buf = buf;
+        }
+        return src_page;
+}
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+                                   struct page *page)
+{
+        if (page) {
+                kunmap(page);
+                page_cache_release(page);
+        }
+}
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+                                         const struct iovec *iov,
+                                         unsigned long nr_segs,
+                                         size_t count,
+                                         ssize_t o_direct_written)
+{
+        int ret = 0;
+        ssize_t copied, total = 0;
+        size_t iov_offset = 0;
+        const struct iovec *cur_iov = iov;
+        struct ocfs2_buffered_write_priv bp;
+        struct page *page;
+        /*
+         * handle partial DIO write.  Adjust cur_iov if needed.
+         */
+        ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+        do {
+                bp.b_cur_off = iov_offset;
+                bp.b_cur_iov = cur_iov;
+                page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+                if (IS_ERR(page)) {
+                        ret = PTR_ERR(page);
+                        goto out;
+                }
+                copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+                                                      ocfs2_map_and_write_user_data,
+                                                      &bp);
+                ocfs2_put_write_source(&bp, page);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                total += copied;
+                *ppos = *ppos + copied;
+                count -= copied;
+                ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+        } while(count);
+out:
+        return total ? total : ret;
+}
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+                             unsigned long *nr_segs)
+{
+        size_t ocount;          /* original count */
+        unsigned long seg;
+        ocount = 0;
+        for (seg = 0; seg < *nr_segs; seg++) {
+                const struct iovec *iv = &iov[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                ocount += iv->iov_len;
+                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                *nr_segs = seg;
+                ocount -= iv->iov_len;  /* This segment is no good */
+                break;
+        }
+        *counted = ocount;
+        return 0;
+}
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs,
                                    loff_t pos)
 {
-        int ret, rw_level, have_alloc_sem = 0;
+        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-        struct file *filp = iocb->ki_filp;
+        int can_do_direct, sync = 0;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        ssize_t written = 0;
-        int appending = filp->f_flags & O_APPEND ? 1 : 0;
+        size_t ocount;          /* original count */
+        size_t count;           /* after file limit checks */
-        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+        loff_t *ppos = &iocb->ki_pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
-                   filp->f_path.dentry->d_name.len,
+                   file->f_path.dentry->d_name.len,
-                   filp->f_path.dentry->d_name.name);
+                   file->f_path.dentry->d_name.name);
-        /* happy write of zero bytes */
        if (iocb->ki_left == 0)
                return 0;
+        ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+        if (ret)
+                return ret;
+        count = ocount;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        appending = file->f_flags & O_APPEND ? 1 : 0;
+        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
        mutex_lock(&inode->i_mutex);
+relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-        if (filp->f_flags & O_DIRECT) {
+        if (direct_io) {
-                have_alloc_sem = 1;
                down_read(&inode->i_alloc_sem);
+                have_alloc_sem = 1;
        }
        /* concurrent O_DIRECT writes are allowed */
-        rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+        rw_level = !direct_io;
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
-                rw_level = -1;
                mlog_errno(ret);
-                goto out;
+                goto out_sems;
        }
-        ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
+        can_do_direct = direct_io;
-                                            iocb->ki_left, appending);
+        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+                                            iocb->ki_left, appending,
+                                            &can_do_direct);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        /* communicate with ocfs2_dio_end_io */
+        /*
-        ocfs2_iocb_set_rw_locked(iocb);
+         * We can't complete the direct I/O as requested, fall back to
+         * buffered I/O.
+         */
+        if (direct_io && !can_do_direct) {
+                ocfs2_rw_unlock(inode, rw_level);
+                up_read(&inode->i_alloc_sem);
+                have_alloc_sem = 0;
+                rw_level = -1;
-        ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+                direct_io = 0;
+                sync = 1;
+                goto relock;
+        }
+        if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+                sync = 1;
+        /*
+         * XXX: Is it ok to execute these checks a second time?
+         */
+        ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+        if (ret)
+                goto out;
+        /*
+         * Set pos so that sync_page_range_nolock() below understands
+         * where to start from. We might've moved it around via the
+         * calls above. The range we want to actually sync starts from
+         * *ppos here.
+         *
+         */
+        pos = *ppos;
+        /* communicate with ocfs2_dio_end_io */
+        ocfs2_iocb_set_rw_locked(iocb, rw_level);
+        if (direct_io) {
+                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+                                                    ppos, count, ocount);
+                if (written < 0) {
+                        ret = written;
+                        goto out_dio;
+                }
+        } else {
+                written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+                                                    count, written);
+                if (written < 0) {
+                        ret = written;
+                        if (ret != -EFAULT || ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
-        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        /* 
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1583,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        }
 out:
+        if (rw_level != -1)
+                ocfs2_rw_unlock(inode, rw_level);
+out_sems:
        if (have_alloc_sem)
                up_read(&inode->i_alloc_sem);
-        if (rw_level != -1) 
-                ocfs2_rw_unlock(inode, rw_level);
+        if (written > 0 && sync) {
+                ssize_t err;
+                err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+                if (err < 0)
+                        written = err;
+        }
        mutex_unlock(&inode->i_mutex);
        mlog_exit(ret);
+        return written ? written : ret;
+}
+static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
+                                    struct pipe_buffer *buf,
+                                    struct splice_desc *sd)
+{
+        int ret, count, total = 0;
+        ssize_t copied = 0;
+        struct ocfs2_splice_write_priv sp;
+        ret = buf->ops->pin(pipe, buf);
+        if (ret)
+                goto out;
+        sp.s_sd = sd;
+        sp.s_buf = buf;
+        sp.s_pipe = pipe;
+        sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
+        sp.s_buf_offset = buf->offset;
+        count = sd->len;
+        if (count + sp.s_offset > PAGE_CACHE_SIZE)
+                count = PAGE_CACHE_SIZE - sp.s_offset;
+        do {
+                /*
+                 * splice wants us to copy up to one page at a
+                 * time. For pagesize > cluster size, this means we
+                 * might enter ocfs2_buffered_write_cluster() more
+                 * than once, so keep track of our progress here.
+                 */
+                copied = ocfs2_buffered_write_cluster(sd->file,
+                                                      (loff_t)sd->pos + total,
+                                                      count,
+                                                      ocfs2_map_and_write_splice_data,
+                                                      &sp);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                count -= copied;
+                sp.s_offset += copied;
+                sp.s_buf_offset += copied;
+                total += copied;
+        } while (count);
+        ret = 0;
+out:
+        return total ? total : ret;
+}
+static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+                                         struct file *out,
+                                         loff_t *ppos,
+                                         size_t len,
+                                         unsigned int flags)
+{
+        int ret, err;
+        struct address_space *mapping = out->f_mapping;
+        struct inode *inode = mapping->host;
+        ret = __splice_from_pipe(pipe, out, ppos, len, flags,
+                                 ocfs2_splice_write_actor);
+        if (ret > 0) {
+                *ppos += ret;
+                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        err = generic_osync_inode(inode, mapping,
+                                                  OSYNC_METADATA|OSYNC_DATA);
+                        if (err)
+                                ret = err;
+                }
+        }
        return ret;
 }
@@ -1239,14 +1704,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                goto out;
        }
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+                                            NULL);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        /* ok, we're done with i_size and alloc work */
-        ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+        ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
 out_unlock:
        ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1789,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                }
                rw_level = 0;
                /* communicate with ocfs2_dio_end_io */
-                ocfs2_iocb_set_rw_locked(iocb);
+                ocfs2_iocb_set_rw_locked(iocb, rw_level);
        }
        /*
@@ -1388,6 +1854,9 @@ const struct file_operations ocfs2_fops = {
        .aio_read       = ocfs2_file_aio_read,
        .aio_write      = ocfs2_file_aio_write,
        .ioctl          = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ocfs2_compat_ioctl,
+#endif
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
 };
@@ -1397,4 +1866,7 @@ const struct file_operations ocfs2_dops = {
        .readdir        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
        .ioctl          = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..a4dd1fa1822b 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,23 +39,23 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
+                               u32 *cluster_start,
                               u32 clusters_to_add,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+                          u32 clusters_to_add,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
 int ocfs2_permission(struct inode *inode, int mask,
                     struct nameidata *nd);
-int ocfs2_set_inode_size(handle_t *handle,
-                         struct inode *inode,
-                         struct buffer_head *fe_bh,
-                         u64 new_i_size);
 int ocfs2_should_update_atime(struct inode *inode,
                              struct vfsmount *vfsmnt);
 int ocfs2_update_inode_atime(struct inode *inode,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..bc844bfe607c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,22 +89,23 @@ void ocfs2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_DIRSYNC;
 }
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
-                                     u64 blkno,
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
-                                     int delete_vote)
 {
-        struct ocfs2_find_inode_args args;
+        unsigned int flags = oi->vfs_inode.i_flags;
-        /* ocfs2_ilookup_for_vote should *only* be called from the
+        oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
-         * vote thread */
+                        OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
-        BUG_ON(current != osb->vote_task);
+        if (flags & S_SYNC)
+                oi->ip_attr |= OCFS2_SYNC_FL;
-        args.fi_blkno = blkno;
+        if (flags & S_APPEND)
-        args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
+                oi->ip_attr |= OCFS2_APPEND_FL;
-        if (delete_vote)
+        if (flags & S_IMMUTABLE)
-                args.fi_flags |= OCFS2_FI_FLAG_DELETE;
+                oi->ip_attr |= OCFS2_IMMUTABLE_FL;
-        args.fi_ino = ino_from_blkno(osb->sb, blkno);
+        if (flags & S_NOATIME)
-        return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
+                oi->ip_attr |= OCFS2_NOATIME_FL;
+        if (flags & S_DIRSYNC)
+                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
@@ -182,28 +183,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
        if (oi->ip_blkno != args->fi_blkno)
                goto bail;
-        /* OCFS2_FI_FLAG_NOWAIT is *only* set from
-         * ocfs2_ilookup_for_vote which won't create an inode for one
-         * that isn't found. The vote thread which doesn't want to get
-         * an inode which is in the process of going away - otherwise
-         * the call to __wait_on_freeing_inode in find_inode_fast will
-         * cause it to deadlock on an inode which may be waiting on a
-         * vote (or lock release) in delete_inode */
-        if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
-            (inode->i_state & (I_FREEING|I_CLEAR))) {
-                /* As stated above, we're not going to return an
-                 * inode.  In the case of a delete vote, the voting
-                 * code is going to signal the other node to go
-                 * ahead. Mark that state here, so this freeing inode
-                 * has the state when it gets to delete_inode. */
-                if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
-                        spin_lock(&oi->ip_lock);
-                        ocfs2_mark_inode_remotely_deleted(inode);
-                        spin_unlock(&oi->ip_lock);
-                }
-                goto bail;
-        }
        ret = 1;
 bail:
        mlog_exit(ret);
@@ -236,7 +215,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        int status = -EINVAL;
        mlog_entry("(0x%p, size:%llu)\n", inode,
-                   (unsigned long long)fe->i_size);
+                   (unsigned long long)le64_to_cpu(fe->i_size));
        sb = inode->i_sb;
        osb = OCFS2_SB(sb);
@@ -261,6 +240,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                goto bail;
        }
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        inode->i_version = 1;
        inode->i_generation = le32_to_cpu(fe->i_generation);
        inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +254,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
                inode->i_blocks = 0;
        else
-                inode->i_blocks =
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                        ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
        inode->i_mapping->a_ops = &ocfs2_aops;
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -286,11 +267,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                mlog(ML_ERROR,
                     "ip_blkno %llu != i_blkno %llu!\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)fe->i_blkno);
+                     (unsigned long long)le64_to_cpu(fe->i_blkno));
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-        OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
-        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
@@ -343,10 +320,13 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                 * the generation argument to
                 * ocfs2_inode_lock_res_init() will have to change.
                 */
-                BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL));
+                BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
+                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                          OCFS2_LOCK_TYPE_OPEN, 0, inode);
        }
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +401,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * cluster lock before trusting anything anyway.
         */
        can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-                && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+                && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
                && !ocfs2_mount_local(osb);
        /*
@@ -438,7 +418,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
+        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                  OCFS2_LOCK_TYPE_OPEN,
+                                  0, inode);
        if (can_lock) {
+                status = ocfs2_open_lock(inode);
+                if (status) {
+                        make_bad_inode(inode);
+                        mlog_errno(status);
+                        return status;
+                }
                status = ocfs2_meta_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
@@ -447,6 +437,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
+        if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+                status = ocfs2_try_open_lock(inode, 0);
+                if (status) {
+                        make_bad_inode(inode);  
+                        return status;
+                }
+        }
        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
                                  can_lock ? inode : NULL);
        if (status < 0) {
@@ -458,7 +456,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
+                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                     fe->i_signature);
                goto bail;
        }
@@ -507,50 +506,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        handle_t *handle = NULL;
        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
+        handle_t *handle = NULL;
        mlog_entry_void();
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        /* zero allocation, zero truncate :) */
+        if (fe->i_clusters) {
-        if (!fe->i_clusters)
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-                goto bail;
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out;
+                }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                status = ocfs2_journal_access(handle, inode, fe_bh,
-        if (IS_ERR(handle)) {
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                status = PTR_ERR(handle);
+                if (status < 0) {
-                handle = NULL;
+                        mlog_errno(status);
-                mlog_errno(status);
+                        goto out;
-                goto bail;
+                }
-        }
-        status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+                i_size_write(inode, 0);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_commit_trans(osb, handle);
+                status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
-        handle = NULL;
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out;
+                }
-        status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                ocfs2_commit_trans(osb, handle);
-        if (status < 0) {
+                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-        if (status < 0) {
+                if (status < 0) {
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto out;
+                }
+                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out;
+                }
        }
-bail:
+out:
        if (handle)
                ocfs2_commit_trans(osb, handle);
        mlog_exit(status);
        return status;
 }
@@ -678,10 +683,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_dinode *di;
-        /* We've already voted on this so it should be readonly - no
+        di = (struct ocfs2_dinode *) di_bh->b_data;
-         * spinlock needed. */
+        orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
-        orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
        if (status)
@@ -827,8 +832,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                     "Inode %llu (on-disk %llu) not orphaned! "
                     "Disk flags  0x%x, inode flags 0x%x\n",
                     (unsigned long long)oi->ip_blkno,
-                     (unsigned long long)di->i_blkno, di->i_flags,
+                     (unsigned long long)le64_to_cpu(di->i_blkno),
-                     oi->ip_flags);
+                     le32_to_cpu(di->i_flags), oi->ip_flags);
                goto bail;
        }
@@ -839,11 +844,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
-        status = ocfs2_request_delete_vote(inode);
+        /*
-        /* -EBUSY means that other nodes are still using the
+         * This is how ocfs2 determines whether an inode is still live
-         * inode. We're done here though, so avoid doing anything on
+         * within the cluster. Every node takes a shared read lock on
-         * disk and let them worry about deleting it. */
+         * the inode open lock in ocfs2_read_locked_inode(). When we
-        if (status == -EBUSY) {
+         * get to ->delete_inode(), each node tries to convert it's
+         * lock to an exclusive. Trylocks are serialized by the inode
+         * meta data lock. If the upconvert suceeds, we know the inode
+         * is no longer live and can be deleted.
+         *
+         * Though we call this with the meta data lock held, the
+         * trylock keeps us from ABBA deadlock.
+         */
+        status = ocfs2_try_open_lock(inode, 1);
+        if (status == -EAGAIN) {
                status = 0;
                mlog(0, "Skipping delete of %llu because it is in use on"
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +868,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
-        spin_lock(&oi->ip_lock);
+        *wipe = 1;
-        if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+        mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
-                /* Nobody knew which slot this inode was orphaned
+             (unsigned long long)oi->ip_blkno,
-                 * into. This may happen during node death and
+             le16_to_cpu(di->i_orphaned_slot));
-                 * recovery knows how to clean it up so we can safely
-                 * ignore this inode for now on. */
-                mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-                     (unsigned long long)oi->ip_blkno);
-        } else {
-                *wipe = 1;
-                mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-                     (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-        }
-        spin_unlock(&oi->ip_lock);
 bail:
        return status;
@@ -1001,11 +1004,16 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
+        /* For remove delete_inode vote, we hold open lock before,
+         * now it is time to unlock PR and EX open locks. */
+        ocfs2_open_unlock(inode);
        /* Do these before all the other work so that we don't bounce
         * the vote thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1028,7 @@ void ocfs2_clear_inode(struct inode *inode)
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
-        ocfs2_extent_map_drop(inode, 0);
+        ocfs2_extent_map_trunc(inode, 0);
-        ocfs2_extent_map_init(inode);
        status = ocfs2_drop_inode_locks(inode);
        if (status < 0)
@@ -1030,6 +1037,7 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
        ocfs2_lock_res_free(&oi->ip_meta_lockres);
        ocfs2_lock_res_free(&oi->ip_data_lockres);
+        ocfs2_lock_res_free(&oi->ip_open_lockres);
        ocfs2_metadata_cache_purge(inode);
@@ -1086,9 +1094,6 @@ void ocfs2_drop_inode(struct inode *inode)
        mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
-        /* Testing ip_orphaned_slot here wouldn't work because we may
-         * not have gotten a delete_inode vote from any other nodes
-         * yet. */
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
                generic_delete_inode(inode);
        else
@@ -1121,8 +1126,10 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
                return NULL;
        }
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-                                             &p_blkno, NULL);
+        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+                                             NULL);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
        if (tmperr < 0) {
                mlog_errno(tmperr);
                goto fail;
@@ -1212,6 +1219,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        spin_lock(&OCFS2_I(inode)->ip_lock);
        fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+        ocfs2_get_inode_flags(OCFS2_I(inode));
        fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -1259,7 +1267,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
                inode->i_blocks = 0;
        else
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
        inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..a41d0817121b 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
+#include "extent_map.h"
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
        struct ocfs2_lock_res           ip_rw_lockres;
        struct ocfs2_lock_res           ip_meta_lockres;
        struct ocfs2_lock_res           ip_data_lockres;
+        struct ocfs2_lock_res           ip_open_lockres;
        /* protects allocation changes on this inode. */
        struct rw_semaphore             ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
        u32                             ip_clusters;
-        struct ocfs2_extent_map         ip_map;
        struct list_head                ip_io_markers;
-        int                             ip_orphaned_slot;
        struct mutex                    ip_io_mutex;
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
        struct ocfs2_caching_info       ip_metadata_cache;
+        struct ocfs2_extent_map         ip_extent_map;
        struct inode                    vfs_inode;
 };
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT    0x1
+#define OCFS2_FI_FLAG_SYSFILE           0x4
-#define OCFS2_FI_FLAG_DELETE    0x2
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x8
-#define OCFS2_FI_FLAG_SYSFILE   0x4
-#define OCFS2_FI_FLAG_NOLOCK    0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-                                     u64 blkno,
-                                     int delete_vote);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -143,5 +141,13 @@ int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 void ocfs2_set_inode_flags(struct inode *inode);
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+        int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 4768be5f3086..f3ad21ad9aed 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -31,6 +31,7 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
                mlog_errno(status);
                return status;
        }
+        ocfs2_get_inode_flags(OCFS2_I(inode));
        *flags = OCFS2_I(inode)->ip_attr;
        ocfs2_meta_unlock(inode, 0);
@@ -134,3 +135,26 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
        }
 }
+#ifdef CONFIG_COMPAT
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        int ret;
+        switch (cmd) {
+        case OCFS2_IOC32_GETFLAGS:
+                cmd = OCFS2_IOC_GETFLAGS;
+                break;
+        case OCFS2_IOC32_SETFLAGS:
+                cmd = OCFS2_IOC_SETFLAGS;
+                break;
+        default:
+                return -ENOIOCTLCMD;
+        }
+        lock_kernel();
+        ret = ocfs2_ioctl(inode, file, cmd, arg);
+        unlock_kernel();
+        return ret;
+}
+#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4a7c82931dba..4d6c4f430d0d 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -12,5 +12,6 @@
 int ocfs2_ioctl(struct inode * inode, struct file * filp,
        unsigned int cmd, unsigned long arg);
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..dc1188081720 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -435,7 +435,8 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                 * handle the errors in a specific manner, so no need
                 * to call ocfs2_error() here. */
                mlog(ML_ERROR, "Journal dinode %llu  has invalid "
-                     "signature: %.*s", (unsigned long long)fe->i_blkno, 7,
+                     "signature: %.*s",
+                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
                     fe->i_signature);
                status = -EIO;
                goto out;
@@ -649,29 +650,20 @@ bail:
 static int ocfs2_force_read_journal(struct inode *inode)
 {
        int status = 0;
-        int i, p_blocks;
+        int i;
-        u64 v_blkno, p_blkno;
+        u64 v_blkno, p_blkno, p_blocks, num_blocks;
-#define CONCURRENT_JOURNAL_FILL 32
+#define CONCURRENT_JOURNAL_FILL 32ULL
        struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
        mlog_entry_void();
-        BUG_ON(inode->i_blocks !=
-                     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
        memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
-        mlog(0, "Force reading %llu blocks\n",
+        num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
-                (unsigned long long)(inode->i_blocks >>
-                        (inode->i_sb->s_blocksize_bits - 9)));
        v_blkno = 0;
-        while (v_blkno <
+        while (v_blkno < num_blocks) {
-               (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
                status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-                                                     1, &p_blkno,
+                                                     &p_blkno, &p_blocks, NULL);
-                                                     &p_blocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -751,7 +743,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                la_dinode = item->lri_la_dinode;
                if (la_dinode) {
                        mlog(0, "Clean up local alloc %llu\n",
-                             (unsigned long long)la_dinode->i_blkno);
+                             (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
                        ret = ocfs2_complete_local_alloc_recovery(osb,
                                                                  la_dinode);
@@ -764,7 +756,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                tl_dinode = item->lri_tl_dinode;
                if (tl_dinode) {
                        mlog(0, "Clean up truncate log %llu\n",
-                             (unsigned long long)tl_dinode->i_blkno);
+                             (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
                        ret = ocfs2_complete_truncate_log_recovery(osb,
                                                                   tl_dinode);
@@ -1306,7 +1298,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                                continue;
                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-                                          OCFS2_FI_FLAG_NOLOCK);
+                                          OCFS2_FI_FLAG_ORPHAN_RECOVERY);
                        if (IS_ERR(iter))
                                continue;
@@ -1418,7 +1410,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                /* Set the proper information to get us going into
                 * ocfs2_delete_inode. */
                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-                oi->ip_orphaned_slot = slot;
                spin_unlock(&oi->ip_lock);
                iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* We may be deleting metadata blocks, so metadata alloc dinode +
           one desc. block for each possible delete. */
        if (tree_depth && next_free == 1 &&
-            le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+            ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
                credits += 1 + tree_depth;
        /* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
        int ret = 0, lock_level = 0;
        struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
-        /* We don't want to support shared writable mappings yet. */
+        /*
-        if (!ocfs2_mount_local(osb) &&
+         * Only support shared writeable mmap for local mounts which
+         * don't know about holes.
+         */
+        if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
            ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
            ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
                mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 28dd757ff67d..36289e6295ce 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
        if (IS_ERR(inode)) {
-                mlog(ML_ERROR, "Unable to create inode %llu\n",
-                     (unsigned long long)blkno);
                ret = ERR_PTR(-EACCES);
                goto bail_unlock;
        }
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
         * unlink. */
        spin_lock(&oi->ip_lock);
        oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-        oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        spin_unlock(&oi->ip_lock);
 bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
-        inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -581,8 +578,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        if (ocfs2_populate_inode(inode, fe, 1) < 0) {
                mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
                     "i_blkno=%llu, i_ino=%lu\n",
-                     (unsigned long long) (*new_fe_bh)->b_blocknr,
+                     (unsigned long long)(*new_fe_bh)->b_blocknr,
-                     (unsigned long long)fe->i_blkno, inode->i_ino);
+                     (unsigned long long)le64_to_cpu(fe->i_blkno),
+                     inode->i_ino);
                BUG();
        }
@@ -1486,8 +1484,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
        struct buffer_head **bhs = NULL;
        const char *c;
        struct super_block *sb = osb->sb;
-        u64 p_blkno;
+        u64 p_blkno, p_blocks;
-        int p_blocks;
        int virtual, blocks, status, i, bytes_left;
        bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1511,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+        status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
-                                             &p_blocks);
+                                             NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1674,8 +1671,11 @@ static int ocfs2_symlink(struct inode *dir,
        inode->i_rdev = 0;
        newsize = l - 1;
        if (l > ocfs2_fast_symlink_chars(sb)) {
+                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+                                                    new_fe_bh,
                                                    handle, data_ac, NULL,
                                                    NULL);
                if (status < 0) {
@@ -1689,7 +1689,7 @@ static int ocfs2_symlink(struct inode *dir,
                        goto bail;
                }
                i_size_write(inode, newsize);
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
        } else {
                inode->i_op = &ocfs2_fast_symlink_inode_operations;
                memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2222,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
-        OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..a860633e833f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
 #include "endian.h"
 #include "ocfs2_lockid.h"
-struct ocfs2_extent_map {
-        u32             em_clusters;
-        struct rb_root  em_extents;
-};
 /* Most user visible OCFS2 inodes will have very few pieces of
 * metadata, but larger files (including bitmaps, etc) must be taken
 * into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
        return 1;
 }
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -361,9 +363,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
        typeof(__di) ____di = (__di);                                   \
        ocfs2_error((__sb),                                             \
                "Dinode # %llu has bad signature %.*s",                 \
-                (unsigned long long)(____di)->i_blkno, 7,               \
+                (unsigned long long)le64_to_cpu((____di)->i_blkno), 7,  \
                (____di)->i_signature);                                 \
-} while (0);
+} while (0)
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)                                \
        (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
@@ -372,9 +374,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
        typeof(__eb) ____eb = (__eb);                                   \
        ocfs2_error((__sb),                                             \
                "Extent Block # %llu has bad signature %.*s",           \
-                (unsigned long long)(____eb)->h_blkno, 7,               \
+                (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,  \
                (____eb)->h_signature);                                 \
-} while (0);
+} while (0)
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)                                  \
        (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
@@ -383,9 +385,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
        typeof(__gd) ____gd = (__gd);                                   \
                ocfs2_error((__sb),                                     \
                "Group Descriptor # %llu has bad signature %.*s",       \
-                (unsigned long long)(____gd)->bg_blkno, 7,              \
+                (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
                (____gd)->bg_signature);                                \
-} while (0);
+} while (0)
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
        return (unsigned long)((bytes + 511) >> 9);
 }
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+                                                        unsigned long pg_index)
+{
+        u32 clusters = pg_index;
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        if (unlikely(PAGE_CACHE_SHIFT > cbits))
+                clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+        else if (PAGE_CACHE_SHIFT < cbits)
+                clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+        return clusters;
+}
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+                                                        u32 clusters)
+{
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned long index = clusters;
+        if (PAGE_CACHE_SHIFT > cbits) {
+                index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+        } else if (PAGE_CACHE_SHIFT < cbits) {
+                index = clusters << (cbits - PAGE_CACHE_SHIFT);
+        }
+        return index;
+}
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned int pages_per_cluster = 1;
+        if (PAGE_CACHE_SHIFT < cbits)
+                pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+        return pages_per_cluster;
+}
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..f0d9eb08547a 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
        OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP     OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    0
 /*
@@ -155,10 +156,18 @@
 #define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
 /*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN     (0x01)  /* Extent is allocated but
+                                         * unwritten */
+/*
 * ioctl commands
 */
 #define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
 #define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS    _IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
 /*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
@@ -282,10 +291,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
 */
 struct ocfs2_extent_rec {
 /*00*/  __le32 e_cpos;          /* Offset into the file, in clusters */
-        __le32 e_clusters;      /* Clusters covered by this extent */
+        union {
+                __le32 e_int_clusters; /* Clusters covered by all children */
+                struct {
+                        __le16 e_leaf_clusters; /* Clusters covered by this
+                                                   extent */
+                        __u8 e_reserved1;
+                        __u8 e_flags; /* Extent flags */
+                };
+        };
        __le64 e_blkno;         /* Physical disk offset, in blocks */
 /*10*/
 };
@@ -311,7 +331,10 @@ struct ocfs2_extent_list {
 /*00*/  __le16 l_tree_depth;            /* Extent tree depth from this
                                           point.  0 means data extents
                                           hang directly off this
-                                           header (a leaf) */
+                                           header (a leaf)
+                                           NOTE: The high 8 bits cannot be
+                                           used - tree_depth is never that big.
+                                        */
        __le16 l_count;                 /* Number of extent records */
        __le16 l_next_free_rec;         /* Next unused extent slot */
        __le16 l_reserved1;
@@ -446,7 +469,9 @@ struct ocfs2_dinode {
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
        __le32 i_attr;
-        __le32 i_reserved1;
+        __le16 i_orphaned_slot;         /* Only valid when OCFS2_ORPHANED_FL
+                                           was set in i_flags */
+        __le16 i_reserved1;
 /*70*/  __le64 i_reserved2[8];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_RENAME,
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
+        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_DENTRY:
                        c = 'N';
                        break;
+                case OCFS2_LOCK_TYPE_OPEN:
+                        c = 'O';
+                        break;
                default:
                        c = '\0';
        }
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
         * important job it does, anyway. */
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+        [OCFS2_LOCK_TYPE_OPEN] = "Open",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+        status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..e3437626d183 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                             le32_to_cpu(fe->i_clusters)));
        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
-        alloc_inode->i_blocks =
+        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
-                ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
        status = 0;
 bail:
@@ -850,9 +849,9 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
-             (unsigned long long)fe->i_blkno, chain,
+             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
-             (unsigned long long)bg->bg_blkno,
+             (unsigned long long)le64_to_cpu(bg->bg_blkno),
-             (unsigned long long)prev_bg->bg_blkno);
+             (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
        fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
        bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1163,7 +1162,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-             tmp_bits, (unsigned long long)bg->bg_blkno);
+             tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
        *num_bits = tmp_bits;
@@ -1228,7 +1227,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
-             (unsigned long long)fe->i_blkno);
+             (unsigned long long)le64_to_cpu(fe->i_blkno));
        *bg_blkno = le64_to_cpu(bg->bg_blkno);
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..7c5e3f5d6634 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
        ocfs2_print_version();
-        if (init_ocfs2_extent_maps())
-                return -ENOMEM;
        status = init_ocfs2_uptodate_cache();
        if (status < 0) {
                mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
        if (status < 0) {
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
-                exit_ocfs2_extent_maps();
        }
        mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
        unregister_filesystem(&ocfs2_fs_type);
-        exit_ocfs2_extent_maps();
        exit_ocfs2_uptodate_cache();
        mlog_exit_void();
@@ -943,8 +937,7 @@ static void ocfs2_inode_init_once(void *data,
 {
        struct ocfs2_inode_info *oi = data;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                oi->ip_flags = 0;
                oi->ip_open_count = 0;
                spin_lock_init(&oi->ip_lock);
@@ -963,6 +956,7 @@ static void ocfs2_inode_init_once(void *data,
                ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
                ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
                ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+                ocfs2_lock_res_init_once(&oi->ip_open_lockres);
                ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1543,7 +1537,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
                        mlog(ML_ERROR, "bad block number on superblock: "
                             "found %llu, should be %llu\n",
-                             (unsigned long long)di->i_blkno,
+                             (unsigned long long)le64_to_cpu(di->i_blkno),
                             (unsigned long long)bh->b_blocknr);
                } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
                            le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 40dc1a51f4a9..7134007ba22f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -67,16 +67,9 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                goto sync_fail;
-        wait_on_page_locked(page);
-        if (!PageUptodate(page))
-                goto async_fail;
        *ppage = page;
        return kmap(page);
-async_fail:
-        page_cache_release(page);
-        return ERR_PTR(-EIO);
 sync_fail:
        return (char*)page;
 }
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
        __be32 h_node_num;    /* node sending this particular message. */
 };
-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
 struct ocfs2_vote_msg
 {
        struct ocfs2_msg_hdr v_hdr;
-        union {
+        __be32 v_reserved1;
-                __be32 v_generic1;
-                __be32 v_orphaned_slot; /* Used during delete votes */
-                __be32 v_nlink;         /* Used during unlink votes */
-        } md1;                          /* Message type dependant 1 */
 };
 /* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
 {
        struct ocfs2_msg_hdr r_hdr;
        __be32 r_response;
-        __be32 r_orphaned_slot;
 };
 struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
 enum ocfs2_vote_request {
        OCFS2_VOTE_REQ_INVALID = 0,
-        OCFS2_VOTE_REQ_DELETE,
        OCFS2_VOTE_REQ_MOUNT,
        OCFS2_VOTE_REQ_UMOUNT,
        OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
        ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
 }
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        assert_spin_locked(&oi->ip_lock);
-        /* We set the SKIP_DELETE flag on the inode so we don't try to
-         * delete it in delete_inode ourselves, thus avoiding
-         * unecessary lock pinging. If the other node failed to wipe
-         * the inode as a result of a crash, then recovery will pick
-         * up the slack. */
-        oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-static int ocfs2_process_delete_request(struct inode *inode,
-                                        int *orphaned_slot)
-{
-        int response = OCFS2_RESPONSE_BUSY;
-        mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
-             inode->i_ino, inode->i_nlink, *orphaned_slot);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        /* Whatever our vote response is, we want to make sure that
-         * the orphaned slot is recorded properly on this node *and*
-         * on the requesting node. Technically, if the requesting node
-         * did not know which slot the inode is orphaned in but we
-         * respond with BUSY he doesn't actually need the orphaned
-         * slot, but it doesn't hurt to do it here anyway. */
-        if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
-                mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
-                                OCFS2_INVALID_SLOT &&
-                                OCFS2_I(inode)->ip_orphaned_slot !=
-                                (*orphaned_slot),
-                                "Inode %llu: This node thinks it's "
-                                "orphaned in slot %d, messaged it's in %d\n",
-                                (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                OCFS2_I(inode)->ip_orphaned_slot,
-                                *orphaned_slot);
-                mlog(0, "Setting orphaned slot for inode %llu to %d\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     *orphaned_slot);
-                OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
-        } else {
-                mlog(0, "Sending back orphaned slot %d for inode %llu\n",
-                     OCFS2_I(inode)->ip_orphaned_slot,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-        }
-        /* vote no if the file is still open. */
-        if (OCFS2_I(inode)->ip_open_count) {
-                mlog(0, "open count = %u\n",
-                     OCFS2_I(inode)->ip_open_count);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                goto done;
-        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* directories are a bit ugly... What if someone is sitting in
-         * it? We want to make sure the inode is removed completely as
-         * a result of the iput in process_vote. */
-        if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
-                mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
-                goto done;
-        }
-        if (filemap_fdatawrite(inode->i_mapping)) {
-                mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                goto done;
-        }
-        sync_mapping_buffers(inode->i_mapping);
-        truncate_inode_pages(inode->i_mapping, 0);
-        ocfs2_extent_map_trunc(inode, 0);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        /* double check open count - someone might have raced this
-         * thread into ocfs2_file_open while we were writing out
-         * data. If we're to allow a wipe of this inode now, we *must*
-         * hold the spinlock until we've marked it. */
-        if (OCFS2_I(inode)->ip_open_count) {
-                mlog(0, "Raced to wipe! open count = %u\n",
-                     OCFS2_I(inode)->ip_open_count);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                goto done;
-        }
-        /* Mark the inode as being wiped from disk. */
-        ocfs2_mark_inode_remotely_deleted(inode);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* Not sure this is necessary anymore. */
-        d_prune_aliases(inode);
-        /* If we get here, then we're voting 'yes', so commit the
-         * delete on our side. */
-        response = OCFS2_RESPONSE_OK;
-done:
-        return response;
-}
 static void ocfs2_process_vote(struct ocfs2_super *osb,
                               struct ocfs2_vote_msg *msg)
 {
        int net_status, vote_response;
-        int orphaned_slot = 0;
+        unsigned int node_num;
-        unsigned int node_num, generation;
        u64 blkno;
        enum ocfs2_vote_request request;
-        struct inode *inode = NULL;
        struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
        struct ocfs2_response_msg response;
        /* decode the network mumbo jumbo into local variables. */
        request = be32_to_cpu(hdr->h_request);
        blkno = be64_to_cpu(hdr->h_blkno);
-        generation = be32_to_cpu(hdr->h_generation);
        node_num = be32_to_cpu(hdr->h_node_num);
-        if (request == OCFS2_VOTE_REQ_DELETE)
-                orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
-        mlog(0, "processing vote: request = %u, blkno = %llu, "
+        mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
-             "generation = %u, node_num = %u, priv1 = %u\n", request,
+             request, (unsigned long long)blkno, node_num);
-             (unsigned long long)blkno, generation, node_num,
-             be32_to_cpu(msg->md1.v_generic1));
        if (!ocfs2_is_valid_vote_request(request)) {
                mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
                break;
        }
-        /* We cannot process the remaining message types before we're
-         * fully mounted. It's perfectly safe however to send a 'yes'
-         * response as we can't possibly have any of the state they're
-         * asking us to modify yet. */
-        if (atomic_read(&osb->vol_state) == VOLUME_INIT)
-                goto respond;
-        /* If we get here, then the request is against an inode. */
-        inode = ocfs2_ilookup_for_vote(osb, blkno,
-                                       request == OCFS2_VOTE_REQ_DELETE);
-        /* Not finding the inode is perfectly valid - it means we're
-         * not interested in what the other node is about to do to it
-         * so in those cases we automatically respond with an
-         * affirmative. Cluster locking ensures that we won't race
-         * interest in the inode with this vote request. */
-        if (!inode)
-                goto respond;
-        /* Check generation values. It's possible for us to get a
-         * request against a stale inode. If so then we proceed as if
-         * we had not found an inode in the first place. */
-        if (inode->i_generation != generation) {
-                mlog(0, "generation passed %u != inode generation = %u, "
-                     "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
-                     "message type = %u\n", generation, inode->i_generation,
-                     OCFS2_I(inode)->ip_flags,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)blkno, atomic_read(&inode->i_count),
-                     request);
-                iput(inode);
-                inode = NULL;
-                goto respond;
-        }
-        switch (request) {
-        case OCFS2_VOTE_REQ_DELETE:
-                vote_response = ocfs2_process_delete_request(inode,
-                                                             &orphaned_slot);
-                break;
-        default:
-                mlog(ML_ERROR, "node %u, invalid request: %u\n",
-                     node_num, request);
-                vote_response = OCFS2_RESPONSE_BAD_MSG;
-        }
 respond:
        /* Response struture is small so we just put it on the stack
         * and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
        response.r_hdr.h_generation = hdr->h_generation;
        response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
        response.r_response = cpu_to_be32(vote_response);
-        response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
        net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
                                        osb->net_key,
@@ -373,9 +205,6 @@ respond:
            && net_status != -ENOTCONN)
                mlog(ML_ERROR, "message to node %u fails with error %d!\n",
                     node_num, net_status);
-        if (inode)
-                iput(inode);
 }
 static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
 static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
                                                      u64 blkno,
                                                      unsigned int generation,
-                                                      enum ocfs2_vote_request type,
+                                                      enum ocfs2_vote_request type)
-                                                      u32 priv)
 {
        struct ocfs2_vote_msg *request;
        struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
                hdr->h_request = cpu_to_be32(type);
                hdr->h_blkno = cpu_to_be64(blkno);
                hdr->h_generation = cpu_to_be32(generation);
-                request->md1.v_generic1 = cpu_to_be32(priv);
        }
        return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
                                 struct ocfs2_vote_msg *request,
                                 struct ocfs2_net_response_cb *callback)
 {
-        int status, response;
+        int status, response = -EBUSY;
        unsigned int response_id;
        struct ocfs2_msg_hdr *hdr;
@@ -686,109 +512,12 @@ bail:
        return status;
 }
-static int ocfs2_request_vote(struct inode *inode,
-                              struct ocfs2_vote_msg *request,
-                              struct ocfs2_net_response_cb *callback)
-{
-        int status;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        if (ocfs2_inode_is_new(inode))
-                return 0;
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-                    signal_pending(current))
-                        return -ERESTARTSYS;
-                status = ocfs2_super_lock(osb, 0);
-                if (status < 0) {
-                        mlog_errno(status);
-                        break;
-                }
-                status = 0;
-                if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num))
-                        status = ocfs2_do_request_vote(osb, request, callback);
-                ocfs2_super_unlock(osb, 0);
-        }
-        return status;
-}
-static void ocfs2_delete_response_cb(void *priv,
-                                     struct ocfs2_response_msg *resp)
-{
-        int orphaned_slot, node;
-        struct inode *inode = priv;
-        orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
-        node = be32_to_cpu(resp->r_hdr.h_node_num);
-        mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
-             node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             orphaned_slot);
-        /* The other node may not actually know which slot the inode
-         * is orphaned in. */
-        if (orphaned_slot == OCFS2_INVALID_SLOT)
-                return;
-        /* Ok, the responding node knows which slot this inode is
-         * orphaned in. We verify that the information is correct and
-         * then record this in the inode. ocfs2_delete_inode will use
-         * this information to determine which lock to take. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
-                        OCFS2_I(inode)->ip_orphaned_slot
-                        != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
-                        "orphaned in slot %d, we think it's in %d\n",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                        be32_to_cpu(resp->r_hdr.h_node_num),
-                        orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-        OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-int ocfs2_request_delete_vote(struct inode *inode)
-{
-        int orphaned_slot, status;
-        struct ocfs2_net_response_cb delete_cb;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_vote_msg *request;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        delete_cb.rc_cb = ocfs2_delete_response_cb;
-        delete_cb.rc_priv = inode;
-        mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-        status = -ENOMEM;
-        request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-                                         inode->i_generation,
-                                         OCFS2_VOTE_REQ_DELETE, orphaned_slot);
-        if (request) {
-                status = ocfs2_request_vote(inode, request, &delete_cb);
-                kfree(request);
-        }
-        return status;
-}
 int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 {
        int status;
        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0,
+        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
-                                         OCFS2_VOTE_REQ_MOUNT, 0);
        if (!request) {
                status = -ENOMEM;
                goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
        int status;
        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0,
+        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
-                                         OCFS2_VOTE_REQ_UMOUNT, 0);
        if (!request) {
                status = -ENOMEM;
                goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
             be32_to_cpu(work->w_msg.v_hdr.h_generation));
        mlog(0, "h_node_num = %u\n",
             be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-        mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
        spin_lock(&osb->vote_task_lock);
        list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
        wake_up(&osb->vote_event);
 }
-int ocfs2_request_delete_vote(struct inode *inode);
 int ocfs2_request_mount_vote(struct ocfs2_super *osb);
 int ocfs2_request_umount_vote(struct ocfs2_super *osb);
 int ocfs2_register_net_handlers(struct ocfs2_super *osb);
 void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
                                        int node_num);
 #endif
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index bde1c164417d..731a90e9f0cd 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -419,8 +419,7 @@ static void op_inode_init_once(void *data, struct kmem_cache * cachep, unsigned
 {
        struct op_inode_info *oi = (struct op_inode_info *) data;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&oi->vfs_inode);
 }
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 1bc9f372c7d4..e3491328596b 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -271,7 +271,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
                extern void xd_set_geometry(struct block_device *,
                        unsigned char, unsigned char, unsigned int);
                xd_set_geometry(bdev, dr->secspertrack, heads, 1);
-                invalidate_bdev(bdev, 1);
+                invalidate_bh_lrus();
                truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
        }
 #endif
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8a7d0035ad7a..6b9dae3f0e6c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute * default_attrs[] = {
        NULL,
 };
-extern struct subsystem block_subsys;
+extern struct kset block_subsys;
 static void part_release(struct kobject *kobj)
 {
@@ -388,7 +388,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
        kobject_add(&p->kobj);
        if (!disk->part_uevent_suppress)
                kobject_uevent(&p->kobj, KOBJ_ADD);
-        sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem");
+        sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
        if (flags & ADDPART_FLAG_WHOLEDISK) {
                static struct attribute addpartattr = {
                        .name = "whole_disk",
@@ -444,7 +444,7 @@ static int disk_sysfs_symlinks(struct gendisk *disk)
                        goto err_out_dev_link;
        }
-        err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
+        err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
                                "subsystem");
        if (err)
                goto err_out_disk_name_lnk;
@@ -569,9 +569,6 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
        page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
                                 NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (PageError(page))
                        goto fail;
                p->v = page;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989af5e55d1b..ec158dd02b3a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -715,6 +715,40 @@ static const struct file_operations proc_oom_adjust_operations = {
        .write          = oom_adjust_write,
 };
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        struct task_struct *task;
+        char buffer[PROC_NUMBUF], *end;
+        struct mm_struct *mm;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        if (!simple_strtol(buffer, &end, 0))
+                return -EINVAL;
+        if (*end == '\n')
+                end++;
+        task = get_proc_task(file->f_path.dentry->d_inode);
+        if (!task)
+                return -ESRCH;
+        mm = get_task_mm(task);
+        if (mm) {
+                clear_refs_smap(mm);
+                mmput(mm);
+        }
+        put_task_struct(task);
+        if (end - buffer == 0)
+                return -EIO;
+        return end - buffer;
+}
+static struct file_operations proc_clear_refs_operations = {
+        .write          = clear_refs_write,
+};
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1851,6 +1885,7 @@ static struct pid_entry tgid_base_stuff[] = {
        REG("mounts",     S_IRUGO, mounts),
        REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
+        REG("clear_refs", S_IWUSR, clear_refs),
        REG("smaps",      S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
@@ -2132,6 +2167,7 @@ static struct pid_entry tid_base_stuff[] = {
        LNK("exe",       exe),
        REG("mounts",    S_IRUGO, mounts),
 #ifdef CONFIG_MMU
+        REG("clear_refs", S_IWUSR, clear_refs),
        REG("smaps",     S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c372eb151a3a..22b1158389ae 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -109,8 +109,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct proc_inode *ei = (struct proc_inode *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index abdf068bc27f..eca471bc8512 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -38,7 +38,7 @@ static int property_read_proc(char *page, char **start, off_t off,
                n = count;
        else
                *eof = 1;
-        memcpy(page, pp->value + off, n);
+        memcpy(page, (char *)pp->value + off, n);
        *start = page;
        return n;
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e2c4c0a5c90d..75ec6523d29a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -398,8 +398,6 @@ static const struct file_operations proc_modules_operations = {
 #endif
 #ifdef CONFIG_SLAB
-extern struct seq_operations slabinfo_op;
-extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &slabinfo_op);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7445980c8022..4008c060f7ef 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -120,6 +120,14 @@ struct mem_size_stats
        unsigned long shared_dirty;
        unsigned long private_clean;
        unsigned long private_dirty;
+        unsigned long referenced;
+};
+struct pmd_walker {
+        struct vm_area_struct *vma;
+        void *private;
+        void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
+                       unsigned long, void *);
 };
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
@@ -181,18 +189,20 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
        if (mss)
                seq_printf(m,
-                           "Size:          %8lu kB\n"
+                           "Size:           %8lu kB\n"
-                           "Rss:           %8lu kB\n"
+                           "Rss:            %8lu kB\n"
-                           "Shared_Clean:  %8lu kB\n"
+                           "Shared_Clean:   %8lu kB\n"
-                           "Shared_Dirty:  %8lu kB\n"
+                           "Shared_Dirty:   %8lu kB\n"
-                           "Private_Clean: %8lu kB\n"
+                           "Private_Clean:  %8lu kB\n"
-                           "Private_Dirty: %8lu kB\n",
+                           "Private_Dirty:  %8lu kB\n"
+                           "Referenced:     %8lu kB\n",
                           (vma->vm_end - vma->vm_start) >> 10,
                           mss->resident >> 10,
                           mss->shared_clean  >> 10,
                           mss->shared_dirty  >> 10,
                           mss->private_clean >> 10,
-                           mss->private_dirty >> 10);
+                           mss->private_dirty >> 10,
+                           mss->referenced >> 10);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -205,15 +215,16 @@ static int show_map(struct seq_file *m, void *v)
 }
 static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-                                unsigned long addr, unsigned long end,
+                            unsigned long addr, unsigned long end,
-                                struct mem_size_stats *mss)
+                            void *private)
 {
+        struct mem_size_stats *mss = private;
        pte_t *pte, ptent;
        spinlock_t *ptl;
        struct page *page;
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-        do {
+        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = *pte;
                if (!pte_present(ptent))
                        continue;
@@ -224,6 +235,9 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (!page)
                        continue;
+                /* Accumulate the size in pages that have been accessed. */
+                if (pte_young(ptent) || PageReferenced(page))
+                        mss->referenced += PAGE_SIZE;
                if (page_mapcount(page) >= 2) {
                        if (pte_dirty(ptent))
                                mss->shared_dirty += PAGE_SIZE;
@@ -235,57 +249,99 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        else
                                mss->private_clean += PAGE_SIZE;
                }
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        }
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
 }
-static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-                                unsigned long addr, unsigned long end,
+                                 unsigned long addr, unsigned long end,
-                                struct mem_size_stats *mss)
+                                 void *private)
+{
+        pte_t *pte, ptent;
+        spinlock_t *ptl;
+        struct page *page;
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        for (; addr != end; pte++, addr += PAGE_SIZE) {
+                ptent = *pte;
+                if (!pte_present(ptent))
+                        continue;
+                page = vm_normal_page(vma, addr, ptent);
+                if (!page)
+                        continue;
+                /* Clear accessed and referenced bits. */
+                ptep_test_and_clear_young(vma, addr, pte);
+                ClearPageReferenced(page);
+        }
+        pte_unmap_unlock(pte - 1, ptl);
+        cond_resched();
+}
+static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
+                                  unsigned long addr, unsigned long end)
 {
        pmd_t *pmd;
        unsigned long next;
-        pmd = pmd_offset(pud, addr);
+        for (pmd = pmd_offset(pud, addr); addr != end;
-        do {
+             pmd++, addr = next) {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                smaps_pte_range(vma, pmd, addr, next, mss);
+                walker->action(walker->vma, pmd, addr, next, walker->private);
-        } while (pmd++, addr = next, addr != end);
+        }
 }
-static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
-                                unsigned long addr, unsigned long end,
+                                  unsigned long addr, unsigned long end)
-                                struct mem_size_stats *mss)
 {
        pud_t *pud;
        unsigned long next;
-        pud = pud_offset(pgd, addr);
+        for (pud = pud_offset(pgd, addr); addr != end;
-        do {
+             pud++, addr = next) {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                smaps_pmd_range(vma, pud, addr, next, mss);
+                walk_pmd_range(walker, pud, addr, next);
-        } while (pud++, addr = next, addr != end);
+        }
 }
-static inline void smaps_pgd_range(struct vm_area_struct *vma,
+/*
-                                unsigned long addr, unsigned long end,
+ * walk_page_range - walk the page tables of a VMA with a callback
-                                struct mem_size_stats *mss)
+ * @vma - VMA to walk
+ * @action - callback invoked for every bottom-level (PTE) page table
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA, calling
+ * a callback for every bottom-level (PTE) page table.
+ */
+static inline void walk_page_range(struct vm_area_struct *vma,
+                                   void (*action)(struct vm_area_struct *,
+                                                  pmd_t *, unsigned long,
+                                                  unsigned long, void *),
+                                   void *private)
 {
+        unsigned long addr = vma->vm_start;
+        unsigned long end = vma->vm_end;
+        struct pmd_walker walker = {
+                .vma            = vma,
+                .private        = private,
+                .action         = action,
+        };
        pgd_t *pgd;
        unsigned long next;
-        pgd = pgd_offset(vma->vm_mm, addr);
+        for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
-        do {
+             pgd++, addr = next) {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                smaps_pud_range(vma, pgd, addr, next, mss);
+                walk_pud_range(&walker, pgd, addr, next);
-        } while (pgd++, addr = next, addr != end);
+        }
 }
 static int show_smap(struct seq_file *m, void *v)
@@ -295,10 +351,22 @@ static int show_smap(struct seq_file *m, void *v)
        memset(&mss, 0, sizeof mss);
        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-                smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+                walk_page_range(vma, smaps_pte_range, &mss);
        return show_map_internal(m, v, &mss);
 }
+void clear_refs_smap(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        down_read(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next)
+                if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+                        walk_page_range(vma, clear_refs_pte_range, NULL);
+        flush_tlb_mm(mm);
+        up_read(&mm->mmap_sem);
+}
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
        struct proc_maps_private *priv = m->private;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index d96050728c43..523e1098ae88 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -514,7 +514,7 @@ static int __init parse_crash_elf64_headers(void)
        /* Do some basic Verification. */
        if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
                (ehdr.e_type != ET_CORE) ||
-                !elf_check_arch(&ehdr) ||
+                !vmcore_elf_check_arch(&ehdr) ||
                ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
                ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
                ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 83bc8e7824cd..75fc8498f2e2 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -536,8 +536,7 @@ static void init_once(void *foo, struct kmem_cache * cachep,
 {
        struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f13a7f164dc6..7054aaef0493 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -511,8 +511,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
        struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
-        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR) {
                INIT_LIST_HEAD(&ei->i_prealloc_list);
                inode_init_once(&ei->vfs_inode);
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f01389fd162e..bf6e58214538 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -54,82 +54,48 @@
 static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
                                                                *prefix);
-static struct dentry *create_xa_root(struct super_block *sb)
+/* Returns the dentry referring to the root of the extended attribute
+ * directory tree. If it has already been retrieved, it is used. If it
+ * hasn't been created and the flags indicate creation is allowed, we
+ * attempt to create it. On error, we return a pointer-encoded error.
+ */
+static struct dentry *get_xa_root(struct super_block *sb, int flags)
 {
        struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root);
        struct dentry *xaroot;
        /* This needs to be created at mount-time */
        if (!privroot)
-                return ERR_PTR(-EOPNOTSUPP);
+                return ERR_PTR(-ENODATA);
-        xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
+        mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
-        if (IS_ERR(xaroot)) {
+        if (REISERFS_SB(sb)->xattr_root) {
+                xaroot = dget(REISERFS_SB(sb)->xattr_root);
                goto out;
-        } else if (!xaroot->d_inode) {
-                int err;
-                mutex_lock(&privroot->d_inode->i_mutex);
-                err =
-                    privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
-                                                   0700);
-                mutex_unlock(&privroot->d_inode->i_mutex);
-                if (err) {
-                        dput(xaroot);
-                        dput(privroot);
-                        return ERR_PTR(err);
-                }
-                REISERFS_SB(sb)->xattr_root = dget(xaroot);
        }
-      out:
-        dput(privroot);
-        return xaroot;
-}
-/* This will return a dentry, or error, refering to the xa root directory.
- * If the xa root doesn't exist yet, the dentry will be returned without
- * an associated inode. This dentry can be used with ->mkdir to create
- * the xa directory. */
-static struct dentry *__get_xa_root(struct super_block *s)
-{
-        struct dentry *privroot = dget(REISERFS_SB(s)->priv_root);
-        struct dentry *xaroot = NULL;
-        if (IS_ERR(privroot) || !privroot)
-                return privroot;
        xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
        if (IS_ERR(xaroot)) {
                goto out;
        } else if (!xaroot->d_inode) {
-                dput(xaroot);
+                int err = -ENODATA;
-                xaroot = NULL;
+                if (flags == 0 || flags & XATTR_CREATE)
-                goto out;
+                        err = privroot->d_inode->i_op->mkdir(privroot->d_inode,
+                                                             xaroot, 0700);
+                if (err) {
+                        dput(xaroot);
+                        xaroot = ERR_PTR(err);
+                        goto out;
+                }
        }
+        REISERFS_SB(sb)->xattr_root = dget(xaroot);
-        REISERFS_SB(s)->xattr_root = dget(xaroot);
      out:
+        mutex_unlock(&privroot->d_inode->i_mutex);
        dput(privroot);
        return xaroot;
 }
-/* Returns the dentry (or NULL) referring to the root of the extended
- * attribute directory tree. If it has already been retrieved, it is used.
- * Otherwise, we attempt to retrieve it from disk. It may also return
- * a pointer-encoded error.
- */
-static inline struct dentry *get_xa_root(struct super_block *s)
-{
-        struct dentry *dentry = dget(REISERFS_SB(s)->xattr_root);
-        if (!dentry)
-                dentry = __get_xa_root(s);
-        return dentry;
-}
 /* Opens the directory corresponding to the inode's extended attribute store.
 * If flags allow, the tree to the directory may be created. If creation is
 * prohibited, -ENODATA is returned. */
@@ -138,21 +104,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
        struct dentry *xaroot, *xadir;
        char namebuf[17];
-        xaroot = get_xa_root(inode->i_sb);
+        xaroot = get_xa_root(inode->i_sb, flags);
-        if (IS_ERR(xaroot)) {
+        if (IS_ERR(xaroot))
                return xaroot;
-        } else if (!xaroot) {
-                if (flags == 0 || flags & XATTR_CREATE) {
-                        xaroot = create_xa_root(inode->i_sb);
-                        if (IS_ERR(xaroot))
-                                return xaroot;
-                }
-                if (!xaroot)
-                        return ERR_PTR(-ENODATA);
-        }
        /* ok, we have xaroot open */
        snprintf(namebuf, sizeof(namebuf), "%X.%X",
                 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
                 inode->i_generation);
@@ -454,11 +410,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        page = read_mapping_page(mapping, n, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (PageError(page))
                        goto fail;
        }
@@ -821,7 +773,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
        /* Leftovers besides . and .. -- that's not good. */
        if (dir->d_inode->i_nlink <= 2) {
-                root = get_xa_root(inode->i_sb);
+                root = get_xa_root(inode->i_sb, XATTR_REPLACE);
                reiserfs_write_lock_xattrs(inode->i_sb);
                err = vfs_rmdir(root->d_inode, dir);
                reiserfs_write_unlock_xattrs(inode->i_sb);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index fd601014813e..804285190271 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -570,8 +570,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct romfs_inode_info *ei = (struct romfs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 5faba4f1c9ab..424a3ddf86dd 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -69,9 +69,8 @@ static void smb_destroy_inode(struct inode *inode)
 static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
        struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-        unsigned long flagmask = SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR;
-        if ((flags & flagmask) == SLAB_CTOR_CONSTRUCTOR)
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/super.c b/fs/super.c
index 60b1e50cbf53..8341e4e1d738 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -725,16 +725,6 @@ static int test_bdev_super(struct super_block *s, void *data)
        return (void *)s->s_bdev == data;
 }
-static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
-{
-        if (bdev->bd_disk) {
-                if (bdev->bd_part)
-                        kobject_uevent(&bdev->bd_part->kobj, action);
-                else
-                        kobject_uevent(&bdev->bd_disk->kobj, action);
-        }
-}
 int get_sb_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int),
@@ -782,7 +772,6 @@ int get_sb_bdev(struct file_system_type *fs_type,
                }
                s->s_flags |= MS_ACTIVE;
-                bdev_uevent(bdev, KOBJ_MOUNT);
        }
        return simple_set_mnt(mnt, s);
@@ -801,7 +790,6 @@ void kill_block_super(struct super_block *sb)
 {
        struct block_device *bdev = sb->s_bdev;
-        bdev_uevent(bdev, KOBJ_UMOUNT);
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        close_bdev_excl(bdev);
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
 /*
 * `endbyte' is inclusive
 */
-int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
+int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
-                        unsigned int flags)
+                          loff_t endbyte, unsigned int flags)
 {
        int ret;
-        struct address_space *mapping;
-        mapping = file->f_mapping;
        if (!mapping) {
                ret = -EINVAL;
                goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
 out:
        return ret;
 }
-EXPORT_SYMBOL_GPL(do_sync_file_range);
+EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index fc4633378dc0..0e637adc2b87 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,8 +13,7 @@
 #include "sysfs.h"
-#define to_subsys(k) container_of(k,struct subsystem,kset.kobj)
+#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
-#define to_sattr(a) container_of(a,struct subsys_attribute,attr)
 /*
 * Subsystem file operations.
@@ -24,12 +23,12 @@
 static ssize_t 
 subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
 {
-        struct subsystem * s = to_subsys(kobj);
+        struct kset *kset = to_kset(kobj);
        struct subsys_attribute * sattr = to_sattr(attr);
        ssize_t ret = -EIO;
        if (sattr->show)
-                ret = sattr->show(s,page);
+                ret = sattr->show(kset, page);
        return ret;
 }
@@ -37,12 +36,12 @@ static ssize_t
 subsys_attr_store(struct kobject * kobj, struct attribute * attr, 
                  const char * page, size_t count)
 {
-        struct subsystem * s = to_subsys(kobj);
+        struct kset *kset = to_kset(kobj);
        struct subsys_attribute * sattr = to_sattr(attr);
        ssize_t ret = -EIO;
        if (sattr->store)
-                ret = sattr->store(s,page,count);
+                ret = sattr->store(kset, page, count);
        return ret;
 }
@@ -633,6 +632,7 @@ struct sysfs_schedule_callback_struct {
        struct kobject          *kobj;
        void                    (*func)(void *);
        void                    *data;
+        struct module           *owner;
        struct work_struct      work;
 };
@@ -643,6 +643,7 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
        (ss->func)(ss->data);
        kobject_put(ss->kobj);
+        module_put(ss->owner);
        kfree(ss);
 }
@@ -651,6 +652,7 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
 * @kobj: object we're acting for.
 * @func: callback function to invoke later.
 * @data: argument to pass to @func.
+ * @owner: module owning the callback code
 *
 * sysfs attribute methods must not unregister themselves or their parent
 * kobject (which would amount to the same thing).  Attempts to do so will
@@ -663,20 +665,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
 * until @func returns.
 *
 * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated.
+ * be allocated, -ENODEV if a reference to @owner isn't available.
 */
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
-                void *data)
+                void *data, struct module *owner)
 {
        struct sysfs_schedule_callback_struct *ss;
+        if (!try_module_get(owner))
+                return -ENODEV;
        ss = kmalloc(sizeof(*ss), GFP_KERNEL);
-        if (!ss)
+        if (!ss) {
+                module_put(owner);
                return -ENOMEM;
+        }
        kobject_get(kobj);
        ss->kobj = kobj;
        ss->func = func;
        ss->data = data;
+        ss->owner = owner;
        INIT_WORK(&ss->work, sysfs_schedule_callback_work);
        schedule_work(&ss->work);
        return 0;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index b20951c93761..52eed2a7a5ef 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,9 +70,11 @@ void sysfs_remove_group(struct kobject * kobj,
 {
        struct dentry * dir;
-        if (grp->name)
+        if (grp->name) {
-                dir = lookup_one_len(grp->name, kobj->dentry,
+                dir = lookup_one_len_kern(grp->name, kobj->dentry,
                                strlen(grp->name));
+                BUG_ON(IS_ERR(dir));
+        }
        else
                dir = dget(kobj->dentry);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index ebf7007fa161..e566b387fcf9 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -54,17 +54,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
-        if (!IS_ERR(page)) {
+        if (!IS_ERR(page))
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-        }
        return page;
-fail:
-        dir_put_page(page);
-        return ERR_PTR(-EIO);
 }
 static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9311cac186fe..3152d7415606 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -322,8 +322,7 @@ static void init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
 {
        struct sysv_inode_info *si = (struct sysv_inode_info *)p;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-                        SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&si->vfs_inode);
 }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8672b88f7ff2..023b304fdd99 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -134,9 +134,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct udf_inode_info *ei = (struct udf_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR) {
-            SLAB_CTOR_CONSTRUCTOR)
-        {
                ei->i_ext.i_data = NULL;
                inode_init_once(&ei->vfs_inode);
        }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 4890ddf1518e..4fb8b2e077ee 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -180,13 +180,9 @@ fail:
 static struct page *ufs_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
-        struct page *page = read_cache_page(mapping, n,
+        struct page *page = read_mapping_page(mapping, n, NULL);
-                                (filler_t*)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (!PageChecked(page))
                        ufs_check_page(page);
                if (PageError(page))
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 013d7afe7cde..f18b79122fa3 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -601,7 +601,7 @@ static void ufs_set_inode_ops(struct inode *inode)
                                   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
 }
-static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
+static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -613,8 +613,10 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
         */
        inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
        inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+                return -1;
+        }
        
        /*
         * Linux now has 32-bit uid and gid, so we can support EFT.
@@ -643,9 +645,10 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
        }
+        return 0;
 }
-static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -658,8 +661,10 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
         */
        inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
        inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+                return -1;
+        }
        /*
         * Linux now has 32-bit uid and gid, so we can support EFT.
@@ -690,6 +695,7 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
        }
+        return 0;
 }
 void ufs_read_inode(struct inode * inode)
@@ -698,6 +704,7 @@ void ufs_read_inode(struct inode * inode)
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct buffer_head * bh;
+        int err;
        UFSD("ENTER, ino %lu\n", inode->i_ino);
@@ -720,14 +727,17 @@ void ufs_read_inode(struct inode * inode)
        if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
                struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
-                ufs2_read_inode(inode,
+                err = ufs2_read_inode(inode,
-                                ufs2_inode + ufs_inotofsbo(inode->i_ino));
+                                      ufs2_inode + ufs_inotofsbo(inode->i_ino));
        } else {
                struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
-                ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+                err = ufs1_read_inode(inode,
+                                      ufs_inode + ufs_inotofsbo(inode->i_ino));
        }
+        if (err)
+                goto bad_inode;
        inode->i_version++;
        ufsi->i_lastfrag =
                (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -888,6 +898,8 @@ void ufs_delete_inode (struct inode * inode)
        loff_t old_i_size;
        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode))
+                goto no_delete;
        /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
        lock_kernel();
        mark_inode_dirty(inode);
@@ -898,4 +910,7 @@ void ufs_delete_inode (struct inode * inode)
                ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n");
        ufs_free_inode (inode);
        unlock_kernel();
+        return;
+no_delete:
+        clear_inode(inode);     /* We must guarantee clearing of inode... */
 }
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b5a6461ec66b..be7c48c5f203 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1237,8 +1237,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
        struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-            SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 17437574f79c..84357f1ff0ec 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -251,13 +251,11 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
        page = find_lock_page(mapping, index);
        if (!page) {
-                page = read_cache_page(mapping, index,
+                page = read_mapping_page(mapping, index, NULL);
-                                       (filler_t*)mapping->a_ops->readpage,
-                                       NULL);
                if (IS_ERR(page)) {
                        printk(KERN_ERR "ufs_change_blocknr: "
-                               "read_cache_page error: ino %lu, index: %lu\n",
+                               "read_mapping_page error: ino %lu, index: %lu\n",
                               mapping->host->i_ino, index);
                        goto out;
                }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2f2c40db562e..14e2cbe5a8d5 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -360,8 +360,7 @@ xfs_fs_inode_init_once(
        kmem_zone_t             *zonep,
        unsigned long           flags)
 {
-        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+        if (flags & SLAB_CTOR_CONSTRUCTOR)
-                      SLAB_CTOR_CONSTRUCTOR)
                inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }