Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts: drivers/usb/input/Makefile drivers/usb/input/gtco.c
author: Dmitry Torokhov <dtor@insightbb.com> 2007-05-01 00:24:54 -0400
committer: Dmitry Torokhov <dtor@insightbb.com> 2007-05-01 00:24:54 -0400
commit: bc95f3669f5e6f63cf0b84fe4922c3c6dd4aa775 (patch)
tree: 427fcf2a7287c16d4b5aa6cbf494d59579a6a8b1 /fs
parent: 3d29cdff999c37b3876082278a8134a0642a02cd (diff)
parent: dc87c3985e9b442c60994308a96f887579addc39 (diff)
253 files changed, 15126 insertions, 17563 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index a9b6301a04fc..90419715c7e9 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -136,7 +136,8 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 }
 /**
- * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and release it
+ * v9fs_fid_clone - lookup the fid for a dentry, clone a private copy and
+ *                      release it
 * @dentry: dentry to look for fid in
 *
 * find a fid in the dentry and then clone to a new private fid
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 147ceef8e537..c783874a9caf 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -256,7 +256,7 @@ static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
        vpt->muxnum--;
        if (!vpt->muxnum) {
                dprintk(DEBUG_MUX, "destroy proc %p\n", vpt);
-                send_sig(SIGKILL, vpt->task, 1);
+                kthread_stop(vpt->task);
                vpt->task = NULL;
                v9fs_mux_poll_task_num--;
        }
@@ -438,11 +438,8 @@ static int v9fs_poll_proc(void *a)
        vpt = a;
        dprintk(DEBUG_MUX, "start %p %p\n", current, vpt);
-        allow_signal(SIGKILL);
        while (!kthread_should_stop()) {
                set_current_state(TASK_INTERRUPTIBLE);
-                if (signal_pending(current))
-                        break;
                list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
                        v9fs_poll_mux(m);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d9b561ba5e58..6ad6f192b6e4 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -53,6 +53,8 @@ enum {
        Opt_uname, Opt_remotename,
        /* Options that take no arguments */
        Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd,
+        /* Cache options */
+        Opt_cache_loose,
        /* Error token */
        Opt_err
 };
@@ -76,6 +78,8 @@ static match_table_t tokens = {
        {Opt_fd, "fd"},
        {Opt_legacy, "noextend"},
        {Opt_nodevmap, "nodevmap"},
+        {Opt_cache_loose, "cache=loose"},
+        {Opt_cache_loose, "loose"},
        {Opt_err, NULL}
 };
@@ -106,6 +110,7 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
        v9ses->debug = 0;
        v9ses->rfdno = ~0;
        v9ses->wfdno = ~0;
+        v9ses->cache = 0;
        if (!options)
                return;
@@ -121,7 +126,6 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
                                        "integer field, but no integer?\n");
                                continue;
                        }
                }
                switch (token) {
                case Opt_port:
@@ -169,6 +173,9 @@ static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
                case Opt_nodevmap:
                        v9ses->nodev = 1;
                        break;
+                case Opt_cache_loose:
+                        v9ses->cache = CACHE_LOOSE;
+                        break;
                default:
                        continue;
                }
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c134d104cb28..820bf5ca35d8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -47,7 +47,7 @@ struct v9fs_session_info {
        unsigned int afid;      /* authentication fid */
        unsigned int rfdno;     /* read file descriptor number */
        unsigned int wfdno;     /* write file descriptor number */
+        unsigned int cache;     /* cache mode */
        char *name;             /* user name to mount as */
        char *remotename;       /* name of remote hierarchy being mounted */
@@ -73,6 +73,13 @@ enum {
        PROTO_FD,
 };
+/* possible values of ->cache */
+/* eventually support loose, tight, time, session, default always none */
+enum {
+        CACHE_NONE,             /* default */
+        CACHE_LOOSE,            /* no consistency */
+};
 extern struct dentry *v9fs_debugfs_root;
 int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 450b0c1b385e..6a82d39dc498 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,6 +42,7 @@ extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
+extern struct dentry_operations v9fs_cached_dentry_operations;
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct v9fs_qid *qid);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index cc24abf232d5..bed48fa96521 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -63,6 +63,8 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        int total = 0;
        int result = 0;
+        dprintk(DEBUG_VFS, "\n");
        buffer = kmap(page);
        do {
                if (count < rsize)
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 062daa6000ab..ddffd8aa902d 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,10 +53,31 @@
 static int v9fs_dentry_delete(struct dentry *dentry)
 {
        dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
        return 1;
 }
 /**
+ * v9fs_cached_dentry_delete - called when dentry refcount equals 0
+ * @dentry:  dentry in question
+ *
+ * Only return 1 if our inode is invalid.  Only non-synthetic files
+ * (ones without mtime == 0) should be calling this function.
+ *
+ */
+static int v9fs_cached_dentry_delete(struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        if(!inode)
+                return 1;
+        return 0;
+}
+/**
 * v9fs_dentry_release - called when dentry is going to be freed
 * @dentry:  dentry that is being release
 *
@@ -87,6 +108,11 @@ void v9fs_dentry_release(struct dentry *dentry)
        }
 }
+struct dentry_operations v9fs_cached_dentry_operations = {
+        .d_delete = v9fs_cached_dentry_delete,
+        .d_release = v9fs_dentry_release,
+};
 struct dentry_operations v9fs_dentry_operations = {
        .d_delete = v9fs_dentry_delete,
        .d_release = v9fs_dentry_release,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6c78343cf690..c7b677253843 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -42,6 +42,8 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
+static const struct file_operations v9fs_cached_file_operations;
 /**
 * v9fs_file_open - open a file (or directory)
 * @inode: inode to be opened
@@ -79,6 +81,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        vfid->filp = file;
        kfree(fcall);
+        if((vfid->qid.version) && (v9ses->cache)) {
+                dprintk(DEBUG_VFS, "cached");
+                /* enable cached file options */
+                if(file->f_op == &v9fs_file_operations)
+                        file->f_op = &v9fs_cached_file_operations;
+        }
        return 0;
 Clunk_Fid:
@@ -238,6 +247,17 @@ v9fs_file_write(struct file *filp, const char __user * data,
        return total;
 }
+static const struct file_operations v9fs_cached_file_operations = {
+        .llseek = generic_file_llseek,
+        .read = do_sync_read,
+        .aio_read = generic_file_aio_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+        .mmap = generic_file_mmap,
+};
 const struct file_operations v9fs_file_operations = {
        .llseek = generic_file_llseek,
        .read = v9fs_file_read,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5cf22134826b..b01b0a457932 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -415,7 +415,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
        file_inode = file->d_inode;
        sb = file_inode->i_sb;
        v9ses = v9fs_inode2v9ses(file_inode);
-        v9fid = v9fs_fid_lookup(file);
+        v9fid = v9fs_fid_clone(file);
        if(IS_ERR(v9fid))
                return PTR_ERR(v9fid);
@@ -504,7 +504,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                goto error;
        }
-        dentry->d_op = &v9fs_dentry_operations;
+        if(v9ses->cache)
+                dentry->d_op = &v9fs_cached_dentry_operations;
+        else
+                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        if (nd && nd->flags & LOOKUP_OPEN) {
@@ -589,7 +592,10 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto error;
        }
-        dentry->d_op = &v9fs_dentry_operations;
+        if(v9ses->cache)
+                dentry->d_op = &v9fs_cached_dentry_operations;
+        else
+                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        return 0;
@@ -626,7 +632,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
        v9ses = v9fs_inode2v9ses(dir);
-        dentry->d_op = &v9fs_dentry_operations;
        dirfid = v9fs_fid_lookup(dentry->d_parent);
        if(IS_ERR(dirfid))
@@ -697,6 +702,10 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        fid->qid = fcall->params.rstat.stat.qid;
        v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
+        if((fid->qid.version)&&(v9ses->cache))
+                dentry->d_op = &v9fs_cached_dentry_operations;
+        else
+                dentry->d_op = &v9fs_dentry_operations;
        d_add(dentry, inode);
        kfree(fcall);
@@ -1184,7 +1193,10 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
                goto free_vfid;
        }
-        dentry->d_op = &v9fs_dentry_operations;
+        if(v9ses->cache)
+                dentry->d_op = &v9fs_cached_dentry_operations;
+        else
+                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
        return 0;
diff --git a/fs/Kconfig b/fs/Kconfig
index a722b5a3f752..a42f767dcdd5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1189,32 +1189,6 @@ config EFS_FS
          To compile the EFS file system support as a module, choose M here: the
          module will be called efs.
-config JFFS_FS
-        tristate "Journalling Flash File System (JFFS) support"
-        depends on MTD && BLOCK && BROKEN
-        help
-          JFFS is the Journalling Flash File System developed by Axis
-          Communications in Sweden, aimed at providing a crash/powerdown-safe
-          file system for disk-less embedded devices. Further information is
-          available at (<http://developer.axis.com/software/jffs/>).
-          NOTE: This filesystem is deprecated and is scheduled for removal in
-          2.6.21.  See Documentation/feature-removal-schedule.txt
-config JFFS_FS_VERBOSE
-        int "JFFS debugging verbosity (0 = quiet, 3 = noisy)"
-        depends on JFFS_FS
-        default "0"
-        help
-          Determines the verbosity level of the JFFS debugging messages.
-config JFFS_PROC_FS
-        bool "JFFS stats available in /proc filesystem"
-        depends on JFFS_FS && PROC_FS
-        help
-          Enabling this option will cause statistics from mounted JFFS file systems
-          to be made available to the user in the /proc/fs/jffs/ directory.
 config JFFS2_FS
        tristate "Journalling Flash File System v2 (JFFS2) support"
        select CRC32
@@ -2045,7 +2019,8 @@ config CODA_FS_OLD_API
 config AFS_FS
        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
-        select RXRPC
+        select AF_RXRPC
+        select KEYS
        help
          If you say Y here, you will get an experimental Andrew File System
          driver. It currently only supports unsecured read-only AFS access.
@@ -2054,8 +2029,15 @@ config AFS_FS
          If unsure, say N.
-config RXRPC
+config AFS_DEBUG
-        tristate
+        bool "AFS dynamic debugging"
+        depends on AFS_FS
+        help
+          Say Y here to make runtime controllable debugging messages appear.
+          See <file:Documentation/filesystems/afs.txt> for more information.
+          If unsure, say N.
 config 9P_FS
        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/Makefile b/fs/Makefile
index b9ffa63f77fc..9edf4112bee0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -94,7 +94,6 @@ obj-$(CONFIG_HPFS_FS)		+= hpfs/
 obj-$(CONFIG_NTFS_FS)           += ntfs/
 obj-$(CONFIG_UFS_FS)            += ufs/
 obj-$(CONFIG_EFS_FS)            += efs/
-obj-$(CONFIG_JFFS_FS)           += jffs/
 obj-$(CONFIG_JFFS2_FS)          += jffs2/
 obj-$(CONFIG_AFFS_FS)           += affs/
 obj-$(CONFIG_ROMFS_FS)          += romfs/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 7db2d287e9f3..232c69493683 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -171,6 +171,7 @@ extern unsigned long		 affs_parent_ino(struct inode *dir);
 extern struct inode             *affs_new_inode(struct inode *dir);
 extern int                       affs_notify_change(struct dentry *dentry, struct iattr *attr);
 extern void                      affs_put_inode(struct inode *inode);
+extern void                      affs_drop_inode(struct inode *inode);
 extern void                      affs_delete_inode(struct inode *inode);
 extern void                      affs_clear_inode(struct inode *inode);
 extern void                      affs_read_inode(struct inode *inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index fce6848a4641..c5b9d73c084a 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -243,12 +243,17 @@ affs_put_inode(struct inode *inode)
 {
        pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
        affs_free_prealloc(inode);
-        if (atomic_read(&inode->i_count) == 1) {
+}
-                mutex_lock(&inode->i_mutex);
-                if (inode->i_size != AFFS_I(inode)->mmu_private)
+void
-                        affs_truncate(inode);
+affs_drop_inode(struct inode *inode)
-                mutex_unlock(&inode->i_mutex);
+{
-        }
+        mutex_lock(&inode->i_mutex);
+        if (inode->i_size != AFFS_I(inode)->mmu_private)
+                affs_truncate(inode);
+        mutex_unlock(&inode->i_mutex);
+        generic_drop_inode(inode);
 }
 void
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a324045d8554..c3986a1911b0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -118,6 +118,7 @@ static const struct super_operations affs_sops = {
        .read_inode     = affs_read_inode,
        .write_inode    = affs_write_inode,
        .put_inode      = affs_put_inode,
+        .drop_inode     = affs_drop_inode,
        .delete_inode   = affs_delete_inode,
        .clear_inode    = affs_clear_inode,
        .put_super      = affs_put_super,
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4029c9da4b86..01545eb1d872 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,8 +2,6 @@
 # Makefile for Red Hat Linux AFS client.
 #
-#CFLAGS += -finstrument-functions
 kafs-objs := \
        callback.o \
        cell.o \
@@ -12,14 +10,15 @@ kafs-objs := \
        file.o \
        fsclient.o \
        inode.o \
-        kafsasyncd.o \
-        kafstimod.o \
        main.o \
        misc.o \
        mntpt.o \
        proc.o \
+        rxrpc.o \
+        security.o \
        server.o \
        super.o \
+        use-rtnetlink.o \
        vlclient.o \
        vlocation.o \
        vnode.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
new file mode 100644
index 000000000000..52d0752265b8
--- /dev/null
+++ b/fs/afs/afs.h
@@ -0,0 +1,146 @@
+/* AFS common types
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef AFS_H
+#define AFS_H
+#include <linux/in.h>
+#define AFS_MAXCELLNAME 64              /* maximum length of a cell name */
+#define AFS_MAXVOLNAME  64              /* maximum length of a volume name */
+typedef unsigned                        afs_volid_t;
+typedef unsigned                        afs_vnodeid_t;
+typedef unsigned long long              afs_dataversion_t;
+typedef enum {
+        AFSVL_RWVOL,                    /* read/write volume */
+        AFSVL_ROVOL,                    /* read-only volume */
+        AFSVL_BACKVOL,                  /* backup volume */
+} __attribute__((packed)) afs_voltype_t;
+typedef enum {
+        AFS_FTYPE_INVALID       = 0,
+        AFS_FTYPE_FILE          = 1,
+        AFS_FTYPE_DIR           = 2,
+        AFS_FTYPE_SYMLINK       = 3,
+} afs_file_type_t;
+/*
+ * AFS file identifier
+ */
+struct afs_fid {
+        afs_volid_t     vid;            /* volume ID */
+        afs_vnodeid_t   vnode;          /* file index within volume */
+        unsigned        unique;         /* unique ID number (file index version) */
+};
+/*
+ * AFS callback notification
+ */
+typedef enum {
+        AFSCM_CB_UNTYPED        = 0,    /* no type set on CB break */
+        AFSCM_CB_EXCLUSIVE      = 1,    /* CB exclusive to CM [not implemented] */
+        AFSCM_CB_SHARED         = 2,    /* CB shared by other CM's */
+        AFSCM_CB_DROPPED        = 3,    /* CB promise cancelled by file server */
+} afs_callback_type_t;
+struct afs_callback {
+        struct afs_fid          fid;            /* file identifier */
+        unsigned                version;        /* callback version */
+        unsigned                expiry;         /* time at which expires */
+        afs_callback_type_t     type;           /* type of callback */
+};
+#define AFSCBMAX 50     /* maximum callbacks transferred per bulk op */
+/*
+ * AFS volume information
+ */
+struct afs_volume_info {
+        afs_volid_t             vid;            /* volume ID */
+        afs_voltype_t           type;           /* type of this volume */
+        afs_volid_t             type_vids[5];   /* volume ID's for possible types for this vol */
+        /* list of fileservers serving this volume */
+        size_t                  nservers;       /* number of entries used in servers[] */
+        struct {
+                struct in_addr  addr;           /* fileserver address */
+        } servers[8];
+};
+/*
+ * AFS security ACE access mask
+ */
+typedef u32 afs_access_t;
+#define AFS_ACE_READ            0x00000001U     /* - permission to read a file/dir */
+#define AFS_ACE_WRITE           0x00000002U     /* - permission to write/chmod a file */
+#define AFS_ACE_INSERT          0x00000004U     /* - permission to create dirent in a dir */
+#define AFS_ACE_LOOKUP          0x00000008U     /* - permission to lookup a file/dir in a dir */
+#define AFS_ACE_DELETE          0x00000010U     /* - permission to delete a dirent from a dir */
+#define AFS_ACE_LOCK            0x00000020U     /* - permission to lock a file */
+#define AFS_ACE_ADMINISTER      0x00000040U     /* - permission to change ACL */
+#define AFS_ACE_USER_A          0x01000000U     /* - 'A' user-defined permission */
+#define AFS_ACE_USER_B          0x02000000U     /* - 'B' user-defined permission */
+#define AFS_ACE_USER_C          0x04000000U     /* - 'C' user-defined permission */
+#define AFS_ACE_USER_D          0x08000000U     /* - 'D' user-defined permission */
+#define AFS_ACE_USER_E          0x10000000U     /* - 'E' user-defined permission */
+#define AFS_ACE_USER_F          0x20000000U     /* - 'F' user-defined permission */
+#define AFS_ACE_USER_G          0x40000000U     /* - 'G' user-defined permission */
+#define AFS_ACE_USER_H          0x80000000U     /* - 'H' user-defined permission */
+/*
+ * AFS file status information
+ */
+struct afs_file_status {
+        unsigned                if_version;     /* interface version */
+#define AFS_FSTATUS_VERSION     1
+        afs_file_type_t         type;           /* file type */
+        unsigned                nlink;          /* link count */
+        u64                     size;           /* file size */
+        afs_dataversion_t       data_version;   /* current data version */
+        u32                     author;         /* author ID */
+        u32                     owner;          /* owner ID */
+        u32                     group;          /* group ID */
+        afs_access_t            caller_access;  /* access rights for authenticated caller */
+        afs_access_t            anon_access;    /* access rights for unauthenticated caller */
+        umode_t                 mode;           /* UNIX mode */
+        struct afs_fid          parent;         /* parent dir ID for non-dirs only */
+        time_t                  mtime_client;   /* last time client changed data */
+        time_t                  mtime_server;   /* last time server changed data */
+};
+/*
+ * AFS file status change request
+ */
+struct afs_store_status {
+        u32                     mask;           /* which bits of the struct are set */
+        u32                     mtime_client;   /* last time client changed data */
+        u32                     owner;          /* owner ID */
+        u32                     group;          /* group ID */
+        umode_t                 mode;           /* UNIX mode */
+};
+#define AFS_SET_MTIME           0x01            /* set the mtime */
+#define AFS_SET_OWNER           0x02            /* set the owner ID */
+#define AFS_SET_GROUP           0x04            /* set the group ID (unsupported?) */
+#define AFS_SET_MODE            0x08            /* set the UNIX mode */
+#define AFS_SET_SEG_SIZE        0x10            /* set the segment size (unsupported) */
+/*
+ * AFS volume synchronisation information
+ */
+struct afs_volsync {
+        time_t                  creation;       /* volume creation time */
+};
+#endif /* AFS_H */
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
new file mode 100644
index 000000000000..7b4d4fab4c80
--- /dev/null
+++ b/fs/afs/afs_cm.h
@@ -0,0 +1,32 @@
+/* AFS Cache Manager definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef AFS_CM_H
+#define AFS_CM_H
+#define AFS_CM_PORT             7001    /* AFS file server port */
+#define CM_SERVICE              1       /* AFS File Service ID */
+enum AFS_CM_Operations {
+        CBCallBack              = 204,  /* break callback promises */
+        CBInitCallBackState     = 205,  /* initialise callback state */
+        CBProbe                 = 206,  /* probe client */
+        CBGetLock               = 207,  /* get contents of CM lock table */
+        CBGetCE                 = 208,  /* get cache file description */
+        CBGetXStatsVersion      = 209,  /* get version of extended statistics */
+        CBGetXStats             = 210,  /* get contents of extended statistics data */
+        CBInitCallBackState3    = 213,  /* initialise callback state, version 3 */
+        CBGetCapabilities       = 65538, /* get client capabilities */
+};
+#define AFS_CAP_ERROR_TRANSLATION       0x1
+#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
new file mode 100644
index 000000000000..89e0d1650a72
--- /dev/null
+++ b/fs/afs/afs_fs.h
@@ -0,0 +1,48 @@
+/* AFS File Service definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef AFS_FS_H
+#define AFS_FS_H
+#define AFS_FS_PORT             7000    /* AFS file server port */
+#define FS_SERVICE              1       /* AFS File Service ID */
+enum AFS_FS_Operations {
+        FSFETCHDATA             = 130,  /* AFS Fetch file data */
+        FSFETCHSTATUS           = 132,  /* AFS Fetch file status */
+        FSREMOVEFILE            = 136,  /* AFS Remove a file */
+        FSCREATEFILE            = 137,  /* AFS Create a file */
+        FSRENAME                = 138,  /* AFS Rename or move a file or directory */
+        FSSYMLINK               = 139,  /* AFS Create a symbolic link */
+        FSLINK                  = 140,  /* AFS Create a hard link */
+        FSMAKEDIR               = 141,  /* AFS Create a directory */
+        FSREMOVEDIR             = 142,  /* AFS Remove a directory */
+        FSGIVEUPCALLBACKS       = 147,  /* AFS Discard callback promises */
+        FSGETVOLUMEINFO         = 148,  /* AFS Get root volume information */
+        FSGETROOTVOLUME         = 151,  /* AFS Get root volume name */
+        FSLOOKUP                = 161,  /* AFS lookup file in directory */
+};
+enum AFS_FS_Errors {
+        VSALVAGE        = 101,  /* volume needs salvaging */
+        VNOVNODE        = 102,  /* no such file/dir (vnode) */
+        VNOVOL          = 103,  /* no such volume or volume unavailable */
+        VVOLEXISTS      = 104,  /* volume name already exists */
+        VNOSERVICE      = 105,  /* volume not currently in service */
+        VOFFLINE        = 106,  /* volume is currently offline (more info available [VVL-spec]) */
+        VONLINE         = 107,  /* volume is already online */
+        VDISKFULL       = 108,  /* disk partition is full */
+        VOVERQUOTA      = 109,  /* volume's maximum quota exceeded */
+        VBUSY           = 110,  /* volume is temporarily unavailable */
+        VMOVED          = 111,  /* volume moved to new server - ask this FS where */
+};
+#endif /* AFS_FS_H */
diff --git a/fs/afs/vlclient.h b/fs/afs/afs_vl.h
index e3d601179c46..8bbefe009ed4 100644
--- a/fs/afs/vlclient.h
+++ b/fs/afs/afs_vl.h
@@ -1,6 +1,6 @@
-/* vlclient.h: Volume Location Service client interface
+/* AFS Volume Location Service client interface
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,10 +9,19 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef _LINUX_AFS_VLCLIENT_H
+#ifndef AFS_VL_H
-#define _LINUX_AFS_VLCLIENT_H
+#define AFS_VL_H
-#include "types.h"
+#include "afs.h"
+#define AFS_VL_PORT             7003    /* volume location service port */
+#define VL_SERVICE              52      /* RxRPC service ID for the Volume Location service */
+enum AFSVL_Operations {
+        VLGETENTRYBYID          = 503,  /* AFS Get Cache Entry By ID operation ID */
+        VLGETENTRYBYNAME        = 504,  /* AFS Get Cache Entry By Name operation ID */
+        VLPROBE                 = 514,  /* AFS Probe Volume Location Service operation ID */
+};
 enum AFSVL_Errors {
        AFSVL_IDEXIST           = 363520,       /* Volume Id entry exists in vl database */
@@ -40,14 +49,16 @@ enum AFSVL_Errors {
        AFSVL_BADVOLOPER        = 363542,       /* Bad volume operation code */
        AFSVL_BADRELLOCKTYPE    = 363543,       /* Bad release lock type */
        AFSVL_RERELEASE         = 363544,       /* Status report: last release was aborted */
-        AFSVL_BADSERVERFLAG     = 363545,       /* Invalid replication site server �ag */
+        AFSVL_BADSERVERFLAG     = 363545,       /* Invalid replication site server ��ag */
        AFSVL_PERM              = 363546,       /* No permission access */
        AFSVL_NOMEM             = 363547,       /* malloc/realloc failed to alloc enough memory */
 };
-/* maps to "struct vldbentry" in vvl-spec.pdf */
+/*
+ * maps to "struct vldbentry" in vvl-spec.pdf
+ */
 struct afs_vldbentry {
-        char            name[65];               /* name of volume (including NUL char) */
+        char            name[65];               /* name of volume (with NUL char) */
        afs_voltype_t   type;                   /* volume type */
        unsigned        num_servers;            /* num servers that hold instances of this vol */
        unsigned        clone_id;               /* cloning ID */
@@ -68,26 +79,6 @@ struct afs_vldbentry {
 #define AFS_VLSF_RWVOL          0x0004  /* this server holds a R/W instance of the volume */
 #define AFS_VLSF_BACKVOL        0x0008  /* this server holds a backup instance of the volume */
        } servers[8];
 };
-/* look up a volume location database entry by name */
+#endif /* AFS_VL_H */
-extern int afs_rxvl_get_entry_by_name(struct afs_server *server,
-                                      const char *volname,
-                                      unsigned volnamesz,
-                                      struct afs_cache_vlocation *entry);
-/* look up a volume location database entry by ID */
-extern int afs_rxvl_get_entry_by_id(struct afs_server *server,
-                                    afs_volid_t volid,
-                                    afs_voltype_t voltype,
-                                    struct afs_cache_vlocation *entry);
-extern int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
-                                          afs_volid_t volid,
-                                          afs_voltype_t voltype);
-extern int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
-                                           struct afs_cache_vlocation *entry);
-#endif /* _LINUX_AFS_VLCLIENT_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
new file mode 100644
index 000000000000..de0d7de69edc
--- /dev/null
+++ b/fs/afs/cache.c
@@ -0,0 +1,256 @@
+/* AFS caching stuff
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+                                                const void *entry);
+static void afs_cell_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_cache_cell_index_def = {
+        .name                   = "cell_ix",
+        .data_size              = sizeof(struct afs_cache_cell),
+        .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+        .match                  = afs_cell_cache_match,
+        .update                 = afs_cell_cache_update,
+};
+#endif
+/*
+ * match a cell record obtained from the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+                                                const void *entry)
+{
+        const struct afs_cache_cell *ccell = entry;
+        struct afs_cell *cell = target;
+        _enter("{%s},{%s}", ccell->name, cell->name);
+        if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
+                _leave(" = SUCCESS");
+                return CACHEFS_MATCH_SUCCESS;
+        }
+        _leave(" = FAILED");
+        return CACHEFS_MATCH_FAILED;
+}
+#endif
+/*
+ * update a cell record in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_cell_cache_update(void *source, void *entry)
+{
+        struct afs_cache_cell *ccell = entry;
+        struct afs_cell *cell = source;
+        _enter("%p,%p", source, entry);
+        strncpy(ccell->name, cell->name, sizeof(ccell->name));
+        memcpy(ccell->vl_servers,
+               cell->vl_addrs,
+               min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+}
+#endif
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                                     const void *entry);
+static void afs_vlocation_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_vlocation_cache_index_def = {
+        .name           = "vldb",
+        .data_size      = sizeof(struct afs_cache_vlocation),
+        .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+        .match          = afs_vlocation_cache_match,
+        .update         = afs_vlocation_cache_update,
+};
+#endif
+/*
+ * match a VLDB record stored in the cache
+ * - may also load target from entry
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                                     const void *entry)
+{
+        const struct afs_cache_vlocation *vldb = entry;
+        struct afs_vlocation *vlocation = target;
+        _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+        if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+            ) {
+                if (!vlocation->valid ||
+                    vlocation->vldb.rtime == vldb->rtime
+                    ) {
+                        vlocation->vldb = *vldb;
+                        vlocation->valid = 1;
+                        _leave(" = SUCCESS [c->m]");
+                        return CACHEFS_MATCH_SUCCESS;
+                } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
+                        /* delete if VIDs for this name differ */
+                        if (memcmp(&vlocation->vldb.vid,
+                                   &vldb->vid,
+                                   sizeof(vldb->vid)) != 0) {
+                                _leave(" = DELETE");
+                                return CACHEFS_MATCH_SUCCESS_DELETE;
+                        }
+                        _leave(" = UPDATE");
+                        return CACHEFS_MATCH_SUCCESS_UPDATE;
+                } else {
+                        _leave(" = SUCCESS");
+                        return CACHEFS_MATCH_SUCCESS;
+                }
+        }
+        _leave(" = FAILED");
+        return CACHEFS_MATCH_FAILED;
+}
+#endif
+/*
+ * update a VLDB record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vlocation_cache_update(void *source, void *entry)
+{
+        struct afs_cache_vlocation *vldb = entry;
+        struct afs_vlocation *vlocation = source;
+        _enter("");
+        *vldb = vlocation->vldb;
+}
+#endif
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+                                                  const void *entry);
+static void afs_volume_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_volume_cache_index_def = {
+        .name           = "volume",
+        .data_size      = sizeof(struct afs_cache_vhash),
+        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
+        .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
+        .match          = afs_volume_cache_match,
+        .update         = afs_volume_cache_update,
+};
+#endif
+/*
+ * match a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+                                                  const void *entry)
+{
+        const struct afs_cache_vhash *vhash = entry;
+        struct afs_volume *volume = target;
+        _enter("{%u},{%u}", volume->type, vhash->vtype);
+        if (volume->type == vhash->vtype) {
+                _leave(" = SUCCESS");
+                return CACHEFS_MATCH_SUCCESS;
+        }
+        _leave(" = FAILED");
+        return CACHEFS_MATCH_FAILED;
+}
+#endif
+/*
+ * update a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_volume_cache_update(void *source, void *entry)
+{
+        struct afs_cache_vhash *vhash = entry;
+        struct afs_volume *volume = source;
+        _enter("");
+        vhash->vtype = volume->type;
+}
+#endif
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                                                 const void *entry);
+static void afs_vnode_cache_update(void *source, void *entry);
+struct cachefs_index_def afs_vnode_cache_index_def = {
+        .name           = "vnode",
+        .data_size      = sizeof(struct afs_cache_vnode),
+        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
+        .match          = afs_vnode_cache_match,
+        .update         = afs_vnode_cache_update,
+};
+#endif
+/*
+ * match a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                                                 const void *entry)
+{
+        const struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = target;
+        _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               vnode->status.version,
+               cvnode->vnode_id,
+               cvnode->vnode_unique,
+               cvnode->data_version);
+        if (vnode->fid.vnode != cvnode->vnode_id) {
+                _leave(" = FAILED");
+                return CACHEFS_MATCH_FAILED;
+        }
+        if (vnode->fid.unique != cvnode->vnode_unique ||
+            vnode->status.version != cvnode->data_version) {
+                _leave(" = DELETE");
+                return CACHEFS_MATCH_SUCCESS_DELETE;
+        }
+        _leave(" = SUCCESS");
+        return CACHEFS_MATCH_SUCCESS;
+}
+#endif
+/*
+ * update a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_update(void *source, void *entry)
+{
+        struct afs_cache_vnode *cvnode = entry;
+        struct afs_vnode *vnode = source;
+        _enter("");
+        cvnode->vnode_id        = vnode->fid.vnode;
+        cvnode->vnode_unique    = vnode->fid.unique;
+        cvnode->data_version    = vnode->status.version;
+}
+#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 9eb7722b34d5..36a3642cf90e 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,4 +1,4 @@
-/* cache.h: AFS local cache management interface
+/* AFS local cache management interface
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -9,8 +9,8 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef _LINUX_AFS_CACHE_H
+#ifndef AFS_CACHE_H
-#define _LINUX_AFS_CACHE_H
+#define AFS_CACHE_H
 #undef AFS_CACHING_SUPPORT
@@ -20,8 +20,4 @@
 #endif
 #include "types.h"
-#ifdef __KERNEL__
+#endif /* AFS_CACHE_H */
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_CACHE_H */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9cb206e9d4be..639399f0ab6f 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
 *
 * This software may be freely redistributed under the terms of the
 * GNU General Public License.
@@ -16,85 +16,187 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include "server.h"
+#include <linux/circ_buf.h>
-#include "vnode.h"
 #include "internal.h"
-#include "cmservice.h"
-/*****************************************************************************/
+unsigned afs_vnode_update_timeout = 10;
+#define afs_breakring_space(server) \
+        CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,    \
+                   ARRAY_SIZE((server)->cb_break))
+//static void afs_callback_updater(struct work_struct *);
+static struct workqueue_struct *afs_callback_update_worker;
 /*
 * allow the fileserver to request callback state (re-)initialisation
 */
-int SRXAFSCM_InitCallBackState(struct afs_server *server)
+void afs_init_callback_state(struct afs_server *server)
 {
-        struct list_head callbacks;
+        struct afs_vnode *vnode;
-        _enter("%p", server);
+        _enter("{%p}", server);
-        INIT_LIST_HEAD(&callbacks);
-        /* transfer the callback list from the server to a temp holding area */
        spin_lock(&server->cb_lock);
-        list_add(&callbacks, &server->cb_promises);
+        /* kill all the promises on record from this server */
-        list_del_init(&server->cb_promises);
+        while (!RB_EMPTY_ROOT(&server->cb_promises)) {
+                vnode = rb_entry(server->cb_promises.rb_node,
+                                 struct afs_vnode, cb_promise);
+                _debug("UNPROMISE { vid=%x vn=%u uq=%u}",
+                       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+                rb_erase(&vnode->cb_promise, &server->cb_promises);
+                vnode->cb_promised = false;
+        }
-        /* munch our way through the list, grabbing the inode, dropping all the
+        spin_unlock(&server->cb_lock);
-         * locks and regetting them in the right order
+        _leave("");
-         */
+}
-        while (!list_empty(&callbacks)) {
-                struct afs_vnode *vnode;
-                struct inode *inode;
-                vnode = list_entry(callbacks.next, struct afs_vnode, cb_link);
+/*
-                list_del_init(&vnode->cb_link);
+ * handle the data invalidation side of a callback being broken
+ */
+void afs_broken_callback_work(struct work_struct *work)
+{
+        struct afs_vnode *vnode =
+                container_of(work, struct afs_vnode, cb_broken_work);
-                /* try and grab the inode - may fail */
+        _enter("");
-                inode = igrab(AFS_VNODE_TO_I(vnode));
-                if (inode) {
-                        int release = 0;
-                        spin_unlock(&server->cb_lock);
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-                        spin_lock(&vnode->lock);
+                return;
-                        if (vnode->cb_server == server) {
+        /* we're only interested in dealing with a broken callback on *this*
-                                vnode->cb_server = NULL;
+         * vnode and only if no-one else has dealt with it yet */
-                                afs_kafstimod_del_timer(&vnode->cb_timeout);
+        if (!mutex_trylock(&vnode->validate_lock))
-                                spin_lock(&afs_cb_hash_lock);
+                return; /* someone else is dealing with it */
-                                list_del_init(&vnode->cb_hash_link);
-                                spin_unlock(&afs_cb_hash_lock);
-                                release = 1;
-                        }
-                        spin_unlock(&vnode->lock);
+        if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+                if (S_ISDIR(vnode->vfs_inode.i_mode))
+                        afs_clear_permits(vnode);
-                        iput(inode);
+                if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
-                        afs_put_server(server);
+                        goto out;
-                        spin_lock(&server->cb_lock);
+                if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+                        goto out;
+                /* if the vnode's data version number changed then its contents
+                 * are different */
+                if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                        _debug("zap data {%x:%u}",
+                               vnode->fid.vid, vnode->fid.vnode);
+                        invalidate_remote_inode(&vnode->vfs_inode);
                }
        }
-        spin_unlock(&server->cb_lock);
+out:
+        mutex_unlock(&vnode->validate_lock);
-        _leave(" = 0");
+        /* avoid the potential race whereby the mutex_trylock() in this
-        return 0;
+         * function happens again between the clear_bit() and the
-} /* end SRXAFSCM_InitCallBackState() */
+         * mutex_unlock() */
+        if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+                _debug("requeue");
+                queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+        }
+        _leave("");
+}
+/*
+ * actually break a callback
+ */
+static void afs_break_callback(struct afs_server *server,
+                               struct afs_vnode *vnode)
+{
+        _enter("");
+        set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+        if (vnode->cb_promised) {
+                spin_lock(&vnode->lock);
+                _debug("break callback");
+                spin_lock(&server->cb_lock);
+                if (vnode->cb_promised) {
+                        rb_erase(&vnode->cb_promise, &server->cb_promises);
+                        vnode->cb_promised = false;
+                }
+                spin_unlock(&server->cb_lock);
+                queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+                spin_unlock(&vnode->lock);
+        }
+}
+/*
+ * allow the fileserver to explicitly break one callback
+ * - happens when
+ *   - the backing file is changed
+ *   - a lock is released
+ */
+static void afs_break_one_callback(struct afs_server *server,
+                                   struct afs_fid *fid)
+{
+        struct afs_vnode *vnode;
+        struct rb_node *p;
+        _debug("find");
+        spin_lock(&server->fs_lock);
+        p = server->fs_vnodes.rb_node;
+        while (p) {
+                vnode = rb_entry(p, struct afs_vnode, server_rb);
+                if (fid->vid < vnode->fid.vid)
+                        p = p->rb_left;
+                else if (fid->vid > vnode->fid.vid)
+                        p = p->rb_right;
+                else if (fid->vnode < vnode->fid.vnode)
+                        p = p->rb_left;
+                else if (fid->vnode > vnode->fid.vnode)
+                        p = p->rb_right;
+                else if (fid->unique < vnode->fid.unique)
+                        p = p->rb_left;
+                else if (fid->unique > vnode->fid.unique)
+                        p = p->rb_right;
+                else
+                        goto found;
+        }
+        /* not found so we just ignore it (it may have moved to another
+         * server) */
+not_available:
+        _debug("not avail");
+        spin_unlock(&server->fs_lock);
+        _leave("");
+        return;
+found:
+        _debug("found");
+        ASSERTCMP(server, ==, vnode->server);
+        if (!igrab(AFS_VNODE_TO_I(vnode)))
+                goto not_available;
+        spin_unlock(&server->fs_lock);
+        afs_break_callback(server, vnode);
+        iput(&vnode->vfs_inode);
+        _leave("");
+}
-/*****************************************************************************/
 /*
 * allow the fileserver to break callback promises
 */
-int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
+void afs_break_callbacks(struct afs_server *server, size_t count,
-                      struct afs_callback callbacks[])
+                         struct afs_callback callbacks[])
 {
-        _enter("%p,%u,", server, count);
+        _enter("%p,%zu,", server, count);
-        for (; count > 0; callbacks++, count--) {
+        ASSERT(server != NULL);
-                struct afs_vnode *vnode = NULL;
+        ASSERTCMP(count, <=, AFSCBMAX);
-                struct inode *inode = NULL;
-                int valid = 0;
+        for (; count > 0; callbacks++, count--) {
                _debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
                       callbacks->fid.vid,
                       callbacks->fid.vnode,
@@ -103,67 +205,270 @@ int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
                       callbacks->expiry,
                       callbacks->type
                       );
+                afs_break_one_callback(server, &callbacks->fid);
+        }
-                /* find the inode for this fid */
+        _leave("");
-                spin_lock(&afs_cb_hash_lock);
+        return;
+}
-                list_for_each_entry(vnode,
+/*
-                                    &afs_cb_hash(server, &callbacks->fid),
+ * record the callback for breaking
-                                    cb_hash_link) {
+ * - the caller must hold server->cb_lock
-                        if (memcmp(&vnode->fid, &callbacks->fid,
+ */
-                                   sizeof(struct afs_fid)) != 0)
+static void afs_do_give_up_callback(struct afs_server *server,
-                                continue;
+                                    struct afs_vnode *vnode)
+{
+        struct afs_callback *cb;
-                        /* right vnode, but is it same server? */
+        _enter("%p,%p", server, vnode);
-                        if (vnode->cb_server != server)
-                                break; /* no */
-                        /* try and nail the inode down */
+        cb = &server->cb_break[server->cb_break_head];
-                        inode = igrab(AFS_VNODE_TO_I(vnode));
+        cb->fid         = vnode->fid;
-                        break;
+        cb->version     = vnode->cb_version;
+        cb->expiry      = vnode->cb_expiry;
+        cb->type        = vnode->cb_type;
+        smp_wmb();
+        server->cb_break_head =
+                (server->cb_break_head + 1) &
+                (ARRAY_SIZE(server->cb_break) - 1);
+        /* defer the breaking of callbacks to try and collect as many as
+         * possible to ship in one operation */
+        switch (atomic_inc_return(&server->cb_break_n)) {
+        case 1 ... AFSCBMAX - 1:
+                queue_delayed_work(afs_callback_update_worker,
+                                   &server->cb_break_work, HZ * 2);
+                break;
+        case AFSCBMAX:
+                afs_flush_callback_breaks(server);
+                break;
+        default:
+                break;
+        }
+        ASSERT(server->cb_promises.rb_node != NULL);
+        rb_erase(&vnode->cb_promise, &server->cb_promises);
+        vnode->cb_promised = false;
+        _leave("");
+}
+/*
+ * discard the callback on a deleted item
+ */
+void afs_discard_callback_on_delete(struct afs_vnode *vnode)
+{
+        struct afs_server *server = vnode->server;
+        _enter("%d", vnode->cb_promised);
+        if (!vnode->cb_promised) {
+                _leave(" [not promised]");
+                return;
+        }
+        ASSERT(server != NULL);
+        spin_lock(&server->cb_lock);
+        if (vnode->cb_promised) {
+                ASSERT(server->cb_promises.rb_node != NULL);
+                rb_erase(&vnode->cb_promise, &server->cb_promises);
+                vnode->cb_promised = false;
+        }
+        spin_unlock(&server->cb_lock);
+        _leave("");
+}
+/*
+ * give up the callback registered for a vnode on the file server when the
+ * inode is being cleared
+ */
+void afs_give_up_callback(struct afs_vnode *vnode)
+{
+        struct afs_server *server = vnode->server;
+        DECLARE_WAITQUEUE(myself, current);
+        _enter("%d", vnode->cb_promised);
+        _debug("GIVE UP INODE %p", &vnode->vfs_inode);
+        if (!vnode->cb_promised) {
+                _leave(" [not promised]");
+                return;
+        }
+        ASSERT(server != NULL);
+        spin_lock(&server->cb_lock);
+        if (vnode->cb_promised && afs_breakring_space(server) == 0) {
+                add_wait_queue(&server->cb_break_waitq, &myself);
+                for (;;) {
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        if (!vnode->cb_promised ||
+                            afs_breakring_space(server) != 0)
+                                break;
+                        spin_unlock(&server->cb_lock);
+                        schedule();
+                        spin_lock(&server->cb_lock);
                }
+                remove_wait_queue(&server->cb_break_waitq, &myself);
+                __set_current_state(TASK_RUNNING);
+        }
+        /* of course, it's always possible for the server to break this vnode's
+         * callback first... */
+        if (vnode->cb_promised)
+                afs_do_give_up_callback(server, vnode);
+        spin_unlock(&server->cb_lock);
+        _leave("");
+}
+/*
+ * dispatch a deferred give up callbacks operation
+ */
+void afs_dispatch_give_up_callbacks(struct work_struct *work)
+{
+        struct afs_server *server =
+                container_of(work, struct afs_server, cb_break_work.work);
+        _enter("");
+        /* tell the fileserver to discard the callback promises it has
+         * - in the event of ENOMEM or some other error, we just forget that we
+         *   had callbacks entirely, and the server will call us later to break
+         *   them
+         */
+        afs_fs_give_up_callbacks(server, &afs_async_call);
+}
+/*
+ * flush the outstanding callback breaks on a server
+ */
+void afs_flush_callback_breaks(struct afs_server *server)
+{
+        cancel_delayed_work(&server->cb_break_work);
+        queue_delayed_work(afs_callback_update_worker,
+                           &server->cb_break_work, 0);
+}
-                spin_unlock(&afs_cb_hash_lock);
+#if 0
+/*
-                if (inode) {
+ * update a bunch of callbacks
-                        /* we've found the record for this vnode */
+ */
-                        spin_lock(&vnode->lock);
+static void afs_callback_updater(struct work_struct *work)
-                        if (vnode->cb_server == server) {
+{
-                                /* the callback _is_ on the calling server */
+        struct afs_server *server;
-                                vnode->cb_server = NULL;
+        struct afs_vnode *vnode, *xvnode;
-                                valid = 1;
+        time_t now;
+        long timeout;
-                                afs_kafstimod_del_timer(&vnode->cb_timeout);
+        int ret;
-                                vnode->flags |= AFS_VNODE_CHANGED;
+        server = container_of(work, struct afs_server, updater);
-                                spin_lock(&server->cb_lock);
-                                list_del_init(&vnode->cb_link);
+        _enter("");
-                                spin_unlock(&server->cb_lock);
+        now = get_seconds();
-                                spin_lock(&afs_cb_hash_lock);
-                                list_del_init(&vnode->cb_hash_link);
+        /* find the first vnode to update */
-                                spin_unlock(&afs_cb_hash_lock);
+        spin_lock(&server->cb_lock);
-                        }
+        for (;;) {
-                        spin_unlock(&vnode->lock);
+                if (RB_EMPTY_ROOT(&server->cb_promises)) {
+                        spin_unlock(&server->cb_lock);
-                        if (valid) {
+                        _leave(" [nothing]");
-                                invalidate_remote_inode(inode);
+                        return;
-                                afs_put_server(server);
-                        }
-                        iput(inode);
                }
+                vnode = rb_entry(rb_first(&server->cb_promises),
+                                 struct afs_vnode, cb_promise);
+                if (atomic_read(&vnode->usage) > 0)
+                        break;
+                rb_erase(&vnode->cb_promise, &server->cb_promises);
+                vnode->cb_promised = false;
        }
-        _leave(" = 0");
+        timeout = vnode->update_at - now;
-        return 0;
+        if (timeout > 0) {
-} /* end SRXAFSCM_CallBack() */
+                queue_delayed_work(afs_vnode_update_worker,
+                                   &afs_vnode_update, timeout * HZ);
+                spin_unlock(&server->cb_lock);
+                _leave(" [nothing]");
+                return;
+        }
+        list_del_init(&vnode->update);
+        atomic_inc(&vnode->usage);
+        spin_unlock(&server->cb_lock);
+        /* we can now perform the update */
+        _debug("update %s", vnode->vldb.name);
+        vnode->state = AFS_VL_UPDATING;
+        vnode->upd_rej_cnt = 0;
+        vnode->upd_busy_cnt = 0;
+        ret = afs_vnode_update_record(vl, &vldb);
+        switch (ret) {
+        case 0:
+                afs_vnode_apply_update(vl, &vldb);
+                vnode->state = AFS_VL_UPDATING;
+                break;
+        case -ENOMEDIUM:
+                vnode->state = AFS_VL_VOLUME_DELETED;
+                break;
+        default:
+                vnode->state = AFS_VL_UNCERTAIN;
+                break;
+        }
+        /* and then reschedule */
+        _debug("reschedule");
+        vnode->update_at = get_seconds() + afs_vnode_update_timeout;
+        spin_lock(&server->cb_lock);
+        if (!list_empty(&server->cb_promises)) {
+                /* next update in 10 minutes, but wait at least 1 second more
+                 * than the newest record already queued so that we don't spam
+                 * the VL server suddenly with lots of requests
+                 */
+                xvnode = list_entry(server->cb_promises.prev,
+                                    struct afs_vnode, update);
+                if (vnode->update_at <= xvnode->update_at)
+                        vnode->update_at = xvnode->update_at + 1;
+                xvnode = list_entry(server->cb_promises.next,
+                                    struct afs_vnode, update);
+                timeout = xvnode->update_at - now;
+                if (timeout < 0)
+                        timeout = 0;
+        } else {
+                timeout = afs_vnode_update_timeout;
+        }
+        list_add_tail(&vnode->update, &server->cb_promises);
+        _debug("timeout %ld", timeout);
+        queue_delayed_work(afs_vnode_update_worker,
+                           &afs_vnode_update, timeout * HZ);
+        spin_unlock(&server->cb_lock);
+        afs_put_vnode(vl);
+}
+#endif
+/*
+ * initialise the callback update process
+ */
+int __init afs_callback_update_init(void)
+{
+        afs_callback_update_worker =
+                create_singlethread_workqueue("kafs_callbackd");
+        return afs_callback_update_worker ? 0 : -ENOMEM;
+}
-/*****************************************************************************/
 /*
- * allow the fileserver to see if the cache manager is still alive
+ * shut down the callback update process
 */
-int SRXAFSCM_Probe(struct afs_server *server)
+void __exit afs_callback_update_kill(void)
 {
-        _debug("SRXAFSCM_Probe(%p)\n", server);
+        destroy_workqueue(afs_callback_update_worker);
-        return 0;
+}
-} /* end SRXAFSCM_Probe() */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 1fc578372759..9b1311a1df51 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,4 +1,4 @@
-/* cell.c: AFS cell and server record management
+/* AFS cell and server record management
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -11,15 +11,9 @@
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <rxrpc/peer.h>
+#include <linux/key.h>
-#include <rxrpc/connection.h>
+#include <linux/ctype.h>
-#include "volume.h"
+#include <keys/rxrpc-type.h>
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include "super.h"
 #include "internal.h"
 DECLARE_RWSEM(afs_proc_cells_sem);
@@ -28,66 +22,47 @@ LIST_HEAD(afs_proc_cells);
 static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells);
 static DEFINE_RWLOCK(afs_cells_lock);
 static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
+static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
 static struct afs_cell *afs_cell_root;
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-                                                const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_cache_cell_index_def = {
-        .name                   = "cell_ix",
-        .data_size              = sizeof(struct afs_cache_cell),
-        .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-        .match                  = afs_cell_cache_match,
-        .update                 = afs_cell_cache_update,
-};
-#endif
-/*****************************************************************************/
 /*
- * create a cell record
+ * allocate a cell record and fill in its name, VL server address list and
- * - "name" is the name of the cell
+ * allocate an anonymous key
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
 */
-int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
+static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
 {
        struct afs_cell *cell;
-        char *next;
+        size_t namelen;
+        char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
        int ret;
-        _enter("%s", name);
+        _enter("%s,%s", name, vllist);
        BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+        namelen = strlen(name);
+        if (namelen > AFS_MAXCELLNAME)
+                return ERR_PTR(-ENAMETOOLONG);
        /* allocate and initialise a cell record */
-        cell = kmalloc(sizeof(struct afs_cell) + strlen(name) + 1, GFP_KERNEL);
+        cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
        if (!cell) {
                _leave(" = -ENOMEM");
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        }
-        down_write(&afs_cells_sem);
+        memcpy(cell->name, name, namelen);
+        cell->name[namelen] = 0;
-        memset(cell, 0, sizeof(struct afs_cell));
-        atomic_set(&cell->usage, 0);
+        atomic_set(&cell->usage, 1);
        INIT_LIST_HEAD(&cell->link);
+        rwlock_init(&cell->servers_lock);
-        rwlock_init(&cell->sv_lock);
+        INIT_LIST_HEAD(&cell->servers);
-        INIT_LIST_HEAD(&cell->sv_list);
-        INIT_LIST_HEAD(&cell->sv_graveyard);
-        spin_lock_init(&cell->sv_gylock);
        init_rwsem(&cell->vl_sem);
        INIT_LIST_HEAD(&cell->vl_list);
-        INIT_LIST_HEAD(&cell->vl_graveyard);
+        spin_lock_init(&cell->vl_lock);
-        spin_lock_init(&cell->vl_gylock);
-        strcpy(cell->name,name);
        /* fill in the VL server list from the rest of the string */
-        ret = -EINVAL;
        do {
                unsigned a, b, c, d;
@@ -96,20 +71,75 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
                        *next++ = 0;
                if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
-                        goto badaddr;
+                        goto bad_address;
                if (a > 255 || b > 255 || c > 255 || d > 255)
-                        goto badaddr;
+                        goto bad_address;
                cell->vl_addrs[cell->vl_naddrs++].s_addr =
                        htonl((a << 24) | (b << 16) | (c << 8) | d);
-                if (cell->vl_naddrs >= AFS_CELL_MAX_ADDRS)
+        } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
-                        break;
+        /* create a key to represent an anonymous user */
+        memcpy(keyname, "afs@", 4);
+        dp = keyname + 4;
+        cp = cell->name;
+        do {
+                *dp++ = toupper(*cp);
+        } while (*cp++);
+        cell->anonymous_key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+                                        KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+        if (IS_ERR(cell->anonymous_key)) {
+                _debug("no key");
+                ret = PTR_ERR(cell->anonymous_key);
+                goto error;
+        }
+        ret = key_instantiate_and_link(cell->anonymous_key, NULL, 0,
+                                       NULL, NULL);
+        if (ret < 0) {
+                _debug("instantiate failed");
+                goto error;
+        }
+        _debug("anon key %p{%x}",
+               cell->anonymous_key, key_serial(cell->anonymous_key));
+        _leave(" = %p", cell);
+        return cell;
+bad_address:
+        printk(KERN_ERR "kAFS: bad VL server IP address\n");
+        ret = -EINVAL;
+error:
+        key_put(cell->anonymous_key);
+        kfree(cell);
+        _leave(" = %d", ret);
+        return ERR_PTR(ret);
+}
+/*
+ * create a cell record
+ * - "name" is the name of the cell
+ * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ */
+struct afs_cell *afs_cell_create(const char *name, char *vllist)
+{
+        struct afs_cell *cell;
+        int ret;
+        _enter("%s,%s", name, vllist);
-        } while(vllist = next, vllist);
+        cell = afs_cell_alloc(name, vllist);
+        if (IS_ERR(cell)) {
+                _leave(" = %ld", PTR_ERR(cell));
+                return cell;
+        }
+        down_write(&afs_cells_sem);
-        /* add a proc dir for this cell */
+        /* add a proc directory for this cell */
        ret = afs_proc_cell_setup(cell);
        if (ret < 0)
                goto error;
@@ -130,31 +160,28 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
        down_write(&afs_proc_cells_sem);
        list_add_tail(&cell->proc_link, &afs_proc_cells);
        up_write(&afs_proc_cells_sem);
-        *_cell = cell;
        up_write(&afs_cells_sem);
-        _leave(" = 0 (%p)", cell);
+        _leave(" = %p", cell);
-        return 0;
+        return cell;
- badaddr:
+error:
-        printk(KERN_ERR "kAFS: bad VL server IP address: '%s'\n", vllist);
- error:
        up_write(&afs_cells_sem);
+        key_put(cell->anonymous_key);
        kfree(cell);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
-} /* end afs_cell_create() */
+}
-/*****************************************************************************/
 /*
- * initialise the cell database from module parameters
+ * set the root cell information
+ * - can be called with a module parameter string
+ * - can be called from a write to /proc/fs/afs/rootcell
 */
 int afs_cell_init(char *rootcell)
 {
        struct afs_cell *old_root, *new_root;
        char *cp;
-        int ret;
        _enter("");
@@ -162,82 +189,60 @@ int afs_cell_init(char *rootcell)
                /* module is loaded with no parameters, or built statically.
                 * - in the future we might initialize cell DB here.
                 */
-                _leave(" = 0 (but no root)");
+                _leave(" = 0 [no root]");
                return 0;
        }
        cp = strchr(rootcell, ':');
        if (!cp) {
                printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
-                _leave(" = %d (no colon)", -EINVAL);
+                _leave(" = -EINVAL");
                return -EINVAL;
        }
        /* allocate a cell record for the root cell */
        *cp++ = 0;
-        ret = afs_cell_create(rootcell, cp, &new_root);
+        new_root = afs_cell_create(rootcell, cp);
-        if (ret < 0) {
+        if (IS_ERR(new_root)) {
-                _leave(" = %d", ret);
+                _leave(" = %ld", PTR_ERR(new_root));
-                return ret;
+                return PTR_ERR(new_root);
        }
-        /* as afs_put_cell() takes locks by itself, we have to do
+        /* install the new cell */
-         * a little gymnastics to be race-free.
-         */
-        afs_get_cell(new_root);
        write_lock(&afs_cells_lock);
-        while (afs_cell_root) {
+        old_root = afs_cell_root;
-                old_root = afs_cell_root;
-                afs_cell_root = NULL;
-                write_unlock(&afs_cells_lock);
-                afs_put_cell(old_root);
-                write_lock(&afs_cells_lock);
-        }
        afs_cell_root = new_root;
        write_unlock(&afs_cells_lock);
+        afs_put_cell(old_root);
-        _leave(" = %d", ret);
+        _leave(" = 0");
-        return ret;
+        return 0;
+}
-} /* end afs_cell_init() */
-/*****************************************************************************/
 /*
 * lookup a cell record
 */
-int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
 {
        struct afs_cell *cell;
-        int ret;
        _enter("\"%*.*s\",", namesz, namesz, name ? name : "");
-        *_cell = NULL;
+        down_read(&afs_cells_sem);
+        read_lock(&afs_cells_lock);
        if (name) {
                /* if the cell was named, look for it in the cell record list */
-                ret = -ENOENT;
-                cell = NULL;
-                read_lock(&afs_cells_lock);
                list_for_each_entry(cell, &afs_cells, link) {
                        if (strncmp(cell->name, name, namesz) == 0) {
                                afs_get_cell(cell);
                                goto found;
                        }
                }
-                cell = NULL;
+                cell = ERR_PTR(-ENOENT);
        found:
+                ;
-                read_unlock(&afs_cells_lock);
+        } else {
-                if (cell)
-                        ret = 0;
-        }
-        else {
-                read_lock(&afs_cells_lock);
                cell = afs_cell_root;
                if (!cell) {
                        /* this should not happen unless user tries to mount
@@ -246,44 +251,35 @@ int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
                         * ENOENT might be "more appropriate" but they happen
                         * for other reasons.
                         */
-                        ret = -EDESTADDRREQ;
+                        cell = ERR_PTR(-EDESTADDRREQ);
-                }
+                } else {
-                else {
                        afs_get_cell(cell);
-                        ret = 0;
                }
-                read_unlock(&afs_cells_lock);
        }
-        *_cell = cell;
+        read_unlock(&afs_cells_lock);
-        _leave(" = %d (%p)", ret, cell);
+        up_read(&afs_cells_sem);
-        return ret;
+        _leave(" = %p", cell);
+        return cell;
-} /* end afs_cell_lookup() */
+}
-/*****************************************************************************/
 /*
 * try and get a cell record
 */
-struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell)
+struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
 {
-        struct afs_cell *cell;
        write_lock(&afs_cells_lock);
-        cell = *_cell;
        if (cell && !list_empty(&cell->link))
                afs_get_cell(cell);
        else
                cell = NULL;
        write_unlock(&afs_cells_lock);
        return cell;
-} /* end afs_get_cell_maybe() */
+}
-/*****************************************************************************/
 /*
 * destroy a cell record
 */
@@ -294,8 +290,7 @@ void afs_put_cell(struct afs_cell *cell)
        _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
-        /* sanity check */
+        ASSERTCMP(atomic_read(&cell->usage), >, 0);
-        BUG_ON(atomic_read(&cell->usage) <= 0);
        /* to prevent a race, the decrement and the dequeue must be effectively
         * atomic */
@@ -307,36 +302,49 @@ void afs_put_cell(struct afs_cell *cell)
                return;
        }
+        ASSERT(list_empty(&cell->servers));
+        ASSERT(list_empty(&cell->vl_list));
        write_unlock(&afs_cells_lock);
-        BUG_ON(!list_empty(&cell->sv_list));
+        wake_up(&afs_cells_freeable_wq);
-        BUG_ON(!list_empty(&cell->sv_graveyard));
-        BUG_ON(!list_empty(&cell->vl_list));
-        BUG_ON(!list_empty(&cell->vl_graveyard));
        _leave(" [unused]");
-} /* end afs_put_cell() */
+}
-/*****************************************************************************/
 /*
 * destroy a cell record
+ * - must be called with the afs_cells_sem write-locked
+ * - cell->link should have been broken by the caller
 */
 static void afs_cell_destroy(struct afs_cell *cell)
 {
        _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
-        /* to prevent a race, the decrement and the dequeue must be effectively
+        ASSERTCMP(atomic_read(&cell->usage), >=, 0);
-         * atomic */
+        ASSERT(list_empty(&cell->link));
-        write_lock(&afs_cells_lock);
-        /* sanity check */
+        /* wait for everyone to stop using the cell */
-        BUG_ON(atomic_read(&cell->usage) != 0);
+        if (atomic_read(&cell->usage) > 0) {
+                DECLARE_WAITQUEUE(myself, current);
-        list_del_init(&cell->link);
+                _debug("wait for cell %s", cell->name);
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                add_wait_queue(&afs_cells_freeable_wq, &myself);
-        write_unlock(&afs_cells_lock);
+                while (atomic_read(&cell->usage) > 0) {
+                        schedule();
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                }
-        down_write(&afs_cells_sem);
+                remove_wait_queue(&afs_cells_freeable_wq, &myself);
+                set_current_state(TASK_RUNNING);
+        }
+        _debug("cell dead");
+        ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+        ASSERT(list_empty(&cell->servers));
+        ASSERT(list_empty(&cell->vl_list));
        afs_proc_cell_remove(cell);
@@ -348,104 +356,26 @@ static void afs_cell_destroy(struct afs_cell *cell)
        cachefs_relinquish_cookie(cell->cache, 0);
 #endif
-        up_write(&afs_cells_sem);
+        key_put(cell->anonymous_key);
-        BUG_ON(!list_empty(&cell->sv_list));
-        BUG_ON(!list_empty(&cell->sv_graveyard));
-        BUG_ON(!list_empty(&cell->vl_list));
-        BUG_ON(!list_empty(&cell->vl_graveyard));
-        /* finish cleaning up the cell */
        kfree(cell);
        _leave(" [destroyed]");
-} /* end afs_cell_destroy() */
+}
-/*****************************************************************************/
-/*
- * lookup the server record corresponding to an Rx RPC peer
- */
-int afs_server_find_by_peer(const struct rxrpc_peer *peer,
-                            struct afs_server **_server)
-{
-        struct afs_server *server;
-        struct afs_cell *cell;
-        _enter("%p{a=%08x},", peer, ntohl(peer->addr.s_addr));
-        /* search the cell list */
-        read_lock(&afs_cells_lock);
-        list_for_each_entry(cell, &afs_cells, link) {
-                _debug("? cell %s",cell->name);
-                write_lock(&cell->sv_lock);
-                /* check the active list */
-                list_for_each_entry(server, &cell->sv_list, link) {
-                        _debug("?? server %08x", ntohl(server->addr.s_addr));
-                        if (memcmp(&server->addr, &peer->addr,
-                                   sizeof(struct in_addr)) == 0)
-                                goto found_server;
-                }
-                /* check the inactive list */
-                spin_lock(&cell->sv_gylock);
-                list_for_each_entry(server, &cell->sv_graveyard, link) {
-                        _debug("?? dead server %08x",
-                               ntohl(server->addr.s_addr));
-                        if (memcmp(&server->addr, &peer->addr,
-                                   sizeof(struct in_addr)) == 0)
-                                goto found_dead_server;
-                }
-                spin_unlock(&cell->sv_gylock);
-                write_unlock(&cell->sv_lock);
-        }
-        read_unlock(&afs_cells_lock);
-        _leave(" = -ENOENT");
-        return -ENOENT;
-        /* we found it in the graveyard - resurrect it */
- found_dead_server:
-        list_move_tail(&server->link, &cell->sv_list);
-        afs_get_server(server);
-        afs_kafstimod_del_timer(&server->timeout);
-        spin_unlock(&cell->sv_gylock);
-        goto success;
-        /* we found it - increment its ref count and return it */
- found_server:
-        afs_get_server(server);
- success:
-        write_unlock(&cell->sv_lock);
-        read_unlock(&afs_cells_lock);
-        *_server = server;
-        _leave(" = 0 (s=%p c=%p)", server, cell);
-        return 0;
-} /* end afs_server_find_by_peer() */
-/*****************************************************************************/
 /*
 * purge in-memory cell database on module unload or afs_init() failure
 * - the timeout daemon is stopped before calling this
 */
 void afs_cell_purge(void)
 {
-        struct afs_vlocation *vlocation;
        struct afs_cell *cell;
        _enter("");
        afs_put_cell(afs_cell_root);
+        down_write(&afs_cells_sem);
        while (!list_empty(&afs_cells)) {
                cell = NULL;
@@ -464,104 +394,11 @@ void afs_cell_purge(void)
                        _debug("PURGING CELL %s (%d)",
                               cell->name, atomic_read(&cell->usage));
-                        BUG_ON(!list_empty(&cell->sv_list));
-                        BUG_ON(!list_empty(&cell->vl_list));
-                        /* purge the cell's VL graveyard list */
-                        _debug(" - clearing VL graveyard");
-                        spin_lock(&cell->vl_gylock);
-                        while (!list_empty(&cell->vl_graveyard)) {
-                                vlocation = list_entry(cell->vl_graveyard.next,
-                                                       struct afs_vlocation,
-                                                       link);
-                                list_del_init(&vlocation->link);
-                                afs_kafstimod_del_timer(&vlocation->timeout);
-                                spin_unlock(&cell->vl_gylock);
-                                afs_vlocation_do_timeout(vlocation);
-                                /* TODO: race if move to use krxtimod instead
-                                 * of kafstimod */
-                                spin_lock(&cell->vl_gylock);
-                        }
-                        spin_unlock(&cell->vl_gylock);
-                        /* purge the cell's server graveyard list */
-                        _debug(" - clearing server graveyard");
-                        spin_lock(&cell->sv_gylock);
-                        while (!list_empty(&cell->sv_graveyard)) {
-                                struct afs_server *server;
-                                server = list_entry(cell->sv_graveyard.next,
-                                                    struct afs_server, link);
-                                list_del_init(&server->link);
-                                afs_kafstimod_del_timer(&server->timeout);
-                                spin_unlock(&cell->sv_gylock);
-                                afs_server_do_timeout(server);
-                                spin_lock(&cell->sv_gylock);
-                        }
-                        spin_unlock(&cell->sv_gylock);
                        /* now the cell should be left with no references */
                        afs_cell_destroy(cell);
                }
        }
+        up_write(&afs_cells_sem);
        _leave("");
-} /* end afs_cell_purge() */
+}
-/*****************************************************************************/
-/*
- * match a cell record obtained from the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-                                                const void *entry)
-{
-        const struct afs_cache_cell *ccell = entry;
-        struct afs_cell *cell = target;
-        _enter("{%s},{%s}", ccell->name, cell->name);
-        if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
-                _leave(" = SUCCESS");
-                return CACHEFS_MATCH_SUCCESS;
-        }
-        _leave(" = FAILED");
-        return CACHEFS_MATCH_FAILED;
-} /* end afs_cell_cache_match() */
-#endif
-/*****************************************************************************/
-/*
- * update a cell record in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
-{
-        struct afs_cache_cell *ccell = entry;
-        struct afs_cell *cell = source;
-        _enter("%p,%p", source, entry);
-        strncpy(ccell->name, cell->name, sizeof(ccell->name));
-        memcpy(ccell->vl_servers,
-               cell->vl_addrs,
-               min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
-} /* end afs_cell_cache_update() */
-#endif
diff --git a/fs/afs/cell.h b/fs/afs/cell.h
deleted file mode 100644
index 48349108fb00..000000000000
--- a/fs/afs/cell.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* cell.h: AFS cell record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_CELL_H
-#define _LINUX_AFS_CELL_H
-#include "types.h"
-#include "cache.h"
-#define AFS_CELL_MAX_ADDRS 15
-extern volatile int afs_cells_being_purged; /* T when cells are being purged by rmmod */
-/*****************************************************************************/
-/*
- * entry in the cached cell catalogue
- */
-struct afs_cache_cell
-{
-        char                    name[64];       /* cell name (padded with NULs) */
-        struct in_addr          vl_servers[15]; /* cached cell VL servers */
-};
-/*****************************************************************************/
-/*
- * AFS cell record
- */
-struct afs_cell
-{
-        atomic_t                usage;
-        struct list_head        link;           /* main cell list link */
-        struct list_head        proc_link;      /* /proc cell list link */
-        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        /* server record management */
-        rwlock_t                sv_lock;        /* active server list lock */
-        struct list_head        sv_list;        /* active server list */
-        struct list_head        sv_graveyard;   /* inactive server list */
-        spinlock_t              sv_gylock;      /* inactive server list lock */
-        /* volume location record management */
-        struct rw_semaphore     vl_sem;         /* volume management serialisation semaphore */
-        struct list_head        vl_list;        /* cell's active VL record list */
-        struct list_head        vl_graveyard;   /* cell's inactive VL record list */
-        spinlock_t              vl_gylock;      /* graveyard lock */
-        unsigned short          vl_naddrs;      /* number of VL servers in addr list */
-        unsigned short          vl_curr_svix;   /* current server index */
-        struct in_addr          vl_addrs[AFS_CELL_MAX_ADDRS];   /* cell VL server addresses */
-        char                    name[0];        /* cell name - must go last */
-};
-extern int afs_cell_init(char *rootcell);
-extern int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell);
-extern int afs_cell_lookup(const char *name, unsigned nmsize, struct afs_cell **_cell);
-#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-extern struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell);
-extern void afs_put_cell(struct afs_cell *cell);
-extern void afs_cell_purge(void);
-#endif /* _LINUX_AFS_CELL_H */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3d097fddcb7a..6685f4cbccb3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -1,4 +1,4 @@
-/* cmservice.c: AFS Cache Manager Service
+/* AFS Cache Manager Service
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -12,641 +12,463 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <linux/completion.h>
+#include <linux/ip.h>
-#include "server.h"
-#include "cell.h"
-#include "transport.h"
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "cmservice.h"
 #include "internal.h"
+#include "afs_cm.h"
-static unsigned afscm_usage;            /* AFS cache manager usage count */
+struct workqueue_struct *afs_cm_workqueue;
-static struct rw_semaphore afscm_sem;   /* AFS cache manager start/stop semaphore */
-static int afscm_new_call(struct rxrpc_call *call);
-static void afscm_attention(struct rxrpc_call *call);
-static void afscm_error(struct rxrpc_call *call);
-static void afscm_aemap(struct rxrpc_call *call);
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call);
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call);
-static void _SRXAFSCM_Probe(struct rxrpc_call *call);
-typedef void (*_SRXAFSCM_xxxx_t)(struct rxrpc_call *call);
-static const struct rxrpc_operation AFSCM_ops[] = {
-        {
-                .id     = 204,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "CallBack",
-                .user   = _SRXAFSCM_CallBack,
-        },
-        {
-                .id     = 205,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "InitCallBackState",
-                .user   = _SRXAFSCM_InitCallBackState,
-        },
-        {
-                .id     = 206,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "Probe",
-                .user   = _SRXAFSCM_Probe,
-        },
-#if 0
-        {
-                .id     = 207,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetLock",
-                .user   = _SRXAFSCM_GetLock,
-        },
-        {
-                .id     = 208,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetCE",
-                .user   = _SRXAFSCM_GetCE,
-        },
-        {
-                .id     = 209,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetXStatsVersion",
-                .user   = _SRXAFSCM_GetXStatsVersion,
-        },
-        {
-                .id     = 210,
-                .asize  = RXRPC_APP_MARK_EOF,
-                .name   = "GetXStats",
-                .user   = _SRXAFSCM_GetXStats,
-        }
-#endif
-};
-static struct rxrpc_service AFSCM_service = {
+static int afs_deliver_cb_init_call_back_state(struct afs_call *,
-        .name           = "AFS/CM",
+                                               struct sk_buff *, bool);
-        .owner          = THIS_MODULE,
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
-        .link           = LIST_HEAD_INIT(AFSCM_service.link),
+                                                struct sk_buff *, bool);
-        .new_call       = afscm_new_call,
+static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
-        .service_id     = 1,
+static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
-        .attn_func      = afscm_attention,
+static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *,
-        .error_func     = afscm_error,
+                                           bool);
-        .aemap_func     = afscm_aemap,
+static void afs_cm_destructor(struct afs_call *);
-        .ops_begin      = &AFSCM_ops[0],
-        .ops_end        = &AFSCM_ops[ARRAY_SIZE(AFSCM_ops)],
-};
-static DECLARE_COMPLETION(kafscmd_alive);
-static DECLARE_COMPLETION(kafscmd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafscmd_sleepq);
-static LIST_HEAD(kafscmd_attention_list);
-static LIST_HEAD(afscm_calls);
-static DEFINE_SPINLOCK(afscm_calls_lock);
-static DEFINE_SPINLOCK(kafscmd_attention_lock);
-static int kafscmd_die;
-/*****************************************************************************/
 /*
- * AFS Cache Manager kernel thread
+ * CB.CallBack operation type
 */
-static int kafscmd(void *arg)
+static const struct afs_call_type afs_SRXCBCallBack = {
-{
+        .name           = "CB.CallBack",
-        DECLARE_WAITQUEUE(myself, current);
+        .deliver        = afs_deliver_cb_callback,
+        .abort_to_error = afs_abort_to_error,
-        struct rxrpc_call *call;
+        .destructor     = afs_cm_destructor,
-        _SRXAFSCM_xxxx_t func;
+};
-        int die;
-        printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
-        daemonize("kafscmd");
-        complete(&kafscmd_alive);
-        /* loop around looking for things to attend to */
-        do {
-                if (list_empty(&kafscmd_attention_list)) {
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        add_wait_queue(&kafscmd_sleepq, &myself);
-                        for (;;) {
-                                set_current_state(TASK_INTERRUPTIBLE);
-                                if (!list_empty(&kafscmd_attention_list) ||
-                                    signal_pending(current) ||
-                                    kafscmd_die)
-                                        break;
-                                schedule();
-                        }
-                        remove_wait_queue(&kafscmd_sleepq, &myself);
-                        set_current_state(TASK_RUNNING);
-                }
-                die = kafscmd_die;
-                /* dequeue the next call requiring attention */
-                call = NULL;
-                spin_lock(&kafscmd_attention_lock);
-                if (!list_empty(&kafscmd_attention_list)) {
-                        call = list_entry(kafscmd_attention_list.next,
-                                          struct rxrpc_call,
-                                          app_attn_link);
-                        list_del_init(&call->app_attn_link);
-                        die = 0;
-                }
-                spin_unlock(&kafscmd_attention_lock);
-                if (call) {
-                        /* act upon it */
-                        _debug("@@@ Begin Attend Call %p", call);
-                        func = call->app_user;
-                        if (func)
-                                func(call);
-                        rxrpc_put_call(call);
-                        _debug("@@@ End Attend Call %p", call);
-                }
-        } while(!die);
-        /* and that's all */
-        complete_and_exit(&kafscmd_dead, 0);
-} /* end kafscmd() */
-/*****************************************************************************/
 /*
- * handle a call coming in to the cache manager
+ * CB.InitCallBackState operation type
- * - if I want to keep the call, I must increment its usage count
- * - the return value will be negated and passed back in an abort packet if
- *   non-zero
- * - serialised by virtue of there only being one krxiod
 */
-static int afscm_new_call(struct rxrpc_call *call)
+static const struct afs_call_type afs_SRXCBInitCallBackState = {
-{
+        .name           = "CB.InitCallBackState",
-        _enter("%p{cid=%u u=%d}",
+        .deliver        = afs_deliver_cb_init_call_back_state,
-               call, ntohl(call->call_id), atomic_read(&call->usage));
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_cm_destructor,
-        rxrpc_get_call(call);
+};
-        /* add to my current call list */
-        spin_lock(&afscm_calls_lock);
-        list_add(&call->app_link,&afscm_calls);
-        spin_unlock(&afscm_calls_lock);
-        _leave(" = 0");
-        return 0;
-} /* end afscm_new_call() */
-/*****************************************************************************/
 /*
- * queue on the kafscmd queue for attention
+ * CB.InitCallBackState3 operation type
 */
-static void afscm_attention(struct rxrpc_call *call)
+static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
-{
+        .name           = "CB.InitCallBackState3",
-        _enter("%p{cid=%u u=%d}",
+        .deliver        = afs_deliver_cb_init_call_back_state3,
-               call, ntohl(call->call_id), atomic_read(&call->usage));
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_cm_destructor,
-        spin_lock(&kafscmd_attention_lock);
+};
-        if (list_empty(&call->app_attn_link)) {
-                list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
-                rxrpc_get_call(call);
-        }
-        spin_unlock(&kafscmd_attention_lock);
-        wake_up(&kafscmd_sleepq);
-        _leave(" {u=%d}", atomic_read(&call->usage));
-} /* end afscm_attention() */
-/*****************************************************************************/
 /*
- * handle my call being aborted
+ * CB.Probe operation type
- * - clean up, dequeue and put my ref to the call
 */
-static void afscm_error(struct rxrpc_call *call)
+static const struct afs_call_type afs_SRXCBProbe = {
-{
+        .name           = "CB.Probe",
-        int removed;
+        .deliver        = afs_deliver_cb_probe,
+        .abort_to_error = afs_abort_to_error,
-        _enter("%p{est=%s ac=%u er=%d}",
+        .destructor     = afs_cm_destructor,
-               call,
+};
-               rxrpc_call_error_states[call->app_err_state],
-               call->app_abort_code,
-               call->app_errno);
-        spin_lock(&kafscmd_attention_lock);
-        if (list_empty(&call->app_attn_link)) {
-                list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
-                rxrpc_get_call(call);
-        }
-        spin_unlock(&kafscmd_attention_lock);
-        removed = 0;
-        spin_lock(&afscm_calls_lock);
-        if (!list_empty(&call->app_link)) {
-                list_del_init(&call->app_link);
-                removed = 1;
-        }
-        spin_unlock(&afscm_calls_lock);
-        if (removed)
-                rxrpc_put_call(call);
-        wake_up(&kafscmd_sleepq);
-        _leave("");
+/*
-} /* end afscm_error() */
+ * CB.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_SRXCBGetCapabilites = {
+        .name           = "CB.GetCapabilities",
+        .deliver        = afs_deliver_cb_get_capabilities,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_cm_destructor,
+};
-/*****************************************************************************/
 /*
- * map afs abort codes to/from Linux error codes
+ * route an incoming cache manager call
- * - called with call->lock held
+ * - return T if supported, F if not
 */
-static void afscm_aemap(struct rxrpc_call *call)
+bool afs_cm_incoming_call(struct afs_call *call)
 {
-        switch (call->app_err_state) {
+        u32 operation_id = ntohl(call->operation_ID);
-        case RXRPC_ESTATE_LOCAL_ABORT:
-                call->app_abort_code = -call->app_errno;
+        _enter("{CB.OP %u}", operation_id);
-                break;
-        case RXRPC_ESTATE_PEER_ABORT:
+        switch (operation_id) {
-                call->app_errno = -ECONNABORTED;
+        case CBCallBack:
-                break;
+                call->type = &afs_SRXCBCallBack;
+                return true;
+        case CBInitCallBackState:
+                call->type = &afs_SRXCBInitCallBackState;
+                return true;
+        case CBInitCallBackState3:
+                call->type = &afs_SRXCBInitCallBackState3;
+                return true;
+        case CBProbe:
+                call->type = &afs_SRXCBProbe;
+                return true;
+        case CBGetCapabilities:
+                call->type = &afs_SRXCBGetCapabilites;
+                return true;
        default:
-                break;
+                return false;
        }
-} /* end afscm_aemap() */
+}
-/*****************************************************************************/
 /*
- * start the cache manager service if not already started
+ * clean up a cache manager call
 */
-int afscm_start(void)
+static void afs_cm_destructor(struct afs_call *call)
 {
-        int ret;
+        _enter("");
-        down_write(&afscm_sem);
-        if (!afscm_usage) {
-                ret = kernel_thread(kafscmd, NULL, 0);
-                if (ret < 0)
-                        goto out;
-                wait_for_completion(&kafscmd_alive);
-                ret = rxrpc_add_service(afs_transport, &AFSCM_service);
-                if (ret < 0)
-                        goto kill;
-                afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
-                                        afs_mntpt_expiry_timeout * HZ);
-        }
-        afscm_usage++;
-        up_write(&afscm_sem);
-        return 0;
- kill:
-        kafscmd_die = 1;
-        wake_up(&kafscmd_sleepq);
-        wait_for_completion(&kafscmd_dead);
- out:
-        up_write(&afscm_sem);
-        return ret;
-} /* end afscm_start() */
+        afs_put_server(call->server);
+        call->server = NULL;
+        kfree(call->buffer);
+        call->buffer = NULL;
+}
-/*****************************************************************************/
 /*
- * stop the cache manager service
+ * allow the fileserver to see if the cache manager is still alive
 */
-void afscm_stop(void)
+static void SRXAFSCB_CallBack(struct work_struct *work)
 {
-        struct rxrpc_call *call;
+        struct afs_call *call = container_of(work, struct afs_call, work);
-        down_write(&afscm_sem);
+        _enter("");
-        BUG_ON(afscm_usage == 0);
+        /* be sure to send the reply *before* attempting to spam the AFS server
-        afscm_usage--;
+         * with FSFetchStatus requests on the vnodes with broken callbacks lest
+         * the AFS server get into a vicious cycle of trying to break further
+         * callbacks because it hadn't received completion of the CBCallBack op
+         * yet */
+        afs_send_empty_reply(call);
-        if (afscm_usage == 0) {
+        afs_break_callbacks(call->server, call->count, call->request);
-                /* don't want more incoming calls */
+        _leave("");
-                rxrpc_del_service(afs_transport, &AFSCM_service);
+}
-                /* abort any calls I've still got open (the afscm_error() will
-                 * dequeue them) */
-                spin_lock(&afscm_calls_lock);
-                while (!list_empty(&afscm_calls)) {
-                        call = list_entry(afscm_calls.next,
-                                          struct rxrpc_call,
-                                          app_link);
-                        list_del_init(&call->app_link);
+/*
-                        rxrpc_get_call(call);
+ * deliver request data to a CB.CallBack call
-                        spin_unlock(&afscm_calls_lock);
+ */
+static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
+                                   bool last)
+{
+        struct afs_callback *cb;
+        struct afs_server *server;
+        struct in_addr addr;
+        __be32 *bp;
+        u32 tmp;
+        int ret, loop;
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+        switch (call->unmarshall) {
+        case 0:
+                call->offset = 0;
+                call->unmarshall++;
+                /* extract the FID array and its count in two steps */
+        case 1:
+                _debug("extract FID count");
+                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-                        rxrpc_call_abort(call, -ESRCH); /* abort, dequeue and
+                call->count = ntohl(call->tmp);
-                                                         * put */
+                _debug("FID count: %u", call->count);
+                if (call->count > AFSCBMAX)
+                        return -EBADMSG;
+                call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
+                if (!call->buffer)
+                        return -ENOMEM;
+                call->offset = 0;
+                call->unmarshall++;
+        case 2:
+                _debug("extract FID array");
+                ret = afs_extract_data(call, skb, last, call->buffer,
+                                       call->count * 3 * 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-                        _debug("nuking active call %08x.%d",
+                _debug("unmarshall FID array");
-                               ntohl(call->conn->conn_id),
+                call->request = kcalloc(call->count,
-                               ntohl(call->call_id));
+                                        sizeof(struct afs_callback),
-                        rxrpc_put_call(call);
+                                        GFP_KERNEL);
-                        rxrpc_put_call(call);
+                if (!call->request)
+                        return -ENOMEM;
+                cb = call->request;
+                bp = call->buffer;
+                for (loop = call->count; loop > 0; loop--, cb++) {
+                        cb->fid.vid     = ntohl(*bp++);
+                        cb->fid.vnode   = ntohl(*bp++);
+                        cb->fid.unique  = ntohl(*bp++);
+                        cb->type        = AFSCM_CB_UNTYPED;
+                }
-                        spin_lock(&afscm_calls_lock);
+                call->offset = 0;
+                call->unmarshall++;
+                /* extract the callback array and its count in two steps */
+        case 3:
+                _debug("extract CB count");
+                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
                }
-                spin_unlock(&afscm_calls_lock);
-                /* get rid of my daemon */
+                tmp = ntohl(call->tmp);
-                kafscmd_die = 1;
+                _debug("CB count: %u", tmp);
-                wake_up(&kafscmd_sleepq);
+                if (tmp != call->count && tmp != 0)
-                wait_for_completion(&kafscmd_dead);
+                        return -EBADMSG;
+                call->offset = 0;
+                call->unmarshall++;
+                if (tmp == 0)
+                        goto empty_cb_array;
+        case 4:
+                _debug("extract CB array");
+                ret = afs_extract_data(call, skb, last, call->request,
+                                       call->count * 3 * 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-                /* dispose of any calls waiting for attention */
+                _debug("unmarshall CB array");
-                spin_lock(&kafscmd_attention_lock);
+                cb = call->request;
-                while (!list_empty(&kafscmd_attention_list)) {
+                bp = call->buffer;
-                        call = list_entry(kafscmd_attention_list.next,
+                for (loop = call->count; loop > 0; loop--, cb++) {
-                                          struct rxrpc_call,
+                        cb->version     = ntohl(*bp++);
-                                          app_attn_link);
+                        cb->expiry      = ntohl(*bp++);
+                        cb->type        = ntohl(*bp++);
+                }
-                        list_del_init(&call->app_attn_link);
+        empty_cb_array:
-                        spin_unlock(&kafscmd_attention_lock);
+                call->offset = 0;
+                call->unmarshall++;
-                        rxrpc_put_call(call);
+        case 5:
+                _debug("trailer");
+                if (skb->len != 0)
+                        return -EBADMSG;
+                break;
+        }
-                        spin_lock(&kafscmd_attention_lock);
+        if (!last)
-                }
+                return 0;
-                spin_unlock(&kafscmd_attention_lock);
-                afs_kafstimod_del_timer(&afs_mntpt_expiry_timer);
+        call->state = AFS_CALL_REPLYING;
-        }
-        up_write(&afscm_sem);
+        /* we'll need the file server record as that tells us which set of
+         * vnodes to operate upon */
+        memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+        server = afs_find_server(&addr);
+        if (!server)
+                return -ENOTCONN;
+        call->server = server;
-} /* end afscm_stop() */
+        INIT_WORK(&call->work, SRXAFSCB_CallBack);
+        schedule_work(&call->work);
+        return 0;
+}
-/*****************************************************************************/
 /*
- * handle the fileserver breaking a set of callbacks
+ * allow the fileserver to request callback state (re-)initialisation
 */
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call)
+static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 {
-        struct afs_server *server;
+        struct afs_call *call = container_of(work, struct afs_call, work);
-        size_t count, qty, tmp;
-        int ret = 0, removed;
-        _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
-        server = afs_server_get_from_peer(call->conn->peer);
-        switch (call->app_call_state) {
-                /* we've received the last packet
-                 * - drain all the data from the call and send the reply
-                 */
-        case RXRPC_CSTATE_SRVR_GOT_ARGS:
-                ret = -EBADMSG;
-                qty = call->app_ready_qty;
-                if (qty < 8 || qty > 50 * (6 * 4) + 8)
-                        break;
-                {
-                        struct afs_callback *cb, *pcb;
-                        int loop;
-                        __be32 *fp, *bp;
-                        fp = rxrpc_call_alloc_scratch(call, qty);
-                        /* drag the entire argument block out to the scratch
-                         * space */
-                        ret = rxrpc_call_read_data(call, fp, qty, 0);
-                        if (ret < 0)
-                                break;
-                        /* and unmarshall the parameter block */
-                        ret = -EBADMSG;
-                        count = ntohl(*fp++);
-                        if (count>AFSCBMAX ||
-                            (count * (3 * 4) + 8 != qty &&
-                             count * (6 * 4) + 8 != qty))
-                                break;
-                        bp = fp + count*3;
-                        tmp = ntohl(*bp++);
-                        if (tmp > 0 && tmp != count)
-                                break;
-                        if (tmp == 0)
-                                bp = NULL;
-                        pcb = cb = rxrpc_call_alloc_scratch_s(
-                                call, struct afs_callback);
-                        for (loop = count - 1; loop >= 0; loop--) {
-                                pcb->fid.vid    = ntohl(*fp++);
-                                pcb->fid.vnode  = ntohl(*fp++);
-                                pcb->fid.unique = ntohl(*fp++);
-                                if (bp) {
-                                        pcb->version    = ntohl(*bp++);
-                                        pcb->expiry     = ntohl(*bp++);
-                                        pcb->type       = ntohl(*bp++);
-                                }
-                                else {
-                                        pcb->version    = 0;
-                                        pcb->expiry     = 0;
-                                        pcb->type       = AFSCM_CB_UNTYPED;
-                                }
-                                pcb++;
-                        }
-                        /* invoke the actual service routine */
-                        ret = SRXAFSCM_CallBack(server, count, cb);
-                        if (ret < 0)
-                                break;
-                }
-                /* send the reply */
+        _enter("{%p}", call->server);
-                ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-                                            GFP_KERNEL, 0, &count);
-                if (ret < 0)
-                        break;
-                break;
-                /* operation complete */
-        case RXRPC_CSTATE_COMPLETE:
-                call->app_user = NULL;
-                removed = 0;
-                spin_lock(&afscm_calls_lock);
-                if (!list_empty(&call->app_link)) {
-                        list_del_init(&call->app_link);
-                        removed = 1;
-                }
-                spin_unlock(&afscm_calls_lock);
-                if (removed)
+        afs_init_callback_state(call->server);
-                        rxrpc_put_call(call);
+        afs_send_empty_reply(call);
-                break;
+        _leave("");
+}
-                /* operation terminated on error */
+/*
-        case RXRPC_CSTATE_ERROR:
+ * deliver request data to a CB.InitCallBackState call
-                call->app_user = NULL;
+ */
-                break;
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
+                                               struct sk_buff *skb,
+                                               bool last)
+{
+        struct afs_server *server;
+        struct in_addr addr;
-        default:
+        _enter(",{%u},%d", skb->len, last);
-                break;
-        }
-        if (ret < 0)
+        if (skb->len > 0)
-                rxrpc_call_abort(call, ret);
+                return -EBADMSG;
+        if (!last)
+                return 0;
-        afs_put_server(server);
+        /* no unmarshalling required */
+        call->state = AFS_CALL_REPLYING;
-        _leave(" = %d", ret);
+        /* we'll need the file server record as that tells us which set of
+         * vnodes to operate upon */
+        memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+        server = afs_find_server(&addr);
+        if (!server)
+                return -ENOTCONN;
+        call->server = server;
-} /* end _SRXAFSCM_CallBack() */
+        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+        schedule_work(&call->work);
+        return 0;
+}
-/*****************************************************************************/
 /*
- * handle the fileserver asking us to initialise our callback state
+ * deliver request data to a CB.InitCallBackState3 call
 */
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call)
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
+                                                struct sk_buff *skb,
+                                                bool last)
 {
        struct afs_server *server;
-        size_t count;
+        struct in_addr addr;
-        int ret = 0, removed;
-        _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
+        _enter(",{%u},%d", skb->len, last);
-        server = afs_server_get_from_peer(call->conn->peer);
+        if (!last)
+                return 0;
-        switch (call->app_call_state) {
+        /* no unmarshalling required */
-                /* we've received the last packet - drain all the data from the
+        call->state = AFS_CALL_REPLYING;
-                 * call */
-        case RXRPC_CSTATE_SRVR_GOT_ARGS:
-                /* shouldn't be any args */
-                ret = -EBADMSG;
-                break;
-                /* send the reply when asked for it */
-        case RXRPC_CSTATE_SRVR_SND_REPLY:
-                /* invoke the actual service routine */
-                ret = SRXAFSCM_InitCallBackState(server);
-                if (ret < 0)
-                        break;
-                ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-                                            GFP_KERNEL, 0, &count);
-                if (ret < 0)
-                        break;
-                break;
-                /* operation complete */
+        /* we'll need the file server record as that tells us which set of
-        case RXRPC_CSTATE_COMPLETE:
+         * vnodes to operate upon */
-                call->app_user = NULL;
+        memcpy(&addr, &ip_hdr(skb)->saddr, 4);
-                removed = 0;
+        server = afs_find_server(&addr);
-                spin_lock(&afscm_calls_lock);
+        if (!server)
-                if (!list_empty(&call->app_link)) {
+                return -ENOTCONN;
-                        list_del_init(&call->app_link);
+        call->server = server;
-                        removed = 1;
-                }
-                spin_unlock(&afscm_calls_lock);
-                if (removed)
+        INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-                        rxrpc_put_call(call);
+        schedule_work(&call->work);
-                break;
+        return 0;
+}
-                /* operation terminated on error */
-        case RXRPC_CSTATE_ERROR:
-                call->app_user = NULL;
-                break;
-        default:
-                break;
-        }
-        if (ret < 0)
-                rxrpc_call_abort(call, ret);
-        afs_put_server(server);
-        _leave(" = %d", ret);
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_Probe(struct work_struct *work)
+{
+        struct afs_call *call = container_of(work, struct afs_call, work);
-} /* end _SRXAFSCM_InitCallBackState() */
+        _enter("");
+        afs_send_empty_reply(call);
+        _leave("");
+}
-/*****************************************************************************/
 /*
- * handle a probe from a fileserver
+ * deliver request data to a CB.Probe call
 */
-static void _SRXAFSCM_Probe(struct rxrpc_call *call)
+static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
+                                bool last)
 {
-        struct afs_server *server;
+        _enter(",{%u},%d", skb->len, last);
-        size_t count;
-        int ret = 0, removed;
-        _enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
-        server = afs_server_get_from_peer(call->conn->peer);
+        if (skb->len > 0)
+                return -EBADMSG;
+        if (!last)
+                return 0;
-        switch (call->app_call_state) {
+        /* no unmarshalling required */
-                /* we've received the last packet - drain all the data from the
+        call->state = AFS_CALL_REPLYING;
-                 * call */
-        case RXRPC_CSTATE_SRVR_GOT_ARGS:
-                /* shouldn't be any args */
-                ret = -EBADMSG;
-                break;
-                /* send the reply when asked for it */
+        INIT_WORK(&call->work, SRXAFSCB_Probe);
-        case RXRPC_CSTATE_SRVR_SND_REPLY:
+        schedule_work(&call->work);
-                /* invoke the actual service routine */
+        return 0;
-                ret = SRXAFSCM_Probe(server);
+}
-                if (ret < 0)
-                        break;
-                ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-                                            GFP_KERNEL, 0, &count);
-                if (ret < 0)
-                        break;
-                break;
-                /* operation complete */
+/*
-        case RXRPC_CSTATE_COMPLETE:
+ * allow the fileserver to ask about the cache manager's capabilities
-                call->app_user = NULL;
+ */
-                removed = 0;
+static void SRXAFSCB_GetCapabilities(struct work_struct *work)
-                spin_lock(&afscm_calls_lock);
+{
-                if (!list_empty(&call->app_link)) {
+        struct afs_interface *ifs;
-                        list_del_init(&call->app_link);
+        struct afs_call *call = container_of(work, struct afs_call, work);
-                        removed = 1;
+        int loop, nifs;
+        struct {
+                struct /* InterfaceAddr */ {
+                        __be32 nifs;
+                        __be32 uuid[11];
+                        __be32 ifaddr[32];
+                        __be32 netmask[32];
+                        __be32 mtu[32];
+                } ia;
+                struct /* Capabilities */ {
+                        __be32 capcount;
+                        __be32 caps[1];
+                } cap;
+        } reply;
+        _enter("");
+        nifs = 0;
+        ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
+        if (ifs) {
+                nifs = afs_get_ipv4_interfaces(ifs, 32, false);
+                if (nifs < 0) {
+                        kfree(ifs);
+                        ifs = NULL;
+                        nifs = 0;
                }
-                spin_unlock(&afscm_calls_lock);
+        }
-                if (removed)
+        memset(&reply, 0, sizeof(reply));
-                        rxrpc_put_call(call);
+        reply.ia.nifs = htonl(nifs);
-                break;
+        reply.ia.uuid[0] = htonl(afs_uuid.time_low);
+        reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
+        reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+        reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
+        reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+        for (loop = 0; loop < 6; loop++)
+                reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+        if (ifs) {
+                for (loop = 0; loop < nifs; loop++) {
+                        reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
+                        reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
+                        reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+                }
+        }
-                /* operation terminated on error */
+        reply.cap.capcount = htonl(1);
-        case RXRPC_CSTATE_ERROR:
+        reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
-                call->app_user = NULL;
+        afs_send_simple_reply(call, &reply, sizeof(reply));
-                break;
-        default:
+        _leave("");
-                break;
+}
-        }
-        if (ret < 0)
+/*
-                rxrpc_call_abort(call, ret);
+ * deliver request data to a CB.GetCapabilities call
+ */
+static int afs_deliver_cb_get_capabilities(struct afs_call *call,
+                                           struct sk_buff *skb, bool last)
+{
+        _enter(",{%u},%d", skb->len, last);
-        afs_put_server(server);
+        if (skb->len > 0)
+                return -EBADMSG;
+        if (!last)
+                return 0;
-        _leave(" = %d", ret);
+        /* no unmarshalling required */
+        call->state = AFS_CALL_REPLYING;
-} /* end _SRXAFSCM_Probe() */
+        INIT_WORK(&call->work, SRXAFSCB_GetCapabilities);
+        schedule_work(&call->work);
+        return 0;
+}
diff --git a/fs/afs/cmservice.h b/fs/afs/cmservice.h
deleted file mode 100644
index af8d4d689cb2..000000000000
--- a/fs/afs/cmservice.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* cmservice.h: AFS Cache Manager Service declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_CMSERVICE_H
-#define _LINUX_AFS_CMSERVICE_H
-#include <rxrpc/transport.h>
-#include "types.h"
-/* cache manager start/stop */
-extern int afscm_start(void);
-extern void afscm_stop(void);
-/* cache manager server functions */
-extern int SRXAFSCM_InitCallBackState(struct afs_server *server);
-extern int SRXAFSCM_CallBack(struct afs_server *server,
-                             size_t count,
-                             struct afs_callback callbacks[]);
-extern int SRXAFSCM_Probe(struct afs_server *server);
-#endif /* _LINUX_AFS_CMSERVICE_H */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b6dc2ebe47a8..dac5b990c0cd 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -15,45 +15,53 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
+#include <linux/ctype.h>
-#include "vnode.h"
-#include "volume.h"
-#include <rxrpc/call.h>
-#include "super.h"
 #include "internal.h"
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
-                                     struct nameidata *nd);
+                                 struct nameidata *nd);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
 static int afs_d_delete(struct dentry *dentry);
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
+static void afs_d_release(struct dentry *dentry);
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
                                  loff_t fpos, u64 ino, unsigned dtype);
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+                      struct nameidata *nd);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_rmdir(struct inode *dir, struct dentry *dentry);
+static int afs_unlink(struct inode *dir, struct dentry *dentry);
+static int afs_link(struct dentry *from, struct inode *dir,
+                    struct dentry *dentry);
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+                       const char *content);
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                      struct inode *new_dir, struct dentry *new_dentry);
 const struct file_operations afs_dir_file_operations = {
        .open           = afs_dir_open,
-        .readdir        = afs_dir_readdir,
+        .release        = afs_release,
+        .readdir        = afs_readdir,
 };
 const struct inode_operations afs_dir_inode_operations = {
-        .lookup         = afs_dir_lookup,
+        .create         = afs_create,
+        .lookup         = afs_lookup,
+        .link           = afs_link,
+        .unlink         = afs_unlink,
+        .symlink        = afs_symlink,
+        .mkdir          = afs_mkdir,
+        .rmdir          = afs_rmdir,
+        .rename         = afs_rename,
+        .permission     = afs_permission,
        .getattr        = afs_inode_getattr,
-#if 0 /* TODO */
-        .create         = afs_dir_create,
-        .link           = afs_dir_link,
-        .unlink         = afs_dir_unlink,
-        .symlink        = afs_dir_symlink,
-        .mkdir          = afs_dir_mkdir,
-        .rmdir          = afs_dir_rmdir,
-        .mknod          = afs_dir_mknod,
-        .rename         = afs_dir_rename,
-#endif
 };
 static struct dentry_operations afs_fs_dentry_operations = {
        .d_revalidate   = afs_d_revalidate,
        .d_delete       = afs_d_delete,
+        .d_release      = afs_d_release,
 };
 #define AFS_DIR_HASHTBL_SIZE    128
@@ -105,14 +113,13 @@ struct afs_dir_page {
        union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
 };
-struct afs_dir_lookup_cookie {
+struct afs_lookup_cookie {
        struct afs_fid  fid;
        const char      *name;
        size_t          nlen;
        int             found;
 };
-/*****************************************************************************/
 /*
 * check that a directory page is valid
 */
@@ -128,9 +135,10 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
        if (qty == 0)
                goto error;
-        if (page->index==0 && qty!=ntohs(dbuf->blocks[0].pagehdr.npages)) {
+        if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
                printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
-                       __FUNCTION__,dir->i_ino,qty,ntohs(dbuf->blocks[0].pagehdr.npages));
+                       __FUNCTION__, dir->i_ino, qty,
+                       ntohs(dbuf->blocks[0].pagehdr.npages));
                goto error;
        }
 #endif
@@ -157,13 +165,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
        SetPageChecked(page);
        return;
- error:
+error:
        SetPageChecked(page);
        SetPageError(page);
+}
-} /* end afs_dir_check_page() */
-/*****************************************************************************/
 /*
 * discard a page cached in the pagecache
 */
@@ -171,20 +177,22 @@ static inline void afs_dir_put_page(struct page *page)
 {
        kunmap(page);
        page_cache_release(page);
+}
-} /* end afs_dir_put_page() */
-/*****************************************************************************/
 /*
 * get a page into the pagecache
 */
-static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
+static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
+                                     struct key *key)
 {
        struct page *page;
+        struct file file = {
+                .private_data = key,
+        };
        _enter("{%lu},%lu", dir->i_ino, index);
-        page = read_mapping_page(dir->i_mapping, index, NULL);
+        page = read_mapping_page(dir->i_mapping, index, &file);
        if (!IS_ERR(page)) {
                wait_on_page_locked(page);
                kmap(page);
@@ -197,12 +205,12 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
        }
        return page;
- fail:
+fail:
        afs_dir_put_page(page);
+        _leave(" = -EIO");
        return ERR_PTR(-EIO);
-} /* end afs_dir_get_page() */
+}
-/*****************************************************************************/
 /*
 * open an AFS directory file
 */
@@ -213,15 +221,12 @@ static int afs_dir_open(struct inode *inode, struct file *file)
        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
-        if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
+        if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
                return -ENOENT;
-        _leave(" = 0");
+        return afs_open(inode, file);
-        return 0;
+}
-} /* end afs_dir_open() */
-/*****************************************************************************/
 /*
 * deal with one block in an AFS directory
 */
@@ -250,7 +255,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                /* skip entries marked unused in the bitmap */
                if (!(block->pagehdr.bitmap[offset / 8] &
                      (1 << (offset % 8)))) {
-                        _debug("ENT[%Zu.%u]: unused\n",
+                        _debug("ENT[%Zu.%u]: unused",
                               blkoff / sizeof(union afs_dir_block), offset);
                        if (offset >= curr)
                                *fpos = blkoff +
@@ -264,7 +269,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                               sizeof(*block) -
                               offset * sizeof(union afs_dirent));
-                _debug("ENT[%Zu.%u]: %s %Zu \"%s\"\n",
+                _debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
                       blkoff / sizeof(union afs_dir_block), offset,
                       (offset < curr ? "skip" : "fill"),
                       nlen, dire->u.name);
@@ -274,7 +279,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        if (next >= AFS_DIRENT_PER_BLOCK) {
                                _debug("ENT[%Zu.%u]:"
                                       " %u travelled beyond end dir block"
-                                       " (len %u/%Zu)\n",
+                                       " (len %u/%Zu)",
                                       blkoff / sizeof(union afs_dir_block),
                                       offset, next, tmp, nlen);
                                return -EIO;
@@ -282,13 +287,13 @@ static int afs_dir_iterate_block(unsigned *fpos,
                        if (!(block->pagehdr.bitmap[next / 8] &
                              (1 << (next % 8)))) {
                                _debug("ENT[%Zu.%u]:"
-                                       " %u unmarked extension (len %u/%Zu)\n",
+                                       " %u unmarked extension (len %u/%Zu)",
                                       blkoff / sizeof(union afs_dir_block),
                                       offset, next, tmp, nlen);
                                return -EIO;
                        }
-                        _debug("ENT[%Zu.%u]: ext %u/%Zu\n",
+                        _debug("ENT[%Zu.%u]: ext %u/%Zu",
                               blkoff / sizeof(union afs_dir_block),
                               next, tmp, nlen);
                        next++;
@@ -304,7 +309,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
                              nlen,
                              blkoff + offset * sizeof(union afs_dirent),
                              ntohl(dire->u.vnode),
-                              filldir == afs_dir_lookup_filldir ?
+                              filldir == afs_lookup_filldir ?
                              ntohl(dire->u.unique) : DT_UNKNOWN);
                if (ret < 0) {
                        _leave(" = 0 [full]");
@@ -316,16 +321,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
        _leave(" = 1 [more]");
        return 1;
-} /* end afs_dir_iterate_block() */
+}
-/*****************************************************************************/
 /*
- * read an AFS directory
+ * iterate through the data blob that lists the contents of an AFS directory
 */
 static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
-                           filldir_t filldir)
+                           filldir_t filldir, struct key *key)
 {
-        union afs_dir_block     *dblock;
+        union afs_dir_block *dblock;
        struct afs_dir_page *dbuf;
        struct page *page;
        unsigned blkoff, limit;
@@ -333,7 +337,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
        _enter("{%lu},%u,,", dir->i_ino, *fpos);
-        if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+        if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
                _leave(" = -ESTALE");
                return -ESTALE;
        }
@@ -348,7 +352,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
                /* fetch the appropriate page from the directory */
-                page = afs_dir_get_page(dir, blkoff / PAGE_SIZE);
+                page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
                if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        break;
@@ -377,43 +381,50 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
                ret = 0;
        }
- out:
+out:
        _leave(" = %d", ret);
        return ret;
-} /* end afs_dir_iterate() */
+}
-/*****************************************************************************/
 /*
 * read an AFS directory
 */
-static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
 {
        unsigned fpos;
        int ret;
-        _enter("{%Ld,{%lu}}", file->f_pos, file->f_path.dentry->d_inode->i_ino);
+        _enter("{%Ld,{%lu}}",
+               file->f_pos, file->f_path.dentry->d_inode->i_ino);
+        ASSERT(file->private_data != NULL);
        fpos = file->f_pos;
-        ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, cookie, filldir);
+        ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
+                              cookie, filldir, file->private_data);
        file->f_pos = fpos;
        _leave(" = %d", ret);
        return ret;
-} /* end afs_dir_readdir() */
+}
-/*****************************************************************************/
 /*
 * search the directory for a name
 * - if afs_dir_iterate_block() spots this function, it'll pass the FID
 *   uniquifier through dtype
 */
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
-                                  loff_t fpos, u64 ino, unsigned dtype)
+                              loff_t fpos, u64 ino, unsigned dtype)
 {
-        struct afs_dir_lookup_cookie *cookie = _cookie;
+        struct afs_lookup_cookie *cookie = _cookie;
-        _enter("{%s,%Zu},%s,%u,,%lu,%u",
+        _enter("{%s,%Zu},%s,%u,,%llu,%u",
-               cookie->name, cookie->nlen, name, nlen, ino, dtype);
+               cookie->name, cookie->nlen, name, nlen,
+               (unsigned long long) ino, dtype);
+        /* insanity checks first */
+        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
        if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
                _leave(" = 0 [no]");
@@ -426,216 +437,254 @@ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
        _leave(" = -1 [found]");
        return -1;
-} /* end afs_dir_lookup_filldir() */
+}
-/*****************************************************************************/
 /*
- * look up an entry in a directory
+ * do a lookup in a directory
+ * - just returns the FID the dentry name maps to if found
 */
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
+static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
-                                     struct nameidata *nd)
+                         struct afs_fid *fid, struct key *key)
 {
-        struct afs_dir_lookup_cookie cookie;
+        struct afs_lookup_cookie cookie;
        struct afs_super_info *as;
+        unsigned fpos;
+        int ret;
+        _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
+        as = dir->i_sb->s_fs_info;
+        /* search the directory */
+        cookie.name     = dentry->d_name.name;
+        cookie.nlen     = dentry->d_name.len;
+        cookie.fid.vid  = as->volume->vid;
+        cookie.found    = 0;
+        fpos = 0;
+        ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
+                              key);
+        if (ret < 0) {
+                _leave(" = %d [iter]", ret);
+                return ret;
+        }
+        ret = -ENOENT;
+        if (!cookie.found) {
+                _leave(" = -ENOENT [not found]");
+                return -ENOENT;
+        }
+        *fid = cookie.fid;
+        _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+        return 0;
+}
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+                                 struct nameidata *nd)
+{
        struct afs_vnode *vnode;
+        struct afs_fid fid;
        struct inode *inode;
-        unsigned fpos;
+        struct key *key;
        int ret;
-        _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
+        vnode = AFS_FS_I(dir);
-        /* insanity checks first */
+        _enter("{%x:%d},%p{%s},",
-        BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+               vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
-        BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+        ASSERTCMP(dentry->d_inode, ==, NULL);
        if (dentry->d_name.len > 255) {
                _leave(" = -ENAMETOOLONG");
                return ERR_PTR(-ENAMETOOLONG);
        }
-        vnode = AFS_FS_I(dir);
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-        if (vnode->flags & AFS_VNODE_DELETED) {
                _leave(" = -ESTALE");
                return ERR_PTR(-ESTALE);
        }
-        as = dir->i_sb->s_fs_info;
+        key = afs_request_key(vnode->volume->cell);
+        if (IS_ERR(key)) {
-        /* search the directory */
+                _leave(" = %ld [key]", PTR_ERR(key));
-        cookie.name     = dentry->d_name.name;
+                return ERR_PTR(PTR_ERR(key));
-        cookie.nlen     = dentry->d_name.len;
+        }
-        cookie.fid.vid  = as->volume->vid;
-        cookie.found    = 0;
-        fpos = 0;
+        ret = afs_validate(vnode, key);
-        ret = afs_dir_iterate(dir, &fpos, &cookie, afs_dir_lookup_filldir);
        if (ret < 0) {
-                _leave(" = %d", ret);
+                key_put(key);
+                _leave(" = %d [val]", ret);
                return ERR_PTR(ret);
        }
-        ret = -ENOENT;
+        ret = afs_do_lookup(dir, dentry, &fid, key);
-        if (!cookie.found) {
+        if (ret < 0) {
-                _leave(" = %d", ret);
+                key_put(key);
+                if (ret == -ENOENT) {
+                        d_add(dentry, NULL);
+                        _leave(" = NULL [negative]");
+                        return NULL;
+                }
+                _leave(" = %d [do]", ret);
                return ERR_PTR(ret);
        }
+        dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
        /* instantiate the dentry */
-        ret = afs_iget(dir->i_sb, &cookie.fid, &inode);
+        inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
-        if (ret < 0) {
+        key_put(key);
-                _leave(" = %d", ret);
+        if (IS_ERR(inode)) {
-                return ERR_PTR(ret);
+                _leave(" = %ld", PTR_ERR(inode));
+                return ERR_PTR(PTR_ERR(inode));
        }
        dentry->d_op = &afs_fs_dentry_operations;
-        dentry->d_fsdata = (void *) (unsigned long) vnode->status.version;
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
-               cookie.fid.vnode,
+               fid.vnode,
-               cookie.fid.unique,
+               fid.unique,
               dentry->d_inode->i_ino,
               dentry->d_inode->i_version);
        return NULL;
-} /* end afs_dir_lookup() */
+}
-/*****************************************************************************/
 /*
 * check that a dentry lookup hit has found a valid entry
 * - NOTE! the hit can be a negative hit too, so we can't assume we have an
 *   inode
- * (derived from nfs_lookup_revalidate)
 */
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct afs_dir_lookup_cookie cookie;
+        struct afs_vnode *vnode, *dir;
+        struct afs_fid fid;
        struct dentry *parent;
-        struct inode *inode, *dir;
+        struct key *key;
-        unsigned fpos;
+        void *dir_version;
        int ret;
-        _enter("{sb=%p n=%s},", dentry->d_sb, dentry->d_name.name);
+        vnode = AFS_FS_I(dentry->d_inode);
-        /* lock down the parent dentry so we can peer at it */
+        if (dentry->d_inode)
-        parent = dget_parent(dentry->d_parent);
+                _enter("{v={%x:%u} n=%s fl=%lx},",
+                       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+                       vnode->flags);
+        else
+                _enter("{neg n=%s}", dentry->d_name.name);
-        dir = parent->d_inode;
+        key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
-        inode = dentry->d_inode;
+        if (IS_ERR(key))
+                key = NULL;
-        /* handle a negative dentry */
+        /* lock down the parent dentry so we can peer at it */
-        if (!inode)
+        parent = dget_parent(dentry);
+        if (!parent->d_inode)
                goto out_bad;
-        /* handle a bad inode */
+        dir = AFS_FS_I(parent->d_inode);
-        if (is_bad_inode(inode)) {
-                printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
-                       dentry->d_parent->d_name.name, dentry->d_name.name);
-                goto out_bad;
-        }
-        /* force a full look up if the parent directory changed since last the
+        /* validate the parent directory */
-         * server was consulted
+        if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
-         * - otherwise this inode must still exist, even if the inode details
+                afs_validate(dir, key);
-         *   themselves have changed
-         */
-        if (AFS_FS_I(dir)->flags & AFS_VNODE_CHANGED)
-                afs_vnode_fetch_status(AFS_FS_I(dir));
-        if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+        if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
                _debug("%s: parent dir deleted", dentry->d_name.name);
                goto out_bad;
        }
-        if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) {
+        dir_version = (void *) (unsigned long) dir->status.data_version;
-                _debug("%s: file already deleted", dentry->d_name.name);
+        if (dentry->d_fsdata == dir_version)
-                goto out_bad;
+                goto out_valid; /* the dir contents are unchanged */
-        }
-        if ((unsigned long) dentry->d_fsdata !=
-            (unsigned long) AFS_FS_I(dir)->status.version) {
-                _debug("%s: parent changed %lu -> %u",
-                       dentry->d_name.name,
-                       (unsigned long) dentry->d_fsdata,
-                       (unsigned) AFS_FS_I(dir)->status.version);
-                /* search the directory for this vnode */
+        _debug("dir modified");
-                cookie.name     = dentry->d_name.name;
-                cookie.nlen     = dentry->d_name.len;
-                cookie.fid.vid  = AFS_FS_I(inode)->volume->vid;
-                cookie.found    = 0;
-                fpos = 0;
+        /* search the directory for this vnode */
-                ret = afs_dir_iterate(dir, &fpos, &cookie,
+        ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
-                                      afs_dir_lookup_filldir);
+        switch (ret) {
-                if (ret < 0) {
+        case 0:
-                        _debug("failed to iterate dir %s: %d",
+                /* the filename maps to something */
-                               parent->d_name.name, ret);
+                if (!dentry->d_inode)
+                        goto out_bad;
+                if (is_bad_inode(dentry->d_inode)) {
+                        printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
+                               parent->d_name.name, dentry->d_name.name);
                        goto out_bad;
-                }
-                if (!cookie.found) {
-                        _debug("%s: dirent not found", dentry->d_name.name);
-                        goto not_found;
                }
                /* if the vnode ID has changed, then the dirent points to a
                 * different file */
-                if (cookie.fid.vnode != AFS_FS_I(inode)->fid.vnode) {
+                if (fid.vnode != vnode->fid.vnode) {
-                        _debug("%s: dirent changed", dentry->d_name.name);
+                        _debug("%s: dirent changed [%u != %u]",
+                               dentry->d_name.name, fid.vnode,
+                               vnode->fid.vnode);
                        goto not_found;
                }
                /* if the vnode ID uniqifier has changed, then the file has
-                 * been deleted */
+                 * been deleted and replaced, and the original vnode ID has
-                if (cookie.fid.unique != AFS_FS_I(inode)->fid.unique) {
+                 * been reused */
+                if (fid.unique != vnode->fid.unique) {
                        _debug("%s: file deleted (uq %u -> %u I:%lu)",
-                               dentry->d_name.name,
+                               dentry->d_name.name, fid.unique,
-                               cookie.fid.unique,
+                               vnode->fid.unique, dentry->d_inode->i_version);
-                               AFS_FS_I(inode)->fid.unique,
+                        spin_lock(&vnode->lock);
-                               inode->i_version);
+                        set_bit(AFS_VNODE_DELETED, &vnode->flags);
-                        spin_lock(&AFS_FS_I(inode)->lock);
+                        spin_unlock(&vnode->lock);
-                        AFS_FS_I(inode)->flags |= AFS_VNODE_DELETED;
+                        goto not_found;
-                        spin_unlock(&AFS_FS_I(inode)->lock);
-                        invalidate_remote_inode(inode);
-                        goto out_bad;
                }
+                goto out_valid;
+        case -ENOENT:
+                /* the filename is unknown */
+                _debug("%s: dirent not found", dentry->d_name.name);
+                if (dentry->d_inode)
+                        goto not_found;
+                goto out_valid;
-                dentry->d_fsdata =
+        default:
-                        (void *) (unsigned long) AFS_FS_I(dir)->status.version;
+                _debug("failed to iterate dir %s: %d",
+                       parent->d_name.name, ret);
+                goto out_bad;
        }
- out_valid:
+out_valid:
+        dentry->d_fsdata = dir_version;
+out_skip:
        dput(parent);
+        key_put(key);
        _leave(" = 1 [valid]");
        return 1;
        /* the dirent, if it exists, now points to a different vnode */
- not_found:
+not_found:
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_NFSFS_RENAMED;
        spin_unlock(&dentry->d_lock);
- out_bad:
+out_bad:
-        if (inode) {
+        if (dentry->d_inode) {
                /* don't unhash if we have submounts */
                if (have_submounts(dentry))
-                        goto out_valid;
+                        goto out_skip;
        }
-        shrink_dcache_parent(dentry);
        _debug("dropping dentry %s/%s",
-               dentry->d_parent->d_name.name, dentry->d_name.name);
+               parent->d_name.name, dentry->d_name.name);
+        shrink_dcache_parent(dentry);
        d_drop(dentry);
        dput(parent);
+        key_put(key);
        _leave(" = 0 [bad]");
        return 0;
-} /* end afs_d_revalidate() */
+}
-/*****************************************************************************/
 /*
 * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
 * sleep)
@@ -649,15 +698,444 @@ static int afs_d_delete(struct dentry *dentry)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
                goto zap;
-        if (dentry->d_inode) {
+        if (dentry->d_inode &&
-                if (AFS_FS_I(dentry->d_inode)->flags & AFS_VNODE_DELETED)
+            test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
                        goto zap;
-        }
        _leave(" = 0 [keep]");
        return 0;
- zap:
+zap:
        _leave(" = 1 [zap]");
        return 1;
-} /* end afs_d_delete() */
+}
+/*
+ * handle dentry release
+ */
+static void afs_d_release(struct dentry *dentry)
+{
+        _enter("%s", dentry->d_name.name);
+}
+/*
+ * create a directory on an AFS filesystem
+ */
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct afs_file_status status;
+        struct afs_callback cb;
+        struct afs_server *server;
+        struct afs_vnode *dvnode, *vnode;
+        struct afs_fid fid;
+        struct inode *inode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s},%o",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        mode |= S_IFDIR;
+        ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+                               mode, &fid, &status, &cb, &server);
+        if (ret < 0)
+                goto mkdir_error;
+        inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+        if (IS_ERR(inode)) {
+                /* ENOMEM at a really inconvenient time - just abandon the new
+                 * directory on the server */
+                ret = PTR_ERR(inode);
+                goto iget_error;
+        }
+        /* apply the status report we've got for the new vnode */
+        vnode = AFS_FS_I(inode);
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        afs_vnode_finalise_status_update(vnode, server);
+        afs_put_server(server);
+        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry)) {
+                _debug("not hashed");
+                d_rehash(dentry);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+iget_error:
+        afs_put_server(server);
+mkdir_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * remove a directory from an AFS filesystem
+ */
+static int afs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct afs_vnode *dvnode, *vnode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s}",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
+        if (ret < 0)
+                goto rmdir_error;
+        if (dentry->d_inode) {
+                vnode = AFS_FS_I(dentry->d_inode);
+                clear_nlink(&vnode->vfs_inode);
+                set_bit(AFS_VNODE_DELETED, &vnode->flags);
+                afs_discard_callback_on_delete(vnode);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+rmdir_error:
+        key_put(key);
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * remove a file from an AFS filesystem
+ */
+static int afs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct afs_vnode *dvnode, *vnode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s}",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        if (dentry->d_inode) {
+                vnode = AFS_FS_I(dentry->d_inode);
+                /* make sure we have a callback promise on the victim */
+                ret = afs_validate(vnode, key);
+                if (ret < 0)
+                        goto error;
+        }
+        ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
+        if (ret < 0)
+                goto remove_error;
+        if (dentry->d_inode) {
+                /* if the file wasn't deleted due to excess hard links, the
+                 * fileserver will break the callback promise on the file - if
+                 * it had one - before it returns to us, and if it was deleted,
+                 * it won't
+                 *
+                 * however, if we didn't have a callback promise outstanding,
+                 * or it was outstanding on a different server, then it won't
+                 * break it either...
+                 */
+                vnode = AFS_FS_I(dentry->d_inode);
+                if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+                        _debug("AFS_VNODE_DELETED");
+                if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+                        _debug("AFS_VNODE_CB_BROKEN");
+                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+                ret = afs_validate(vnode, key);
+                _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+remove_error:
+        key_put(key);
+error:
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * create a regular file on an AFS filesystem
+ */
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+                      struct nameidata *nd)
+{
+        struct afs_file_status status;
+        struct afs_callback cb;
+        struct afs_server *server;
+        struct afs_vnode *dvnode, *vnode;
+        struct afs_fid fid;
+        struct inode *inode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s},%o,",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        mode |= S_IFREG;
+        ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+                               mode, &fid, &status, &cb, &server);
+        if (ret < 0)
+                goto create_error;
+        inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+        if (IS_ERR(inode)) {
+                /* ENOMEM at a really inconvenient time - just abandon the new
+                 * directory on the server */
+                ret = PTR_ERR(inode);
+                goto iget_error;
+        }
+        /* apply the status report we've got for the new vnode */
+        vnode = AFS_FS_I(inode);
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        afs_vnode_finalise_status_update(vnode, server);
+        afs_put_server(server);
+        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry)) {
+                _debug("not hashed");
+                d_rehash(dentry);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+iget_error:
+        afs_put_server(server);
+create_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * create a hard link between files in an AFS filesystem
+ */
+static int afs_link(struct dentry *from, struct inode *dir,
+                    struct dentry *dentry)
+{
+        struct afs_vnode *dvnode, *vnode;
+        struct key *key;
+        int ret;
+        vnode = AFS_FS_I(from->d_inode);
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%x:%d},{%s}",
+               vnode->fid.vid, vnode->fid.vnode,
+               dvnode->fid.vid, dvnode->fid.vnode,
+               dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
+        if (ret < 0)
+                goto link_error;
+        atomic_inc(&vnode->vfs_inode.i_count);
+        d_instantiate(dentry, &vnode->vfs_inode);
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+link_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * create a symlink in an AFS filesystem
+ */
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+                       const char *content)
+{
+        struct afs_file_status status;
+        struct afs_server *server;
+        struct afs_vnode *dvnode, *vnode;
+        struct afs_fid fid;
+        struct inode *inode;
+        struct key *key;
+        int ret;
+        dvnode = AFS_FS_I(dir);
+        _enter("{%x:%d},{%s},%s",
+               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
+               content);
+        ret = -ENAMETOOLONG;
+        if (dentry->d_name.len > 255)
+                goto error;
+        ret = -EINVAL;
+        if (strlen(content) > 1023)
+                goto error;
+        key = afs_request_key(dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
+                                &fid, &status, &server);
+        if (ret < 0)
+                goto create_error;
+        inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
+        if (IS_ERR(inode)) {
+                /* ENOMEM at a really inconvenient time - just abandon the new
+                 * directory on the server */
+                ret = PTR_ERR(inode);
+                goto iget_error;
+        }
+        /* apply the status report we've got for the new vnode */
+        vnode = AFS_FS_I(inode);
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        afs_vnode_finalise_status_update(vnode, server);
+        afs_put_server(server);
+        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry)) {
+                _debug("not hashed");
+                d_rehash(dentry);
+        }
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+iget_error:
+        afs_put_server(server);
+create_error:
+        key_put(key);
+error:
+        d_drop(dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * rename a file in an AFS filesystem and/or move it between directories
+ */
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                      struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+        struct key *key;
+        int ret;
+        vnode = AFS_FS_I(old_dentry->d_inode);
+        orig_dvnode = AFS_FS_I(old_dir);
+        new_dvnode = AFS_FS_I(new_dir);
+        _enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
+               orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
+               vnode->fid.vid, vnode->fid.vnode,
+               new_dvnode->fid.vid, new_dvnode->fid.vnode,
+               new_dentry->d_name.name);
+        ret = -ENAMETOOLONG;
+        if (new_dentry->d_name.len > 255)
+                goto error;
+        key = afs_request_key(orig_dvnode->volume->cell);
+        if (IS_ERR(key)) {
+                ret = PTR_ERR(key);
+                goto error;
+        }
+        ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
+                               old_dentry->d_name.name,
+                               new_dentry->d_name.name);
+        if (ret < 0)
+                goto rename_error;
+        key_put(key);
+        _leave(" = 0");
+        return 0;
+rename_error:
+        key_put(key);
+error:
+        d_drop(new_dentry);
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/afs/errors.h b/fs/afs/errors.h
deleted file mode 100644
index 574d94ac8d05..000000000000
--- a/fs/afs/errors.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* errors.h: AFS abort/error codes
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_ERRORS_H
-#define _LINUX_AFS_ERRORS_H
-#include "types.h"
-/* file server abort codes */
-typedef enum {
-        VSALVAGE        = 101,  /* volume needs salvaging */
-        VNOVNODE        = 102,  /* no such file/dir (vnode) */
-        VNOVOL          = 103,  /* no such volume or volume unavailable */
-        VVOLEXISTS      = 104,  /* volume name already exists */
-        VNOSERVICE      = 105,  /* volume not currently in service */
-        VOFFLINE        = 106,  /* volume is currently offline (more info available [VVL-spec]) */
-        VONLINE         = 107,  /* volume is already online */
-        VDISKFULL       = 108,  /* disk partition is full */
-        VOVERQUOTA      = 109,  /* volume's maximum quota exceeded */
-        VBUSY           = 110,  /* volume is temporarily unavailable */
-        VMOVED          = 111,  /* volume moved to new server - ask this FS where */
-} afs_rxfs_abort_t;
-extern int afs_abort_to_error(int abortcode);
-#endif /* _LINUX_AFS_ERRORS_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index b17634541f67..ae256498f4f7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -1,6 +1,6 @@
-/* file.c: AFS filesystem file handling
+/* AFS filesystem file handling
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -15,22 +15,25 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include <rxrpc/call.h>
 #include "internal.h"
-#if 0
-static int afs_file_open(struct inode *inode, struct file *file);
-static int afs_file_release(struct inode *inode, struct file *file);
-#endif
 static int afs_file_readpage(struct file *file, struct page *page);
 static void afs_file_invalidatepage(struct page *page, unsigned long offset);
 static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
+const struct file_operations afs_file_operations = {
+        .open           = afs_open,
+        .release        = afs_release,
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .mmap           = generic_file_readonly_mmap,
+        .sendfile       = generic_file_sendfile,
+};
 const struct inode_operations afs_file_inode_operations = {
        .getattr        = afs_inode_getattr,
+        .permission     = afs_permission,
 };
 const struct address_space_operations afs_fs_aops = {
@@ -40,7 +43,48 @@ const struct address_space_operations afs_fs_aops = {
        .invalidatepage = afs_file_invalidatepage,
 };
-/*****************************************************************************/
+/*
+ * open an AFS file or directory and attach a key to it
+ */
+int afs_open(struct inode *inode, struct file *file)
+{
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        struct key *key;
+        int ret;
+        _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+        key = afs_request_key(vnode->volume->cell);
+        if (IS_ERR(key)) {
+                _leave(" = %ld [key]", PTR_ERR(key));
+                return PTR_ERR(key);
+        }
+        ret = afs_validate(vnode, key);
+        if (ret < 0) {
+                _leave(" = %d [val]", ret);
+                return ret;
+        }
+        file->private_data = key;
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * release an AFS file or directory and discard its key
+ */
+int afs_release(struct inode *inode, struct file *file)
+{
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+        key_put(file->private_data);
+        _leave(" = 0");
+        return 0;
+}
 /*
 * deal with notification that a page was read from the cache
 */
@@ -58,10 +102,9 @@ static void afs_file_readpage_read_complete(void *cookie_data,
                SetPageUptodate(page);
        unlock_page(page);
-} /* end afs_file_readpage_read_complete() */
+}
 #endif
-/*****************************************************************************/
 /*
 * deal with notification that a page was written to the cache
 */
@@ -74,41 +117,38 @@ static void afs_file_readpage_write_complete(void *cookie_data,
        _enter("%p,%p,%p,%d", cookie_data, page, data, error);
        unlock_page(page);
+}
-} /* end afs_file_readpage_write_complete() */
 #endif
-/*****************************************************************************/
 /*
 * AFS read page from file (or symlink)
 */
 static int afs_file_readpage(struct file *file, struct page *page)
 {
-        struct afs_rxfs_fetch_descriptor desc;
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_page *pageio;
-#endif
        struct afs_vnode *vnode;
        struct inode *inode;
+        struct key *key;
+        size_t len;
+        off_t offset;
        int ret;
        inode = page->mapping->host;
-        _enter("{%lu},{%lu}", inode->i_ino, page->index);
+        ASSERT(file != NULL);
+        key = file->private_data;
+        ASSERT(key != NULL);
+        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
        vnode = AFS_FS_I(inode);
        BUG_ON(!PageLocked(page));
        ret = -ESTALE;
-        if (vnode->flags & AFS_VNODE_DELETED)
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                goto error;
 #ifdef AFS_CACHING_SUPPORT
-        ret = cachefs_page_get_private(page, &pageio, GFP_NOIO);
-        if (ret < 0)
-                goto error;
        /* is it cached? */
        ret = cachefs_read_or_alloc_page(vnode->cache,
                                         page,
@@ -132,26 +172,19 @@ static int afs_file_readpage(struct file *file, struct page *page)
        case -ENOBUFS:
        case -ENODATA:
        default:
-                desc.fid        = vnode->fid;
+                offset = page->index << PAGE_CACHE_SHIFT;
-                desc.offset     = page->index << PAGE_CACHE_SHIFT;
+                len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
-                desc.size       = min((size_t) (inode->i_size - desc.offset),
-                                      (size_t) PAGE_SIZE);
-                desc.buffer     = kmap(page);
-                clear_page(desc.buffer);
                /* read the contents of the file from the server into the
                 * page */
-                ret = afs_vnode_fetch_data(vnode, &desc);
+                ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
-                kunmap(page);
                if (ret < 0) {
-                        if (ret==-ENOENT) {
+                        if (ret == -ENOENT) {
                                _debug("got NOENT from server"
                                       " - marking file deleted and stale");
-                                vnode->flags |= AFS_VNODE_DELETED;
+                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
                                ret = -ESTALE;
                        }
 #ifdef AFS_CACHING_SUPPORT
                        cachefs_uncache_page(vnode->cache, page);
 #endif
@@ -178,16 +211,13 @@ static int afs_file_readpage(struct file *file, struct page *page)
        _leave(" = 0");
        return 0;
- error:
+error:
        SetPageError(page);
        unlock_page(page);
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_file_readpage() */
-/*****************************************************************************/
 /*
 * get a page cookie for the specified page
 */
@@ -202,10 +232,9 @@ int afs_cache_get_page_cookie(struct page *page,
        _leave(" = %d", ret);
        return ret;
-} /* end afs_cache_get_page_cookie() */
+}
 #endif
-/*****************************************************************************/
 /*
 * invalidate part or all of a page
 */
@@ -240,9 +269,8 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
        }
        _leave(" = %d", ret);
-} /* end afs_file_invalidatepage() */
+}
-/*****************************************************************************/
 /*
 * release a page and cleanup its private data
 */
@@ -267,4 +295,4 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
        _leave(" = 0");
        return 0;
-} /* end afs_file_releasepage() */
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 61bc371532ab..2393d2a08d79 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1,6 +1,6 @@
-/* fsclient.c: AFS File Server client stubs
+/* AFS File Server client stubs
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -11,827 +11,927 @@
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
+#include <linux/circ_buf.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "fsclient.h"
-#include "cmservice.h"
-#include "vnode.h"
-#include "server.h"
-#include "errors.h"
 #include "internal.h"
+#include "afs_fs.h"
-#define FSFETCHSTATUS           132     /* AFS Fetch file status */
-#define FSFETCHDATA             130     /* AFS Fetch file data */
-#define FSGIVEUPCALLBACKS       147     /* AFS Discard callback promises */
-#define FSGETVOLUMEINFO         148     /* AFS Get root volume information */
-#define FSGETROOTVOLUME         151     /* AFS Get root volume name */
-#define FSLOOKUP                161     /* AFS lookup file in directory */
-/*****************************************************************************/
 /*
- * map afs abort codes to/from Linux error codes
+ * decode an AFSFid block
- * - called with call->lock held
 */
-static void afs_rxfs_aemap(struct rxrpc_call *call)
+static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
 {
-        switch (call->app_err_state) {
+        const __be32 *bp = *_bp;
-        case RXRPC_ESTATE_LOCAL_ABORT:
-                call->app_abort_code = -call->app_errno;
+        fid->vid                = ntohl(*bp++);
-                break;
+        fid->vnode              = ntohl(*bp++);
-        case RXRPC_ESTATE_PEER_ABORT:
+        fid->unique             = ntohl(*bp++);
-                call->app_errno = afs_abort_to_error(call->app_abort_code);
+        *_bp = bp;
-                break;
+}
-        default:
-                break;
-        }
-} /* end afs_rxfs_aemap() */
-/*****************************************************************************/
 /*
- * get the root volume name from a fileserver
+ * decode an AFSFetchStatus block
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
 */
-#if 0
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
-int afs_rxfs_get_root_volume(struct afs_server *server,
+                                      struct afs_file_status *status,
-                             char *buf, size_t *buflen)
+                                      struct afs_vnode *vnode)
 {
-        struct rxrpc_connection *conn;
+        const __be32 *bp = *_bp;
-        struct rxrpc_call *call;
+        umode_t mode;
-        struct kvec piov[2];
+        u64 data_version, size;
-        size_t sent;
+        u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
-        int ret;
-        u32 param[1];
+#define EXTRACT(DST)                            \
+        do {                                    \
+                u32 x = ntohl(*bp++);           \
+                changed |= DST - x;             \
+                DST = x;                        \
+        } while (0)
+        status->if_version = ntohl(*bp++);
+        EXTRACT(status->type);
+        EXTRACT(status->nlink);
+        size = ntohl(*bp++);
+        data_version = ntohl(*bp++);
+        EXTRACT(status->author);
+        EXTRACT(status->owner);
+        EXTRACT(status->caller_access); /* call ticket dependent */
+        EXTRACT(status->anon_access);
+        EXTRACT(status->mode);
+        EXTRACT(status->parent.vnode);
+        EXTRACT(status->parent.unique);
+        bp++; /* seg size */
+        status->mtime_client = ntohl(*bp++);
+        status->mtime_server = ntohl(*bp++);
+        EXTRACT(status->group);
+        bp++; /* sync counter */
+        data_version |= (u64) ntohl(*bp++) << 32;
+        bp++; /* lock count */
+        size |= (u64) ntohl(*bp++) << 32;
+        bp++; /* spare 4 */
+        *_bp = bp;
+        if (size != status->size) {
+                status->size = size;
+                changed |= true;
+        }
+        status->mode &= S_IALLUGO;
+        _debug("vnode time %lx, %lx",
+               status->mtime_client, status->mtime_server);
+        if (vnode) {
+                status->parent.vid = vnode->fid.vid;
+                if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+                        _debug("vnode changed");
+                        i_size_write(&vnode->vfs_inode, size);
+                        vnode->vfs_inode.i_uid = status->owner;
+                        vnode->vfs_inode.i_gid = status->group;
+                        vnode->vfs_inode.i_version = vnode->fid.unique;
+                        vnode->vfs_inode.i_nlink = status->nlink;
+                        mode = vnode->vfs_inode.i_mode;
+                        mode &= ~S_IALLUGO;
+                        mode |= status->mode;
+                        barrier();
+                        vnode->vfs_inode.i_mode = mode;
+                }
-        DECLARE_WAITQUEUE(myself, current);
+                vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server;
+                vnode->vfs_inode.i_mtime        = vnode->vfs_inode.i_ctime;
+                vnode->vfs_inode.i_atime        = vnode->vfs_inode.i_ctime;
+        }
-        kenter("%p,%p,%u",server, buf, *buflen);
+        if (status->data_version != data_version) {
+                status->data_version = data_version;
+                if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+                        _debug("vnode modified %llx on {%x:%u}",
+                               (unsigned long long) data_version,
+                               vnode->fid.vid, vnode->fid.vnode);
+                        set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+                        set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+                }
+        }
+}
-        /* get hold of the fileserver connection */
+/*
-        ret = afs_server_get_fsconn(server, &conn);
+ * decode an AFSCallBack block
-        if (ret < 0)
+ */
-                goto out;
+static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+{
+        const __be32 *bp = *_bp;
-        /* create a call through that connection */
+        vnode->cb_version       = ntohl(*bp++);
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
+        vnode->cb_expiry        = ntohl(*bp++);
-        if (ret < 0) {
+        vnode->cb_type          = ntohl(*bp++);
-                printk("kAFS: Unable to create call: %d\n", ret);
+        vnode->cb_expires       = vnode->cb_expiry + get_seconds();
-                goto out_put_conn;
+        *_bp = bp;
-        }
+}
-        call->app_opcode = FSGETROOTVOLUME;
-        /* we want to get event notifications from the call */
+static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
-        add_wait_queue(&call->waitq, &myself);
+                                       struct afs_callback *cb)
+{
+        const __be32 *bp = *_bp;
-        /* marshall the parameters */
+        cb->version     = ntohl(*bp++);
-        param[0] = htonl(FSGETROOTVOLUME);
+        cb->expiry      = ntohl(*bp++);
+        cb->type        = ntohl(*bp++);
-        piov[0].iov_len = sizeof(param);
+        *_bp = bp;
-        piov[0].iov_base = param;
+}
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-                    signal_pending(current))
-                        break;
-                schedule();
-        }
-        set_current_state(TASK_RUNNING);
-        ret = -EINTR;
+/*
-        if (signal_pending(current))
+ * decode an AFSVolSync block
-                goto abort;
+ */
+static void xdr_decode_AFSVolSync(const __be32 **_bp,
+                                  struct afs_volsync *volsync)
+{
+        const __be32 *bp = *_bp;
-        switch (call->app_call_state) {
+        volsync->creation = ntohl(*bp++);
-        case RXRPC_CSTATE_ERROR:
+        bp++; /* spare2 */
-                ret = call->app_errno;
+        bp++; /* spare3 */
-                kdebug("Got Error: %d", ret);
+        bp++; /* spare4 */
-                goto out_unwait;
+        bp++; /* spare5 */
+        bp++; /* spare6 */
+        *_bp = bp;
+}
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
+/*
-                /* read the reply */
+ * deliver reply data to an FS.FetchStatus
-                kdebug("Got Reply: qty=%d", call->app_ready_qty);
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call,
+                                       struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *vnode = call->reply;
+        const __be32 *bp;
-                ret = -EBADMSG;
+        _enter(",,%u", last);
-                if (call->app_ready_qty <= 4)
-                        goto abort;
-                ret = rxrpc_call_read_data(call, NULL, call->app_ready_qty, 0);
+        afs_transfer_reply(call, skb);
-                if (ret < 0)
+        if (!last)
-                        goto abort;
+                return 0;
-#if 0
+        if (call->reply_size != call->reply_max)
-                /* unmarshall the reply */
+                return -EBADMSG;
-                bp = buffer;
-                for (loop = 0; loop < 65; loop++)
-                        entry->name[loop] = ntohl(*bp++);
-                entry->name[64] = 0;
-                entry->type = ntohl(*bp++);
+        /* unmarshall the reply once we've received all of it */
-                entry->num_servers = ntohl(*bp++);
+        bp = call->buffer;
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        xdr_decode_AFSCallBack(&bp, vnode);
+        if (call->reply2)
+                xdr_decode_AFSVolSync(&bp, call->reply2);
-                for (loop = 0; loop < 8; loop++)
+        _leave(" = 0 [done]");
-                        entry->servers[loop].addr.s_addr = *bp++;
+        return 0;
+}
-                for (loop = 0; loop < 8; loop++)
+/*
-                        entry->servers[loop].partition = ntohl(*bp++);
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+        .name           = "FS.FetchStatus",
+        .deliver        = afs_deliver_fs_fetch_status,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
-                for (loop = 0; loop < 8; loop++)
+/*
-                        entry->servers[loop].flags = ntohl(*bp++);
+ * fetch the status information for a file
+ */
+int afs_fs_fetch_file_status(struct afs_server *server,
+                             struct key *key,
+                             struct afs_vnode *vnode,
+                             struct afs_volsync *volsync,
+                             const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        __be32 *bp;
-                for (loop = 0; loop < 3; loop++)
+        _enter(",%x,{%x:%d},,",
-                        entry->volume_ids[loop] = ntohl(*bp++);
+               key_serial(key), vnode->fid.vid, vnode->fid.vnode);
-                entry->clone_id = ntohl(*bp++);
+        call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
-                entry->flags = ntohl(*bp);
+        if (!call)
-#endif
+                return -ENOMEM;
-                /* success */
+        call->key = key;
-                ret = 0;
+        call->reply = vnode;
-                goto out_unwait;
+        call->reply2 = volsync;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
-        default:
+        /* marshall the parameters */
-                BUG();
+        bp = call->request;
-        }
+        bp[0] = htonl(FSFETCHSTATUS);
+        bp[1] = htonl(vnode->fid.vid);
+        bp[2] = htonl(vnode->fid.vnode);
+        bp[3] = htonl(vnode->fid.unique);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_fsconn(server, conn);
- out:
-        kleave("");
-        return ret;
-} /* end afs_rxfs_get_root_volume() */
-#endif
-/*****************************************************************************/
 /*
- * get information about a volume
+ * deliver reply data to an FS.FetchData
 */
-#if 0
+static int afs_deliver_fs_fetch_data(struct afs_call *call,
-int afs_rxfs_get_volume_info(struct afs_server *server,
+                                     struct sk_buff *skb, bool last)
-                             const char *name,
-                             struct afs_volume_info *vinfo)
 {
-        struct rxrpc_connection *conn;
+        struct afs_vnode *vnode = call->reply;
-        struct rxrpc_call *call;
+        const __be32 *bp;
-        struct kvec piov[3];
+        struct page *page;
-        size_t sent;
+        void *buffer;
        int ret;
-        u32 param[2], *bp, zero;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+        switch (call->unmarshall) {
+        case 0:
+                call->offset = 0;
+                call->unmarshall++;
+                /* extract the returned data length */
+        case 1:
+                _debug("extract data length");
+                ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-        _enter("%p,%s,%p", server, name, vinfo);
+                call->count = ntohl(call->tmp);
+                _debug("DATA length: %u", call->count);
+                if (call->count > PAGE_SIZE)
+                        return -EBADMSG;
+                call->offset = 0;
+                call->unmarshall++;
+                if (call->count < PAGE_SIZE) {
+                        buffer = kmap_atomic(call->reply3, KM_USER0);
+                        memset(buffer + PAGE_SIZE - call->count, 0,
+                               call->count);
+                        kunmap_atomic(buffer, KM_USER0);
+                }
-        /* get hold of the fileserver connection */
+                /* extract the returned data */
-        ret = afs_server_get_fsconn(server, &conn);
+        case 2:
-        if (ret < 0)
+                _debug("extract data");
-                goto out;
+                page = call->reply3;
+                buffer = kmap_atomic(page, KM_USER0);
+                ret = afs_extract_data(call, skb, last, buffer, call->count);
+                kunmap_atomic(buffer, KM_USER0);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-        /* create a call through that connection */
+                call->offset = 0;
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
+                call->unmarshall++;
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
+                /* extract the metadata */
-                goto out_put_conn;
+        case 3:
-        }
+                ret = afs_extract_data(call, skb, last, call->buffer,
-        call->app_opcode = FSGETVOLUMEINFO;
+                                       (21 + 3 + 6) * 4);
+                switch (ret) {
+                case 0:         break;
+                case -EAGAIN:   return 0;
+                default:        return ret;
+                }
-        /* we want to get event notifications from the call */
+                bp = call->buffer;
-        add_wait_queue(&call->waitq, &myself);
+                xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+                xdr_decode_AFSCallBack(&bp, vnode);
+                if (call->reply2)
+                        xdr_decode_AFSVolSync(&bp, call->reply2);
-        /* marshall the parameters */
+                call->offset = 0;
-        piov[1].iov_len = strlen(name);
+                call->unmarshall++;
-        piov[1].iov_base = (char *) name;
+        case 4:
-        zero = 0;
+                _debug("trailer");
-        piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
+                if (skb->len != 0)
-        piov[2].iov_base = &zero;
+                        return -EBADMSG;
+                break;
-        param[0] = htonl(FSGETVOLUMEINFO);
-        param[1] = htonl(piov[1].iov_len);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 64);
-        ret = rxrpc_call_read_data(call, bp, 64,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
        }
-        /* unmarshall the reply */
+        if (!last)
-        vinfo->vid = ntohl(*bp++);
+                return 0;
-        vinfo->type = ntohl(*bp++);
+        _leave(" = 0 [done]");
-        vinfo->type_vids[0] = ntohl(*bp++);
+        return 0;
-        vinfo->type_vids[1] = ntohl(*bp++);
+}
-        vinfo->type_vids[2] = ntohl(*bp++);
-        vinfo->type_vids[3] = ntohl(*bp++);
-        vinfo->type_vids[4] = ntohl(*bp++);
-        vinfo->nservers = ntohl(*bp++);
-        vinfo->servers[0].addr.s_addr = *bp++;
-        vinfo->servers[1].addr.s_addr = *bp++;
-        vinfo->servers[2].addr.s_addr = *bp++;
-        vinfo->servers[3].addr.s_addr = *bp++;
-        vinfo->servers[4].addr.s_addr = *bp++;
-        vinfo->servers[5].addr.s_addr = *bp++;
-        vinfo->servers[6].addr.s_addr = *bp++;
-        vinfo->servers[7].addr.s_addr = *bp++;
-        ret = -EBADMSG;
-        if (vinfo->nservers > 8)
-                goto abort;
-        /* success */
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_fsconn(server, conn);
- out:
-        _leave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_get_volume_info() */
-#endif
-/*****************************************************************************/
 /*
- * fetch the status information for a file
+ * FS.FetchData operation type
+ */
+static const struct afs_call_type afs_RXFSFetchData = {
+        .name           = "FS.FetchData",
+        .deliver        = afs_deliver_fs_fetch_data,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * fetch data from a file
 */
-int afs_rxfs_fetch_file_status(struct afs_server *server,
+int afs_fs_fetch_data(struct afs_server *server,
-                               struct afs_vnode *vnode,
+                      struct key *key,
-                               struct afs_volsync *volsync)
+                      struct afs_vnode *vnode,
+                      off_t offset, size_t length,
+                      struct page *buffer,
+                      const struct afs_wait_mode *wait_mode)
 {
-        struct afs_server_callslot callslot;
+        struct afs_call *call;
-        struct rxrpc_call *call;
-        struct kvec piov[1];
-        size_t sent;
-        int ret;
        __be32 *bp;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
-        _enter("%p,{%u,%u,%u}",
+        call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
-               server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+        if (!call)
+                return -ENOMEM;
-        /* get hold of the fileserver connection */
+        call->key = key;
-        ret = afs_server_request_callslot(server, &callslot);
+        call->reply = vnode;
-        if (ret < 0)
+        call->reply2 = NULL; /* volsync */
-                goto out;
+        call->reply3 = buffer;
+        call->service_id = FS_SERVICE;
-        /* create a call through that connection */
+        call->port = htons(AFS_FS_PORT);
-        ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap,
-                                &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = FSFETCHSTATUS;
-        /* we want to get event notifications from the call */
-        add_wait_queue(&call->waitq, &myself);
        /* marshall the parameters */
-        bp = rxrpc_call_alloc_scratch(call, 16);
+        bp = call->request;
-        bp[0] = htonl(FSFETCHSTATUS);
+        bp[0] = htonl(FSFETCHDATA);
        bp[1] = htonl(vnode->fid.vid);
        bp[2] = htonl(vnode->fid.vnode);
        bp[3] = htonl(vnode->fid.unique);
+        bp[4] = htonl(offset);
+        bp[5] = htonl(length);
-        piov[0].iov_len = 16;
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        piov[0].iov_base = bp;
+}
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 120);
-        ret = rxrpc_call_read_data(call, bp, 120,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
-        }
-        /* unmarshall the reply */
+/*
-        vnode->status.if_version        = ntohl(*bp++);
+ * deliver reply data to an FS.GiveUpCallBacks
-        vnode->status.type              = ntohl(*bp++);
+ */
-        vnode->status.nlink             = ntohl(*bp++);
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
-        vnode->status.size              = ntohl(*bp++);
+                                            struct sk_buff *skb, bool last)
-        vnode->status.version           = ntohl(*bp++);
+{
-        vnode->status.author            = ntohl(*bp++);
+        _enter(",{%u},%d", skb->len, last);
-        vnode->status.owner             = ntohl(*bp++);
-        vnode->status.caller_access     = ntohl(*bp++);
-        vnode->status.anon_access       = ntohl(*bp++);
-        vnode->status.mode              = ntohl(*bp++);
-        vnode->status.parent.vid        = vnode->fid.vid;
-        vnode->status.parent.vnode      = ntohl(*bp++);
-        vnode->status.parent.unique     = ntohl(*bp++);
-        bp++; /* seg size */
-        vnode->status.mtime_client      = ntohl(*bp++);
-        vnode->status.mtime_server      = ntohl(*bp++);
-        bp++; /* group */
-        bp++; /* sync counter */
-        vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-        bp++; /* spare2 */
-        bp++; /* spare3 */
-        bp++; /* spare4 */
-        vnode->cb_version               = ntohl(*bp++);
+        if (skb->len > 0)
-        vnode->cb_expiry                = ntohl(*bp++);
+                return -EBADMSG; /* shouldn't be any reply data */
-        vnode->cb_type                  = ntohl(*bp++);
+        return 0;
+}
-        if (volsync) {
-                volsync->creation       = ntohl(*bp++);
-                bp++; /* spare2 */
-                bp++; /* spare3 */
-                bp++; /* spare4 */
-                bp++; /* spare5 */
-                bp++; /* spare6 */
-        }
-        /* success */
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_callslot(server, &callslot);
- out:
-        _leave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_fetch_file_status() */
-/*****************************************************************************/
 /*
- * fetch the contents of a file or directory
+ * FS.GiveUpCallBacks operation type
 */
-int afs_rxfs_fetch_file_data(struct afs_server *server,
+static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
-                             struct afs_vnode *vnode,
+        .name           = "FS.GiveUpCallBacks",
-                             struct afs_rxfs_fetch_descriptor *desc,
+        .deliver        = afs_deliver_fs_give_up_callbacks,
-                             struct afs_volsync *volsync)
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * give up a set of callbacks
+ * - the callbacks are held in the server->cb_break ring
+ */
+int afs_fs_give_up_callbacks(struct afs_server *server,
+                             const struct afs_wait_mode *wait_mode)
 {
-        struct afs_server_callslot callslot;
+        struct afs_call *call;
-        struct rxrpc_call *call;
+        size_t ncallbacks;
-        struct kvec piov[1];
+        __be32 *bp, *tp;
-        size_t sent;
+        int loop;
-        int ret;
-        __be32 *bp;
-        DECLARE_WAITQUEUE(myself, current);
+        ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
+                              ARRAY_SIZE(server->cb_break));
-        _enter("%p,{fid={%u,%u,%u},sz=%Zu,of=%lu}",
-               server,
+        _enter("{%zu},", ncallbacks);
-               desc->fid.vid,
-               desc->fid.vnode,
+        if (ncallbacks == 0)
-               desc->fid.unique,
+                return 0;
-               desc->size,
+        if (ncallbacks > AFSCBMAX)
-               desc->offset);
+                ncallbacks = AFSCBMAX;
-        /* get hold of the fileserver connection */
+        _debug("break %zu callbacks", ncallbacks);
-        ret = afs_server_request_callslot(server, &callslot);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = FSFETCHDATA;
-        /* we want to get event notifications from the call */
+        call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
-        add_wait_queue(&call->waitq, &myself);
+                                   12 + ncallbacks * 6 * 4, 0);
+        if (!call)
+                return -ENOMEM;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
        /* marshall the parameters */
-        bp = rxrpc_call_alloc_scratch(call, 24);
+        bp = call->request;
-        bp[0] = htonl(FSFETCHDATA);
+        tp = bp + 2 + ncallbacks * 3;
-        bp[1] = htonl(desc->fid.vid);
+        *bp++ = htonl(FSGIVEUPCALLBACKS);
-        bp[2] = htonl(desc->fid.vnode);
+        *bp++ = htonl(ncallbacks);
-        bp[3] = htonl(desc->fid.unique);
+        *tp++ = htonl(ncallbacks);
-        bp[4] = htonl(desc->offset);
-        bp[5] = htonl(desc->size);
+        atomic_sub(ncallbacks, &server->cb_break_n);
+        for (loop = ncallbacks; loop > 0; loop--) {
-        piov[0].iov_len = 24;
+                struct afs_callback *cb =
-        piov[0].iov_base = bp;
+                        &server->cb_break[server->cb_break_tail];
-        /* send the parameters to the server */
+                *bp++ = htonl(cb->fid.vid);
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
+                *bp++ = htonl(cb->fid.vnode);
-                                    0, &sent);
+                *bp++ = htonl(cb->fid.unique);
-        if (ret < 0)
+                *tp++ = htonl(cb->version);
-                goto abort;
+                *tp++ = htonl(cb->expiry);
+                *tp++ = htonl(cb->type);
-        /* wait for the data count to arrive */
+                smp_mb();
-        ret = rxrpc_call_read_data(call, bp, 4, RXRPC_CALL_READ_BLOCK);
+                server->cb_break_tail =
-        if (ret < 0)
+                        (server->cb_break_tail + 1) &
-                goto read_failed;
+                        (ARRAY_SIZE(server->cb_break) - 1);
-        desc->actual = ntohl(bp[0]);
-        if (desc->actual != desc->size) {
-                ret = -EBADMSG;
-                goto abort;
        }
-        /* call the app to read the actual data */
+        ASSERT(ncallbacks > 0);
-        rxrpc_call_reset_scratch(call);
+        wake_up_nr(&server->cb_break_waitq, ncallbacks);
-        ret = rxrpc_call_read_data(call, desc->buffer, desc->actual,
-                                   RXRPC_CALL_READ_BLOCK);
-        if (ret < 0)
-                goto read_failed;
-        /* wait for the rest of the reply to completely arrive */
-        rxrpc_call_reset_scratch(call);
-        bp = rxrpc_call_alloc_scratch(call, 120);
-        ret = rxrpc_call_read_data(call, bp, 120,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0)
-                goto read_failed;
-        /* unmarshall the reply */
-        vnode->status.if_version        = ntohl(*bp++);
-        vnode->status.type              = ntohl(*bp++);
-        vnode->status.nlink             = ntohl(*bp++);
-        vnode->status.size              = ntohl(*bp++);
-        vnode->status.version           = ntohl(*bp++);
-        vnode->status.author            = ntohl(*bp++);
-        vnode->status.owner             = ntohl(*bp++);
-        vnode->status.caller_access     = ntohl(*bp++);
-        vnode->status.anon_access       = ntohl(*bp++);
-        vnode->status.mode              = ntohl(*bp++);
-        vnode->status.parent.vid        = desc->fid.vid;
-        vnode->status.parent.vnode      = ntohl(*bp++);
-        vnode->status.parent.unique     = ntohl(*bp++);
-        bp++; /* seg size */
-        vnode->status.mtime_client      = ntohl(*bp++);
-        vnode->status.mtime_server      = ntohl(*bp++);
-        bp++; /* group */
-        bp++; /* sync counter */
-        vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-        bp++; /* spare2 */
-        bp++; /* spare3 */
-        bp++; /* spare4 */
-        vnode->cb_version               = ntohl(*bp++);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        vnode->cb_expiry                = ntohl(*bp++);
+}
-        vnode->cb_type                  = ntohl(*bp++);
-        if (volsync) {
-                volsync->creation       = ntohl(*bp++);
-                bp++; /* spare2 */
-                bp++; /* spare3 */
-                bp++; /* spare4 */
-                bp++; /* spare5 */
-                bp++; /* spare6 */
-        }
-        /* success */
+/*
-        ret = 0;
+ * deliver reply data to an FS.CreateFile or an FS.MakeDir
+ */
- out_unwait:
+static int afs_deliver_fs_create_vnode(struct afs_call *call,
-        set_current_state(TASK_RUNNING);
+                                       struct sk_buff *skb, bool last)
-        remove_wait_queue(&call->waitq,&myself);
+{
-        rxrpc_put_call(call);
+        struct afs_vnode *vnode = call->reply;
- out_put_conn:
+        const __be32 *bp;
-        afs_server_release_callslot(server, &callslot);
- out:
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        _leave(" = %d", ret);
-        return ret;
- read_failed:
-        if (ret == -ECONNABORTED) {
-                ret = call->app_errno;
-                goto out_unwait;
-        }
- abort:
+        afs_transfer_reply(call, skb);
-        set_current_state(TASK_UNINTERRUPTIBLE);
+        if (!last)
-        rxrpc_call_abort(call, ret);
+                return 0;
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_fetch_file_data() */
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFid(&bp, call->reply2);
+        xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        xdr_decode_AFSCallBack_raw(&bp, call->reply4);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateXXXX = {
+        .name           = "FS.CreateXXXX",
+        .deliver        = afs_deliver_fs_create_vnode,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
-/*****************************************************************************/
 /*
- * ask the AFS fileserver to discard a callback request on a file
+ * create a file or make a directory
 */
-int afs_rxfs_give_up_callback(struct afs_server *server,
+int afs_fs_create(struct afs_server *server,
-                              struct afs_vnode *vnode)
+                  struct key *key,
+                  struct afs_vnode *vnode,
+                  const char *name,
+                  umode_t mode,
+                  struct afs_fid *newfid,
+                  struct afs_file_status *newstatus,
+                  struct afs_callback *newcb,
+                  const struct afs_wait_mode *wait_mode)
 {
-        struct afs_server_callslot callslot;
+        struct afs_call *call;
-        struct rxrpc_call *call;
+        size_t namesz, reqsz, padsz;
-        struct kvec piov[1];
-        size_t sent;
-        int ret;
        __be32 *bp;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
-        _enter("%p,{%u,%u,%u}",
+        namesz = strlen(name);
-               server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+        padsz = (4 - (namesz & 3)) & 3;
+        reqsz = (5 * 4) + namesz + padsz + (6 * 4);
-        /* get hold of the fileserver connection */
+        call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
-        ret = afs_server_request_callslot(server, &callslot);
+                                   (3 + 21 + 21 + 3 + 6) * 4);
-        if (ret < 0)
+        if (!call)
-                goto out;
+                return -ENOMEM;
-        /* create a call through that connection */
+        call->key = key;
-        ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
+        call->reply = vnode;
-        if (ret < 0) {
+        call->reply2 = newfid;
-                printk("kAFS: Unable to create call: %d\n", ret);
+        call->reply3 = newstatus;
-                goto out_put_conn;
+        call->reply4 = newcb;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        *bp++ = htonl(namesz);
+        memcpy(bp, name, namesz);
+        bp = (void *) bp + namesz;
+        if (padsz > 0) {
+                memset(bp, 0, padsz);
+                bp = (void *) bp + padsz;
        }
-        call->app_opcode = FSGIVEUPCALLBACKS;
+        *bp++ = htonl(AFS_SET_MODE);
+        *bp++ = 0; /* mtime */
+        *bp++ = 0; /* owner */
+        *bp++ = 0; /* group */
+        *bp++ = htonl(mode & S_IALLUGO); /* unix mode */
+        *bp++ = 0; /* segment size */
-        /* we want to get event notifications from the call */
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        add_wait_queue(&call->waitq, &myself);
+}
-        /* marshall the parameters */
+/*
-        bp = rxrpc_call_alloc_scratch(call, (1 + 4 + 4) * 4);
+ * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ */
+static int afs_deliver_fs_remove(struct afs_call *call,
+                                 struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *vnode = call->reply;
+        const __be32 *bp;
-        piov[0].iov_len = (1 + 4 + 4) * 4;
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        piov[0].iov_base = bp;
-        *bp++ = htonl(FSGIVEUPCALLBACKS);
+        afs_transfer_reply(call, skb);
-        *bp++ = htonl(1);
+        if (!last)
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.RemoveDir/FS.RemoveFile operation type
+ */
+static const struct afs_call_type afs_RXFSRemoveXXXX = {
+        .name           = "FS.RemoveXXXX",
+        .deliver        = afs_deliver_fs_remove,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * remove a file or directory
+ */
+int afs_fs_remove(struct afs_server *server,
+                  struct key *key,
+                  struct afs_vnode *vnode,
+                  const char *name,
+                  bool isdir,
+                  const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        size_t namesz, reqsz, padsz;
+        __be32 *bp;
+        _enter("");
+        namesz = strlen(name);
+        padsz = (4 - (namesz & 3)) & 3;
+        reqsz = (5 * 4) + namesz + padsz;
+        call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
        *bp++ = htonl(vnode->fid.vid);
        *bp++ = htonl(vnode->fid.vnode);
        *bp++ = htonl(vnode->fid.unique);
-        *bp++ = htonl(1);
+        *bp++ = htonl(namesz);
-        *bp++ = htonl(vnode->cb_version);
+        memcpy(bp, name, namesz);
-        *bp++ = htonl(vnode->cb_expiry);
+        bp = (void *) bp + namesz;
-        *bp++ = htonl(vnode->cb_type);
+        if (padsz > 0) {
+                memset(bp, 0, padsz);
-        /* send the parameters to the server */
+                bp = (void *) bp + padsz;
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-                    signal_pending(current))
-                        break;
-                schedule();
        }
-        set_current_state(TASK_RUNNING);
-        ret = -EINTR;
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        if (signal_pending(current))
+}
-                goto abort;
-        switch (call->app_call_state) {
+/*
-        case RXRPC_CSTATE_ERROR:
+ * deliver reply data to an FS.Link
-                ret = call->app_errno;
+ */
-                goto out_unwait;
+static int afs_deliver_fs_link(struct afs_call *call,
+                               struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+        const __be32 *bp;
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-                ret = 0;
-                goto out_unwait;
-        default:
+        afs_transfer_reply(call, skb);
-                BUG();
+        if (!last)
-        }
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.Link operation type
+ */
+static const struct afs_call_type afs_RXFSLink = {
+        .name           = "FS.Link",
+        .deliver        = afs_deliver_fs_link,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_callslot(server, &callslot);
- out:
-        _leave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_give_up_callback() */
-/*****************************************************************************/
 /*
- * look a filename up in a directory
+ * make a hard link
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
 */
-#if 0
+int afs_fs_link(struct afs_server *server,
-int afs_rxfs_lookup(struct afs_server *server,
+                struct key *key,
-                    struct afs_vnode *dir,
+                struct afs_vnode *dvnode,
-                    const char *filename,
+                struct afs_vnode *vnode,
-                    struct afs_vnode *vnode,
+                const char *name,
-                    struct afs_volsync *volsync)
+                const struct afs_wait_mode *wait_mode)
 {
-        struct rxrpc_connection *conn;
+        struct afs_call *call;
-        struct rxrpc_call *call;
+        size_t namesz, reqsz, padsz;
-        struct kvec piov[3];
+        __be32 *bp;
-        size_t sent;
-        int ret;
-        u32 *bp, zero;
-        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
-        kenter("%p,{%u,%u,%u},%s",
+        namesz = strlen(name);
-               server, fid->vid, fid->vnode, fid->unique, filename);
+        padsz = (4 - (namesz & 3)) & 3;
+        reqsz = (5 * 4) + namesz + padsz + (3 * 4);
-        /* get hold of the fileserver connection */
+        call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
-        ret = afs_server_get_fsconn(server, &conn);
+        if (!call)
-        if (ret < 0)
+                return -ENOMEM;
-                goto out;
-        /* create a call through that connection */
+        call->key = key;
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
+        call->reply = dvnode;
-        if (ret < 0) {
+        call->reply2 = vnode;
-                printk("kAFS: Unable to create call: %d\n", ret);
+        call->service_id = FS_SERVICE;
-                goto out_put_conn;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSLINK);
+        *bp++ = htonl(dvnode->fid.vid);
+        *bp++ = htonl(dvnode->fid.vnode);
+        *bp++ = htonl(dvnode->fid.unique);
+        *bp++ = htonl(namesz);
+        memcpy(bp, name, namesz);
+        bp = (void *) bp + namesz;
+        if (padsz > 0) {
+                memset(bp, 0, padsz);
+                bp = (void *) bp + padsz;
        }
-        call->app_opcode = FSLOOKUP;
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+/*
+ * deliver reply data to an FS.Symlink
+ */
+static int afs_deliver_fs_symlink(struct afs_call *call,
+                                  struct sk_buff *skb, bool last)
+{
+        struct afs_vnode *vnode = call->reply;
+        const __be32 *bp;
-        /* we want to get event notifications from the call */
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        add_wait_queue(&call->waitq,&myself);
+        afs_transfer_reply(call, skb);
+        if (!last)
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        xdr_decode_AFSFid(&bp, call->reply2);
+        xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+        xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.Symlink operation type
+ */
+static const struct afs_call_type afs_RXFSSymlink = {
+        .name           = "FS.Symlink",
+        .deliver        = afs_deliver_fs_symlink,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * create a symbolic link
+ */
+int afs_fs_symlink(struct afs_server *server,
+                   struct key *key,
+                   struct afs_vnode *vnode,
+                   const char *name,
+                   const char *contents,
+                   struct afs_fid *newfid,
+                   struct afs_file_status *newstatus,
+                   const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        size_t namesz, reqsz, padsz, c_namesz, c_padsz;
+        __be32 *bp;
+        _enter("");
+        namesz = strlen(name);
+        padsz = (4 - (namesz & 3)) & 3;
+        c_namesz = strlen(contents);
+        c_padsz = (4 - (c_namesz & 3)) & 3;
+        reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
+        call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+                                   (3 + 21 + 21 + 6) * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->reply2 = newfid;
+        call->reply3 = newstatus;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
        /* marshall the parameters */
-        bp = rxrpc_call_alloc_scratch(call, 20);
+        bp = call->request;
+        *bp++ = htonl(FSSYMLINK);
-        zero = 0;
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
-        piov[0].iov_len = 20;
+        *bp++ = htonl(vnode->fid.unique);
-        piov[0].iov_base = bp;
+        *bp++ = htonl(namesz);
-        piov[1].iov_len = strlen(filename);
+        memcpy(bp, name, namesz);
-        piov[1].iov_base = (char *) filename;
+        bp = (void *) bp + namesz;
-        piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
+        if (padsz > 0) {
-        piov[2].iov_base = &zero;
+                memset(bp, 0, padsz);
+                bp = (void *) bp + padsz;
-        *bp++ = htonl(FSLOOKUP);
-        *bp++ = htonl(dirfid->vid);
-        *bp++ = htonl(dirfid->vnode);
-        *bp++ = htonl(dirfid->unique);
-        *bp++ = htonl(piov[1].iov_len);
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 220);
-        ret = rxrpc_call_read_data(call, bp, 220,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
        }
+        *bp++ = htonl(c_namesz);
+        memcpy(bp, contents, c_namesz);
+        bp = (void *) bp + c_namesz;
+        if (c_padsz > 0) {
+                memset(bp, 0, c_padsz);
+                bp = (void *) bp + c_padsz;
+        }
+        *bp++ = htonl(AFS_SET_MODE);
+        *bp++ = 0; /* mtime */
+        *bp++ = 0; /* owner */
+        *bp++ = 0; /* group */
+        *bp++ = htonl(S_IRWXUGO); /* unix mode */
+        *bp++ = 0; /* segment size */
-        /* unmarshall the reply */
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        fid->vid                = ntohl(*bp++);
+}
-        fid->vnode              = ntohl(*bp++);
-        fid->unique             = ntohl(*bp++);
-        vnode->status.if_version        = ntohl(*bp++);
+/*
-        vnode->status.type              = ntohl(*bp++);
+ * deliver reply data to an FS.Rename
-        vnode->status.nlink             = ntohl(*bp++);
+ */
-        vnode->status.size              = ntohl(*bp++);
+static int afs_deliver_fs_rename(struct afs_call *call,
-        vnode->status.version           = ntohl(*bp++);
+                                  struct sk_buff *skb, bool last)
-        vnode->status.author            = ntohl(*bp++);
+{
-        vnode->status.owner             = ntohl(*bp++);
+        struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
-        vnode->status.caller_access     = ntohl(*bp++);
+        const __be32 *bp;
-        vnode->status.anon_access       = ntohl(*bp++);
-        vnode->status.mode              = ntohl(*bp++);
-        vnode->status.parent.vid        = dirfid->vid;
-        vnode->status.parent.vnode      = ntohl(*bp++);
-        vnode->status.parent.unique     = ntohl(*bp++);
-        bp++; /* seg size */
-        vnode->status.mtime_client      = ntohl(*bp++);
-        vnode->status.mtime_server      = ntohl(*bp++);
-        bp++; /* group */
-        bp++; /* sync counter */
-        vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-        bp++; /* spare2 */
-        bp++; /* spare3 */
-        bp++; /* spare4 */
-        dir->status.if_version          = ntohl(*bp++);
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
-        dir->status.type                = ntohl(*bp++);
-        dir->status.nlink               = ntohl(*bp++);
+        afs_transfer_reply(call, skb);
-        dir->status.size                = ntohl(*bp++);
+        if (!last)
-        dir->status.version             = ntohl(*bp++);
+                return 0;
-        dir->status.author              = ntohl(*bp++);
-        dir->status.owner               = ntohl(*bp++);
+        if (call->reply_size != call->reply_max)
-        dir->status.caller_access       = ntohl(*bp++);
+                return -EBADMSG;
-        dir->status.anon_access         = ntohl(*bp++);
-        dir->status.mode                = ntohl(*bp++);
+        /* unmarshall the reply once we've received all of it */
-        dir->status.parent.vid          = dirfid->vid;
+        bp = call->buffer;
-        dir->status.parent.vnode        = ntohl(*bp++);
+        xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
-        dir->status.parent.unique       = ntohl(*bp++);
+        if (new_dvnode != orig_dvnode)
-        bp++; /* seg size */
+                xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
-        dir->status.mtime_client        = ntohl(*bp++);
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
-        dir->status.mtime_server        = ntohl(*bp++);
-        bp++; /* group */
+        _leave(" = 0 [done]");
-        bp++; /* sync counter */
+        return 0;
-        dir->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
+}
-        bp++; /* spare2 */
-        bp++; /* spare3 */
+/*
-        bp++; /* spare4 */
+ * FS.Rename operation type
+ */
+static const struct afs_call_type afs_RXFSRename = {
+        .name           = "FS.Rename",
+        .deliver        = afs_deliver_fs_rename,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * create a symbolic link
+ */
+int afs_fs_rename(struct afs_server *server,
+                  struct key *key,
+                  struct afs_vnode *orig_dvnode,
+                  const char *orig_name,
+                  struct afs_vnode *new_dvnode,
+                  const char *new_name,
+                  const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
+        __be32 *bp;
+        _enter("");
+        o_namesz = strlen(orig_name);
+        o_padsz = (4 - (o_namesz & 3)) & 3;
+        n_namesz = strlen(new_name);
+        n_padsz = (4 - (n_namesz & 3)) & 3;
+        reqsz = (4 * 4) +
+                4 + o_namesz + o_padsz +
+                (3 * 4) +
+                4 + n_namesz + n_padsz;
+        call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = orig_dvnode;
+        call->reply2 = new_dvnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSRENAME);
+        *bp++ = htonl(orig_dvnode->fid.vid);
+        *bp++ = htonl(orig_dvnode->fid.vnode);
+        *bp++ = htonl(orig_dvnode->fid.unique);
+        *bp++ = htonl(o_namesz);
+        memcpy(bp, orig_name, o_namesz);
+        bp = (void *) bp + o_namesz;
+        if (o_padsz > 0) {
+                memset(bp, 0, o_padsz);
+                bp = (void *) bp + o_padsz;
+        }
-        callback->fid           = *fid;
+        *bp++ = htonl(new_dvnode->fid.vid);
-        callback->version       = ntohl(*bp++);
+        *bp++ = htonl(new_dvnode->fid.vnode);
-        callback->expiry        = ntohl(*bp++);
+        *bp++ = htonl(new_dvnode->fid.unique);
-        callback->type          = ntohl(*bp++);
+        *bp++ = htonl(n_namesz);
+        memcpy(bp, new_name, n_namesz);
-        if (volsync) {
+        bp = (void *) bp + n_namesz;
-                volsync->creation       = ntohl(*bp++);
+        if (n_padsz > 0) {
-                bp++; /* spare2 */
+                memset(bp, 0, n_padsz);
-                bp++; /* spare3 */
+                bp = (void *) bp + n_padsz;
-                bp++; /* spare4 */
-                bp++; /* spare5 */
-                bp++; /* spare6 */
        }
-        /* success */
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
-        ret = 0;
+}
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        afs_server_release_fsconn(server, conn);
- out:
-        kleave("");
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxfs_lookup() */
-#endif
diff --git a/fs/afs/fsclient.h b/fs/afs/fsclient.h
deleted file mode 100644
index 8ba3e749ee3c..000000000000
--- a/fs/afs/fsclient.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* fsclient.h: AFS File Server client stub declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_FSCLIENT_H
-#define _LINUX_AFS_FSCLIENT_H
-#include "server.h"
-extern int afs_rxfs_get_volume_info(struct afs_server *server,
-                                    const char *name,
-                                    struct afs_volume_info *vinfo);
-extern int afs_rxfs_fetch_file_status(struct afs_server *server,
-                                      struct afs_vnode *vnode,
-                                      struct afs_volsync *volsync);
-struct afs_rxfs_fetch_descriptor {
-        struct afs_fid  fid;            /* file ID to fetch */
-        size_t          size;           /* total number of bytes to fetch */
-        off_t           offset;         /* offset in file to start from */
-        void            *buffer;        /* read buffer */
-        size_t          actual;         /* actual size sent back by server */
-};
-extern int afs_rxfs_fetch_file_data(struct afs_server *server,
-                                    struct afs_vnode *vnode,
-                                    struct afs_rxfs_fetch_descriptor *desc,
-                                    struct afs_volsync *volsync);
-extern int afs_rxfs_give_up_callback(struct afs_server *server,
-                                     struct afs_vnode *vnode);
-/* this doesn't appear to work in OpenAFS server */
-extern int afs_rxfs_lookup(struct afs_server *server,
-                           struct afs_vnode *dir,
-                           const char *filename,
-                           struct afs_vnode *vnode,
-                           struct afs_volsync *volsync);
-/* this is apparently mis-implemented in OpenAFS server */
-extern int afs_rxfs_get_root_volume(struct afs_server *server,
-                                    char *buf,
-                                    size_t *buflen);
-#endif /* _LINUX_AFS_FSCLIENT_H */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 9d9bca6c28b5..c184a4ee5995 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,9 +19,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "super.h"
 #include "internal.h"
 struct afs_iget_data {
@@ -29,26 +26,25 @@ struct afs_iget_data {
        struct afs_volume       *volume;        /* volume on which resides */
 };
-/*****************************************************************************/
 /*
 * map the AFS file status to the inode member variables
 */
-static int afs_inode_map_status(struct afs_vnode *vnode)
+static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 {
        struct inode *inode = AFS_VNODE_TO_I(vnode);
-        _debug("FS: ft=%d lk=%d sz=%Zu ver=%Lu mod=%hu",
+        _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
               vnode->status.type,
               vnode->status.nlink,
-               vnode->status.size,
+               (unsigned long long) vnode->status.size,
-               vnode->status.version,
+               vnode->status.data_version,
               vnode->status.mode);
        switch (vnode->status.type) {
        case AFS_FTYPE_FILE:
                inode->i_mode   = S_IFREG | vnode->status.mode;
                inode->i_op     = &afs_file_inode_operations;
-                inode->i_fop    = &generic_ro_fops;
+                inode->i_fop    = &afs_file_operations;
                break;
        case AFS_FTYPE_DIR:
                inode->i_mode   = S_IFDIR | vnode->status.mode;
@@ -77,9 +73,9 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
        /* check to see whether a symbolic link is really a mountpoint */
        if (vnode->status.type == AFS_FTYPE_SYMLINK) {
-                afs_mntpt_check_symlink(vnode);
+                afs_mntpt_check_symlink(vnode, key);
-                if (vnode->flags & AFS_VNODE_MOUNTPOINT) {
+                if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
                        inode->i_mode   = S_IFDIR | vnode->status.mode;
                        inode->i_op     = &afs_mntpt_inode_operations;
                        inode->i_fop    = &afs_mntpt_file_operations;
@@ -87,30 +83,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
        }
        return 0;
-} /* end afs_inode_map_status() */
+}
-/*****************************************************************************/
-/*
- * attempt to fetch the status of an inode, coelescing multiple simultaneous
- * fetches
- */
-static int afs_inode_fetch_status(struct inode *inode)
-{
-        struct afs_vnode *vnode;
-        int ret;
-        vnode = AFS_FS_I(inode);
-        ret = afs_vnode_fetch_status(vnode);
-        if (ret == 0)
-                ret = afs_inode_map_status(vnode);
-        return ret;
-} /* end afs_inode_fetch_status() */
-/*****************************************************************************/
 /*
 * iget5() comparator
 */
@@ -120,9 +94,8 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
        return inode->i_ino == data->fid.vnode &&
                inode->i_version == data->fid.unique;
-} /* end afs_iget5_test() */
+}
-/*****************************************************************************/
 /*
 * iget5() inode initialiser
 */
@@ -137,14 +110,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
        vnode->volume = data->volume;
        return 0;
-} /* end afs_iget5_set() */
+}
-/*****************************************************************************/
 /*
 * inode retrieval
 */
-inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
+struct inode *afs_iget(struct super_block *sb, struct key *key,
-                    struct inode **_inode)
+                       struct afs_fid *fid, struct afs_file_status *status,
+                       struct afs_callback *cb)
 {
        struct afs_iget_data data = { .fid = *fid };
        struct afs_super_info *as;
@@ -161,20 +134,18 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
                             &data);
        if (!inode) {
                _leave(" = -ENOMEM");
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        }
+        _debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+               inode, fid->vid, fid->vnode, fid->unique);
        vnode = AFS_FS_I(inode);
        /* deal with an existing inode */
        if (!(inode->i_state & I_NEW)) {
-                ret = afs_vnode_fetch_status(vnode);
+                _leave(" = %p", inode);
-                if (ret==0)
+                return inode;
-                        *_inode = inode;
-                else
-                        iput(inode);
-                _leave(" = %d", ret);
-                return ret;
        }
 #ifdef AFS_CACHING_SUPPORT
@@ -186,100 +157,185 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
                               &vnode->cache);
 #endif
-        /* okay... it's a new inode */
+        if (!status) {
-        inode->i_flags |= S_NOATIME;
+                /* it's a remotely extant inode */
-        vnode->flags |= AFS_VNODE_CHANGED;
+                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-        ret = afs_inode_fetch_status(inode);
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
-        if (ret<0)
+                if (ret < 0)
+                        goto bad_inode;
+        } else {
+                /* it's an inode we just created */
+                memcpy(&vnode->status, status, sizeof(vnode->status));
+                if (!cb) {
+                        /* it's a symlink we just created (the fileserver
+                         * didn't give us a callback) */
+                        vnode->cb_version = 0;
+                        vnode->cb_expiry = 0;
+                        vnode->cb_type = 0;
+                        vnode->cb_expires = get_seconds();
+                } else {
+                        vnode->cb_version = cb->version;
+                        vnode->cb_expiry = cb->expiry;
+                        vnode->cb_type = cb->type;
+                        vnode->cb_expires = vnode->cb_expiry + get_seconds();
+                }
+        }
+        ret = afs_inode_map_status(vnode, key);
+        if (ret < 0)
                goto bad_inode;
        /* success */
+        clear_bit(AFS_VNODE_UNSET, &vnode->flags);
+        inode->i_flags |= S_NOATIME;
        unlock_new_inode(inode);
+        _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
-        *_inode = inode;
+        return inode;
-        _leave(" = 0 [CB { v=%u x=%lu t=%u }]",
-               vnode->cb_version,
-               vnode->cb_timeout.timo_jif,
-               vnode->cb_type);
-        return 0;
        /* failure */
- bad_inode:
+bad_inode:
        make_bad_inode(inode);
        unlock_new_inode(inode);
        iput(inode);
        _leave(" = %d [bad]", ret);
+        return ERR_PTR(ret);
+}
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ *     symlink)
+ *   - parent dir metadata changed (security changes)
+ *   - dentry data changed (write, truncate)
+ *   - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+        int ret;
+        _enter("{v={%x:%u} fl=%lx},%x",
+               vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+               key_serial(key));
+        if (vnode->cb_promised &&
+            !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+            !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
+            !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                if (vnode->cb_expires < get_seconds() + 10) {
+                        _debug("callback expired");
+                        set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+                } else {
+                        goto valid;
+                }
+        }
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+                goto valid;
+        mutex_lock(&vnode->validate_lock);
+        /* if the promise has expired, we need to check the server again to get
+         * a new promise - note that if the (parent) directory's metadata was
+         * changed then the security may be different and we may no longer have
+         * access */
+        if (!vnode->cb_promised ||
+            test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+                _debug("not promised");
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
+                if (ret < 0)
+                        goto error_unlock;
+                _debug("new promise [fl=%lx]", vnode->flags);
+        }
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+                _debug("file already deleted");
+                ret = -ESTALE;
+                goto error_unlock;
+        }
+        /* if the vnode's data version number changed then its contents are
+         * different */
+        if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                _debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
+                invalidate_remote_inode(&vnode->vfs_inode);
+        }
+        clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+        mutex_unlock(&vnode->validate_lock);
+valid:
+        _leave(" = 0");
+        return 0;
+error_unlock:
+        mutex_unlock(&vnode->validate_lock);
+        _leave(" = %d", ret);
        return ret;
-} /* end afs_iget() */
+}
-/*****************************************************************************/
 /*
 * read the attributes of an inode
 */
 int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
                      struct kstat *stat)
 {
-        struct afs_vnode *vnode;
        struct inode *inode;
-        int ret;
        inode = dentry->d_inode;
        _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
-        vnode = AFS_FS_I(inode);
-        ret = afs_inode_fetch_status(inode);
-        if (ret == -ENOENT) {
-                _leave(" = %d [%d %p]",
-                       ret, atomic_read(&dentry->d_count), dentry->d_inode);
-                return ret;
-        }
-        else if (ret < 0) {
-                make_bad_inode(inode);
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* transfer attributes from the inode structure to the stat
-         * structure */
        generic_fillattr(inode, stat);
-        _leave(" = 0 CB { v=%u x=%u t=%u }",
-               vnode->cb_version,
-               vnode->cb_expiry,
-               vnode->cb_type);
        return 0;
-} /* end afs_inode_getattr() */
+}
-/*****************************************************************************/
 /*
 * clear an AFS inode
 */
 void afs_clear_inode(struct inode *inode)
 {
+        struct afs_permits *permits;
        struct afs_vnode *vnode;
        vnode = AFS_FS_I(inode);
-        _enter("ino=%lu { vn=%08x v=%u x=%u t=%u }",
+        _enter("{%x:%d.%d} v=%u x=%u t=%u }",
-               inode->i_ino,
+               vnode->fid.vid,
               vnode->fid.vnode,
+               vnode->fid.unique,
               vnode->cb_version,
               vnode->cb_expiry,
-               vnode->cb_type
+               vnode->cb_type);
-               );
-        BUG_ON(inode->i_ino != vnode->fid.vnode);
+        _debug("CLEAR INODE %p", inode);
-        afs_vnode_give_up_callback(vnode);
+        ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+        afs_give_up_callback(vnode);
+        if (vnode->server) {
+                spin_lock(&vnode->server->fs_lock);
+                rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+                spin_unlock(&vnode->server->fs_lock);
+                afs_put_server(vnode->server);
+                vnode->server = NULL;
+        }
+        ASSERT(!vnode->cb_promised);
 #ifdef AFS_CACHING_SUPPORT
        cachefs_relinquish_cookie(vnode->cache, 0);
        vnode->cache = NULL;
 #endif
+        mutex_lock(&vnode->permits_lock);
+        permits = vnode->permits;
+        rcu_assign_pointer(vnode->permits, NULL);
+        mutex_unlock(&vnode->permits_lock);
+        if (permits)
+                call_rcu(&permits->rcu, afs_zap_permits);
        _leave("");
-} /* end afs_clear_inode() */
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5151d5da2c2f..34665f7d7a19 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1,6 +1,6 @@
-/* internal.h: internal AFS stuff
+/* internal AFS stuff
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -9,48 +9,391 @@
 * 2 of the License, or (at your option) any later version.
 */
-#ifndef AFS_INTERNAL_H
-#define AFS_INTERNAL_H
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/skbuff.h>
+#include <linux/rxrpc.h>
+#include <linux/key.h>
+#include "afs.h"
+#include "afs_vl.h"
+#define AFS_CELL_MAX_ADDRS 15
+struct afs_call;
+typedef enum {
+        AFS_VL_NEW,                     /* new, uninitialised record */
+        AFS_VL_CREATING,                /* creating record */
+        AFS_VL_VALID,                   /* record is pending */
+        AFS_VL_NO_VOLUME,               /* no such volume available */
+        AFS_VL_UPDATING,                /* update in progress */
+        AFS_VL_VOLUME_DELETED,          /* volume was deleted */
+        AFS_VL_UNCERTAIN,               /* uncertain state (update failed) */
+} __attribute__((packed)) afs_vlocation_state_t;
+struct afs_mount_params {
+        bool                    rwpath;         /* T if the parent should be considered R/W */
+        bool                    force;          /* T to force cell type */
+        afs_voltype_t           type;           /* type of volume requested */
+        int                     volnamesz;      /* size of volume name */
+        const char              *volname;       /* name of volume to mount */
+        struct afs_cell         *cell;          /* cell in which to find volume */
+        struct afs_volume       *volume;        /* volume record */
+        struct key              *key;           /* key to use for secure mounting */
+};
 /*
- * debug tracing
+ * definition of how to wait for the completion of an operation
 */
-#define kenter(FMT, a...)       printk("==> %s("FMT")\n",__FUNCTION__ , ## a)
+struct afs_wait_mode {
-#define kleave(FMT, a...)       printk("<== %s()"FMT"\n",__FUNCTION__ , ## a)
+        /* RxRPC received message notification */
-#define kdebug(FMT, a...)       printk(FMT"\n" , ## a)
+        void (*rx_wakeup)(struct afs_call *call);
-#define kproto(FMT, a...)       printk("### "FMT"\n" , ## a)
-#define knet(FMT, a...)         printk(FMT"\n" , ## a)
-#ifdef __KDEBUG
-#define _enter(FMT, a...)       kenter(FMT , ## a)
-#define _leave(FMT, a...)       kleave(FMT , ## a)
-#define _debug(FMT, a...)       kdebug(FMT , ## a)
-#define _proto(FMT, a...)       kproto(FMT , ## a)
-#define _net(FMT, a...)         knet(FMT , ## a)
-#else
-#define _enter(FMT, a...)       do { } while(0)
-#define _leave(FMT, a...)       do { } while(0)
-#define _debug(FMT, a...)       do { } while(0)
-#define _proto(FMT, a...)       do { } while(0)
-#define _net(FMT, a...)         do { } while(0)
-#endif
-static inline void afs_discard_my_signals(void)
+        /* synchronous call waiter and call dispatched notification */
-{
+        int (*wait)(struct afs_call *call);
-        while (signal_pending(current)) {
-                siginfo_t sinfo;
+        /* asynchronous call completion */
+        void (*async_complete)(void *reply, int error);
+};
+extern const struct afs_wait_mode afs_sync_call;
+extern const struct afs_wait_mode afs_async_call;
-                spin_lock_irq(&current->sighand->siglock);
+/*
-                dequeue_signal(current,&current->blocked, &sinfo);
+ * a record of an in-progress RxRPC call
-                spin_unlock_irq(&current->sighand->siglock);
+ */
-        }
+struct afs_call {
+        const struct afs_call_type *type;       /* type of call */
+        const struct afs_wait_mode *wait_mode;  /* completion wait mode */
+        wait_queue_head_t       waitq;          /* processes awaiting completion */
+        struct work_struct      async_work;     /* asynchronous work processor */
+        struct work_struct      work;           /* actual work processor */
+        struct sk_buff_head     rx_queue;       /* received packets */
+        struct rxrpc_call       *rxcall;        /* RxRPC call handle */
+        struct key              *key;           /* security for this call */
+        struct afs_server       *server;        /* server affected by incoming CM call */
+        void                    *request;       /* request data (first part) */
+        void                    *request2;      /* request data (second part) */
+        void                    *buffer;        /* reply receive buffer */
+        void                    *reply;         /* reply buffer (first part) */
+        void                    *reply2;        /* reply buffer (second part) */
+        void                    *reply3;        /* reply buffer (third part) */
+        void                    *reply4;        /* reply buffer (fourth part) */
+        enum {                                  /* call state */
+                AFS_CALL_REQUESTING,    /* request is being sent for outgoing call */
+                AFS_CALL_AWAIT_REPLY,   /* awaiting reply to outgoing call */
+                AFS_CALL_AWAIT_OP_ID,   /* awaiting op ID on incoming call */
+                AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */
+                AFS_CALL_REPLYING,      /* replying to incoming call */
+                AFS_CALL_AWAIT_ACK,     /* awaiting final ACK of incoming call */
+                AFS_CALL_COMPLETE,      /* successfully completed */
+                AFS_CALL_BUSY,          /* server was busy */
+                AFS_CALL_ABORTED,       /* call was aborted */
+                AFS_CALL_ERROR,         /* call failed due to error */
+        }                       state;
+        int                     error;          /* error code */
+        unsigned                request_size;   /* size of request data */
+        unsigned                reply_max;      /* maximum size of reply */
+        unsigned                reply_size;     /* current size of reply */
+        unsigned short          offset;         /* offset into received data store */
+        unsigned char           unmarshall;     /* unmarshalling phase */
+        bool                    incoming;       /* T if incoming call */
+        u16                     service_id;     /* RxRPC service ID to call */
+        __be16                  port;           /* target UDP port */
+        __be32                  operation_ID;   /* operation ID for an incoming call */
+        u32                     count;          /* count for use in unmarshalling */
+        __be32                  tmp;            /* place to extract temporary data */
+};
+struct afs_call_type {
+        const char *name;
+        /* deliver request or reply data to an call
+         * - returning an error will cause the call to be aborted
+         */
+        int (*deliver)(struct afs_call *call, struct sk_buff *skb,
+                       bool last);
+        /* map an abort code to an error number */
+        int (*abort_to_error)(u32 abort_code);
+        /* clean up a call */
+        void (*destructor)(struct afs_call *call);
+};
+/*
+ * AFS superblock private data
+ * - there's one superblock per volume
+ */
+struct afs_super_info {
+        struct afs_volume       *volume;        /* volume record */
+        char                    rwparent;       /* T if parent is R/W AFS volume */
+};
+static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
+{
+        return sb->s_fs_info;
 }
+extern struct file_system_type afs_fs_type;
+/*
+ * entry in the cached cell catalogue
+ */
+struct afs_cache_cell {
+        char            name[AFS_MAXCELLNAME];  /* cell name (padded with NULs) */
+        struct in_addr  vl_servers[15];         /* cached cell VL servers */
+};
+/*
+ * AFS cell record
+ */
+struct afs_cell {
+        atomic_t                usage;
+        struct list_head        link;           /* main cell list link */
+        struct key              *anonymous_key; /* anonymous user key for this cell */
+        struct list_head        proc_link;      /* /proc cell list link */
+        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        /* server record management */
+        rwlock_t                servers_lock;   /* active server list lock */
+        struct list_head        servers;        /* active server list */
+        /* volume location record management */
+        struct rw_semaphore     vl_sem;         /* volume management serialisation semaphore */
+        struct list_head        vl_list;        /* cell's active VL record list */
+        spinlock_t              vl_lock;        /* vl_list lock */
+        unsigned short          vl_naddrs;      /* number of VL servers in addr list */
+        unsigned short          vl_curr_svix;   /* current server index */
+        struct in_addr          vl_addrs[AFS_CELL_MAX_ADDRS];   /* cell VL server addresses */
+        char                    name[0];        /* cell name - must go last */
+};
+/*
+ * entry in the cached volume location catalogue
+ */
+struct afs_cache_vlocation {
+        /* volume name (lowercase, padded with NULs) */
+        uint8_t                 name[AFS_MAXVOLNAME + 1];
+        uint8_t                 nservers;       /* number of entries used in servers[] */
+        uint8_t                 vidmask;        /* voltype mask for vid[] */
+        uint8_t                 srvtmask[8];    /* voltype masks for servers[] */
+#define AFS_VOL_VTM_RW  0x01 /* R/W version of the volume is available (on this server) */
+#define AFS_VOL_VTM_RO  0x02 /* R/O version of the volume is available (on this server) */
+#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+        afs_volid_t             vid[3];         /* volume IDs for R/W, R/O and Bak volumes */
+        struct in_addr          servers[8];     /* fileserver addresses */
+        time_t                  rtime;          /* last retrieval time */
+};
+/*
+ * volume -> vnode hash table entry
+ */
+struct afs_cache_vhash {
+        afs_voltype_t           vtype;          /* which volume variation */
+        uint8_t                 hash_bucket;    /* which hash bucket this represents */
+} __attribute__((packed));
+/*
+ * AFS volume location record
+ */
+struct afs_vlocation {
+        atomic_t                usage;
+        time_t                  time_of_death;  /* time at which put reduced usage to 0 */
+        struct list_head        link;           /* link in cell volume location list */
+        struct list_head        grave;          /* link in master graveyard list */
+        struct list_head        update;         /* link in master update list */
+        struct afs_cell         *cell;          /* cell to which volume belongs */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        struct afs_cache_vlocation vldb;        /* volume information DB record */
+        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
+        wait_queue_head_t       waitq;          /* status change waitqueue */
+        time_t                  update_at;      /* time at which record should be updated */
+        spinlock_t              lock;           /* access lock */
+        afs_vlocation_state_t   state;          /* volume location state */
+        unsigned short          upd_rej_cnt;    /* ENOMEDIUM count during update */
+        unsigned short          upd_busy_cnt;   /* EBUSY count during update */
+        bool                    valid;          /* T if valid */
+};
+/*
+ * AFS fileserver record
+ */
+struct afs_server {
+        atomic_t                usage;
+        time_t                  time_of_death;  /* time at which put reduced usage to 0 */
+        struct in_addr          addr;           /* server address */
+        struct afs_cell         *cell;          /* cell in which server resides */
+        struct list_head        link;           /* link in cell's server list */
+        struct list_head        grave;          /* link in master graveyard list */
+        struct rb_node          master_rb;      /* link in master by-addr tree */
+        struct rw_semaphore     sem;            /* access lock */
+        /* file service access */
+        struct rb_root          fs_vnodes;      /* vnodes backed by this server (ordered by FID) */
+        unsigned long           fs_act_jif;     /* time at which last activity occurred */
+        unsigned long           fs_dead_jif;    /* time at which no longer to be considered dead */
+        spinlock_t              fs_lock;        /* access lock */
+        int                     fs_state;       /* 0 or reason FS currently marked dead (-errno) */
+        /* callback promise management */
+        struct rb_root          cb_promises;    /* vnode expiration list (ordered earliest first) */
+        struct delayed_work     cb_updater;     /* callback updater */
+        struct delayed_work     cb_break_work;  /* collected break dispatcher */
+        wait_queue_head_t       cb_break_waitq; /* space available in cb_break waitqueue */
+        spinlock_t              cb_lock;        /* access lock */
+        struct afs_callback     cb_break[64];   /* ring of callbacks awaiting breaking */
+        atomic_t                cb_break_n;     /* number of pending breaks */
+        u8                      cb_break_head;  /* head of callback breaking ring */
+        u8                      cb_break_tail;  /* tail of callback breaking ring */
+};
+/*
+ * AFS volume access record
+ */
+struct afs_volume {
+        atomic_t                usage;
+        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
+        struct afs_vlocation    *vlocation;     /* volume location */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        afs_volid_t             vid;            /* volume ID */
+        afs_voltype_t           type;           /* type of volume */
+        char                    type_force;     /* force volume type (suppress R/O -> R/W) */
+        unsigned short          nservers;       /* number of server slots filled */
+        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
+        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
+        struct rw_semaphore     server_sem;     /* lock for accessing current server */
+};
+/*
+ * vnode catalogue entry
+ */
+struct afs_cache_vnode {
+        afs_vnodeid_t           vnode_id;       /* vnode ID */
+        unsigned                vnode_unique;   /* vnode ID uniquifier */
+        afs_dataversion_t       data_version;   /* data version */
+};
+/*
+ * AFS inode private data
+ */
+struct afs_vnode {
+        struct inode            vfs_inode;      /* the VFS's inode record */
+        struct afs_volume       *volume;        /* volume on which vnode resides */
+        struct afs_server       *server;        /* server currently supplying this file */
+        struct afs_fid          fid;            /* the file identifier for this inode */
+        struct afs_file_status  status;         /* AFS status info for this file */
+#ifdef AFS_CACHING_SUPPORT
+        struct cachefs_cookie   *cache;         /* caching cookie */
+#endif
+        struct afs_permits      *permits;       /* cache of permits so far obtained */
+        struct mutex            permits_lock;   /* lock for altering permits list */
+        struct mutex            validate_lock;  /* lock for validating this vnode */
+        wait_queue_head_t       update_waitq;   /* status fetch waitqueue */
+        int                     update_cnt;     /* number of outstanding ops that will update the
+                                                 * status */
+        spinlock_t              lock;           /* waitqueue/flags lock */
+        unsigned long           flags;
+#define AFS_VNODE_CB_BROKEN     0               /* set if vnode's callback was broken */
+#define AFS_VNODE_UNSET         1               /* set if vnode attributes not yet set */
+#define AFS_VNODE_MODIFIED      2               /* set if vnode's data modified */
+#define AFS_VNODE_ZAP_DATA      3               /* set if vnode's data should be invalidated */
+#define AFS_VNODE_DELETED       4               /* set if vnode deleted on server */
+#define AFS_VNODE_MOUNTPOINT    5               /* set if vnode is a mountpoint symlink */
+        long                    acl_order;      /* ACL check count (callback break count) */
+        /* outstanding callback notification on this file */
+        struct rb_node          server_rb;      /* link in server->fs_vnodes */
+        struct rb_node          cb_promise;     /* link in server->cb_promises */
+        struct work_struct      cb_broken_work; /* work to be done on callback break */
+        time_t                  cb_expires;     /* time at which callback expires */
+        time_t                  cb_expires_at;  /* time used to order cb_promise */
+        unsigned                cb_version;     /* callback version */
+        unsigned                cb_expiry;      /* callback expiry time */
+        afs_callback_type_t     cb_type;        /* type of callback */
+        bool                    cb_promised;    /* true if promise still holds */
+};
+/*
+ * cached security record for one user's attempt to access a vnode
+ */
+struct afs_permit {
+        struct key              *key;           /* RxRPC ticket holding a security context */
+        afs_access_t            access_mask;    /* access mask for this key */
+};
+/*
+ * cache of security records from attempts to access a vnode
+ */
+struct afs_permits {
+        struct rcu_head         rcu;            /* disposal procedure */
+        int                     count;          /* number of records */
+        struct afs_permit       permits[0];     /* the permits so far examined */
+};
+/*
+ * record of one of a system's set of network interfaces
+ */
+struct afs_interface {
+        unsigned        index;          /* interface index */
+        struct in_addr  address;        /* IPv4 address bound to interface */
+        struct in_addr  netmask;        /* netmask applied to address */
+        unsigned        mtu;            /* MTU of interface */
+};
+/*
+ * UUID definition [internet draft]
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ *   increments since midnight 15th October 1582
+ *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ *     time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct afs_uuid {
+        u32             time_low;                       /* low part of timestamp */
+        u16             time_mid;                       /* mid part of timestamp */
+        u16             time_hi_and_version;            /* high part of timestamp and version  */
+#define AFS_UUID_TO_UNIX_TIME   0x01b21dd213814000ULL
+#define AFS_UUID_TIMEHI_MASK    0x0fff
+#define AFS_UUID_VERSION_TIME   0x1000  /* time-based UUID */
+#define AFS_UUID_VERSION_NAME   0x3000  /* name-based UUID */
+#define AFS_UUID_VERSION_RANDOM 0x4000  /* (pseudo-)random generated UUID */
+        u8              clock_seq_hi_and_reserved;      /* clock seq hi and variant */
+#define AFS_UUID_CLOCKHI_MASK   0x3f
+#define AFS_UUID_VARIANT_STD    0x80
+        u8              clock_seq_low;                  /* clock seq low */
+        u8              node[6];                        /* spatially unique node ID (MAC addr) */
+};
+/*****************************************************************************/
+/*
+ * callback.c
+ */
+extern void afs_init_callback_state(struct afs_server *);
+extern void afs_broken_callback_work(struct work_struct *);
+extern void afs_break_callbacks(struct afs_server *, size_t,
+                                struct afs_callback[]);
+extern void afs_discard_callback_on_delete(struct afs_vnode *);
+extern void afs_give_up_callback(struct afs_vnode *);
+extern void afs_dispatch_give_up_callbacks(struct work_struct *);
+extern void afs_flush_callback_breaks(struct afs_server *);
+extern int __init afs_callback_update_init(void);
+extern void __exit afs_callback_update_kill(void);
 /*
 * cell.c
 */
@@ -60,57 +403,156 @@ extern struct list_head afs_proc_cells;
 extern struct cachefs_index_def afs_cache_cell_index_def;
 #endif
+#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
+extern int afs_cell_init(char *);
+extern struct afs_cell *afs_cell_create(const char *, char *);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_grab_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_cell *);
+extern void afs_cell_purge(void);
+/*
+ * cmservice.c
+ */
+extern bool afs_cm_incoming_call(struct afs_call *);
 /*
 * dir.c
 */
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct file_operations afs_dir_file_operations;
+extern int afs_permission(struct inode *, int, struct nameidata *);
 /*
 * file.c
 */
 extern const struct address_space_operations afs_fs_aops;
 extern const struct inode_operations afs_file_inode_operations;
+extern const struct file_operations afs_file_operations;
+extern int afs_open(struct inode *, struct file *);
+extern int afs_release(struct inode *, struct file *);
 #ifdef AFS_CACHING_SUPPORT
-extern int afs_cache_get_page_cookie(struct page *page,
+extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
-                                     struct cachefs_page **_page_cookie);
 #endif
 /*
- * inode.c
+ * fsclient.c
 */
-extern int afs_iget(struct super_block *sb, struct afs_fid *fid,
+extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
-                    struct inode **_inode);
+                                    struct afs_vnode *, struct afs_volsync *,
-extern int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                                    const struct afs_wait_mode *);
-                             struct kstat *stat);
+extern int afs_fs_give_up_callbacks(struct afs_server *,
-extern void afs_clear_inode(struct inode *inode);
+                                    const struct afs_wait_mode *);
+extern int afs_fs_fetch_data(struct afs_server *, struct key *,
+                             struct afs_vnode *, off_t, size_t, struct page *,
+                             const struct afs_wait_mode *);
+extern int afs_fs_create(struct afs_server *, struct key *,
+                         struct afs_vnode *, const char *, umode_t,
+                         struct afs_fid *, struct afs_file_status *,
+                         struct afs_callback *,
+                         const struct afs_wait_mode *);
+extern int afs_fs_remove(struct afs_server *, struct key *,
+                         struct afs_vnode *, const char *, bool,
+                         const struct afs_wait_mode *);
+extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
+                       struct afs_vnode *, const char *,
+                       const struct afs_wait_mode *);
+extern int afs_fs_symlink(struct afs_server *, struct key *,
+                          struct afs_vnode *, const char *, const char *,
+                          struct afs_fid *, struct afs_file_status *,
+                          const struct afs_wait_mode *);
+extern int afs_fs_rename(struct afs_server *, struct key *,
+                         struct afs_vnode *, const char *,
+                         struct afs_vnode *, const char *,
+                         const struct afs_wait_mode *);
 /*
- * key_afs.c
+ * inode.c
 */
-#ifdef CONFIG_KEYS
+extern struct inode *afs_iget(struct super_block *, struct key *,
-extern int afs_key_register(void);
+                              struct afs_fid *, struct afs_file_status *,
-extern void afs_key_unregister(void);
+                              struct afs_callback *);
-#endif
+extern int afs_validate(struct afs_vnode *, struct key *);
+extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
+                             struct kstat *);
+extern void afs_zap_permits(struct rcu_head *);
+extern void afs_clear_inode(struct inode *);
 /*
 * main.c
 */
+extern struct afs_uuid afs_uuid;
 #ifdef AFS_CACHING_SUPPORT
 extern struct cachefs_netfs afs_cache_netfs;
 #endif
 /*
+ * misc.c
+ */
+extern int afs_abort_to_error(u32);
+/*
 * mntpt.c
 */
 extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
-extern struct afs_timer afs_mntpt_expiry_timer;
-extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
 extern unsigned long afs_mntpt_expiry_timeout;
-extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
+extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
+extern void afs_mntpt_kill_timer(void);
+extern void afs_umount_begin(struct vfsmount *, int);
+/*
+ * proc.c
+ */
+extern int afs_proc_init(void);
+extern void afs_proc_cleanup(void);
+extern int afs_proc_cell_setup(struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_cell *);
+/*
+ * rxrpc.c
+ */
+extern int afs_open_socket(void);
+extern void afs_close_socket(void);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
+                         const struct afs_wait_mode *);
+extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+                                            size_t, size_t);
+extern void afs_flat_call_destructor(struct afs_call *);
+extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern void afs_send_empty_reply(struct afs_call *);
+extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
+extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
+                            size_t);
+/*
+ * security.c
+ */
+extern void afs_clear_permits(struct afs_vnode *);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_permission(struct inode *, int, struct nameidata *);
+/*
+ * server.c
+ */
+extern spinlock_t afs_server_peer_lock;
+#define afs_get_server(S)                                       \
+do {                                                            \
+        _debug("GET SERVER %d", atomic_read(&(S)->usage));      \
+        atomic_inc(&(S)->usage);                                \
+} while(0)
+extern struct afs_server *afs_lookup_server(struct afs_cell *,
+                                            const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct in_addr *);
+extern void afs_put_server(struct afs_server *);
+extern void __exit afs_purge_servers(void);
 /*
 * super.c
@@ -118,22 +560,211 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
 extern int afs_fs_init(void);
 extern void afs_fs_exit(void);
-#define AFS_CB_HASH_COUNT (PAGE_SIZE / sizeof(struct list_head))
+/*
+ * use-rtnetlink.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+extern int afs_get_MAC_address(u8 [6]);
-extern struct list_head afs_cb_hash_tbl[];
+/*
-extern spinlock_t afs_cb_hash_lock;
+ * vlclient.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vlocation_cache_index_def;
+#endif
-#define afs_cb_hash(SRV,FID) \
+extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
-        afs_cb_hash_tbl[((unsigned long)(SRV) + \
+                                    const char *, struct afs_cache_vlocation *,
-                        (FID)->vid + (FID)->vnode + (FID)->unique) % \
+                                    const struct afs_wait_mode *);
-                        AFS_CB_HASH_COUNT]
+extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+                                  afs_volid_t, afs_voltype_t,
+                                  struct afs_cache_vlocation *,
+                                  const struct afs_wait_mode *);
 /*
- * proc.c
+ * vlocation.c
 */
-extern int afs_proc_init(void);
+#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *cell);
+extern int __init afs_vlocation_update_init(void);
-extern void afs_proc_cell_remove(struct afs_cell *cell);
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+                                                  struct key *,
+                                                  const char *, size_t);
+extern void afs_put_vlocation(struct afs_vlocation *);
+extern void __exit afs_vlocation_purge(void);
+/*
+ * vnode.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vnode_cache_index_def;
+#endif
+extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+        return container_of(inode, struct afs_vnode, vfs_inode);
+}
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+        return &vnode->vfs_inode;
+}
+extern void afs_vnode_finalise_status_update(struct afs_vnode *,
+                                             struct afs_server *);
+extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
+                                  struct key *);
+extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
+                                off_t, size_t, struct page *);
+extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
+                            umode_t, struct afs_fid *, struct afs_file_status *,
+                            struct afs_callback *, struct afs_server **);
+extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
+                            bool);
+extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
+                          const char *);
+extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
+                             const char *, struct afs_fid *,
+                             struct afs_file_status *, struct afs_server **);
+extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
+                            struct key *, const char *, const char *);
+/*
+ * volume.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_volume_cache_index_def;
+#endif
+#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+extern void afs_put_volume(struct afs_volume *);
+extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
+extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
+extern int afs_volume_release_fileserver(struct afs_vnode *,
+                                         struct afs_server *, int);
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern unsigned afs_debug;
+#define dbgprintk(FMT,...) \
+        printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__)
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf,1,2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kdebug(FMT,...) dbgprintk("    "FMT ,##__VA_ARGS__)
+#if defined(__KDEBUG)
+#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__)
+#elif defined(CONFIG_AFS_DEBUG)
+#define AFS_DEBUG_KENTER        0x01
+#define AFS_DEBUG_KLEAVE        0x02
+#define AFS_DEBUG_KDEBUG        0x04
+#define _enter(FMT,...)                                 \
+do {                                                    \
+        if (unlikely(afs_debug & AFS_DEBUG_KENTER))     \
+                kenter(FMT,##__VA_ARGS__);              \
+} while (0)
+#define _leave(FMT,...)                                 \
+do {                                                    \
+        if (unlikely(afs_debug & AFS_DEBUG_KLEAVE))     \
+                kleave(FMT,##__VA_ARGS__);              \
+} while (0)
+#define _debug(FMT,...)                                 \
+do {                                                    \
+        if (unlikely(afs_debug & AFS_DEBUG_KDEBUG))     \
+                kdebug(FMT,##__VA_ARGS__);              \
+} while (0)
+#else
+#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define _debug(FMT,...) _dbprintk("    "FMT ,##__VA_ARGS__)
+#endif
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+#define ASSERT(X)                                               \
+do {                                                            \
+        if (unlikely(!(X))) {                                   \
+                printk(KERN_ERR "\n");                          \
+                printk(KERN_ERR "AFS: Assertion failed\n");     \
+                BUG();                                          \
+        }                                                       \
+} while(0)
+#define ASSERTCMP(X, OP, Y)                                             \
+do {                                                                    \
+        if (unlikely(!((X) OP (Y)))) {                                  \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "AFS: Assertion failed\n");             \
+                printk(KERN_ERR "%lu " #OP " %lu is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",       \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while(0)
+#define ASSERTIF(C, X)                                          \
+do {                                                            \
+        if (unlikely((C) && !(X))) {                            \
+                printk(KERN_ERR "\n");                          \
+                printk(KERN_ERR "AFS: Assertion failed\n");     \
+                BUG();                                          \
+        }                                                       \
+} while(0)
+#define ASSERTIFCMP(C, X, OP, Y)                                        \
+do {                                                                    \
+        if (unlikely((C) && !((X) OP (Y)))) {                           \
+                printk(KERN_ERR "\n");                                  \
+                printk(KERN_ERR "AFS: Assertion failed\n");             \
+                printk(KERN_ERR "%lu " #OP " %lu is false\n",           \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",       \
+                       (unsigned long)(X), (unsigned long)(Y));         \
+                BUG();                                                  \
+        }                                                               \
+} while(0)
+#else
+#define ASSERT(X)                               \
+do {                                            \
+} while(0)
+#define ASSERTCMP(X, OP, Y)                     \
+do {                                            \
+} while(0)
+#define ASSERTIF(C, X)                          \
+do {                                            \
+} while(0)
+#define ASSERTIFCMP(C, X, OP, Y)                \
+do {                                            \
+} while(0)
-#endif /* AFS_INTERNAL_H */
+#endif /* __KDEBUGALL */
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
deleted file mode 100644
index 615df2407cb2..000000000000
--- a/fs/afs/kafsasyncd.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* kafsasyncd.c: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- * The AFS async daemon is used to the following:
- * - probe "dead" servers to see whether they've come back to life yet.
- * - probe "live" servers that we haven't talked to for a while to see if they are better
- *   candidates for serving than what we're currently using
- * - poll volume location servers to keep up to date volume location lists
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "server.h"
-#include "volume.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include <rxrpc/call.h>
-#include <asm/errno.h>
-#include "internal.h"
-static DECLARE_COMPLETION(kafsasyncd_alive);
-static DECLARE_COMPLETION(kafsasyncd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafsasyncd_sleepq);
-static struct task_struct *kafsasyncd_task;
-static int kafsasyncd_die;
-static int kafsasyncd(void *arg);
-static LIST_HEAD(kafsasyncd_async_attnq);
-static LIST_HEAD(kafsasyncd_async_busyq);
-static DEFINE_SPINLOCK(kafsasyncd_async_lock);
-static void kafsasyncd_null_call_attn_func(struct rxrpc_call *call)
-{
-}
-static void kafsasyncd_null_call_error_func(struct rxrpc_call *call)
-{
-}
-/*****************************************************************************/
-/*
- * start the async daemon
- */
-int afs_kafsasyncd_start(void)
-{
-        int ret;
-        ret = kernel_thread(kafsasyncd, NULL, 0);
-        if (ret < 0)
-                return ret;
-        wait_for_completion(&kafsasyncd_alive);
-        return ret;
-} /* end afs_kafsasyncd_start() */
-/*****************************************************************************/
-/*
- * stop the async daemon
- */
-void afs_kafsasyncd_stop(void)
-{
-        /* get rid of my daemon */
-        kafsasyncd_die = 1;
-        wake_up(&kafsasyncd_sleepq);
-        wait_for_completion(&kafsasyncd_dead);
-} /* end afs_kafsasyncd_stop() */
-/*****************************************************************************/
-/*
- * probing daemon
- */
-static int kafsasyncd(void *arg)
-{
-        struct afs_async_op *op;
-        int die;
-        DECLARE_WAITQUEUE(myself, current);
-        kafsasyncd_task = current;
-        printk("kAFS: Started kafsasyncd %d\n", current->pid);
-        daemonize("kafsasyncd");
-        complete(&kafsasyncd_alive);
-        /* loop around looking for things to attend to */
-        do {
-                set_current_state(TASK_INTERRUPTIBLE);
-                add_wait_queue(&kafsasyncd_sleepq, &myself);
-                for (;;) {
-                        if (!list_empty(&kafsasyncd_async_attnq) ||
-                            signal_pending(current) ||
-                            kafsasyncd_die)
-                                break;
-                        schedule();
-                        set_current_state(TASK_INTERRUPTIBLE);
-                }
-                remove_wait_queue(&kafsasyncd_sleepq, &myself);
-                set_current_state(TASK_RUNNING);
-                try_to_freeze();
-                /* discard pending signals */
-                afs_discard_my_signals();
-                die = kafsasyncd_die;
-                /* deal with the next asynchronous operation requiring
-                 * attention */
-                if (!list_empty(&kafsasyncd_async_attnq)) {
-                        struct afs_async_op *op;
-                        _debug("@@@ Begin Asynchronous Operation");
-                        op = NULL;
-                        spin_lock(&kafsasyncd_async_lock);
-                        if (!list_empty(&kafsasyncd_async_attnq)) {
-                                op = list_entry(kafsasyncd_async_attnq.next,
-                                                struct afs_async_op, link);
-                                list_move_tail(&op->link,
-                                              &kafsasyncd_async_busyq);
-                        }
-                        spin_unlock(&kafsasyncd_async_lock);
-                        _debug("@@@ Operation %p {%p}\n",
-                               op, op ? op->ops : NULL);
-                        if (op)
-                                op->ops->attend(op);
-                        _debug("@@@ End Asynchronous Operation");
-                }
-        } while(!die);
-        /* need to kill all outstanding asynchronous operations before
-         * exiting */
-        kafsasyncd_task = NULL;
-        spin_lock(&kafsasyncd_async_lock);
-        /* fold the busy and attention queues together */
-        list_splice_init(&kafsasyncd_async_busyq,
-                         &kafsasyncd_async_attnq);
-        /* dequeue kafsasyncd from all their wait queues */
-        list_for_each_entry(op, &kafsasyncd_async_attnq, link) {
-                op->call->app_attn_func = kafsasyncd_null_call_attn_func;
-                op->call->app_error_func = kafsasyncd_null_call_error_func;
-                remove_wait_queue(&op->call->waitq, &op->waiter);
-        }
-        spin_unlock(&kafsasyncd_async_lock);
-        /* abort all the operations */
-        while (!list_empty(&kafsasyncd_async_attnq)) {
-                op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link);
-                list_del_init(&op->link);
-                rxrpc_call_abort(op->call, -EIO);
-                rxrpc_put_call(op->call);
-                op->call = NULL;
-                op->ops->discard(op);
-        }
-        /* and that's all */
-        _leave("");
-        complete_and_exit(&kafsasyncd_dead, 0);
-} /* end kafsasyncd() */
-/*****************************************************************************/
-/*
- * begin an operation
- * - place operation on busy queue
- */
-void afs_kafsasyncd_begin_op(struct afs_async_op *op)
-{
-        _enter("");
-        spin_lock(&kafsasyncd_async_lock);
-        init_waitqueue_entry(&op->waiter, kafsasyncd_task);
-        add_wait_queue(&op->call->waitq, &op->waiter);
-        list_move_tail(&op->link, &kafsasyncd_async_busyq);
-        spin_unlock(&kafsasyncd_async_lock);
-        _leave("");
-} /* end afs_kafsasyncd_begin_op() */
-/*****************************************************************************/
-/*
- * request attention for an operation
- * - move to attention queue
- */
-void afs_kafsasyncd_attend_op(struct afs_async_op *op)
-{
-        _enter("");
-        spin_lock(&kafsasyncd_async_lock);
-        list_move_tail(&op->link, &kafsasyncd_async_attnq);
-        spin_unlock(&kafsasyncd_async_lock);
-        wake_up(&kafsasyncd_sleepq);
-        _leave("");
-} /* end afs_kafsasyncd_attend_op() */
-/*****************************************************************************/
-/*
- * terminate an operation
- * - remove from either queue
- */
-void afs_kafsasyncd_terminate_op(struct afs_async_op *op)
-{
-        _enter("");
-        spin_lock(&kafsasyncd_async_lock);
-        if (!list_empty(&op->link)) {
-                list_del_init(&op->link);
-                remove_wait_queue(&op->call->waitq, &op->waiter);
-        }
-        spin_unlock(&kafsasyncd_async_lock);
-        wake_up(&kafsasyncd_sleepq);
-        _leave("");
-} /* end afs_kafsasyncd_terminate_op() */
diff --git a/fs/afs/kafsasyncd.h b/fs/afs/kafsasyncd.h
deleted file mode 100644
index 791803f9a6fb..000000000000
--- a/fs/afs/kafsasyncd.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* kafsasyncd.h: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_KAFSASYNCD_H
-#define _LINUX_AFS_KAFSASYNCD_H
-#include "types.h"
-struct afs_async_op;
-struct afs_async_op_ops {
-        void (*attend)(struct afs_async_op *op);
-        void (*discard)(struct afs_async_op *op);
-};
-/*****************************************************************************/
-/*
- * asynchronous operation record
- */
-struct afs_async_op
-{
-        struct list_head                link;
-        struct afs_server               *server;        /* server being contacted */
-        struct rxrpc_call               *call;          /* RxRPC call performing op */
-        wait_queue_t                    waiter;         /* wait queue for kafsasyncd */
-        const struct afs_async_op_ops   *ops;           /* operations */
-};
-static inline void afs_async_op_init(struct afs_async_op *op,
-                                     const struct afs_async_op_ops *ops)
-{
-        INIT_LIST_HEAD(&op->link);
-        op->call = NULL;
-        op->ops = ops;
-}
-extern int afs_kafsasyncd_start(void);
-extern void afs_kafsasyncd_stop(void);
-extern void afs_kafsasyncd_begin_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_attend_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_terminate_op(struct afs_async_op *op);
-#endif /* _LINUX_AFS_KAFSASYNCD_H */
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
deleted file mode 100644
index 694344e4d3c7..000000000000
--- a/fs/afs/kafstimod.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* kafstimod.c: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "volume.h"
-#include "kafstimod.h"
-#include <asm/errno.h>
-#include "internal.h"
-static DECLARE_COMPLETION(kafstimod_alive);
-static DECLARE_COMPLETION(kafstimod_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafstimod_sleepq);
-static int kafstimod_die;
-static LIST_HEAD(kafstimod_list);
-static DEFINE_SPINLOCK(kafstimod_lock);
-static int kafstimod(void *arg);
-/*****************************************************************************/
-/*
- * start the timeout daemon
- */
-int afs_kafstimod_start(void)
-{
-        int ret;
-        ret = kernel_thread(kafstimod, NULL, 0);
-        if (ret < 0)
-                return ret;
-        wait_for_completion(&kafstimod_alive);
-        return ret;
-} /* end afs_kafstimod_start() */
-/*****************************************************************************/
-/*
- * stop the timeout daemon
- */
-void afs_kafstimod_stop(void)
-{
-        /* get rid of my daemon */
-        kafstimod_die = 1;
-        wake_up(&kafstimod_sleepq);
-        wait_for_completion(&kafstimod_dead);
-} /* end afs_kafstimod_stop() */
-/*****************************************************************************/
-/*
- * timeout processing daemon
- */
-static int kafstimod(void *arg)
-{
-        struct afs_timer *timer;
-        DECLARE_WAITQUEUE(myself, current);
-        printk("kAFS: Started kafstimod %d\n", current->pid);
-        daemonize("kafstimod");
-        complete(&kafstimod_alive);
-        /* loop around looking for things to attend to */
- loop:
-        set_current_state(TASK_INTERRUPTIBLE);
-        add_wait_queue(&kafstimod_sleepq, &myself);
-        for (;;) {
-                unsigned long jif;
-                signed long timeout;
-                /* deal with the server being asked to die */
-                if (kafstimod_die) {
-                        remove_wait_queue(&kafstimod_sleepq, &myself);
-                        _leave("");
-                        complete_and_exit(&kafstimod_dead, 0);
-                }
-                try_to_freeze();
-                /* discard pending signals */
-                afs_discard_my_signals();
-                /* work out the time to elapse before the next event */
-                spin_lock(&kafstimod_lock);
-                if (list_empty(&kafstimod_list)) {
-                        timeout = MAX_SCHEDULE_TIMEOUT;
-                }
-                else {
-                        timer = list_entry(kafstimod_list.next,
-                                           struct afs_timer, link);
-                        timeout = timer->timo_jif;
-                        jif = jiffies;
-                        if (time_before_eq((unsigned long) timeout, jif))
-                                goto immediate;
-                        else {
-                                timeout = (long) timeout - (long) jiffies;
-                        }
-                }
-                spin_unlock(&kafstimod_lock);
-                schedule_timeout(timeout);
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        /* the thing on the front of the queue needs processing
-         * - we come here with the lock held and timer pointing to the expired
-         *   entry
-         */
- immediate:
-        remove_wait_queue(&kafstimod_sleepq, &myself);
-        set_current_state(TASK_RUNNING);
-        _debug("@@@ Begin Timeout of %p", timer);
-        /* dequeue the timer */
-        list_del_init(&timer->link);
-        spin_unlock(&kafstimod_lock);
-        /* call the timeout function */
-        timer->ops->timed_out(timer);
-        _debug("@@@ End Timeout");
-        goto loop;
-} /* end kafstimod() */
-/*****************************************************************************/
-/*
- * (re-)queue a timer
- */
-void afs_kafstimod_add_timer(struct afs_timer *timer, unsigned long timeout)
-{
-        struct afs_timer *ptimer;
-        struct list_head *_p;
-        _enter("%p,%lu", timer, timeout);
-        spin_lock(&kafstimod_lock);
-        list_del(&timer->link);
-        /* the timer was deferred or reset - put it back in the queue at the
-         * right place */
-        timer->timo_jif = jiffies + timeout;
-        list_for_each(_p, &kafstimod_list) {
-                ptimer = list_entry(_p, struct afs_timer, link);
-                if (time_before(timer->timo_jif, ptimer->timo_jif))
-                        break;
-        }
-        list_add_tail(&timer->link, _p); /* insert before stopping point */
-        spin_unlock(&kafstimod_lock);
-        wake_up(&kafstimod_sleepq);
-        _leave("");
-} /* end afs_kafstimod_add_timer() */
-/*****************************************************************************/
-/*
- * dequeue a timer
- * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued
- */
-int afs_kafstimod_del_timer(struct afs_timer *timer)
-{
-        int ret = 0;
-        _enter("%p", timer);
-        spin_lock(&kafstimod_lock);
-        if (list_empty(&timer->link))
-                ret = -ENOENT;
-        else
-                list_del_init(&timer->link);
-        spin_unlock(&kafstimod_lock);
-        wake_up(&kafstimod_sleepq);
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_kafstimod_del_timer() */
diff --git a/fs/afs/kafstimod.h b/fs/afs/kafstimod.h
deleted file mode 100644
index e312f1a61a7f..000000000000
--- a/fs/afs/kafstimod.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* kafstimod.h: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_KAFSTIMOD_H
-#define _LINUX_AFS_KAFSTIMOD_H
-#include "types.h"
-struct afs_timer;
-struct afs_timer_ops {
-        /* called when the front of the timer queue has timed out */
-        void (*timed_out)(struct afs_timer *timer);
-};
-/*****************************************************************************/
-/*
- * AFS timer/timeout record
- */
-struct afs_timer
-{
-        struct list_head                link;           /* link in timer queue */
-        unsigned long                   timo_jif;       /* timeout time */
-        const struct afs_timer_ops      *ops;           /* timeout expiry function */
-};
-static inline void afs_timer_init(struct afs_timer *timer,
-                                  const struct afs_timer_ops *ops)
-{
-        INIT_LIST_HEAD(&timer->link);
-        timer->ops = ops;
-}
-extern int afs_kafstimod_start(void);
-extern void afs_kafstimod_stop(void);
-extern void afs_kafstimod_add_timer(struct afs_timer *timer,
-                                    unsigned long timeout);
-extern int afs_kafstimod_del_timer(struct afs_timer *timer);
-#endif /* _LINUX_AFS_KAFSTIMOD_H */
diff --git a/fs/afs/main.c b/fs/afs/main.c
index f2704ba53857..40c2704e7557 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,4 +1,4 @@
-/* main.c: AFS client file system
+/* AFS client file system
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -13,43 +13,21 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/completion.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/call.h>
-#include <rxrpc/peer.h>
-#include "cache.h"
-#include "cell.h"
-#include "server.h"
-#include "fsclient.h"
-#include "cmservice.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
 #include "internal.h"
-struct rxrpc_transport *afs_transport;
-static int afs_adding_peer(struct rxrpc_peer *peer);
-static void afs_discarding_peer(struct rxrpc_peer *peer);
 MODULE_DESCRIPTION("AFS Client File System");
 MODULE_AUTHOR("Red Hat, Inc.");
 MODULE_LICENSE("GPL");
+unsigned afs_debug;
+module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
 static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-static struct rxrpc_peer_ops afs_peer_ops = {
-        .adding         = afs_adding_peer,
-        .discarding     = afs_discarding_peer,
-};
-struct list_head afs_cb_hash_tbl[AFS_CB_HASH_COUNT];
-DEFINE_SPINLOCK(afs_cb_hash_lock);
 #ifdef AFS_CACHING_SUPPORT
 static struct cachefs_netfs_operations afs_cache_ops = {
        .get_page_cookie        = afs_cache_get_page_cookie,
@@ -62,20 +40,63 @@ struct cachefs_netfs afs_cache_netfs = {
 };
 #endif
-/*****************************************************************************/
+struct afs_uuid afs_uuid;
+/*
+ * get a client UUID
+ */
+static int __init afs_get_client_UUID(void)
+{
+        struct timespec ts;
+        u64 uuidtime;
+        u16 clockseq;
+        int ret;
+        /* read the MAC address of one of the external interfaces and construct
+         * a UUID from it */
+        ret = afs_get_MAC_address(afs_uuid.node);
+        if (ret < 0)
+                return ret;
+        getnstimeofday(&ts);
+        uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
+        uuidtime += ts.tv_nsec / 100;
+        uuidtime += AFS_UUID_TO_UNIX_TIME;
+        afs_uuid.time_low = uuidtime;
+        afs_uuid.time_mid = uuidtime >> 32;
+        afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
+        afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+        get_random_bytes(&clockseq, 2);
+        afs_uuid.clock_seq_low = clockseq;
+        afs_uuid.clock_seq_hi_and_reserved =
+                (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
+        afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+        _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+               afs_uuid.time_low,
+               afs_uuid.time_mid,
+               afs_uuid.time_hi_and_version,
+               afs_uuid.clock_seq_hi_and_reserved,
+               afs_uuid.clock_seq_low,
+               afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
+               afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
+        return 0;
+}
 /*
 * initialise the AFS client FS module
 */
 static int __init afs_init(void)
 {
-        int loop, ret;
+        int ret;
        printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
-        /* initialise the callback hash table */
+        ret = afs_get_client_UUID();
-        spin_lock_init(&afs_cb_hash_lock);
+        if (ret < 0)
-        for (loop = AFS_CB_HASH_COUNT - 1; loop >= 0; loop--)
+                return ret;
-                INIT_LIST_HEAD(&afs_cb_hash_tbl[loop]);
        /* register the /proc stuff */
        ret = afs_proc_init();
@@ -87,70 +108,56 @@ static int __init afs_init(void)
        ret = cachefs_register_netfs(&afs_cache_netfs,
                                     &afs_cache_cell_index_def);
        if (ret < 0)
-                goto error;
-#endif
-#ifdef CONFIG_KEYS_TURNED_OFF
-        ret = afs_key_register();
-        if (ret < 0)
                goto error_cache;
 #endif
        /* initialise the cell DB */
        ret = afs_cell_init(rootcell);
        if (ret < 0)
-                goto error_keys;
+                goto error_cell_init;
-        /* start the timeout daemon */
+        /* initialise the VL update process */
-        ret = afs_kafstimod_start();
+        ret = afs_vlocation_update_init();
        if (ret < 0)
-                goto error_keys;
+                goto error_vl_update_init;
-        /* start the async operation daemon */
+        /* initialise the callback update process */
-        ret = afs_kafsasyncd_start();
+        ret = afs_callback_update_init();
-        if (ret < 0)
-                goto error_kafstimod;
        /* create the RxRPC transport */
-        ret = rxrpc_create_transport(7001, &afs_transport);
+        ret = afs_open_socket();
        if (ret < 0)
-                goto error_kafsasyncd;
+                goto error_open_socket;
-        afs_transport->peer_ops = &afs_peer_ops;
        /* register the filesystems */
        ret = afs_fs_init();
        if (ret < 0)
-                goto error_transport;
+                goto error_fs;
        return ret;
- error_transport:
+error_fs:
-        rxrpc_put_transport(afs_transport);
+        afs_close_socket();
- error_kafsasyncd:
+error_open_socket:
-        afs_kafsasyncd_stop();
+error_vl_update_init:
- error_kafstimod:
+error_cell_init:
-        afs_kafstimod_stop();
- error_keys:
-#ifdef CONFIG_KEYS_TURNED_OFF
-        afs_key_unregister();
- error_cache:
-#endif
 #ifdef AFS_CACHING_SUPPORT
        cachefs_unregister_netfs(&afs_cache_netfs);
- error:
+error_cache:
 #endif
+        afs_callback_update_kill();
+        afs_vlocation_purge();
        afs_cell_purge();
        afs_proc_cleanup();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
        return ret;
-} /* end afs_init() */
+}
 /* XXX late_initcall is kludgy, but the only alternative seems to create
 * a transport upon the first mount, which is worse. Or is it?
 */
 late_initcall(afs_init);        /* must be called after net/ to create socket */
-/*****************************************************************************/
 /*
 * clean up on module removal
 */
@@ -159,127 +166,16 @@ static void __exit afs_exit(void)
        printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
        afs_fs_exit();
-        rxrpc_put_transport(afs_transport);
+        afs_close_socket();
-        afs_kafstimod_stop();
+        afs_purge_servers();
-        afs_kafsasyncd_stop();
+        afs_callback_update_kill();
+        afs_vlocation_purge();
+        flush_scheduled_work();
        afs_cell_purge();
-#ifdef CONFIG_KEYS_TURNED_OFF
-        afs_key_unregister();
-#endif
 #ifdef AFS_CACHING_SUPPORT
        cachefs_unregister_netfs(&afs_cache_netfs);
 #endif
        afs_proc_cleanup();
-} /* end afs_exit() */
-module_exit(afs_exit);
-/*****************************************************************************/
-/*
- * notification that new peer record is being added
- * - called from krxsecd
- * - return an error to induce an abort
- * - mustn't sleep (caller holds an rwlock)
- */
-static int afs_adding_peer(struct rxrpc_peer *peer)
-{
-        struct afs_server *server;
-        int ret;
-        _debug("kAFS: Adding new peer %08x\n", ntohl(peer->addr.s_addr));
-        /* determine which server the peer resides in (if any) */
-        ret = afs_server_find_by_peer(peer, &server);
-        if (ret < 0)
-                return ret; /* none that we recognise, so abort */
-        _debug("Server %p{u=%d}\n", server, atomic_read(&server->usage));
-        _debug("Cell %p{u=%d}\n",
-               server->cell, atomic_read(&server->cell->usage));
-        /* cross-point the structs under a global lock */
-        spin_lock(&afs_server_peer_lock);
-        peer->user = server;
-        server->peer = peer;
-        spin_unlock(&afs_server_peer_lock);
-        afs_put_server(server);
-        return 0;
-} /* end afs_adding_peer() */
-/*****************************************************************************/
-/*
- * notification that a peer record is being discarded
- * - called from krxiod or krxsecd
- */
-static void afs_discarding_peer(struct rxrpc_peer *peer)
-{
-        struct afs_server *server;
-        _enter("%p",peer);
-        _debug("Discarding peer %08x (rtt=%lu.%lumS)\n",
-               ntohl(peer->addr.s_addr),
-               (long) (peer->rtt / 1000),
-               (long) (peer->rtt % 1000));
-        /* uncross-point the structs under a global lock */
-        spin_lock(&afs_server_peer_lock);
-        server = peer->user;
-        if (server) {
-                peer->user = NULL;
-                server->peer = NULL;
-        }
-        spin_unlock(&afs_server_peer_lock);
-        _leave("");
-} /* end afs_discarding_peer() */
-/*****************************************************************************/
-/*
- * clear the dead space between task_struct and kernel stack
- * - called by supplying -finstrument-functions to gcc
- */
-#if 0
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-__attribute__((no_instrument_function));
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-{
-       asm volatile("  movl    %%esp,%%edi     \n"
-                    "  andl    %0,%%edi        \n"
-                    "  addl    %1,%%edi        \n"
-                    "  movl    %%esp,%%ecx     \n"
-                    "  subl    %%edi,%%ecx     \n"
-                    "  shrl    $2,%%ecx        \n"
-                    "  movl    $0xedededed,%%eax     \n"
-                    "  rep stosl               \n"
-                    :
-                    : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
-                    : "eax", "ecx", "edi", "memory", "cc"
-                    );
 }
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
+module_exit(afs_exit);
-__attribute__((no_instrument_function));
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
-{
-       asm volatile("  movl    %%esp,%%edi     \n"
-                    "  andl    %0,%%edi        \n"
-                    "  addl    %1,%%edi        \n"
-                    "  movl    %%esp,%%ecx     \n"
-                    "  subl    %%edi,%%ecx     \n"
-                    "  shrl    $2,%%ecx        \n"
-                    "  movl    $0xdadadada,%%eax     \n"
-                    "  rep stosl               \n"
-                    :
-                    : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
-                    : "eax", "ecx", "edi", "memory", "cc"
-                    );
-}
-#endif
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index e4fce66d76e0..cdb9792d8161 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -1,6 +1,6 @@
-/* misc.c: miscellaneous bits
+/* miscellaneous bits
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -12,19 +12,20 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
-#include "errors.h"
 #include "internal.h"
+#include "afs_fs.h"
-/*****************************************************************************/
 /*
 * convert an AFS abort code to a Linux error number
 */
-int afs_abort_to_error(int abortcode)
+int afs_abort_to_error(u32 abort_code)
 {
-        switch (abortcode) {
+        switch (abort_code) {
+        case 13:                return -EACCES;
+        case 30:                return -EROFS;
        case VSALVAGE:          return -EIO;
        case VNOVNODE:          return -ENOENT;
-        case VNOVOL:            return -ENXIO;
+        case VNOVOL:            return -ENOMEDIUM;
        case VVOLEXISTS:        return -EEXIST;
        case VNOSERVICE:        return -EIO;
        case VOFFLINE:          return -ENOENT;
@@ -33,7 +34,24 @@ int afs_abort_to_error(int abortcode)
        case VOVERQUOTA:        return -EDQUOT;
        case VBUSY:             return -EBUSY;
        case VMOVED:            return -ENXIO;
-        default:                return -EIO;
+        case 0x2f6df0c:         return -EACCES;
+        case 0x2f6df0f:         return -EBUSY;
+        case 0x2f6df10:         return -EEXIST;
+        case 0x2f6df11:         return -EXDEV;
+        case 0x2f6df13:         return -ENOTDIR;
+        case 0x2f6df14:         return -EISDIR;
+        case 0x2f6df15:         return -EINVAL;
+        case 0x2f6df1a:         return -EFBIG;
+        case 0x2f6df1b:         return -ENOSPC;
+        case 0x2f6df1d:         return -EROFS;
+        case 0x2f6df1e:         return -EMLINK;
+        case 0x2f6df20:         return -EDOM;
+        case 0x2f6df21:         return -ERANGE;
+        case 0x2f6df22:         return -EDEADLK;
+        case 0x2f6df23:         return -ENAMETOOLONG;
+        case 0x2f6df24:         return -ENOLCK;
+        case 0x2f6df26:         return -ENOTEMPTY;
+        case 0x2f6df78:         return -EDQUOT;
+        default:                return -EREMOTEIO;
        }
+}
-} /* end afs_abort_to_error() */
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 68495f0de7b3..b905ae37f912 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -1,4 +1,4 @@
-/* mntpt.c: mountpoint management
+/* mountpoint management
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -18,10 +18,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
-#include "super.h"
-#include "cell.h"
-#include "volume.h"
-#include "vnode.h"
 #include "internal.h"
@@ -30,6 +26,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
+static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 const struct file_operations afs_mntpt_file_operations = {
        .open           = afs_mntpt_open,
@@ -43,24 +40,19 @@ const struct inode_operations afs_mntpt_inode_operations = {
 };
 static LIST_HEAD(afs_vfsmounts);
+static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer);
+unsigned long afs_mntpt_expiry_timeout = 10 * 60;
-struct afs_timer_ops afs_mntpt_expiry_timer_ops = {
-        .timed_out      = afs_mntpt_expiry_timed_out,
-};
-struct afs_timer afs_mntpt_expiry_timer;
-unsigned long afs_mntpt_expiry_timeout = 20;
-/*****************************************************************************/
 /*
 * check a symbolic link to see whether it actually encodes a mountpoint
 * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
 */
-int afs_mntpt_check_symlink(struct afs_vnode *vnode)
+int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
+        struct file file = {
+                .private_data = key,
+        };
        struct page *page;
        size_t size;
        char *buf;
@@ -69,7 +61,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
        _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
        /* read the contents of the symlink into the pagecache */
-        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
+        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto out;
@@ -85,7 +77,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
        /* examine the symlink's contents */
        size = vnode->status.size;
-        _debug("symlink to %*.*s", size, (int) size, buf);
+        _debug("symlink to %*.*s", (int) size, (int) size, buf);
        if (size > 2 &&
            (buf[0] == '%' || buf[0] == '#') &&
@@ -93,22 +85,20 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
            ) {
                _debug("symlink is a mountpoint");
                spin_lock(&vnode->lock);
-                vnode->flags |= AFS_VNODE_MOUNTPOINT;
+                set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
                spin_unlock(&vnode->lock);
        }
        ret = 0;
- out_free:
+out_free:
        kunmap(page);
        page_cache_release(page);
- out:
+out:
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_mntpt_check_symlink() */
-/*****************************************************************************/
 /*
 * no valid lookup procedure on this sort of dir
 */
@@ -116,7 +106,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
                                       struct dentry *dentry,
                                       struct nameidata *nd)
 {
-        kenter("%p,%p{%p{%s},%s}",
+        _enter("%p,%p{%p{%s},%s}",
               dir,
               dentry,
               dentry->d_parent,
@@ -125,15 +115,14 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
               dentry->d_name.name);
        return ERR_PTR(-EREMOTE);
-} /* end afs_mntpt_lookup() */
+}
-/*****************************************************************************/
 /*
 * no valid open procedure on this sort of dir
 */
 static int afs_mntpt_open(struct inode *inode, struct file *file)
 {
-        kenter("%p,%p{%p{%s},%s}",
+        _enter("%p,%p{%p{%s},%s}",
               inode, file,
               file->f_path.dentry->d_parent,
               file->f_path.dentry->d_parent ?
@@ -142,9 +131,8 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
               file->f_path.dentry->d_name.name);
        return -EREMOTE;
-} /* end afs_mntpt_open() */
+}
-/*****************************************************************************/
 /*
 * create a vfsmount to be automounted
 */
@@ -157,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        char *buf, *devname = NULL, *options = NULL;
        int ret;
-        kenter("{%s}", mntpt->d_name.name);
+        _enter("{%s}", mntpt->d_name.name);
        BUG_ON(!mntpt->d_inode);
@@ -201,79 +189,108 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
                strcat(options, ",rwpath");
        /* try and do the mount */
-        kdebug("--- attempting mount %s -o %s ---", devname, options);
+        _debug("--- attempting mount %s -o %s ---", devname, options);
        mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
-        kdebug("--- mount result %p ---", mnt);
+        _debug("--- mount result %p ---", mnt);
        free_page((unsigned long) devname);
        free_page((unsigned long) options);
-        kleave(" = %p", mnt);
+        _leave(" = %p", mnt);
        return mnt;
- error:
+error:
        if (page)
                page_cache_release(page);
        if (devname)
                free_page((unsigned long) devname);
        if (options)
                free_page((unsigned long) options);
-        kleave(" = %d", ret);
+        _leave(" = %d", ret);
        return ERR_PTR(ret);
-} /* end afs_mntpt_do_automount() */
+}
-/*****************************************************************************/
 /*
 * follow a link from a mountpoint directory, thus causing it to be mounted
 */
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct vfsmount *newmnt;
-        struct dentry *old_dentry;
        int err;
-        kenter("%p{%s},{%s:%p{%s}}",
+        _enter("%p{%s},{%s:%p{%s},}",
               dentry,
               dentry->d_name.name,
               nd->mnt->mnt_devname,
               dentry,
               nd->dentry->d_name.name);
-        newmnt = afs_mntpt_do_automount(dentry);
+        dput(nd->dentry);
+        nd->dentry = dget(dentry);
+        newmnt = afs_mntpt_do_automount(nd->dentry);
        if (IS_ERR(newmnt)) {
                path_release(nd);
                return (void *)newmnt;
        }
-        old_dentry = nd->dentry;
+        mntget(newmnt);
-        nd->dentry = dentry;
+        err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
-        err = do_add_mount(newmnt, nd, 0, &afs_vfsmounts);
+        switch (err) {
-        nd->dentry = old_dentry;
+        case 0:
+                mntput(nd->mnt);
-        path_release(nd);
+                dput(nd->dentry);
-        if (!err) {
-                mntget(newmnt);
                nd->mnt = newmnt;
-                dget(newmnt->mnt_root);
+                nd->dentry = dget(newmnt->mnt_root);
-                nd->dentry = newmnt->mnt_root;
+                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                                      afs_mntpt_expiry_timeout * HZ);
+                break;
+        case -EBUSY:
+                /* someone else made a mount here whilst we were busy */
+                while (d_mountpoint(nd->dentry) &&
+                       follow_down(&nd->mnt, &nd->dentry))
+                        ;
+                err = 0;
+        default:
+                mntput(newmnt);
+                break;
        }
-        kleave(" = %d", err);
+        _leave(" = %d", err);
        return ERR_PTR(err);
-} /* end afs_mntpt_follow_link() */
+}
-/*****************************************************************************/
 /*
 * handle mountpoint expiry timer going off
 */
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer)
+static void afs_mntpt_expiry_timed_out(struct work_struct *work)
 {
-        kenter("");
+        _enter("");
-        mark_mounts_for_expiry(&afs_vfsmounts);
+        if (!list_empty(&afs_vfsmounts)) {
+                mark_mounts_for_expiry(&afs_vfsmounts);
+                schedule_delayed_work(&afs_mntpt_expiry_timer,
+                                      afs_mntpt_expiry_timeout * HZ);
+        }
+        _leave("");
+}
-        afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
+/*
-                                afs_mntpt_expiry_timeout * HZ);
+ * kill the AFS mountpoint timer if it's still running
+ */
+void afs_mntpt_kill_timer(void)
+{
+        _enter("");
-        kleave("");
+        ASSERT(list_empty(&afs_vfsmounts));
-} /* end afs_mntpt_expiry_timed_out() */
+        cancel_delayed_work(&afs_mntpt_expiry_timer);
+        flush_scheduled_work();
+}
+/*
+ * begin unmount by attempting to remove all automounted mountpoints we added
+ */
+void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+        shrink_submounts(vfsmnt, &afs_vfsmounts);
+}
diff --git a/fs/afs/mount.h b/fs/afs/mount.h
deleted file mode 100644
index 9d2f46ec549f..000000000000
--- a/fs/afs/mount.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* mount.h: mount parameters
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_MOUNT_H
-#define _LINUX_AFS_MOUNT_H
-struct afs_mountdata {
-        const char              *volume;        /* name of volume */
-        const char              *cell;          /* name of cell containing volume */
-        const char              *cache;         /* name of cache block device */
-        size_t                  nservers;       /* number of server addresses listed */
-        uint32_t                servers[10];    /* IP addresses of servers in this cell */
-};
-#endif /* _LINUX_AFS_MOUNT_H */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index ae6b85b1e484..d5601f617cdb 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -1,4 +1,4 @@
-/* proc.c: /proc interface for AFS
+/* /proc interface for AFS
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -13,8 +13,6 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include "cell.h"
-#include "volume.h"
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -130,7 +128,6 @@ static const struct file_operations afs_proc_cell_servers_fops = {
        .release        = afs_proc_cell_servers_release,
 };
-/*****************************************************************************/
 /*
 * initialise the /proc/fs/afs/ directory
 */
@@ -142,47 +139,43 @@ int afs_proc_init(void)
        proc_afs = proc_mkdir("fs/afs", NULL);
        if (!proc_afs)
-                goto error;
+                goto error_dir;
        proc_afs->owner = THIS_MODULE;
        p = create_proc_entry("cells", 0, proc_afs);
        if (!p)
-                goto error_proc;
+                goto error_cells;
        p->proc_fops = &afs_proc_cells_fops;
        p->owner = THIS_MODULE;
        p = create_proc_entry("rootcell", 0, proc_afs);
        if (!p)
-                goto error_cells;
+                goto error_rootcell;
        p->proc_fops = &afs_proc_rootcell_fops;
        p->owner = THIS_MODULE;
        _leave(" = 0");
        return 0;
- error_cells:
+error_rootcell:
        remove_proc_entry("cells", proc_afs);
- error_proc:
+error_cells:
        remove_proc_entry("fs/afs", NULL);
- error:
+error_dir:
        _leave(" = -ENOMEM");
        return -ENOMEM;
+}
-} /* end afs_proc_init() */
-/*****************************************************************************/
 /*
 * clean up the /proc/fs/afs/ directory
 */
 void afs_proc_cleanup(void)
 {
+        remove_proc_entry("rootcell", proc_afs);
        remove_proc_entry("cells", proc_afs);
        remove_proc_entry("fs/afs", NULL);
+}
-} /* end afs_proc_cleanup() */
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/cells" which provides a summary of extant cells
 */
@@ -199,9 +192,8 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
        m->private = PDE(inode)->data;
        return 0;
-} /* end afs_proc_cells_open() */
+}
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
@@ -225,9 +217,8 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
                        break;
        return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -241,19 +232,16 @@ static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
        _p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
        return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
 static void afs_proc_cells_stop(struct seq_file *p, void *v)
 {
        up_read(&afs_proc_cells_sem);
+}
-} /* end afs_proc_cells_stop() */
-/*****************************************************************************/
 /*
 * display a header line followed by a load of cell lines
 */
@@ -261,19 +249,18 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
        struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
-        /* display header on line 1 */
        if (v == (void *) 1) {
+                /* display header on line 1 */
                seq_puts(m, "USE NAME\n");
                return 0;
        }
        /* display one cell per line on subsequent lines */
-        seq_printf(m, "%3d %s\n", atomic_read(&cell->usage), cell->name);
+        seq_printf(m, "%3d %s\n",
+                   atomic_read(&cell->usage), cell->name);
        return 0;
-} /* end afs_proc_cells_show() */
+}
-/*****************************************************************************/
 /*
 * handle writes to /proc/fs/afs/cells
 * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
@@ -326,30 +313,32 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
        if (strcmp(kbuf, "add") == 0) {
                struct afs_cell *cell;
-                ret = afs_cell_create(name, args, &cell);
-                if (ret < 0)
+                cell = afs_cell_create(name, args);
+                if (IS_ERR(cell)) {
+                        ret = PTR_ERR(cell);
                        goto done;
+                }
+                afs_put_cell(cell);
                printk("kAFS: Added new cell '%s'\n", name);
-        }
+        } else {
-        else {
                goto inval;
        }
        ret = size;
- done:
+done:
        kfree(kbuf);
        _leave(" = %d", ret);
        return ret;
- inval:
+inval:
        ret = -EINVAL;
        printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
        goto done;
-} /* end afs_proc_cells_write() */
+}
-/*****************************************************************************/
 /*
 * Stubs for /proc/fs/afs/rootcell
 */
@@ -369,7 +358,6 @@ static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
        return 0;
 }
-/*****************************************************************************/
 /*
 * handle writes to /proc/fs/afs/rootcell
 * - to initialize rootcell: echo "cell.name:192.168.231.14"
@@ -407,14 +395,13 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
        if (ret >= 0)
                ret = size;     /* consume everything, always */
- infault:
+infault:
        kfree(kbuf);
- nomem:
+nomem:
        _leave(" = %d", ret);
        return ret;
-} /* end afs_proc_rootcell_write() */
+}
-/*****************************************************************************/
 /*
 * initialise /proc/fs/afs/<cell>/
 */
@@ -426,25 +413,25 @@ int afs_proc_cell_setup(struct afs_cell *cell)
        cell->proc_dir = proc_mkdir(cell->name, proc_afs);
        if (!cell->proc_dir)
-                return -ENOMEM;
+                goto error_dir;
        p = create_proc_entry("servers", 0, cell->proc_dir);
        if (!p)
-                goto error_proc;
+                goto error_servers;
        p->proc_fops = &afs_proc_cell_servers_fops;
        p->owner = THIS_MODULE;
        p->data = cell;
        p = create_proc_entry("vlservers", 0, cell->proc_dir);
        if (!p)
-                goto error_servers;
+                goto error_vlservers;
        p->proc_fops = &afs_proc_cell_vlservers_fops;
        p->owner = THIS_MODULE;
        p->data = cell;
        p = create_proc_entry("volumes", 0, cell->proc_dir);
        if (!p)
-                goto error_vlservers;
+                goto error_volumes;
        p->proc_fops = &afs_proc_cell_volumes_fops;
        p->owner = THIS_MODULE;
        p->data = cell;
@@ -452,17 +439,17 @@ int afs_proc_cell_setup(struct afs_cell *cell)
        _leave(" = 0");
        return 0;
- error_vlservers:
+error_volumes:
        remove_proc_entry("vlservers", cell->proc_dir);
- error_servers:
+error_vlservers:
        remove_proc_entry("servers", cell->proc_dir);
- error_proc:
+error_servers:
        remove_proc_entry(cell->name, proc_afs);
+error_dir:
        _leave(" = -ENOMEM");
        return -ENOMEM;
-} /* end afs_proc_cell_setup() */
+}
-/*****************************************************************************/
 /*
 * remove /proc/fs/afs/<cell>/
 */
@@ -476,9 +463,8 @@ void afs_proc_cell_remove(struct afs_cell *cell)
        remove_proc_entry(cell->name, proc_afs);
        _leave("");
-} /* end afs_proc_cell_remove() */
+}
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
 */
@@ -488,7 +474,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int ret;
-        cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+        cell = PDE(inode)->data;
        if (!cell)
                return -ENOENT;
@@ -500,25 +486,16 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
        m->private = cell;
        return 0;
-} /* end afs_proc_cell_volumes_open() */
+}
-/*****************************************************************************/
 /*
 * close the file and release the ref to the cell
 */
 static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
 {
-        struct afs_cell *cell = PDE(inode)->data;
+        return seq_release(inode, file);
-        int ret;
+}
-        ret = seq_release(inode,file);
-        afs_put_cell(cell);
-        return ret;
-} /* end afs_proc_cell_volumes_release() */
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
@@ -545,9 +522,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
                        break;
        return _p != &cell->vl_list ? _p : NULL;
-} /* end afs_proc_cell_volumes_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -562,12 +538,11 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
        (*_pos)++;
        _p = v;
-        _p = v == (void *) 1 ? cell->vl_list.next : _p->next;
+        _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
-        return _p != &cell->vl_list ? _p : NULL;
+        return (_p != &cell->vl_list) ? _p : NULL;
-} /* end afs_proc_cell_volumes_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
@@ -576,10 +551,18 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
        struct afs_cell *cell = p->private;
        up_read(&cell->vl_sem);
+}
-} /* end afs_proc_cell_volumes_stop() */
+const char afs_vlocation_states[][4] = {
+        [AFS_VL_NEW]                    = "New",
+        [AFS_VL_CREATING]               = "Crt",
+        [AFS_VL_VALID]                  = "Val",
+        [AFS_VL_NO_VOLUME]              = "NoV",
+        [AFS_VL_UPDATING]               = "Upd",
+        [AFS_VL_VOLUME_DELETED]         = "Del",
+        [AFS_VL_UNCERTAIN]              = "Unc",
+};
-/*****************************************************************************/
 /*
 * display a header line followed by a load of volume lines
 */
@@ -590,23 +573,22 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
        /* display header on line 1 */
        if (v == (void *) 1) {
-                seq_puts(m, "USE VLID[0]  VLID[1]  VLID[2]  NAME\n");
+                seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
                return 0;
        }
        /* display one cell per line on subsequent lines */
-        seq_printf(m, "%3d %08x %08x %08x %s\n",
+        seq_printf(m, "%3d %s %08x %08x %08x %s\n",
                   atomic_read(&vlocation->usage),
+                   afs_vlocation_states[vlocation->state],
                   vlocation->vldb.vid[0],
                   vlocation->vldb.vid[1],
                   vlocation->vldb.vid[2],
-                   vlocation->vldb.name
+                   vlocation->vldb.name);
-                   );
        return 0;
-} /* end afs_proc_cell_volumes_show() */
+}
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
 * location server
@@ -617,11 +599,11 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int ret;
-        cell = afs_get_cell_maybe((struct afs_cell**)&PDE(inode)->data);
+        cell = PDE(inode)->data;
        if (!cell)
                return -ENOENT;
-        ret = seq_open(file,&afs_proc_cell_vlservers_ops);
+        ret = seq_open(file, &afs_proc_cell_vlservers_ops);
        if (ret<0)
                return ret;
@@ -629,26 +611,17 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
        m->private = cell;
        return 0;
-} /* end afs_proc_cell_vlservers_open() */
+}
-/*****************************************************************************/
 /*
 * close the file and release the ref to the cell
 */
 static int afs_proc_cell_vlservers_release(struct inode *inode,
                                           struct file *file)
 {
-        struct afs_cell *cell = PDE(inode)->data;
+        return seq_release(inode, file);
-        int ret;
+}
-        ret = seq_release(inode,file);
-        afs_put_cell(cell);
-        return ret;
-} /* end afs_proc_cell_vlservers_release() */
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
@@ -672,9 +645,8 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
                return NULL;
        return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -692,9 +664,8 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
                return NULL;
        return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
@@ -703,10 +674,8 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
        struct afs_cell *cell = p->private;
        up_read(&cell->vl_sem);
+}
-} /* end afs_proc_cell_vlservers_stop() */
-/*****************************************************************************/
 /*
 * display a header line followed by a load of volume lines
 */
@@ -722,11 +691,9 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
        /* display one cell per line on subsequent lines */
        seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr));
        return 0;
-} /* end afs_proc_cell_vlservers_show() */
+}
-/*****************************************************************************/
 /*
 * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
 * servers
@@ -737,7 +704,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
        struct seq_file *m;
        int ret;
-        cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+        cell = PDE(inode)->data;
        if (!cell)
                return -ENOENT;
@@ -747,34 +714,24 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
        m = file->private_data;
        m->private = cell;
        return 0;
-} /* end afs_proc_cell_servers_open() */
+}
-/*****************************************************************************/
 /*
 * close the file and release the ref to the cell
 */
 static int afs_proc_cell_servers_release(struct inode *inode,
                                         struct file *file)
 {
-        struct afs_cell *cell = PDE(inode)->data;
+        return seq_release(inode, file);
-        int ret;
+}
-        ret = seq_release(inode, file);
-        afs_put_cell(cell);
-        return ret;
-} /* end afs_proc_cell_servers_release() */
-/*****************************************************************************/
 /*
 * set up the iterator to start reading from the cells list and return the
 * first item
 */
 static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
-        __acquires(m->private->sv_lock)
+        __acquires(m->private->servers_lock)
 {
        struct list_head *_p;
        struct afs_cell *cell = m->private;
@@ -783,7 +740,7 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
        _enter("cell=%p pos=%Ld", cell, *_pos);
        /* lock the list against modification */
-        read_lock(&cell->sv_lock);
+        read_lock(&cell->servers_lock);
        /* allow for the header line */
        if (!pos)
@@ -791,14 +748,13 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
        pos--;
        /* find the n'th element in the list */
-        list_for_each(_p, &cell->sv_list)
+        list_for_each(_p, &cell->servers)
                if (!pos--)
                        break;
-        return _p != &cell->sv_list ? _p : NULL;
+        return _p != &cell->servers ? _p : NULL;
-} /* end afs_proc_cell_servers_start() */
+}
-/*****************************************************************************/
 /*
 * move to next cell in cells list
 */
@@ -813,25 +769,22 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
        (*_pos)++;
        _p = v;
-        _p = v == (void *) 1 ? cell->sv_list.next : _p->next;
+        _p = v == (void *) 1 ? cell->servers.next : _p->next;
-        return _p != &cell->sv_list ? _p : NULL;
+        return _p != &cell->servers ? _p : NULL;
-} /* end afs_proc_cell_servers_next() */
+}
-/*****************************************************************************/
 /*
 * clean up after reading from the cells list
 */
 static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
-        __releases(p->private->sv_lock)
+        __releases(p->private->servers_lock)
 {
        struct afs_cell *cell = p->private;
-        read_unlock(&cell->sv_lock);
+        read_unlock(&cell->servers_lock);
+}
-} /* end afs_proc_cell_servers_stop() */
-/*****************************************************************************/
 /*
 * display a header line followed by a load of volume lines
 */
@@ -849,10 +802,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
        /* display one cell per line on subsequent lines */
        sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr));
        seq_printf(m, "%3d %-15.15s %5d\n",
-                   atomic_read(&server->usage),
+                   atomic_read(&server->usage), ipaddr, server->fs_state);
-                   ipaddr,
-                   server->fs_state
-                   );
        return 0;
-} /* end afs_proc_cell_servers_show() */
+}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
new file mode 100644
index 000000000000..222c1a3abbb8
--- /dev/null
+++ b/fs/afs/rxrpc.c
@@ -0,0 +1,782 @@
+/* Maintain an RxRPC server socket to do AFS communications through
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_cm.h"
+static struct socket *afs_socket; /* my RxRPC socket */
+static struct workqueue_struct *afs_async_calls;
+static atomic_t afs_outstanding_calls;
+static atomic_t afs_outstanding_skbs;
+static void afs_wake_up_call_waiter(struct afs_call *);
+static int afs_wait_for_call_to_complete(struct afs_call *);
+static void afs_wake_up_async_call(struct afs_call *);
+static int afs_dont_wait_for_call_to_complete(struct afs_call *);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
+static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+/* synchronous call management */
+const struct afs_wait_mode afs_sync_call = {
+        .rx_wakeup      = afs_wake_up_call_waiter,
+        .wait           = afs_wait_for_call_to_complete,
+};
+/* asynchronous call management */
+const struct afs_wait_mode afs_async_call = {
+        .rx_wakeup      = afs_wake_up_async_call,
+        .wait           = afs_dont_wait_for_call_to_complete,
+};
+/* asynchronous incoming call management */
+static const struct afs_wait_mode afs_async_incoming_call = {
+        .rx_wakeup      = afs_wake_up_async_call,
+};
+/* asynchronous incoming call initial processing */
+static const struct afs_call_type afs_RXCMxxxx = {
+        .name           = "CB.xxxx",
+        .deliver        = afs_deliver_cm_op_id,
+        .abort_to_error = afs_abort_to_error,
+};
+static void afs_collect_incoming_call(struct work_struct *);
+static struct sk_buff_head afs_incoming_calls;
+static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+/*
+ * open an RxRPC socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+int afs_open_socket(void)
+{
+        struct sockaddr_rxrpc srx;
+        struct socket *socket;
+        int ret;
+        _enter("");
+        skb_queue_head_init(&afs_incoming_calls);
+        afs_async_calls = create_singlethread_workqueue("kafsd");
+        if (!afs_async_calls) {
+                _leave(" = -ENOMEM [wq]");
+                return -ENOMEM;
+        }
+        ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+        if (ret < 0) {
+                destroy_workqueue(afs_async_calls);
+                _leave(" = %d [socket]", ret);
+                return ret;
+        }
+        socket->sk->sk_allocation = GFP_NOFS;
+        /* bind the callback manager's address to make this a server socket */
+        srx.srx_family                  = AF_RXRPC;
+        srx.srx_service                 = CM_SERVICE;
+        srx.transport_type              = SOCK_DGRAM;
+        srx.transport_len               = sizeof(srx.transport.sin);
+        srx.transport.sin.sin_family    = AF_INET;
+        srx.transport.sin.sin_port      = htons(AFS_CM_PORT);
+        memset(&srx.transport.sin.sin_addr, 0,
+               sizeof(srx.transport.sin.sin_addr));
+        ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+        if (ret < 0) {
+                sock_release(socket);
+                _leave(" = %d [bind]", ret);
+                return ret;
+        }
+        rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
+        afs_socket = socket;
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * close the RxRPC socket AFS was using
+ */
+void afs_close_socket(void)
+{
+        _enter("");
+        sock_release(afs_socket);
+        _debug("dework");
+        destroy_workqueue(afs_async_calls);
+        ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
+        ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
+        _leave("");
+}
+/*
+ * note that the data in a socket buffer is now delivered and that the buffer
+ * should be freed
+ */
+static void afs_data_delivered(struct sk_buff *skb)
+{
+        if (!skb) {
+                _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
+                dump_stack();
+        } else {
+                _debug("DLVR %p{%u} [%d]",
+                       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+                if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+                        BUG();
+                rxrpc_kernel_data_delivered(skb);
+        }
+}
+/*
+ * free a socket buffer
+ */
+static void afs_free_skb(struct sk_buff *skb)
+{
+        if (!skb) {
+                _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
+                dump_stack();
+        } else {
+                _debug("FREE %p{%u} [%d]",
+                       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+                if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+                        BUG();
+                rxrpc_kernel_free_skb(skb);
+        }
+}
+/*
+ * free a call
+ */
+static void afs_free_call(struct afs_call *call)
+{
+        _debug("DONE %p{%s} [%d]",
+               call, call->type->name, atomic_read(&afs_outstanding_calls));
+        if (atomic_dec_return(&afs_outstanding_calls) == -1)
+                BUG();
+        ASSERTCMP(call->rxcall, ==, NULL);
+        ASSERT(!work_pending(&call->async_work));
+        ASSERT(skb_queue_empty(&call->rx_queue));
+        ASSERT(call->type->name != NULL);
+        kfree(call->request);
+        kfree(call);
+}
+/*
+ * allocate a call with flat request and reply buffers
+ */
+struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+                                     size_t request_size, size_t reply_size)
+{
+        struct afs_call *call;
+        call = kzalloc(sizeof(*call), GFP_NOFS);
+        if (!call)
+                goto nomem_call;
+        _debug("CALL %p{%s} [%d]",
+               call, type->name, atomic_read(&afs_outstanding_calls));
+        atomic_inc(&afs_outstanding_calls);
+        call->type = type;
+        call->request_size = request_size;
+        call->reply_max = reply_size;
+        if (request_size) {
+                call->request = kmalloc(request_size, GFP_NOFS);
+                if (!call->request)
+                        goto nomem_free;
+        }
+        if (reply_size) {
+                call->buffer = kmalloc(reply_size, GFP_NOFS);
+                if (!call->buffer)
+                        goto nomem_free;
+        }
+        init_waitqueue_head(&call->waitq);
+        skb_queue_head_init(&call->rx_queue);
+        return call;
+nomem_free:
+        afs_free_call(call);
+nomem_call:
+        return NULL;
+}
+/*
+ * clean up a call with flat buffer
+ */
+void afs_flat_call_destructor(struct afs_call *call)
+{
+        _enter("");
+        kfree(call->request);
+        call->request = NULL;
+        kfree(call->buffer);
+        call->buffer = NULL;
+}
+/*
+ * initiate a call
+ */
+int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
+                  const struct afs_wait_mode *wait_mode)
+{
+        struct sockaddr_rxrpc srx;
+        struct rxrpc_call *rxcall;
+        struct msghdr msg;
+        struct kvec iov[1];
+        int ret;
+        _enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+        ASSERT(call->type != NULL);
+        ASSERT(call->type->name != NULL);
+        _debug("MAKE %p{%s} [%d]",
+               call, call->type->name, atomic_read(&afs_outstanding_calls));
+        call->wait_mode = wait_mode;
+        INIT_WORK(&call->async_work, afs_process_async_call);
+        memset(&srx, 0, sizeof(srx));
+        srx.srx_family = AF_RXRPC;
+        srx.srx_service = call->service_id;
+        srx.transport_type = SOCK_DGRAM;
+        srx.transport_len = sizeof(srx.transport.sin);
+        srx.transport.sin.sin_family = AF_INET;
+        srx.transport.sin.sin_port = call->port;
+        memcpy(&srx.transport.sin.sin_addr, addr, 4);
+        /* create a call */
+        rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+                                         (unsigned long) call, gfp);
+        call->key = NULL;
+        if (IS_ERR(rxcall)) {
+                ret = PTR_ERR(rxcall);
+                goto error_kill_call;
+        }
+        call->rxcall = rxcall;
+        /* send the request */
+        iov[0].iov_base = call->request;
+        iov[0].iov_len  = call->request_size;
+        msg.msg_name            = NULL;
+        msg.msg_namelen         = 0;
+        msg.msg_iov             = (struct iovec *) iov;
+        msg.msg_iovlen          = 1;
+        msg.msg_control         = NULL;
+        msg.msg_controllen      = 0;
+        msg.msg_flags           = 0;
+        /* have to change the state *before* sending the last packet as RxRPC
+         * might give us the reply before it returns from sending the
+         * request */
+        call->state = AFS_CALL_AWAIT_REPLY;
+        ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+        if (ret < 0)
+                goto error_do_abort;
+        /* at this point, an async call may no longer exist as it may have
+         * already completed */
+        return wait_mode->wait(call);
+error_do_abort:
+        rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+        rxrpc_kernel_end_call(rxcall);
+        call->rxcall = NULL;
+error_kill_call:
+        call->type->destructor(call);
+        afs_free_call(call);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * handles intercepted messages that were arriving in the socket's Rx queue
+ * - called with the socket receive queue lock held to ensure message ordering
+ * - called with softirqs disabled
+ */
+static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
+                               struct sk_buff *skb)
+{
+        struct afs_call *call = (struct afs_call *) user_call_ID;
+        _enter("%p,,%u", call, skb->mark);
+        _debug("ICPT %p{%u} [%d]",
+               skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+        ASSERTCMP(sk, ==, afs_socket->sk);
+        atomic_inc(&afs_outstanding_skbs);
+        if (!call) {
+                /* its an incoming call for our callback service */
+                skb_queue_tail(&afs_incoming_calls, skb);
+                schedule_work(&afs_collect_incoming_call_work);
+        } else {
+                /* route the messages directly to the appropriate call */
+                skb_queue_tail(&call->rx_queue, skb);
+                call->wait_mode->rx_wakeup(call);
+        }
+        _leave("");
+}
+/*
+ * deliver messages to a call
+ */
+static void afs_deliver_to_call(struct afs_call *call)
+{
+        struct sk_buff *skb;
+        bool last;
+        u32 abort_code;
+        int ret;
+        _enter("");
+        while ((call->state == AFS_CALL_AWAIT_REPLY ||
+                call->state == AFS_CALL_AWAIT_OP_ID ||
+                call->state == AFS_CALL_AWAIT_REQUEST ||
+                call->state == AFS_CALL_AWAIT_ACK) &&
+               (skb = skb_dequeue(&call->rx_queue))) {
+                switch (skb->mark) {
+                case RXRPC_SKB_MARK_DATA:
+                        _debug("Rcv DATA");
+                        last = rxrpc_kernel_is_data_last(skb);
+                        ret = call->type->deliver(call, skb, last);
+                        switch (ret) {
+                        case 0:
+                                if (last &&
+                                    call->state == AFS_CALL_AWAIT_REPLY)
+                                        call->state = AFS_CALL_COMPLETE;
+                                break;
+                        case -ENOTCONN:
+                                abort_code = RX_CALL_DEAD;
+                                goto do_abort;
+                        case -ENOTSUPP:
+                                abort_code = RX_INVALID_OPERATION;
+                                goto do_abort;
+                        default:
+                                abort_code = RXGEN_CC_UNMARSHAL;
+                                if (call->state != AFS_CALL_AWAIT_REPLY)
+                                        abort_code = RXGEN_SS_UNMARSHAL;
+                        do_abort:
+                                rxrpc_kernel_abort_call(call->rxcall,
+                                                        abort_code);
+                                call->error = ret;
+                                call->state = AFS_CALL_ERROR;
+                                break;
+                        }
+                        afs_data_delivered(skb);
+                        skb = NULL;
+                        continue;
+                case RXRPC_SKB_MARK_FINAL_ACK:
+                        _debug("Rcv ACK");
+                        call->state = AFS_CALL_COMPLETE;
+                        break;
+                case RXRPC_SKB_MARK_BUSY:
+                        _debug("Rcv BUSY");
+                        call->error = -EBUSY;
+                        call->state = AFS_CALL_BUSY;
+                        break;
+                case RXRPC_SKB_MARK_REMOTE_ABORT:
+                        abort_code = rxrpc_kernel_get_abort_code(skb);
+                        call->error = call->type->abort_to_error(abort_code);
+                        call->state = AFS_CALL_ABORTED;
+                        _debug("Rcv ABORT %u -> %d", abort_code, call->error);
+                        break;
+                case RXRPC_SKB_MARK_NET_ERROR:
+                        call->error = -rxrpc_kernel_get_error_number(skb);
+                        call->state = AFS_CALL_ERROR;
+                        _debug("Rcv NET ERROR %d", call->error);
+                        break;
+                case RXRPC_SKB_MARK_LOCAL_ERROR:
+                        call->error = -rxrpc_kernel_get_error_number(skb);
+                        call->state = AFS_CALL_ERROR;
+                        _debug("Rcv LOCAL ERROR %d", call->error);
+                        break;
+                default:
+                        BUG();
+                        break;
+                }
+                afs_free_skb(skb);
+        }
+        /* make sure the queue is empty if the call is done with (we might have
+         * aborted the call early because of an unmarshalling error) */
+        if (call->state >= AFS_CALL_COMPLETE) {
+                while ((skb = skb_dequeue(&call->rx_queue)))
+                        afs_free_skb(skb);
+                if (call->incoming) {
+                        rxrpc_kernel_end_call(call->rxcall);
+                        call->rxcall = NULL;
+                        call->type->destructor(call);
+                        afs_free_call(call);
+                }
+        }
+        _leave("");
+}
+/*
+ * wait synchronously for a call to complete
+ */
+static int afs_wait_for_call_to_complete(struct afs_call *call)
+{
+        struct sk_buff *skb;
+        int ret;
+        DECLARE_WAITQUEUE(myself, current);
+        _enter("");
+        add_wait_queue(&call->waitq, &myself);
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                /* deliver any messages that are in the queue */
+                if (!skb_queue_empty(&call->rx_queue)) {
+                        __set_current_state(TASK_RUNNING);
+                        afs_deliver_to_call(call);
+                        continue;
+                }
+                ret = call->error;
+                if (call->state >= AFS_CALL_COMPLETE)
+                        break;
+                ret = -EINTR;
+                if (signal_pending(current))
+                        break;
+                schedule();
+        }
+        remove_wait_queue(&call->waitq, &myself);
+        __set_current_state(TASK_RUNNING);
+        /* kill the call */
+        if (call->state < AFS_CALL_COMPLETE) {
+                _debug("call incomplete");
+                rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
+                while ((skb = skb_dequeue(&call->rx_queue)))
+                        afs_free_skb(skb);
+        }
+        _debug("call complete");
+        rxrpc_kernel_end_call(call->rxcall);
+        call->rxcall = NULL;
+        call->type->destructor(call);
+        afs_free_call(call);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * wake up a waiting call
+ */
+static void afs_wake_up_call_waiter(struct afs_call *call)
+{
+        wake_up(&call->waitq);
+}
+/*
+ * wake up an asynchronous call
+ */
+static void afs_wake_up_async_call(struct afs_call *call)
+{
+        _enter("");
+        queue_work(afs_async_calls, &call->async_work);
+}
+/*
+ * put a call into asynchronous mode
+ * - mustn't touch the call descriptor as the call my have completed by the
+ *   time we get here
+ */
+static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
+{
+        _enter("");
+        return -EINPROGRESS;
+}
+/*
+ * delete an asynchronous call
+ */
+static void afs_delete_async_call(struct work_struct *work)
+{
+        struct afs_call *call =
+                container_of(work, struct afs_call, async_work);
+        _enter("");
+        afs_free_call(call);
+        _leave("");
+}
+/*
+ * perform processing on an asynchronous call
+ * - on a multiple-thread workqueue this work item may try to run on several
+ *   CPUs at the same time
+ */
+static void afs_process_async_call(struct work_struct *work)
+{
+        struct afs_call *call =
+                container_of(work, struct afs_call, async_work);
+        _enter("");
+        if (!skb_queue_empty(&call->rx_queue))
+                afs_deliver_to_call(call);
+        if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+                if (call->wait_mode->async_complete)
+                        call->wait_mode->async_complete(call->reply,
+                                                        call->error);
+                call->reply = NULL;
+                /* kill the call */
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+                if (call->type->destructor)
+                        call->type->destructor(call);
+                /* we can't just delete the call because the work item may be
+                 * queued */
+                PREPARE_WORK(&call->async_work, afs_delete_async_call);
+                queue_work(afs_async_calls, &call->async_work);
+        }
+        _leave("");
+}
+/*
+ * empty a socket buffer into a flat reply buffer
+ */
+void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+{
+        size_t len = skb->len;
+        if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+                BUG();
+        call->reply_size += len;
+}
+/*
+ * accept the backlog of incoming calls
+ */
+static void afs_collect_incoming_call(struct work_struct *work)
+{
+        struct rxrpc_call *rxcall;
+        struct afs_call *call = NULL;
+        struct sk_buff *skb;
+        while ((skb = skb_dequeue(&afs_incoming_calls))) {
+                _debug("new call");
+                /* don't need the notification */
+                afs_free_skb(skb);
+                if (!call) {
+                        call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+                        if (!call) {
+                                rxrpc_kernel_reject_call(afs_socket);
+                                return;
+                        }
+                        INIT_WORK(&call->async_work, afs_process_async_call);
+                        call->wait_mode = &afs_async_incoming_call;
+                        call->type = &afs_RXCMxxxx;
+                        init_waitqueue_head(&call->waitq);
+                        skb_queue_head_init(&call->rx_queue);
+                        call->state = AFS_CALL_AWAIT_OP_ID;
+                        _debug("CALL %p{%s} [%d]",
+                               call, call->type->name,
+                               atomic_read(&afs_outstanding_calls));
+                        atomic_inc(&afs_outstanding_calls);
+                }
+                rxcall = rxrpc_kernel_accept_call(afs_socket,
+                                                  (unsigned long) call);
+                if (!IS_ERR(rxcall)) {
+                        call->rxcall = rxcall;
+                        call = NULL;
+                }
+        }
+        if (call)
+                afs_free_call(call);
+}
+/*
+ * grab the operation ID from an incoming cache manager call
+ */
+static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
+                                bool last)
+{
+        size_t len = skb->len;
+        void *oibuf = (void *) &call->operation_ID;
+        _enter("{%u},{%zu},%d", call->offset, len, last);
+        ASSERTCMP(call->offset, <, 4);
+        /* the operation ID forms the first four bytes of the request data */
+        len = min_t(size_t, len, 4 - call->offset);
+        if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
+                BUG();
+        if (!pskb_pull(skb, len))
+                BUG();
+        call->offset += len;
+        if (call->offset < 4) {
+                if (last) {
+                        _leave(" = -EBADMSG [op ID short]");
+                        return -EBADMSG;
+                }
+                _leave(" = 0 [incomplete]");
+                return 0;
+        }
+        call->state = AFS_CALL_AWAIT_REQUEST;
+        /* ask the cache manager to route the call (it'll change the call type
+         * if successful) */
+        if (!afs_cm_incoming_call(call))
+                return -ENOTSUPP;
+        /* pass responsibility for the remainer of this message off to the
+         * cache manager op */
+        return call->type->deliver(call, skb, last);
+}
+/*
+ * send an empty reply
+ */
+void afs_send_empty_reply(struct afs_call *call)
+{
+        struct msghdr msg;
+        struct iovec iov[1];
+        _enter("");
+        iov[0].iov_base         = NULL;
+        iov[0].iov_len          = 0;
+        msg.msg_name            = NULL;
+        msg.msg_namelen         = 0;
+        msg.msg_iov             = iov;
+        msg.msg_iovlen          = 0;
+        msg.msg_control         = NULL;
+        msg.msg_controllen      = 0;
+        msg.msg_flags           = 0;
+        call->state = AFS_CALL_AWAIT_ACK;
+        switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+        case 0:
+                _leave(" [replied]");
+                return;
+        case -ENOMEM:
+                _debug("oom");
+                rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+        default:
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+                call->type->destructor(call);
+                afs_free_call(call);
+                _leave(" [error]");
+                return;
+        }
+}
+/*
+ * send a simple reply
+ */
+void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
+{
+        struct msghdr msg;
+        struct iovec iov[1];
+        _enter("");
+        iov[0].iov_base         = (void *) buf;
+        iov[0].iov_len          = len;
+        msg.msg_name            = NULL;
+        msg.msg_namelen         = 0;
+        msg.msg_iov             = iov;
+        msg.msg_iovlen          = 1;
+        msg.msg_control         = NULL;
+        msg.msg_controllen      = 0;
+        msg.msg_flags           = 0;
+        call->state = AFS_CALL_AWAIT_ACK;
+        switch (rxrpc_kernel_send_data(call->rxcall, &msg, len)) {
+        case 0:
+                _leave(" [replied]");
+                return;
+        case -ENOMEM:
+                _debug("oom");
+                rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+        default:
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+                call->type->destructor(call);
+                afs_free_call(call);
+                _leave(" [error]");
+                return;
+        }
+}
+/*
+ * extract a piece of data from the received data socket buffers
+ */
+int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
+                     bool last, void *buf, size_t count)
+{
+        size_t len = skb->len;
+        _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+        ASSERTCMP(call->offset, <, count);
+        len = min_t(size_t, len, count - call->offset);
+        if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
+            !pskb_pull(skb, len))
+                BUG();
+        call->offset += len;
+        if (call->offset < count) {
+                if (last) {
+                        _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
+                        return -EBADMSG;
+                }
+                _leave(" = -EAGAIN");
+                return -EAGAIN;
+        }
+        return 0;
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
new file mode 100644
index 000000000000..f9f424d80458
--- /dev/null
+++ b/fs/afs/security.c
@@ -0,0 +1,356 @@
+/* AFS security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+/*
+ * get a key
+ */
+struct key *afs_request_key(struct afs_cell *cell)
+{
+        struct key *key;
+        _enter("{%x}", key_serial(cell->anonymous_key));
+        _debug("key %s", cell->anonymous_key->description);
+        key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
+                          NULL);
+        if (IS_ERR(key)) {
+                if (PTR_ERR(key) != -ENOKEY) {
+                        _leave(" = %ld", PTR_ERR(key));
+                        return key;
+                }
+                /* act as anonymous user */
+                _leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+                return key_get(cell->anonymous_key);
+        } else {
+                /* act as authorised user */
+                _leave(" = {%x} [auth]", key_serial(key));
+                return key;
+        }
+}
+/*
+ * dispose of a permits list
+ */
+void afs_zap_permits(struct rcu_head *rcu)
+{
+        struct afs_permits *permits =
+                container_of(rcu, struct afs_permits, rcu);
+        int loop;
+        _enter("{%d}", permits->count);
+        for (loop = permits->count - 1; loop >= 0; loop--)
+                key_put(permits->permits[loop].key);
+        kfree(permits);
+}
+/*
+ * dispose of a permits list in which all the key pointers have been copied
+ */
+static void afs_dispose_of_permits(struct rcu_head *rcu)
+{
+        struct afs_permits *permits =
+                container_of(rcu, struct afs_permits, rcu);
+        _enter("{%d}", permits->count);
+        kfree(permits);
+}
+/*
+ * get the authorising vnode - this is the specified inode itself if it's a
+ * directory or it's the parent directory if the specified inode is a file or
+ * symlink
+ * - the caller must release the ref on the inode
+ */
+static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
+                                            struct key *key)
+{
+        struct afs_vnode *auth_vnode;
+        struct inode *auth_inode;
+        _enter("");
+        if (S_ISDIR(vnode->vfs_inode.i_mode)) {
+                auth_inode = igrab(&vnode->vfs_inode);
+                ASSERT(auth_inode != NULL);
+        } else {
+                auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
+                                      &vnode->status.parent, NULL, NULL);
+                if (IS_ERR(auth_inode))
+                        return ERR_PTR(PTR_ERR(auth_inode));
+        }
+        auth_vnode = AFS_FS_I(auth_inode);
+        _leave(" = {%x}", auth_vnode->fid.vnode);
+        return auth_vnode;
+}
+/*
+ * clear the permit cache on a directory vnode
+ */
+void afs_clear_permits(struct afs_vnode *vnode)
+{
+        struct afs_permits *permits;
+        _enter("{%x}", vnode->fid.vnode);
+        mutex_lock(&vnode->permits_lock);
+        permits = vnode->permits;
+        rcu_assign_pointer(vnode->permits, NULL);
+        mutex_unlock(&vnode->permits_lock);
+        if (permits)
+                call_rcu(&permits->rcu, afs_zap_permits);
+        _leave("");
+}
+/*
+ * add the result obtained for a vnode to its or its parent directory's cache
+ * for the key used to access it
+ */
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+{
+        struct afs_permits *permits, *xpermits;
+        struct afs_permit *permit;
+        struct afs_vnode *auth_vnode;
+        int count, loop;
+        _enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
+        auth_vnode = afs_get_auth_inode(vnode, key);
+        if (IS_ERR(auth_vnode)) {
+                _leave(" [get error %ld]", PTR_ERR(auth_vnode));
+                return;
+        }
+        mutex_lock(&auth_vnode->permits_lock);
+        /* guard against a rename being detected whilst we waited for the
+         * lock */
+        if (memcmp(&auth_vnode->fid, &vnode->status.parent,
+                   sizeof(struct afs_fid)) != 0) {
+                _debug("renamed");
+                goto out_unlock;
+        }
+        /* have to be careful as the directory's callback may be broken between
+         * us receiving the status we're trying to cache and us getting the
+         * lock to update the cache for the status */
+        if (auth_vnode->acl_order - acl_order > 0) {
+                _debug("ACL changed?");
+                goto out_unlock;
+        }
+        /* always update the anonymous mask */
+        _debug("anon access %x", vnode->status.anon_access);
+        auth_vnode->status.anon_access = vnode->status.anon_access;
+        if (key == vnode->volume->cell->anonymous_key)
+                goto out_unlock;
+        xpermits = auth_vnode->permits;
+        count = 0;
+        if (xpermits) {
+                /* see if the permit is already in the list
+                 * - if it is then we just amend the list
+                 */
+                count = xpermits->count;
+                permit = xpermits->permits;
+                for (loop = count; loop > 0; loop--) {
+                        if (permit->key == key) {
+                                permit->access_mask =
+                                        vnode->status.caller_access;
+                                goto out_unlock;
+                        }
+                        permit++;
+                }
+        }
+        permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
+                          GFP_NOFS);
+        if (!permits)
+                goto out_unlock;
+        memcpy(permits->permits, xpermits->permits,
+               count * sizeof(struct afs_permit));
+        _debug("key %x access %x",
+               key_serial(key), vnode->status.caller_access);
+        permits->permits[count].access_mask = vnode->status.caller_access;
+        permits->permits[count].key = key_get(key);
+        permits->count = count + 1;
+        rcu_assign_pointer(auth_vnode->permits, permits);
+        if (xpermits)
+                call_rcu(&xpermits->rcu, afs_dispose_of_permits);
+out_unlock:
+        mutex_unlock(&auth_vnode->permits_lock);
+        iput(&auth_vnode->vfs_inode);
+        _leave("");
+}
+/*
+ * check with the fileserver to see if the directory or parent directory is
+ * permitted to be accessed with this authorisation, and if so, what access it
+ * is granted
+ */
+static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+                            afs_access_t *_access)
+{
+        struct afs_permits *permits;
+        struct afs_permit *permit;
+        struct afs_vnode *auth_vnode;
+        bool valid;
+        int loop, ret;
+        _enter("");
+        auth_vnode = afs_get_auth_inode(vnode, key);
+        if (IS_ERR(auth_vnode)) {
+                *_access = 0;
+                _leave(" = %ld", PTR_ERR(auth_vnode));
+                return PTR_ERR(auth_vnode);
+        }
+        ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+        /* check the permits to see if we've got one yet */
+        if (key == auth_vnode->volume->cell->anonymous_key) {
+                _debug("anon");
+                *_access = auth_vnode->status.anon_access;
+                valid = true;
+        } else {
+                valid = false;
+                rcu_read_lock();
+                permits = rcu_dereference(auth_vnode->permits);
+                if (permits) {
+                        permit = permits->permits;
+                        for (loop = permits->count; loop > 0; loop--) {
+                                if (permit->key == key) {
+                                        _debug("found in cache");
+                                        *_access = permit->access_mask;
+                                        valid = true;
+                                        break;
+                                }
+                                permit++;
+                        }
+                }
+                rcu_read_unlock();
+        }
+        if (!valid) {
+                /* check the status on the file we're actually interested in
+                 * (the post-processing will cache the result on auth_vnode) */
+                _debug("no valid permit");
+                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+                ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+                if (ret < 0) {
+                        iput(&auth_vnode->vfs_inode);
+                        *_access = 0;
+                        _leave(" = %d", ret);
+                        return ret;
+                }
+        }
+        *_access = vnode->status.caller_access;
+        iput(&auth_vnode->vfs_inode);
+        _leave(" = 0 [access %x]", *_access);
+        return 0;
+}
+/*
+ * check the permissions on an AFS file
+ * - AFS ACLs are attached to directories only, and a file is controlled by its
+ *   parent directory's ACL
+ */
+int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        afs_access_t access;
+        struct key *key;
+        int ret;
+        _enter("{{%x:%x},%lx},%x,",
+               vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
+        key = afs_request_key(vnode->volume->cell);
+        if (IS_ERR(key)) {
+                _leave(" = %ld [key]", PTR_ERR(key));
+                return PTR_ERR(key);
+        }
+        /* if the promise has expired, we need to check the server again */
+        if (!vnode->cb_promised) {
+                _debug("not promised");
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
+                if (ret < 0)
+                        goto error;
+                _debug("new promise [fl=%lx]", vnode->flags);
+        }
+        /* check the permits to see if we've got one yet */
+        ret = afs_check_permit(vnode, key, &access);
+        if (ret < 0)
+                goto error;
+        /* interpret the access mask */
+        _debug("REQ %x ACC %x on %s",
+               mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
+        if (S_ISDIR(inode->i_mode)) {
+                if (mask & MAY_EXEC) {
+                        if (!(access & AFS_ACE_LOOKUP))
+                                goto permission_denied;
+                } else if (mask & MAY_READ) {
+                        if (!(access & AFS_ACE_READ))
+                                goto permission_denied;
+                } else if (mask & MAY_WRITE) {
+                        if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
+                                        AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
+                                        AFS_ACE_WRITE))) /* chmod */
+                                goto permission_denied;
+                } else {
+                        BUG();
+                }
+        } else {
+                if (!(access & AFS_ACE_LOOKUP))
+                        goto permission_denied;
+                if (mask & (MAY_EXEC | MAY_READ)) {
+                        if (!(access & AFS_ACE_READ))
+                                goto permission_denied;
+                } else if (mask & MAY_WRITE) {
+                        if (!(access & AFS_ACE_WRITE))
+                                goto permission_denied;
+                }
+        }
+        key_put(key);
+        ret = generic_permission(inode, mask, NULL);
+        _leave(" = %d", ret);
+        return ret;
+permission_denied:
+        ret = -EACCES;
+error:
+        key_put(key);
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 44aff81dc6a7..96bb23b476a2 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -1,6 +1,6 @@
-/* server.c: AFS server record management
+/* AFS server record management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -11,489 +11,314 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <rxrpc/peer.h>
-#include <rxrpc/connection.h>
-#include "volume.h"
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
 #include "internal.h"
-DEFINE_SPINLOCK(afs_server_peer_lock);
+unsigned afs_server_timeout = 10;       /* server timeout in seconds */
-#define FS_SERVICE_ID           1       /* AFS Volume Location Service ID */
+static void afs_reap_server(struct work_struct *);
-#define VL_SERVICE_ID           52      /* AFS Volume Location Service ID */
-static void __afs_server_timeout(struct afs_timer *timer)
+/* tree of all the servers, indexed by IP address */
+static struct rb_root afs_servers = RB_ROOT;
+static DEFINE_RWLOCK(afs_servers_lock);
+/* LRU list of all the servers not currently in use */
+static LIST_HEAD(afs_server_graveyard);
+static DEFINE_SPINLOCK(afs_server_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+/*
+ * install a server record in the master tree
+ */
+static int afs_install_server(struct afs_server *server)
 {
-        struct afs_server *server =
+        struct afs_server *xserver;
-                list_entry(timer, struct afs_server, timeout);
+        struct rb_node **pp, *p;
+        int ret;
-        _debug("SERVER TIMEOUT [%p{u=%d}]",
+        _enter("%p", server);
-               server, atomic_read(&server->usage));
-        afs_server_do_timeout(server);
+        write_lock(&afs_servers_lock);
-}
+        ret = -EEXIST;
+        pp = &afs_servers.rb_node;
+        p = NULL;
+        while (*pp) {
+                p = *pp;
+                _debug("- consider %p", p);
+                xserver = rb_entry(p, struct afs_server, master_rb);
+                if (server->addr.s_addr < xserver->addr.s_addr)
+                        pp = &(*pp)->rb_left;
+                else if (server->addr.s_addr > xserver->addr.s_addr)
+                        pp = &(*pp)->rb_right;
+                else
+                        goto error;
+        }
-static const struct afs_timer_ops afs_server_timer_ops = {
+        rb_link_node(&server->master_rb, p, pp);
-        .timed_out      = __afs_server_timeout,
+        rb_insert_color(&server->master_rb, &afs_servers);
-};
+        ret = 0;
+error:
+        write_unlock(&afs_servers_lock);
+        return ret;
+}
-/*****************************************************************************/
 /*
- * lookup a server record in a cell
+ * allocate a new server record
- * - TODO: search the cell's server list
 */
-int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
+static struct afs_server *afs_alloc_server(struct afs_cell *cell,
-                      struct afs_server **_server)
+                                           const struct in_addr *addr)
 {
-        struct afs_server *server, *active, *zombie;
+        struct afs_server *server;
-        int loop;
-        _enter("%p,%08x,", cell, ntohl(addr->s_addr));
+        _enter("");
-        /* allocate and initialise a server record */
        server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
-        if (!server) {
+        if (server) {
-                _leave(" = -ENOMEM");
+                atomic_set(&server->usage, 1);
-                return -ENOMEM;
+                server->cell = cell;
+                INIT_LIST_HEAD(&server->link);
+                INIT_LIST_HEAD(&server->grave);
+                init_rwsem(&server->sem);
+                spin_lock_init(&server->fs_lock);
+                server->fs_vnodes = RB_ROOT;
+                server->cb_promises = RB_ROOT;
+                spin_lock_init(&server->cb_lock);
+                init_waitqueue_head(&server->cb_break_waitq);
+                INIT_DELAYED_WORK(&server->cb_break_work,
+                                  afs_dispatch_give_up_callbacks);
+                memcpy(&server->addr, addr, sizeof(struct in_addr));
+                server->addr.s_addr = addr->s_addr;
        }
-        atomic_set(&server->usage, 1);
+        _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        return server;
-        INIT_LIST_HEAD(&server->link);
+}
-        init_rwsem(&server->sem);
-        INIT_LIST_HEAD(&server->fs_callq);
-        spin_lock_init(&server->fs_lock);
-        INIT_LIST_HEAD(&server->cb_promises);
-        spin_lock_init(&server->cb_lock);
-        for (loop = 0; loop < AFS_SERVER_CONN_LIST_SIZE; loop++)
-                server->fs_conn_cnt[loop] = 4;
-        memcpy(&server->addr, addr, sizeof(struct in_addr));
+/*
-        server->addr.s_addr = addr->s_addr;
+ * get an FS-server record for a cell
+ */
+struct afs_server *afs_lookup_server(struct afs_cell *cell,
+                                     const struct in_addr *addr)
+{
+        struct afs_server *server, *candidate;
-        afs_timer_init(&server->timeout, &afs_server_timer_ops);
+        _enter("%p,"NIPQUAD_FMT, cell, NIPQUAD(addr->s_addr));
-        /* add to the cell */
+        /* quick scan of the list to see if we already have the server */
-        write_lock(&cell->sv_lock);
+        read_lock(&cell->servers_lock);
-        /* check the active list */
+        list_for_each_entry(server, &cell->servers, link) {
-        list_for_each_entry(active, &cell->sv_list, link) {
+                if (server->addr.s_addr == addr->s_addr)
-                if (active->addr.s_addr == addr->s_addr)
+                        goto found_server_quickly;
-                        goto use_active_server;
        }
+        read_unlock(&cell->servers_lock);
-        /* check the inactive list */
+        candidate = afs_alloc_server(cell, addr);
-        spin_lock(&cell->sv_gylock);
+        if (!candidate) {
-        list_for_each_entry(zombie, &cell->sv_graveyard, link) {
+                _leave(" = -ENOMEM");
-                if (zombie->addr.s_addr == addr->s_addr)
+                return ERR_PTR(-ENOMEM);
-                        goto resurrect_server;
        }
-        spin_unlock(&cell->sv_gylock);
-        afs_get_cell(cell);
+        write_lock(&cell->servers_lock);
-        server->cell = cell;
-        list_add_tail(&server->link, &cell->sv_list);
-        write_unlock(&cell->sv_lock);
+        /* check the cell's server list again */
+        list_for_each_entry(server, &cell->servers, link) {
+                if (server->addr.s_addr == addr->s_addr)
+                        goto found_server;
+        }
-        *_server = server;
+        _debug("new");
-        _leave(" = 0 (%p)", server);
+        server = candidate;
-        return 0;
+        if (afs_install_server(server) < 0)
+                goto server_in_two_cells;
-        /* found a matching active server */
+        afs_get_cell(cell);
- use_active_server:
+        list_add_tail(&server->link, &cell->servers);
-        _debug("active server");
-        afs_get_server(active);
+        write_unlock(&cell->servers_lock);
-        write_unlock(&cell->sv_lock);
+        _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        return server;
+        /* found a matching server quickly */
+found_server_quickly:
+        _debug("found quickly");
+        afs_get_server(server);
+        read_unlock(&cell->servers_lock);
+no_longer_unused:
+        if (!list_empty(&server->grave)) {
+                spin_lock(&afs_server_graveyard_lock);
+                list_del_init(&server->grave);
+                spin_unlock(&afs_server_graveyard_lock);
+        }
+        _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        return server;
+        /* found a matching server on the second pass */
+found_server:
+        _debug("found");
+        afs_get_server(server);
+        write_unlock(&cell->servers_lock);
+        kfree(candidate);
+        goto no_longer_unused;
+        /* found a server that seems to be in two cells */
+server_in_two_cells:
+        write_unlock(&cell->servers_lock);
+        kfree(candidate);
+        printk(KERN_NOTICE "kAFS:"
+               " Server "NIPQUAD_FMT" appears to be in two cells\n",
+               NIPQUAD(*addr));
+        _leave(" = -EEXIST");
+        return ERR_PTR(-EEXIST);
+}
-        kfree(server);
+/*
+ * look up a server by its IP address
+ */
+struct afs_server *afs_find_server(const struct in_addr *_addr)
+{
+        struct afs_server *server = NULL;
+        struct rb_node *p;
+        struct in_addr addr = *_addr;
-        *_server = active;
+        _enter(NIPQUAD_FMT, NIPQUAD(addr.s_addr));
-        _leave(" = 0 (%p)", active);
-        return 0;
-        /* found a matching server in the graveyard, so resurrect it and
+        read_lock(&afs_servers_lock);
-         * dispose of the new record */
- resurrect_server:
-        _debug("resurrecting server");
-        list_move_tail(&zombie->link, &cell->sv_list);
+        p = afs_servers.rb_node;
-        afs_get_server(zombie);
+        while (p) {
-        afs_kafstimod_del_timer(&zombie->timeout);
+                server = rb_entry(p, struct afs_server, master_rb);
-        spin_unlock(&cell->sv_gylock);
-        write_unlock(&cell->sv_lock);
-        kfree(server);
+                _debug("- consider %p", p);
-        *_server = zombie;
+                if (addr.s_addr < server->addr.s_addr) {
-        _leave(" = 0 (%p)", zombie);
+                        p = p->rb_left;
-        return 0;
+                } else if (addr.s_addr > server->addr.s_addr) {
+                        p = p->rb_right;
+                } else {
+                        afs_get_server(server);
+                        goto found;
+                }
+        }
-} /* end afs_server_lookup() */
+        server = NULL;
+found:
+        read_unlock(&afs_servers_lock);
+        ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
+        _leave(" = %p", server);
+        return server;
+}
-/*****************************************************************************/
 /*
 * destroy a server record
 * - removes from the cell list
 */
 void afs_put_server(struct afs_server *server)
 {
-        struct afs_cell *cell;
        if (!server)
                return;
-        _enter("%p", server);
+        _enter("%p{%d}", server, atomic_read(&server->usage));
-        cell = server->cell;
-        /* sanity check */
+        _debug("PUT SERVER %d", atomic_read(&server->usage));
-        BUG_ON(atomic_read(&server->usage) <= 0);
-        /* to prevent a race, the decrement and the dequeue must be effectively
+        ASSERTCMP(atomic_read(&server->usage), >, 0);
-         * atomic */
-        write_lock(&cell->sv_lock);
        if (likely(!atomic_dec_and_test(&server->usage))) {
-                write_unlock(&cell->sv_lock);
                _leave("");
                return;
        }
-        spin_lock(&cell->sv_gylock);
+        afs_flush_callback_breaks(server);
-        list_move_tail(&server->link, &cell->sv_graveyard);
-        /* time out in 10 secs */
+        spin_lock(&afs_server_graveyard_lock);
-        afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
+        if (atomic_read(&server->usage) == 0) {
+                list_move_tail(&server->grave, &afs_server_graveyard);
-        spin_unlock(&cell->sv_gylock);
+                server->time_of_death = get_seconds();
-        write_unlock(&cell->sv_lock);
+                schedule_delayed_work(&afs_server_reaper,
+                                      afs_server_timeout * HZ);
-        _leave(" [killed]");
+        }
-} /* end afs_put_server() */
+        spin_unlock(&afs_server_graveyard_lock);
+        _leave(" [dead]");
+}
-/*****************************************************************************/
 /*
- * timeout server record
+ * destroy a dead server
- * - removes from the cell's graveyard if the usage count is zero
 */
-void afs_server_do_timeout(struct afs_server *server)
+static void afs_destroy_server(struct afs_server *server)
 {
-        struct rxrpc_peer *peer;
-        struct afs_cell *cell;
-        int loop;
        _enter("%p", server);
-        cell = server->cell;
+        ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
+        ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
-        BUG_ON(atomic_read(&server->usage) < 0);
+        ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
+        ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
-        /* remove from graveyard if still dead */
-        spin_lock(&cell->vl_gylock);
-        if (atomic_read(&server->usage) == 0)
-                list_del_init(&server->link);
-        else
-                server = NULL;
-        spin_unlock(&cell->vl_gylock);
-        if (!server) {
-                _leave("");
-                return; /* resurrected */
-        }
-        /* we can now destroy it properly */
-        afs_put_cell(cell);
-        /* uncross-point the structs under a global lock */
-        spin_lock(&afs_server_peer_lock);
-        peer = server->peer;
-        if (peer) {
-                server->peer = NULL;
-                peer->user = NULL;
-        }
-        spin_unlock(&afs_server_peer_lock);
-        /* finish cleaning up the server */
-        for (loop = AFS_SERVER_CONN_LIST_SIZE - 1; loop >= 0; loop--)
-                if (server->fs_conn[loop])
-                        rxrpc_put_connection(server->fs_conn[loop]);
-        if (server->vlserver)
-                rxrpc_put_connection(server->vlserver);
+        afs_put_cell(server->cell);
        kfree(server);
+}
-        _leave(" [destroyed]");
-} /* end afs_server_do_timeout() */
-/*****************************************************************************/
 /*
- * get a callslot on a connection to the fileserver on the specified server
+ * reap dead server records
 */
-int afs_server_request_callslot(struct afs_server *server,
+static void afs_reap_server(struct work_struct *work)
-                                struct afs_server_callslot *callslot)
 {
-        struct afs_server_callslot *pcallslot;
+        LIST_HEAD(corpses);
-        struct rxrpc_connection *conn;
+        struct afs_server *server;
-        int nconn, ret;
+        unsigned long delay, expiry;
+        time_t now;
-        _enter("%p,",server);
+        now = get_seconds();
-        INIT_LIST_HEAD(&callslot->link);
+        spin_lock(&afs_server_graveyard_lock);
-        callslot->task = current;
-        callslot->conn = NULL;
+        while (!list_empty(&afs_server_graveyard)) {
-        callslot->nconn = -1;
+                server = list_entry(afs_server_graveyard.next,
-        callslot->ready = 0;
+                                    struct afs_server, grave);
-        ret = 0;
+                /* the queue is ordered most dead first */
-        conn = NULL;
+                expiry = server->time_of_death + afs_server_timeout;
+                if (expiry > now) {
-        /* get hold of a callslot first */
+                        delay = (expiry - now) * HZ;
-        spin_lock(&server->fs_lock);
+                        if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+                                cancel_delayed_work(&afs_server_reaper);
-        /* resurrect the server if it's death timeout has expired */
+                                schedule_delayed_work(&afs_server_reaper,
-        if (server->fs_state) {
+                                                      delay);
-                if (time_before(jiffies, server->fs_dead_jif)) {
+                        }
-                        ret = server->fs_state;
+                        break;
-                        spin_unlock(&server->fs_lock);
-                        _leave(" = %d [still dead]", ret);
-                        return ret;
                }
-                server->fs_state = 0;
+                write_lock(&server->cell->servers_lock);
-        }
+                write_lock(&afs_servers_lock);
+                if (atomic_read(&server->usage) > 0) {
-        /* try and find a connection that has spare callslots */
+                        list_del_init(&server->grave);
-        for (nconn = 0; nconn < AFS_SERVER_CONN_LIST_SIZE; nconn++) {
+                } else {
-                if (server->fs_conn_cnt[nconn] > 0) {
+                        list_move_tail(&server->grave, &corpses);
-                        server->fs_conn_cnt[nconn]--;
+                        list_del_init(&server->link);
-                        spin_unlock(&server->fs_lock);
+                        rb_erase(&server->master_rb, &afs_servers);
-                        callslot->nconn = nconn;
-                        goto obtained_slot;
                }
+                write_unlock(&afs_servers_lock);
+                write_unlock(&server->cell->servers_lock);
        }
-        /* none were available - wait interruptibly for one to become
+        spin_unlock(&afs_server_graveyard_lock);
-         * available */
-        set_current_state(TASK_INTERRUPTIBLE);
-        list_add_tail(&callslot->link, &server->fs_callq);
-        spin_unlock(&server->fs_lock);
-        while (!callslot->ready && !signal_pending(current)) {
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        set_current_state(TASK_RUNNING);
-        /* even if we were interrupted we may still be queued */
-        if (!callslot->ready) {
-                spin_lock(&server->fs_lock);
-                list_del_init(&callslot->link);
-                spin_unlock(&server->fs_lock);
-        }
-        nconn = callslot->nconn;
-        /* if interrupted, we must release any slot we also got before
+        /* now reap the corpses we've extracted */
-         * returning an error */
+        while (!list_empty(&corpses)) {
-        if (signal_pending(current)) {
+                server = list_entry(corpses.next, struct afs_server, grave);
-                ret = -EINTR;
+                list_del(&server->grave);
-                goto error_release;
+                afs_destroy_server(server);
        }
+}
-        /* if we were woken up with an error, then pass that error back to the
-         * called */
-        if (nconn < 0) {
-                _leave(" = %d", callslot->errno);
-                return callslot->errno;
-        }
-        /* were we given a connection directly? */
-        if (callslot->conn) {
-                /* yes - use it */
-                _leave(" = 0 (nc=%d)", nconn);
-                return 0;
-        }
-        /* got a callslot, but no connection */
- obtained_slot:
-        /* need to get hold of the RxRPC connection */
-        down_write(&server->sem);
-        /* quick check to see if there's an outstanding error */
-        ret = server->fs_state;
-        if (ret)
-                goto error_release_upw;
-        if (server->fs_conn[nconn]) {
-                /* reuse an existing connection */
-                rxrpc_get_connection(server->fs_conn[nconn]);
-                callslot->conn = server->fs_conn[nconn];
-        }
-        else {
-                /* create a new connection */
-                ret = rxrpc_create_connection(afs_transport,
-                                              htons(7000),
-                                              server->addr.s_addr,
-                                              FS_SERVICE_ID,
-                                              NULL,
-                                              &server->fs_conn[nconn]);
-                if (ret < 0)
-                        goto error_release_upw;
-                callslot->conn = server->fs_conn[0];
-                rxrpc_get_connection(callslot->conn);
-        }
-        up_write(&server->sem);
-        _leave(" = 0");
-        return 0;
-        /* handle an error occurring */
- error_release_upw:
-        up_write(&server->sem);
- error_release:
-        /* either release the callslot or pass it along to another deserving
-         * task */
-        spin_lock(&server->fs_lock);
-        if (nconn < 0) {
-                /* no callslot allocated */
-        }
-        else if (list_empty(&server->fs_callq)) {
-                /* no one waiting */
-                server->fs_conn_cnt[nconn]++;
-                spin_unlock(&server->fs_lock);
-        }
-        else {
-                /* someone's waiting - dequeue them and wake them up */
-                pcallslot = list_entry(server->fs_callq.next,
-                                       struct afs_server_callslot, link);
-                list_del_init(&pcallslot->link);
-                pcallslot->errno = server->fs_state;
-                if (!pcallslot->errno) {
-                        /* pass them out callslot details */
-                        callslot->conn = xchg(&pcallslot->conn,
-                                              callslot->conn);
-                        pcallslot->nconn = nconn;
-                        callslot->nconn = nconn = -1;
-                }
-                pcallslot->ready = 1;
-                wake_up_process(pcallslot->task);
-                spin_unlock(&server->fs_lock);
-        }
-        rxrpc_put_connection(callslot->conn);
-        callslot->conn = NULL;
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_server_request_callslot() */
-/*****************************************************************************/
-/*
- * release a callslot back to the server
- * - transfers the RxRPC connection to the next pending callslot if possible
- */
-void afs_server_release_callslot(struct afs_server *server,
-                                 struct afs_server_callslot *callslot)
-{
-        struct afs_server_callslot *pcallslot;
-        _enter("{ad=%08x,cnt=%u},{%d}",
-               ntohl(server->addr.s_addr),
-               server->fs_conn_cnt[callslot->nconn],
-               callslot->nconn);
-        BUG_ON(callslot->nconn < 0);
-        spin_lock(&server->fs_lock);
-        if (list_empty(&server->fs_callq)) {
-                /* no one waiting */
-                server->fs_conn_cnt[callslot->nconn]++;
-                spin_unlock(&server->fs_lock);
-        }
-        else {
-                /* someone's waiting - dequeue them and wake them up */
-                pcallslot = list_entry(server->fs_callq.next,
-                                       struct afs_server_callslot, link);
-                list_del_init(&pcallslot->link);
-                pcallslot->errno = server->fs_state;
-                if (!pcallslot->errno) {
-                        /* pass them out callslot details */
-                        callslot->conn = xchg(&pcallslot->conn, callslot->conn);
-                        pcallslot->nconn = callslot->nconn;
-                        callslot->nconn = -1;
-                }
-                pcallslot->ready = 1;
-                wake_up_process(pcallslot->task);
-                spin_unlock(&server->fs_lock);
-        }
-        rxrpc_put_connection(callslot->conn);
-        _leave("");
-} /* end afs_server_release_callslot() */
-/*****************************************************************************/
 /*
- * get a handle to a connection to the vlserver (volume location) on the
+ * discard all the server records for rmmod
- * specified server
 */
-int afs_server_get_vlconn(struct afs_server *server,
+void __exit afs_purge_servers(void)
-                          struct rxrpc_connection **_conn)
 {
-        struct rxrpc_connection *conn;
+        afs_server_timeout = 0;
-        int ret;
+        cancel_delayed_work(&afs_server_reaper);
+        schedule_delayed_work(&afs_server_reaper, 0);
-        _enter("%p,", server);
+}
-        ret = 0;
-        conn = NULL;
-        down_read(&server->sem);
-        if (server->vlserver) {
-                /* reuse an existing connection */
-                rxrpc_get_connection(server->vlserver);
-                conn = server->vlserver;
-                up_read(&server->sem);
-        }
-        else {
-                /* create a new connection */
-                up_read(&server->sem);
-                down_write(&server->sem);
-                if (!server->vlserver) {
-                        ret = rxrpc_create_connection(afs_transport,
-                                                      htons(7003),
-                                                      server->addr.s_addr,
-                                                      VL_SERVICE_ID,
-                                                      NULL,
-                                                      &server->vlserver);
-                }
-                if (ret == 0) {
-                        rxrpc_get_connection(server->vlserver);
-                        conn = server->vlserver;
-                }
-                up_write(&server->sem);
-        }
-        *_conn = conn;
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_server_get_vlconn() */
diff --git a/fs/afs/server.h b/fs/afs/server.h
deleted file mode 100644
index c3d24115578f..000000000000
--- a/fs/afs/server.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* server.h: AFS server record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_SERVER_H
-#define _LINUX_AFS_SERVER_H
-#include "types.h"
-#include "kafstimod.h"
-#include <rxrpc/peer.h>
-#include <linux/rwsem.h>
-extern spinlock_t afs_server_peer_lock;
-/*****************************************************************************/
-/*
- * AFS server record
- */
-struct afs_server
-{
-        atomic_t                usage;
-        struct afs_cell         *cell;          /* cell in which server resides */
-        struct list_head        link;           /* link in cell's server list */
-        struct rw_semaphore     sem;            /* access lock */
-        struct afs_timer        timeout;        /* graveyard timeout */
-        struct in_addr          addr;           /* server address */
-        struct rxrpc_peer       *peer;          /* peer record for this server */
-        struct rxrpc_connection *vlserver;      /* connection to the volume location service */
-        /* file service access */
-#define AFS_SERVER_CONN_LIST_SIZE 2
-        struct rxrpc_connection *fs_conn[AFS_SERVER_CONN_LIST_SIZE]; /* FS connections */
-        unsigned                fs_conn_cnt[AFS_SERVER_CONN_LIST_SIZE]; /* per conn call count */
-        struct list_head        fs_callq;       /* queue of processes waiting to make a call */
-        spinlock_t              fs_lock;        /* access lock */
-        int                     fs_state;       /* 0 or reason FS currently marked dead (-errno) */
-        unsigned                fs_rtt;         /* FS round trip time */
-        unsigned long           fs_act_jif;     /* time at which last activity occurred */
-        unsigned long           fs_dead_jif;    /* time at which no longer to be considered dead */
-        /* callback promise management */
-        struct list_head        cb_promises;    /* as yet unbroken promises from this server */
-        spinlock_t              cb_lock;        /* access lock */
-};
-extern int afs_server_lookup(struct afs_cell *cell,
-                             const struct in_addr *addr,
-                             struct afs_server **_server);
-#define afs_get_server(S) do { atomic_inc(&(S)->usage); } while(0)
-extern void afs_put_server(struct afs_server *server);
-extern void afs_server_do_timeout(struct afs_server *server);
-extern int afs_server_find_by_peer(const struct rxrpc_peer *peer,
-                                   struct afs_server **_server);
-extern int afs_server_get_vlconn(struct afs_server *server,
-                                 struct rxrpc_connection **_conn);
-static inline
-struct afs_server *afs_server_get_from_peer(struct rxrpc_peer *peer)
-{
-        struct afs_server *server;
-        spin_lock(&afs_server_peer_lock);
-        server = peer->user;
-        if (server)
-                afs_get_server(server);
-        spin_unlock(&afs_server_peer_lock);
-        return server;
-}
-/*****************************************************************************/
-/*
- * AFS server callslot grant record
- */
-struct afs_server_callslot
-{
-        struct list_head        link;           /* link in server's list */
-        struct task_struct      *task;          /* process waiting to make call */
-        struct rxrpc_connection *conn;          /* connection to use (or NULL on error) */
-        short                   nconn;          /* connection slot number (-1 on error) */
-        char                    ready;          /* T when ready */
-        int                     errno;          /* error number if nconn==-1 */
-};
-extern int afs_server_request_callslot(struct afs_server *server,
-                                       struct afs_server_callslot *callslot);
-extern void afs_server_release_callslot(struct afs_server *server,
-                                        struct afs_server_callslot *callslot);
-#endif /* _LINUX_AFS_SERVER_H */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eb7e32349da3..cebd03c91f57 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,5 +1,6 @@
-/*
+/* AFS superblock handling
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ *
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
 *
 * This software may be freely redistributed under the terms of the
 * GNU General Public License.
@@ -9,7 +10,7 @@
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Authors: David Howells <dhowells@redhat.com>
- *          David Woodhouse <dwmw2@cambridge.redhat.com>
+ *          David Woodhouse <dwmw2@redhat.com>
 *
 */
@@ -19,22 +20,10 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "vnode.h"
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "super.h"
 #include "internal.h"
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-struct afs_mount_params {
-        int                     rwpath;
-        struct afs_cell         *default_cell;
-        struct afs_volume       *volume;
-};
 static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
                            unsigned long flags);
@@ -62,13 +51,13 @@ static const struct super_operations afs_super_ops = {
        .drop_inode     = generic_delete_inode,
        .destroy_inode  = afs_destroy_inode,
        .clear_inode    = afs_clear_inode,
+        .umount_begin   = afs_umount_begin,
        .put_super      = afs_put_super,
 };
 static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
-/*****************************************************************************/
 /*
 * initialise the filesystem
 */
@@ -78,8 +67,6 @@ int __init afs_fs_init(void)
        _enter("");
-        afs_timer_init(&afs_mntpt_expiry_timer, &afs_mntpt_expiry_timer_ops);
        /* create ourselves an inode cache */
        atomic_set(&afs_count_active_inodes, 0);
@@ -99,20 +86,22 @@ int __init afs_fs_init(void)
        ret = register_filesystem(&afs_fs_type);
        if (ret < 0) {
                kmem_cache_destroy(afs_inode_cachep);
-                kleave(" = %d", ret);
+                _leave(" = %d", ret);
                return ret;
        }
-        kleave(" = 0");
+        _leave(" = 0");
        return 0;
-} /* end afs_fs_init() */
+}
-/*****************************************************************************/
 /*
 * clean up the filesystem
 */
 void __exit afs_fs_exit(void)
 {
+        _enter("");
+        afs_mntpt_kill_timer();
        unregister_filesystem(&afs_fs_type);
        if (atomic_read(&afs_count_active_inodes) != 0) {
@@ -122,10 +111,9 @@ void __exit afs_fs_exit(void)
        }
        kmem_cache_destroy(afs_inode_cachep);
+        _leave("");
+}
-} /* end afs_fs_exit() */
-/*****************************************************************************/
 /*
 * check that an argument has a value
 */
@@ -136,9 +124,8 @@ static int want_arg(char **_value, const char *option)
                return 0;
        }
        return 1;
-} /* end want_arg() */
+}
-/*****************************************************************************/
 /*
 * check that there's no subsequent value
 */
@@ -150,18 +137,17 @@ static int want_no_value(char *const *_value, const char *option)
                return 0;
        }
        return 1;
-} /* end want_no_value() */
+}
-/*****************************************************************************/
 /*
 * parse the mount options
 * - this function has been shamelessly adapted from the ext3 fs which
 *   shamelessly adapted it from the msdos fs
 */
-static int afs_super_parse_options(struct afs_mount_params *params,
+static int afs_parse_options(struct afs_mount_params *params,
-                                   char *options,
+                             char *options, const char **devname)
-                                   const char **devname)
 {
+        struct afs_cell *cell;
        char *key, *value;
        int ret;
@@ -170,51 +156,135 @@ static int afs_super_parse_options(struct afs_mount_params *params,
        options[PAGE_SIZE - 1] = 0;
        ret = 0;
-        while ((key = strsep(&options, ",")) != 0)
+        while ((key = strsep(&options, ","))) {
-        {
                value = strchr(key, '=');
                if (value)
                        *value++ = 0;
-                printk("kAFS: KEY: %s, VAL:%s\n", key, value ?: "-");
+                _debug("kAFS: KEY: %s, VAL:%s", key, value ?: "-");
                if (strcmp(key, "rwpath") == 0) {
                        if (!want_no_value(&value, "rwpath"))
                                return -EINVAL;
                        params->rwpath = 1;
-                        continue;
+                } else if (strcmp(key, "vol") == 0) {
-                }
-                else if (strcmp(key, "vol") == 0) {
                        if (!want_arg(&value, "vol"))
                                return -EINVAL;
                        *devname = value;
-                        continue;
+                } else if (strcmp(key, "cell") == 0) {
-                }
-                else if (strcmp(key, "cell") == 0) {
                        if (!want_arg(&value, "cell"))
                                return -EINVAL;
-                        afs_put_cell(params->default_cell);
+                        cell = afs_cell_lookup(value, strlen(value));
-                        ret = afs_cell_lookup(value,
+                        if (IS_ERR(cell))
-                                              strlen(value),
+                                return PTR_ERR(cell);
-                                              &params->default_cell);
+                        afs_put_cell(params->cell);
-                        if (ret < 0)
+                        params->cell = cell;
-                                return -EINVAL;
+                } else {
-                        continue;
+                        printk("kAFS: Unknown mount option: '%s'\n",  key);
+                        ret = -EINVAL;
+                        goto error;
                }
-                printk("kAFS: Unknown mount option: '%s'\n",  key);
-                ret = -EINVAL;
-                goto error;
        }
        ret = 0;
+error:
- error:
        _leave(" = %d", ret);
        return ret;
-} /* end afs_super_parse_options() */
+}
+/*
+ * parse a device name to get cell name, volume name, volume type and R/W
+ * selector
+ * - this can be one of the following:
+ *      "%[cell:]volume[.]"             R/W volume
+ *      "#[cell:]volume[.]"             R/O or R/W volume (rwpath=0),
+ *                                       or R/W (rwpath=1) volume
+ *      "%[cell:]volume.readonly"       R/O volume
+ *      "#[cell:]volume.readonly"       R/O volume
+ *      "%[cell:]volume.backup"         Backup volume
+ *      "#[cell:]volume.backup"         Backup volume
+ */
+static int afs_parse_device_name(struct afs_mount_params *params,
+                                 const char *name)
+{
+        struct afs_cell *cell;
+        const char *cellname, *suffix;
+        int cellnamesz;
+        _enter(",%s", name);
+        if (!name) {
+                printk(KERN_ERR "kAFS: no volume name specified\n");
+                return -EINVAL;
+        }
+        if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+                printk(KERN_ERR "kAFS: unparsable volume name\n");
+                return -EINVAL;
+        }
+        /* determine the type of volume we're looking for */
+        params->type = AFSVL_ROVOL;
+        params->force = false;
+        if (params->rwpath || name[0] == '%') {
+                params->type = AFSVL_RWVOL;
+                params->force = true;
+        }
+        name++;
+        /* split the cell name out if there is one */
+        params->volname = strchr(name, ':');
+        if (params->volname) {
+                cellname = name;
+                cellnamesz = params->volname - name;
+                params->volname++;
+        } else {
+                params->volname = name;
+                cellname = NULL;
+                cellnamesz = 0;
+        }
+        /* the volume type is further affected by a possible suffix */
+        suffix = strrchr(params->volname, '.');
+        if (suffix) {
+                if (strcmp(suffix, ".readonly") == 0) {
+                        params->type = AFSVL_ROVOL;
+                        params->force = true;
+                } else if (strcmp(suffix, ".backup") == 0) {
+                        params->type = AFSVL_BACKVOL;
+                        params->force = true;
+                } else if (suffix[1] == 0) {
+                } else {
+                        suffix = NULL;
+                }
+        }
+        params->volnamesz = suffix ?
+                suffix - params->volname : strlen(params->volname);
+        _debug("cell %*.*s [%p]",
+               cellnamesz, cellnamesz, cellname ?: "", params->cell);
+        /* lookup the cell record */
+        if (cellname || !params->cell) {
+                cell = afs_cell_lookup(cellname, cellnamesz);
+                if (IS_ERR(cell)) {
+                        printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
+                               cellname ?: "");
+                        return PTR_ERR(cell);
+                }
+                afs_put_cell(params->cell);
+                params->cell = cell;
+        }
+        _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
+               params->cell->name, params->cell,
+               params->volnamesz, params->volnamesz, params->volname,
+               suffix ?: "-", params->type, params->force ? " FORCE" : "");
+        return 0;
+}
-/*****************************************************************************/
 /*
 * check a superblock to see if it's the one we're looking for
 */
@@ -224,13 +294,12 @@ static int afs_test_super(struct super_block *sb, void *data)
        struct afs_super_info *as = sb->s_fs_info;
        return as->volume == params->volume;
-} /* end afs_test_super() */
+}
-/*****************************************************************************/
 /*
 * fill in the superblock
 */
-static int afs_fill_super(struct super_block *sb, void *data, int silent)
+static int afs_fill_super(struct super_block *sb, void *data)
 {
        struct afs_mount_params *params = data;
        struct afs_super_info *as = NULL;
@@ -239,7 +308,7 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *inode = NULL;
        int ret;
-        kenter("");
+        _enter("");
        /* allocate a superblock info record */
        as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
@@ -262,9 +331,9 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
        fid.vid         = as->volume->vid;
        fid.vnode       = 1;
        fid.unique      = 1;
-        ret = afs_iget(sb, &fid, &inode);
+        inode = afs_iget(sb, params->key, &fid, NULL, NULL);
-        if (ret < 0)
+        if (IS_ERR(inode))
-                goto error;
+                goto error_inode;
        ret = -ENOMEM;
        root = d_alloc_root(inode);
@@ -273,21 +342,23 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_root = root;
-        kleave(" = 0");
+        _leave(" = 0");
        return 0;
- error:
+error_inode:
+        ret = PTR_ERR(inode);
+        inode = NULL;
+error:
        iput(inode);
        afs_put_volume(as->volume);
        kfree(as);
        sb->s_fs_info = NULL;
-        kleave(" = %d", ret);
+        _leave(" = %d", ret);
        return ret;
-} /* end afs_fill_super() */
+}
-/*****************************************************************************/
 /*
 * get an AFS superblock
 * - TODO: don't use get_sb_nodev(), but rather call sget() directly
@@ -300,69 +371,80 @@ static int afs_get_sb(struct file_system_type *fs_type,
 {
        struct afs_mount_params params;
        struct super_block *sb;
+        struct afs_volume *vol;
+        struct key *key;
        int ret;
        _enter(",,%s,%p", dev_name, options);
        memset(&params, 0, sizeof(params));
-        /* start the cache manager */
+        /* parse the options and device name */
-        ret = afscm_start();
-        if (ret < 0) {
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* parse the options */
        if (options) {
-                ret = afs_super_parse_options(&params, options, &dev_name);
+                ret = afs_parse_options(&params, options, &dev_name);
                if (ret < 0)
                        goto error;
-                if (!dev_name) {
-                        printk("kAFS: no volume name specified\n");
-                        ret = -EINVAL;
-                        goto error;
-                }
        }
-        /* parse the device name */
-        ret = afs_volume_lookup(dev_name,
+        ret = afs_parse_device_name(&params, dev_name);
-                                params.default_cell,
-                                params.rwpath,
-                                &params.volume);
        if (ret < 0)
                goto error;
-        /* allocate a deviceless superblock */
+        /* try and do the mount securely */
-        sb = sget(fs_type, afs_test_super, set_anon_super, &params);
+        key = afs_request_key(params.cell);
-        if (IS_ERR(sb))
+        if (IS_ERR(key)) {
+                _leave(" = %ld [key]", PTR_ERR(key));
+                ret = PTR_ERR(key);
                goto error;
+        }
+        params.key = key;
-        sb->s_flags = flags;
+        /* parse the device name */
+        vol = afs_volume_lookup(&params);
+        if (IS_ERR(vol)) {
+                ret = PTR_ERR(vol);
+                goto error;
+        }
+        params.volume = vol;
-        ret = afs_fill_super(sb, &params, flags & MS_SILENT ? 1 : 0);
+        /* allocate a deviceless superblock */
-        if (ret < 0) {
+        sb = sget(fs_type, afs_test_super, set_anon_super, &params);
-                up_write(&sb->s_umount);
+        if (IS_ERR(sb)) {
-                deactivate_super(sb);
+                ret = PTR_ERR(sb);
                goto error;
        }
-        sb->s_flags |= MS_ACTIVE;
-        simple_set_mnt(mnt, sb);
+        if (!sb->s_root) {
+                /* initial superblock/root creation */
+                _debug("create");
+                sb->s_flags = flags;
+                ret = afs_fill_super(sb, &params);
+                if (ret < 0) {
+                        up_write(&sb->s_umount);
+                        deactivate_super(sb);
+                        goto error;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        } else {
+                _debug("reuse");
+                ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+        }
+        simple_set_mnt(mnt, sb);
        afs_put_volume(params.volume);
-        afs_put_cell(params.default_cell);
+        afs_put_cell(params.cell);
-        _leave(" = 0 [%p]", 0, sb);
+        _leave(" = 0 [%p]", sb);
        return 0;
- error:
+error:
        afs_put_volume(params.volume);
-        afs_put_cell(params.default_cell);
+        afs_put_cell(params.cell);
-        afscm_stop();
+        key_put(params.key);
        _leave(" = %d", ret);
        return ret;
-} /* end afs_get_sb() */
+}
-/*****************************************************************************/
 /*
 * finish the unmounting process on the superblock
 */
@@ -373,35 +455,30 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
        afs_put_volume(as->volume);
-        afscm_stop();
        _leave("");
-} /* end afs_put_super() */
+}
-/*****************************************************************************/
 /*
 * initialise an inode cache slab element prior to any use
 */
 static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
                            unsigned long flags)
 {
-        struct afs_vnode *vnode = (struct afs_vnode *) _vnode;
+        struct afs_vnode *vnode = _vnode;
        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
            SLAB_CTOR_CONSTRUCTOR) {
                memset(vnode, 0, sizeof(*vnode));
                inode_init_once(&vnode->vfs_inode);
                init_waitqueue_head(&vnode->update_waitq);
+                mutex_init(&vnode->permits_lock);
+                mutex_init(&vnode->validate_lock);
                spin_lock_init(&vnode->lock);
-                INIT_LIST_HEAD(&vnode->cb_link);
+                INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
-                INIT_LIST_HEAD(&vnode->cb_hash_link);
-                afs_timer_init(&vnode->cb_timeout,
-                               &afs_vnode_cb_timed_out_ops);
        }
+}
-} /* end afs_i_init_once() */
-/*****************************************************************************/
 /*
 * allocate an AFS inode struct from our slab cache
 */
@@ -409,8 +486,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 {
        struct afs_vnode *vnode;
-        vnode = (struct afs_vnode *)
+        vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
-                kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
        if (!vnode)
                return NULL;
@@ -421,21 +497,25 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        vnode->volume           = NULL;
        vnode->update_cnt       = 0;
-        vnode->flags            = 0;
+        vnode->flags            = 1 << AFS_VNODE_UNSET;
+        vnode->cb_promised      = false;
        return &vnode->vfs_inode;
-} /* end afs_alloc_inode() */
+}
-/*****************************************************************************/
 /*
 * destroy an AFS inode struct
 */
 static void afs_destroy_inode(struct inode *inode)
 {
+        struct afs_vnode *vnode = AFS_FS_I(inode);
        _enter("{%lu}", inode->i_ino);
-        kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode));
+        _debug("DESTROY INODE %p", inode);
-        atomic_dec(&afs_count_active_inodes);
+        ASSERTCMP(vnode->server, ==, NULL);
-} /* end afs_destroy_inode() */
+        kmem_cache_free(afs_inode_cachep, vnode);
+        atomic_dec(&afs_count_active_inodes);
+}
diff --git a/fs/afs/super.h b/fs/afs/super.h
deleted file mode 100644
index 32de8cc6fae8..000000000000
--- a/fs/afs/super.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* super.h: AFS filesystem internal private data
- *
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
- *
- * This software may be freely redistributed under the terms of the
- * GNU General Public License.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
- *          David Howells <dhowells@redhat.com>
- *
- */
-#ifndef _LINUX_AFS_SUPER_H
-#define _LINUX_AFS_SUPER_H
-#include <linux/fs.h>
-#include "server.h"
-#ifdef __KERNEL__
-/*****************************************************************************/
-/*
- * AFS superblock private data
- * - there's one superblock per volume
- */
-struct afs_super_info
-{
-        struct afs_volume       *volume;        /* volume record */
-        char                    rwparent;       /* T if parent is R/W AFS volume */
-};
-static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-extern struct file_system_type afs_fs_type;
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/transport.h b/fs/afs/transport.h
deleted file mode 100644
index 7013ae6ccc8c..000000000000
--- a/fs/afs/transport.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* transport.h: AFS transport management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_TRANSPORT_H
-#define _LINUX_AFS_TRANSPORT_H
-#include "types.h"
-#include <rxrpc/transport.h>
-/* the cache manager transport endpoint */
-extern struct rxrpc_transport *afs_transport;
-#endif /* _LINUX_AFS_TRANSPORT_H */
diff --git a/fs/afs/types.h b/fs/afs/types.h
deleted file mode 100644
index b1a2367c7587..000000000000
--- a/fs/afs/types.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* types.h: AFS types
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_TYPES_H
-#define _LINUX_AFS_TYPES_H
-#ifdef __KERNEL__
-#include <rxrpc/types.h>
-#endif /* __KERNEL__ */
-typedef unsigned                        afs_volid_t;
-typedef unsigned                        afs_vnodeid_t;
-typedef unsigned long long              afs_dataversion_t;
-typedef enum {
-        AFSVL_RWVOL,                    /* read/write volume */
-        AFSVL_ROVOL,                    /* read-only volume */
-        AFSVL_BACKVOL,                  /* backup volume */
-} __attribute__((packed)) afs_voltype_t;
-typedef enum {
-        AFS_FTYPE_INVALID       = 0,
-        AFS_FTYPE_FILE          = 1,
-        AFS_FTYPE_DIR           = 2,
-        AFS_FTYPE_SYMLINK       = 3,
-} afs_file_type_t;
-#ifdef __KERNEL__
-struct afs_cell;
-struct afs_vnode;
-/*****************************************************************************/
-/*
- * AFS file identifier
- */
-struct afs_fid
-{
-        afs_volid_t     vid;            /* volume ID */
-        afs_vnodeid_t   vnode;          /* file index within volume */
-        unsigned        unique;         /* unique ID number (file index version) */
-};
-/*****************************************************************************/
-/*
- * AFS callback notification
- */
-typedef enum {
-        AFSCM_CB_UNTYPED        = 0,    /* no type set on CB break */
-        AFSCM_CB_EXCLUSIVE      = 1,    /* CB exclusive to CM [not implemented] */
-        AFSCM_CB_SHARED         = 2,    /* CB shared by other CM's */
-        AFSCM_CB_DROPPED        = 3,    /* CB promise cancelled by file server */
-} afs_callback_type_t;
-struct afs_callback
-{
-        struct afs_server       *server;        /* server that made the promise */
-        struct afs_fid          fid;            /* file identifier */
-        unsigned                version;        /* callback version */
-        unsigned                expiry;         /* time at which expires */
-        afs_callback_type_t     type;           /* type of callback */
-};
-#define AFSCBMAX 50
-/*****************************************************************************/
-/*
- * AFS volume information
- */
-struct afs_volume_info
-{
-        afs_volid_t             vid;            /* volume ID */
-        afs_voltype_t           type;           /* type of this volume */
-        afs_volid_t             type_vids[5];   /* volume ID's for possible types for this vol */
-        
-        /* list of fileservers serving this volume */
-        size_t                  nservers;       /* number of entries used in servers[] */
-        struct {
-                struct in_addr  addr;           /* fileserver address */
-        } servers[8];
-};
-/*****************************************************************************/
-/*
- * AFS file status information
- */
-struct afs_file_status
-{
-        unsigned                if_version;     /* interface version */
-#define AFS_FSTATUS_VERSION     1
-        afs_file_type_t         type;           /* file type */
-        unsigned                nlink;          /* link count */
-        size_t                  size;           /* file size */
-        afs_dataversion_t       version;        /* current data version */
-        unsigned                author;         /* author ID */
-        unsigned                owner;          /* owner ID */
-        unsigned                caller_access;  /* access rights for authenticated caller */
-        unsigned                anon_access;    /* access rights for unauthenticated caller */
-        umode_t                 mode;           /* UNIX mode */
-        struct afs_fid          parent;         /* parent file ID */
-        time_t                  mtime_client;   /* last time client changed data */
-        time_t                  mtime_server;   /* last time server changed data */
-};
-/*****************************************************************************/
-/*
- * AFS volume synchronisation information
- */
-struct afs_volsync
-{
-        time_t                  creation;       /* volume creation time */
-};
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_TYPES_H */
diff --git a/fs/afs/use-rtnetlink.c b/fs/afs/use-rtnetlink.c
new file mode 100644
index 000000000000..f8991c700e02
--- /dev/null
+++ b/fs/afs/use-rtnetlink.c
@@ -0,0 +1,473 @@
+/* RTNETLINK client
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <net/netlink.h>
+#include "internal.h"
+struct afs_rtm_desc {
+        struct socket           *nlsock;
+        struct afs_interface    *bufs;
+        u8                      *mac;
+        size_t                  nbufs;
+        size_t                  maxbufs;
+        void                    *data;
+        ssize_t                 datalen;
+        size_t                  datamax;
+        int                     msg_seq;
+        unsigned                mac_index;
+        bool                    wantloopback;
+        int (*parse)(struct afs_rtm_desc *, struct nlmsghdr *);
+};
+/*
+ * parse an RTM_GETADDR response
+ */
+static int afs_rtm_getaddr_parse(struct afs_rtm_desc *desc,
+                                 struct nlmsghdr *nlhdr)
+{
+        struct afs_interface *this;
+        struct ifaddrmsg *ifa;
+        struct rtattr *rtattr;
+        const char *name;
+        size_t len;
+        ifa = (struct ifaddrmsg *) NLMSG_DATA(nlhdr);
+        _enter("{ix=%d,af=%d}", ifa->ifa_index, ifa->ifa_family);
+        if (ifa->ifa_family != AF_INET) {
+                _leave(" = 0 [family %d]", ifa->ifa_family);
+                return 0;
+        }
+        if (desc->nbufs >= desc->maxbufs) {
+                _leave(" = 0 [max %zu/%zu]", desc->nbufs, desc->maxbufs);
+                return 0;
+        }
+        this = &desc->bufs[desc->nbufs];
+        this->index = ifa->ifa_index;
+        this->netmask.s_addr = inet_make_mask(ifa->ifa_prefixlen);
+        this->mtu = 0;
+        rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifaddrmsg));
+        len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifaddrmsg));
+        name = "unknown";
+        for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
+                switch (rtattr->rta_type) {
+                case IFA_ADDRESS:
+                        memcpy(&this->address, RTA_DATA(rtattr), 4);
+                        break;
+                case IFA_LABEL:
+                        name = RTA_DATA(rtattr);
+                        break;
+                }
+        }
+        _debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT,
+               name, NIPQUAD(this->address), NIPQUAD(this->netmask));
+        desc->nbufs++;
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * parse an RTM_GETLINK response for MTUs
+ */
+static int afs_rtm_getlink_if_parse(struct afs_rtm_desc *desc,
+                                    struct nlmsghdr *nlhdr)
+{
+        struct afs_interface *this;
+        struct ifinfomsg *ifi;
+        struct rtattr *rtattr;
+        const char *name;
+        size_t len, loop;
+        ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
+        _enter("{ix=%d}", ifi->ifi_index);
+        for (loop = 0; loop < desc->nbufs; loop++) {
+                this = &desc->bufs[loop];
+                if (this->index == ifi->ifi_index)
+                        goto found;
+        }
+        _leave(" = 0 [no match]");
+        return 0;
+found:
+        if (ifi->ifi_type == ARPHRD_LOOPBACK && !desc->wantloopback) {
+                _leave(" = 0 [loopback]");
+                return 0;
+        }
+        rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
+        len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
+        name = "unknown";
+        for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
+                switch (rtattr->rta_type) {
+                case IFLA_MTU:
+                        memcpy(&this->mtu, RTA_DATA(rtattr), 4);
+                        break;
+                case IFLA_IFNAME:
+                        name = RTA_DATA(rtattr);
+                        break;
+                }
+        }
+        _debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
+               name, NIPQUAD(this->address), NIPQUAD(this->netmask),
+               this->mtu);
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * parse an RTM_GETLINK response for the MAC address belonging to the lowest
+ * non-internal interface
+ */
+static int afs_rtm_getlink_mac_parse(struct afs_rtm_desc *desc,
+                                     struct nlmsghdr *nlhdr)
+{
+        struct ifinfomsg *ifi;
+        struct rtattr *rtattr;
+        const char *name;
+        size_t remain, len;
+        bool set;
+        ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
+        _enter("{ix=%d}", ifi->ifi_index);
+        if (ifi->ifi_index >= desc->mac_index) {
+                _leave(" = 0 [high]");
+                return 0;
+        }
+        if (ifi->ifi_type == ARPHRD_LOOPBACK) {
+                _leave(" = 0 [loopback]");
+                return 0;
+        }
+        rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
+        remain = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
+        name = "unknown";
+        set = false;
+        for (; RTA_OK(rtattr, remain); rtattr = RTA_NEXT(rtattr, remain)) {
+                switch (rtattr->rta_type) {
+                case IFLA_ADDRESS:
+                        len = RTA_PAYLOAD(rtattr);
+                        memcpy(desc->mac, RTA_DATA(rtattr),
+                               min_t(size_t, len, 6));
+                        desc->mac_index = ifi->ifi_index;
+                        set = true;
+                        break;
+                case IFLA_IFNAME:
+                        name = RTA_DATA(rtattr);
+                        break;
+                }
+        }
+        if (set)
+                _debug("%s: %02x:%02x:%02x:%02x:%02x:%02x",
+                       name,
+                       desc->mac[0], desc->mac[1], desc->mac[2],
+                       desc->mac[3], desc->mac[4], desc->mac[5]);
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * read the rtnetlink response and pass to parsing routine
+ */
+static int afs_read_rtm(struct afs_rtm_desc *desc)
+{
+        struct nlmsghdr *nlhdr, tmphdr;
+        struct msghdr msg;
+        struct kvec iov[1];
+        void *data;
+        bool last = false;
+        int len, ret, remain;
+        _enter("");
+        do {
+                /* first of all peek to see how big the packet is */
+                memset(&msg, 0, sizeof(msg));
+                iov[0].iov_base = &tmphdr;
+                iov[0].iov_len = sizeof(tmphdr);
+                len = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
+                                     sizeof(tmphdr), MSG_PEEK | MSG_TRUNC);
+                if (len < 0) {
+                        _leave(" = %d [peek]", len);
+                        return len;
+                }
+                if (len == 0)
+                        continue;
+                if (len < sizeof(tmphdr) || len < NLMSG_PAYLOAD(&tmphdr, 0)) {
+                        _leave(" = -EMSGSIZE");
+                        return -EMSGSIZE;
+                }
+                if (desc->datamax < len) {
+                        kfree(desc->data);
+                        desc->data = NULL;
+                        data = kmalloc(len, GFP_KERNEL);
+                        if (!data)
+                                return -ENOMEM;
+                        desc->data = data;
+                }
+                desc->datamax = len;
+                /* read all the data from this packet */
+                iov[0].iov_base = desc->data;
+                iov[0].iov_len = desc->datamax;
+                desc->datalen = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
+                                               desc->datamax, 0);
+                if (desc->datalen < 0) {
+                        _leave(" = %zd [recv]", desc->datalen);
+                        return desc->datalen;
+                }
+                nlhdr = desc->data;
+                /* check if the header is valid */
+                if (!NLMSG_OK(nlhdr, desc->datalen) ||
+                    nlhdr->nlmsg_type == NLMSG_ERROR) {
+                        _leave(" = -EIO");
+                        return -EIO;
+                }
+                /* see if this is the last message */
+                if (nlhdr->nlmsg_type == NLMSG_DONE ||
+                    !(nlhdr->nlmsg_flags & NLM_F_MULTI))
+                        last = true;
+                /* parse the bits we got this time */
+                nlmsg_for_each_msg(nlhdr, desc->data, desc->datalen, remain) {
+                        ret = desc->parse(desc, nlhdr);
+                        if (ret < 0) {
+                                _leave(" = %d [parse]", ret);
+                                return ret;
+                        }
+                }
+        } while (!last);
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * list the interface bound addresses to get the address and netmask
+ */
+static int afs_rtm_getaddr(struct afs_rtm_desc *desc)
+{
+        struct msghdr msg;
+        struct kvec iov[1];
+        int ret;
+        struct {
+                struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+                struct ifaddrmsg addr_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+        } request;
+        _enter("");
+        memset(&request, 0, sizeof(request));
+        request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+        request.nl_msg.nlmsg_type = RTM_GETADDR;
+        request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+        request.nl_msg.nlmsg_seq = desc->msg_seq++;
+        request.nl_msg.nlmsg_pid = 0;
+        memset(&msg, 0, sizeof(msg));
+        iov[0].iov_base = &request;
+        iov[0].iov_len = sizeof(request);
+        ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * list the interface link statuses to get the MTUs
+ */
+static int afs_rtm_getlink(struct afs_rtm_desc *desc)
+{
+        struct msghdr msg;
+        struct kvec iov[1];
+        int ret;
+        struct {
+                struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+                struct ifinfomsg link_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+        } request;
+        _enter("");
+        memset(&request, 0, sizeof(request));
+        request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+        request.nl_msg.nlmsg_type = RTM_GETLINK;
+        request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
+        request.nl_msg.nlmsg_seq = desc->msg_seq++;
+        request.nl_msg.nlmsg_pid = 0;
+        memset(&msg, 0, sizeof(msg));
+        iov[0].iov_base = &request;
+        iov[0].iov_len = sizeof(request);
+        ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * cull any interface records for which there isn't an MTU value
+ */
+static void afs_cull_interfaces(struct afs_rtm_desc *desc)
+{
+        struct afs_interface *bufs = desc->bufs;
+        size_t nbufs = desc->nbufs;
+        int loop, point = 0;
+        _enter("{%zu}", nbufs);
+        for (loop = 0; loop < nbufs; loop++) {
+                if (desc->bufs[loop].mtu != 0) {
+                        if (loop != point) {
+                                ASSERTCMP(loop, >, point);
+                                bufs[point] = bufs[loop];
+                        }
+                        point++;
+                }
+        }
+        desc->nbufs = point;
+        _leave(" [%zu/%zu]", desc->nbufs, nbufs);
+}
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+                            bool wantloopback)
+{
+        struct afs_rtm_desc desc;
+        int ret, loop;
+        _enter("");
+        memset(&desc, 0, sizeof(desc));
+        desc.bufs = bufs;
+        desc.maxbufs = maxbufs;
+        desc.wantloopback = wantloopback;
+        ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
+                               &desc.nlsock);
+        if (ret < 0) {
+                _leave(" = %d [sock]", ret);
+                return ret;
+        }
+        /* issue RTM_GETADDR */
+        desc.parse = afs_rtm_getaddr_parse;
+        ret = afs_rtm_getaddr(&desc);
+        if (ret < 0)
+                goto error;
+        ret = afs_read_rtm(&desc);
+        if (ret < 0)
+                goto error;
+        /* issue RTM_GETLINK */
+        desc.parse = afs_rtm_getlink_if_parse;
+        ret = afs_rtm_getlink(&desc);
+        if (ret < 0)
+                goto error;
+        ret = afs_read_rtm(&desc);
+        if (ret < 0)
+                goto error;
+        afs_cull_interfaces(&desc);
+        ret = desc.nbufs;
+        for (loop = 0; loop < ret; loop++)
+                _debug("[%d] "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
+                       bufs[loop].index,
+                       NIPQUAD(bufs[loop].address),
+                       NIPQUAD(bufs[loop].netmask),
+                       bufs[loop].mtu);
+error:
+        kfree(desc.data);
+        sock_release(desc.nlsock);
+        _leave(" = %d", ret);
+        return ret;
+}
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer should be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 mac[6])
+{
+        struct afs_rtm_desc desc;
+        int ret;
+        _enter("");
+        memset(&desc, 0, sizeof(desc));
+        desc.mac = mac;
+        desc.mac_index = UINT_MAX;
+        ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
+                               &desc.nlsock);
+        if (ret < 0) {
+                _leave(" = %d [sock]", ret);
+                return ret;
+        }
+        /* issue RTM_GETLINK */
+        desc.parse = afs_rtm_getlink_mac_parse;
+        ret = afs_rtm_getlink(&desc);
+        if (ret < 0)
+                goto error;
+        ret = afs_read_rtm(&desc);
+        if (ret < 0)
+                goto error;
+        if (desc.mac_index < UINT_MAX) {
+                /* got a MAC address */
+                _debug("[%d] %02x:%02x:%02x:%02x:%02x:%02x",
+                       desc.mac_index,
+                       mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+        } else {
+                ret = -ENONET;
+        }
+error:
+        sock_release(desc.nlsock);
+        _leave(" = %d", ret);
+        return ret;
+}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 7b0e3192ee39..36c1306e09e0 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -1,4 +1,4 @@
-/* vlclient.c: AFS Volume Location Service client
+/* AFS Volume Location Service client
 *
 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -11,247 +11,76 @@
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "server.h"
-#include "volume.h"
-#include "vlclient.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include "errors.h"
 #include "internal.h"
-#define VLGETENTRYBYID          503     /* AFS Get Cache Entry By ID operation ID */
-#define VLGETENTRYBYNAME        504     /* AFS Get Cache Entry By Name operation ID */
-#define VLPROBE                 514     /* AFS Probe Volume Location Service operation ID */
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call);
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call);
-/*****************************************************************************/
 /*
- * map afs VL abort codes to/from Linux error codes
+ * map volume locator abort codes to error codes
- * - called with call->lock held
 */
-static void afs_rxvl_aemap(struct rxrpc_call *call)
+static int afs_vl_abort_to_error(u32 abort_code)
 {
-        int err;
+        _enter("%u", abort_code);
-        _enter("{%u,%u,%d}",
+        switch (abort_code) {
-               call->app_err_state, call->app_abort_code, call->app_errno);
+        case AFSVL_IDEXIST:             return -EEXIST;
+        case AFSVL_IO:                  return -EREMOTEIO;
-        switch (call->app_err_state) {
+        case AFSVL_NAMEEXIST:           return -EEXIST;
-        case RXRPC_ESTATE_LOCAL_ABORT:
+        case AFSVL_CREATEFAIL:          return -EREMOTEIO;
-                call->app_abort_code = -call->app_errno;
+        case AFSVL_NOENT:               return -ENOMEDIUM;
-                return;
+        case AFSVL_EMPTY:               return -ENOMEDIUM;
+        case AFSVL_ENTDELETED:          return -ENOMEDIUM;
-        case RXRPC_ESTATE_PEER_ABORT:
+        case AFSVL_BADNAME:             return -EINVAL;
-                switch (call->app_abort_code) {
+        case AFSVL_BADINDEX:            return -EINVAL;
-                case AFSVL_IDEXIST:             err = -EEXIST;          break;
+        case AFSVL_BADVOLTYPE:          return -EINVAL;
-                case AFSVL_IO:                  err = -EREMOTEIO;       break;
+        case AFSVL_BADSERVER:           return -EINVAL;
-                case AFSVL_NAMEEXIST:           err = -EEXIST;          break;
+        case AFSVL_BADPARTITION:        return -EINVAL;
-                case AFSVL_CREATEFAIL:          err = -EREMOTEIO;       break;
+        case AFSVL_REPSFULL:            return -EFBIG;
-                case AFSVL_NOENT:               err = -ENOMEDIUM;       break;
+        case AFSVL_NOREPSERVER:         return -ENOENT;
-                case AFSVL_EMPTY:               err = -ENOMEDIUM;       break;
+        case AFSVL_DUPREPSERVER:        return -EEXIST;
-                case AFSVL_ENTDELETED:          err = -ENOMEDIUM;       break;
+        case AFSVL_RWNOTFOUND:          return -ENOENT;
-                case AFSVL_BADNAME:             err = -EINVAL;          break;
+        case AFSVL_BADREFCOUNT:         return -EINVAL;
-                case AFSVL_BADINDEX:            err = -EINVAL;          break;
+        case AFSVL_SIZEEXCEEDED:        return -EINVAL;
-                case AFSVL_BADVOLTYPE:          err = -EINVAL;          break;
+        case AFSVL_BADENTRY:            return -EINVAL;
-                case AFSVL_BADSERVER:           err = -EINVAL;          break;
+        case AFSVL_BADVOLIDBUMP:        return -EINVAL;
-                case AFSVL_BADPARTITION:        err = -EINVAL;          break;
+        case AFSVL_IDALREADYHASHED:     return -EINVAL;
-                case AFSVL_REPSFULL:            err = -EFBIG;           break;
+        case AFSVL_ENTRYLOCKED:         return -EBUSY;
-                case AFSVL_NOREPSERVER:         err = -ENOENT;          break;
+        case AFSVL_BADVOLOPER:          return -EBADRQC;
-                case AFSVL_DUPREPSERVER:        err = -EEXIST;          break;
+        case AFSVL_BADRELLOCKTYPE:      return -EINVAL;
-                case AFSVL_RWNOTFOUND:          err = -ENOENT;          break;
+        case AFSVL_RERELEASE:           return -EREMOTEIO;
-                case AFSVL_BADREFCOUNT:         err = -EINVAL;          break;
+        case AFSVL_BADSERVERFLAG:       return -EINVAL;
-                case AFSVL_SIZEEXCEEDED:        err = -EINVAL;          break;
+        case AFSVL_PERM:                return -EACCES;
-                case AFSVL_BADENTRY:            err = -EINVAL;          break;
+        case AFSVL_NOMEM:               return -EREMOTEIO;
-                case AFSVL_BADVOLIDBUMP:        err = -EINVAL;          break;
-                case AFSVL_IDALREADYHASHED:     err = -EINVAL;          break;
-                case AFSVL_ENTRYLOCKED:         err = -EBUSY;           break;
-                case AFSVL_BADVOLOPER:          err = -EBADRQC;         break;
-                case AFSVL_BADRELLOCKTYPE:      err = -EINVAL;          break;
-                case AFSVL_RERELEASE:           err = -EREMOTEIO;       break;
-                case AFSVL_BADSERVERFLAG:       err = -EINVAL;          break;
-                case AFSVL_PERM:                err = -EACCES;          break;
-                case AFSVL_NOMEM:               err = -EREMOTEIO;       break;
-                default:
-                        err = afs_abort_to_error(call->app_abort_code);
-                        break;
-                }
-                call->app_errno = err;
-                return;
        default:
-                return;
+                return afs_abort_to_error(abort_code);
        }
-} /* end afs_rxvl_aemap() */
+}
-#if 0
-/*****************************************************************************/
 /*
- * probe a volume location server to see if it is still alive -- unused
+ * deliver reply data to a VL.GetEntryByXXX call
 */
-static int afs_rxvl_probe(struct afs_server *server, int alloc_flags)
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
+                                           struct sk_buff *skb, bool last)
 {
-        struct rxrpc_connection *conn;
+        struct afs_cache_vlocation *entry;
-        struct rxrpc_call *call;
+        __be32 *bp;
-        struct kvec piov[1];
+        u32 tmp;
-        size_t sent;
+        int loop;
-        int ret;
-        __be32 param[1];
-        DECLARE_WAITQUEUE(myself, current);
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(server, &conn);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = VLPROBE;
-        /* we want to get event notifications from the call */
-        add_wait_queue(&call->waitq, &myself);
-        /* marshall the parameters */
-        param[0] = htonl(VLPROBE);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET,
-                                    alloc_flags, 0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        for (;;) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-                    signal_pending(current))
-                        break;
-                schedule();
-        }
-        set_current_state(TASK_RUNNING);
-        ret = -EINTR;
-        if (signal_pending(current))
-                goto abort;
-        switch (call->app_call_state) {
-        case RXRPC_CSTATE_ERROR:
-                ret = call->app_errno;
-                goto out_unwait;
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
-                ret = 0;
-                goto out_unwait;
-        default:
-                BUG();
-        }
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        rxrpc_put_connection(conn);
- out:
-        return ret;
-} /* end afs_rxvl_probe() */
+        _enter(",,%u", last);
-#endif
-/*****************************************************************************/
+        afs_transfer_reply(call, skb);
-/*
+        if (!last)
- * look up a volume location database entry by name
+                return 0;
- */
-int afs_rxvl_get_entry_by_name(struct afs_server *server,
-                               const char *volname,
-                               unsigned volnamesz,
-                               struct afs_cache_vlocation *entry)
-{
-        DECLARE_WAITQUEUE(myself, current);
-        struct rxrpc_connection *conn;
-        struct rxrpc_call *call;
-        struct kvec piov[3];
-        unsigned tmp;
-        size_t sent;
-        int ret, loop;
-        __be32 *bp, param[2], zero;
-        _enter(",%*.*s,%u,", volnamesz, volnamesz, volname, volnamesz);
-        memset(entry, 0, sizeof(*entry));
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(server, &conn);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = VLGETENTRYBYNAME;
-        /* we want to get event notifications from the call */
+        if (call->reply_size != call->reply_max)
-        add_wait_queue(&call->waitq, &myself);
+                return -EBADMSG;
-        /* marshall the parameters */
+        /* unmarshall the reply once we've received all of it */
-        piov[1].iov_len = volnamesz;
+        entry = call->reply;
-        piov[1].iov_base = (char *) volname;
+        bp = call->buffer;
-        zero = 0;
-        piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
-        piov[2].iov_base = &zero;
-        param[0] = htonl(VLGETENTRYBYNAME);
-        param[1] = htonl(piov[1].iov_len);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 384);
-        ret = rxrpc_call_read_data(call, bp, 384,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
-        }
-        /* unmarshall the reply */
        for (loop = 0; loop < 64; loop++)
                entry->name[loop] = ntohl(*bp++);
+        entry->name[loop] = 0;
        bp++; /* final NUL */
        bp++; /* type */
@@ -264,6 +93,7 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
        for (loop = 0; loop < 8; loop++) {
                tmp = ntohl(*bp++);
+                entry->srvtmask[loop] = 0;
                if (tmp & AFS_VLSF_RWVOL)
                        entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
                if (tmp & AFS_VLSF_ROVOL)
@@ -279,417 +109,110 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
        bp++; /* clone ID */
        tmp = ntohl(*bp++); /* flags */
+        entry->vidmask = 0;
        if (tmp & AFS_VLF_RWEXISTS)
                entry->vidmask |= AFS_VOL_VTM_RW;
        if (tmp & AFS_VLF_ROEXISTS)
                entry->vidmask |= AFS_VOL_VTM_RO;
        if (tmp & AFS_VLF_BACKEXISTS)
                entry->vidmask |= AFS_VOL_VTM_BAK;
-        ret = -ENOMEDIUM;
        if (!entry->vidmask)
-                goto abort;
+                return -EBADMSG;
-        /* success */
-        entry->rtime = get_seconds();
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        rxrpc_put_connection(conn);
- out:
-        _leave(" = %d", ret);
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxvl_get_entry_by_name() */
-/*****************************************************************************/
-/*
- * look up a volume location database entry by ID
- */
-int afs_rxvl_get_entry_by_id(struct afs_server *server,
-                             afs_volid_t volid,
-                             afs_voltype_t voltype,
-                             struct afs_cache_vlocation *entry)
-{
-        DECLARE_WAITQUEUE(myself, current);
-        struct rxrpc_connection *conn;
-        struct rxrpc_call *call;
-        struct kvec piov[1];
-        unsigned tmp;
-        size_t sent;
-        int ret, loop;
-        __be32 *bp, param[3];
-        _enter(",%x,%d,", volid, voltype);
-        memset(entry, 0, sizeof(*entry));
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(server, &conn);
-        if (ret < 0)
-                goto out;
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                goto out_put_conn;
-        }
-        call->app_opcode = VLGETENTRYBYID;
-        /* we want to get event notifications from the call */
-        add_wait_queue(&call->waitq, &myself);
-        /* marshall the parameters */
-        param[0] = htonl(VLGETENTRYBYID);
-        param[1] = htonl(volid);
-        param[2] = htonl(voltype);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0)
-                goto abort;
-        /* wait for the reply to completely arrive */
-        bp = rxrpc_call_alloc_scratch(call, 384);
-        ret = rxrpc_call_read_data(call, bp, 384,
-                                   RXRPC_CALL_READ_BLOCK |
-                                   RXRPC_CALL_READ_ALL);
-        if (ret < 0) {
-                if (ret == -ECONNABORTED) {
-                        ret = call->app_errno;
-                        goto out_unwait;
-                }
-                goto abort;
-        }
-        /* unmarshall the reply */
-        for (loop = 0; loop < 64; loop++)
-                entry->name[loop] = ntohl(*bp++);
-        bp++; /* final NUL */
-        bp++; /* type */
+        _leave(" = 0 [done]");
-        entry->nservers = ntohl(*bp++);
+        return 0;
+}
-        for (loop = 0; loop < 8; loop++)
-                entry->servers[loop].s_addr = *bp++;
-        bp += 8; /* partition IDs */
-        for (loop = 0; loop < 8; loop++) {
-                tmp = ntohl(*bp++);
-                if (tmp & AFS_VLSF_RWVOL)
-                        entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
-                if (tmp & AFS_VLSF_ROVOL)
-                        entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
-                if (tmp & AFS_VLSF_BACKVOL)
-                        entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-        }
-        entry->vid[0] = ntohl(*bp++);
-        entry->vid[1] = ntohl(*bp++);
-        entry->vid[2] = ntohl(*bp++);
-        bp++; /* clone ID */
-        tmp = ntohl(*bp++); /* flags */
-        if (tmp & AFS_VLF_RWEXISTS)
-                entry->vidmask |= AFS_VOL_VTM_RW;
-        if (tmp & AFS_VLF_ROEXISTS)
-                entry->vidmask |= AFS_VOL_VTM_RO;
-        if (tmp & AFS_VLF_BACKEXISTS)
-                entry->vidmask |= AFS_VOL_VTM_BAK;
-        ret = -ENOMEDIUM;
-        if (!entry->vidmask)
-                goto abort;
-#if 0 /* TODO: remove */
-        entry->nservers = 3;
-        entry->servers[0].s_addr = htonl(0xac101249);
-        entry->servers[1].s_addr = htonl(0xac101243);
-        entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-        entry->srvtmask[0] = AFS_VOL_VTM_RO;
-        entry->srvtmask[1] = AFS_VOL_VTM_RO;
-        entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-        /* success */
-        entry->rtime = get_seconds();
-        ret = 0;
- out_unwait:
-        set_current_state(TASK_RUNNING);
-        remove_wait_queue(&call->waitq, &myself);
-        rxrpc_put_call(call);
- out_put_conn:
-        rxrpc_put_connection(conn);
- out:
-        _leave(" = %d", ret);
-        return ret;
- abort:
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        rxrpc_call_abort(call, ret);
-        schedule();
-        goto out_unwait;
-} /* end afs_rxvl_get_entry_by_id() */
-/*****************************************************************************/
 /*
- * look up a volume location database entry by ID asynchronously
+ * VL.GetEntryByName operation type
 */
-int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
+static const struct afs_call_type afs_RXVLGetEntryByName = {
-                                   afs_volid_t volid,
+        .name           = "VL.GetEntryByName",
-                                   afs_voltype_t voltype)
+        .deliver        = afs_deliver_vl_get_entry_by_xxx,
-{
+        .abort_to_error = afs_vl_abort_to_error,
-        struct rxrpc_connection *conn;
+        .destructor     = afs_flat_call_destructor,
-        struct rxrpc_call *call;
+};
-        struct kvec piov[1];
-        size_t sent;
-        int ret;
-        __be32 param[3];
-        _enter(",%x,%d,", volid, voltype);
-        /* get hold of the vlserver connection */
-        ret = afs_server_get_vlconn(op->server, &conn);
-        if (ret < 0) {
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* create a call through that connection */
-        ret = rxrpc_create_call(conn,
-                                afs_rxvl_get_entry_by_id_attn,
-                                afs_rxvl_get_entry_by_id_error,
-                                afs_rxvl_aemap,
-                                &op->call);
-        rxrpc_put_connection(conn);
-        if (ret < 0) {
-                printk("kAFS: Unable to create call: %d\n", ret);
-                _leave(" = %d", ret);
-                return ret;
-        }
-        op->call->app_opcode = VLGETENTRYBYID;
+/*
-        op->call->app_user = op;
+ * VL.GetEntryById operation type
+ */
-        call = op->call;
+static const struct afs_call_type afs_RXVLGetEntryById = {
-        rxrpc_get_call(call);
+        .name           = "VL.GetEntryById",
+        .deliver        = afs_deliver_vl_get_entry_by_xxx,
-        /* send event notifications from the call to kafsasyncd */
+        .abort_to_error = afs_vl_abort_to_error,
-        afs_kafsasyncd_begin_op(op);
+        .destructor     = afs_flat_call_destructor,
+};
-        /* marshall the parameters */
-        param[0] = htonl(VLGETENTRYBYID);
-        param[1] = htonl(volid);
-        param[2] = htonl(voltype);
-        piov[0].iov_len = sizeof(param);
-        piov[0].iov_base = param;
-        /* allocate result read buffer in scratch space */
-        call->app_scr_ptr = rxrpc_call_alloc_scratch(op->call, 384);
-        /* send the parameters to the server */
-        ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-                                    0, &sent);
-        if (ret < 0) {
-                rxrpc_call_abort(call, ret); /* handle from kafsasyncd */
-                ret = 0;
-                goto out;
-        }
-        /* wait for the reply to completely arrive */
-        ret = rxrpc_call_read_data(call, call->app_scr_ptr, 384, 0);
-        switch (ret) {
-        case 0:
-        case -EAGAIN:
-        case -ECONNABORTED:
-                ret = 0;
-                break;  /* all handled by kafsasyncd */
-        default:
-                rxrpc_call_abort(call, ret); /* make kafsasyncd handle it */
-                ret = 0;
-                break;
-        }
- out:
-        rxrpc_put_call(call);
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_rxvl_get_entry_by_id_async() */
-/*****************************************************************************/
 /*
- * attend to the asynchronous get VLDB entry by ID
+ * dispatch a get volume entry by name operation
 */
-int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
+int afs_vl_get_entry_by_name(struct in_addr *addr,
-                                    struct afs_cache_vlocation *entry)
+                             struct key *key,
+                             const char *volname,
+                             struct afs_cache_vlocation *entry,
+                             const struct afs_wait_mode *wait_mode)
 {
+        struct afs_call *call;
+        size_t volnamesz, reqsz, padsz;
        __be32 *bp;
-        __u32 tmp;
-        int loop, ret;
-        _enter("{op=%p cst=%u}", op, op->call->app_call_state);
-        memset(entry, 0, sizeof(*entry));
-        if (op->call->app_call_state == RXRPC_CSTATE_COMPLETE) {
-                /* operation finished */
-                afs_kafsasyncd_terminate_op(op);
-                bp = op->call->app_scr_ptr;
-                /* unmarshall the reply */
-                for (loop = 0; loop < 64; loop++)
-                        entry->name[loop] = ntohl(*bp++);
-                bp++; /* final NUL */
-                bp++; /* type */
-                entry->nservers = ntohl(*bp++);
-                for (loop = 0; loop < 8; loop++)
-                        entry->servers[loop].s_addr = *bp++;
-                bp += 8; /* partition IDs */
-                for (loop = 0; loop < 8; loop++) {
-                        tmp = ntohl(*bp++);
-                        if (tmp & AFS_VLSF_RWVOL)
-                                entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
-                        if (tmp & AFS_VLSF_ROVOL)
-                                entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
-                        if (tmp & AFS_VLSF_BACKVOL)
-                                entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-                }
-                entry->vid[0] = ntohl(*bp++);
-                entry->vid[1] = ntohl(*bp++);
-                entry->vid[2] = ntohl(*bp++);
-                bp++; /* clone ID */
-                tmp = ntohl(*bp++); /* flags */
-                if (tmp & AFS_VLF_RWEXISTS)
-                        entry->vidmask |= AFS_VOL_VTM_RW;
-                if (tmp & AFS_VLF_ROEXISTS)
-                        entry->vidmask |= AFS_VOL_VTM_RO;
-                if (tmp & AFS_VLF_BACKEXISTS)
-                        entry->vidmask |= AFS_VOL_VTM_BAK;
-                ret = -ENOMEDIUM;
-                if (!entry->vidmask) {
-                        rxrpc_call_abort(op->call, ret);
-                        goto done;
-                }
-#if 0 /* TODO: remove */
-                entry->nservers = 3;
-                entry->servers[0].s_addr = htonl(0xac101249);
-                entry->servers[1].s_addr = htonl(0xac101243);
-                entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-                entry->srvtmask[0] = AFS_VOL_VTM_RO;
-                entry->srvtmask[1] = AFS_VOL_VTM_RO;
-                entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-                /* success */
-                entry->rtime = get_seconds();
-                ret = 0;
-                goto done;
-        }
-        if (op->call->app_call_state == RXRPC_CSTATE_ERROR) {
+        _enter("");
-                /* operation error */
-                ret = op->call->app_errno;
-                goto done;
-        }
-        _leave(" = -EAGAIN");
+        volnamesz = strlen(volname);
-        return -EAGAIN;
+        padsz = (4 - (volnamesz & 3)) & 3;
+        reqsz = 8 + volnamesz + padsz;
- done:
+        call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
-        rxrpc_put_call(op->call);
+        if (!call)
-        op->call = NULL;
+                return -ENOMEM;
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_rxvl_get_entry_by_id_async2() */
-/*****************************************************************************/
+        call->key = key;
-/*
+        call->reply = entry;
- * handle attention events on an async get-entry-by-ID op
+        call->service_id = VL_SERVICE;
- * - called from krxiod
+        call->port = htons(AFS_VL_PORT);
- */
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call)
-{
-        struct afs_async_op *op = call->app_user;
-        _enter("{op=%p cst=%u}", op, call->app_call_state);
-        switch (call->app_call_state) {
-        case RXRPC_CSTATE_COMPLETE:
-                afs_kafsasyncd_attend_op(op);
-                break;
-        case RXRPC_CSTATE_CLNT_RCV_REPLY:
-                if (call->app_async_read)
-                        break;
-        case RXRPC_CSTATE_CLNT_GOT_REPLY:
-                if (call->app_read_count == 0)
-                        break;
-                printk("kAFS: Reply bigger than expected"
-                       " {cst=%u asyn=%d mark=%Zu rdy=%Zu pr=%u%s}",
-                       call->app_call_state,
-                       call->app_async_read,
-                       call->app_mark,
-                       call->app_ready_qty,
-                       call->pkt_rcv_count,
-                       call->app_last_rcv ? " last" : "");
-                rxrpc_call_abort(call, -EBADMSG);
-                break;
-        default:
-                BUG();
-        }
-        _leave("");
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(VLGETENTRYBYNAME);
+        *bp++ = htonl(volnamesz);
+        memcpy(bp, volname, volnamesz);
+        if (padsz > 0)
+                memset((void *) bp + volnamesz, 0, padsz);
-} /* end afs_rxvl_get_entry_by_id_attn() */
+        /* initiate the call */
+        return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
-/*****************************************************************************/
 /*
- * handle error events on an async get-entry-by-ID op
+ * dispatch a get volume entry by ID operation
- * - called from krxiod
 */
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call)
+int afs_vl_get_entry_by_id(struct in_addr *addr,
+                           struct key *key,
+                           afs_volid_t volid,
+                           afs_voltype_t voltype,
+                           struct afs_cache_vlocation *entry,
+                           const struct afs_wait_mode *wait_mode)
 {
-        struct afs_async_op *op = call->app_user;
+        struct afs_call *call;
+        __be32 *bp;
-        _enter("{op=%p cst=%u}", op, call->app_call_state);
+        _enter("");
-        afs_kafsasyncd_attend_op(op);
+        call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+        if (!call)
+                return -ENOMEM;
-        _leave("");
+        call->key = key;
+        call->reply = entry;
+        call->service_id = VL_SERVICE;
+        call->port = htons(AFS_VL_PORT);
-} /* end afs_rxvl_get_entry_by_id_error() */
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(VLGETENTRYBYID);
+        *bp++ = htonl(volid);
+        *bp   = htonl(voltype);
+        /* initiate the call */
+        return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 782ee7c600ca..6c8e95a7c2c9 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -1,6 +1,6 @@
-/* vlocation.c: volume location management
+/* AFS volume location management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -12,131 +12,61 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include <rxrpc/connection.h>
 #include "internal.h"
-#define AFS_VLDB_TIMEOUT HZ*1000
+unsigned afs_vlocation_timeout = 10;    /* volume location timeout in seconds */
+unsigned afs_vlocation_update_timeout = 10 * 60;
-static void afs_vlocation_update_timer(struct afs_timer *timer);
+static void afs_vlocation_reaper(struct work_struct *);
-static void afs_vlocation_update_attend(struct afs_async_op *op);
+static void afs_vlocation_updater(struct work_struct *);
-static void afs_vlocation_update_discard(struct afs_async_op *op);
-static void __afs_put_vlocation(struct afs_vlocation *vlocation);
-static void __afs_vlocation_timeout(struct afs_timer *timer)
+static LIST_HEAD(afs_vlocation_updates);
-{
+static LIST_HEAD(afs_vlocation_graveyard);
-        struct afs_vlocation *vlocation =
+static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
-                list_entry(timer, struct afs_vlocation, timeout);
+static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
-        _debug("VL TIMEOUT [%s{u=%d}]",
+static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
-               vlocation->vldb.name, atomic_read(&vlocation->usage));
+static struct workqueue_struct *afs_vlocation_update_worker;
-        afs_vlocation_do_timeout(vlocation);
-}
-static const struct afs_timer_ops afs_vlocation_timer_ops = {
-        .timed_out      = __afs_vlocation_timeout,
-};
-static const struct afs_timer_ops afs_vlocation_update_timer_ops = {
-        .timed_out      = afs_vlocation_update_timer,
-};
-static const struct afs_async_op_ops afs_vlocation_update_op_ops = {
-        .attend         = afs_vlocation_update_attend,
-        .discard        = afs_vlocation_update_discard,
-};
-static LIST_HEAD(afs_vlocation_update_pendq);   /* queue of VLs awaiting update */
-static struct afs_vlocation *afs_vlocation_update;      /* VL currently being updated */
-static DEFINE_SPINLOCK(afs_vlocation_update_lock); /* lock guarding update queue */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-                                                     const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-        .name           = "vldb",
-        .data_size      = sizeof(struct afs_cache_vlocation),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-        .match          = afs_vlocation_cache_match,
-        .update         = afs_vlocation_cache_update,
-};
-#endif
-/*****************************************************************************/
 /*
 * iterate through the VL servers in a cell until one of them admits knowing
 * about the volume in question
- * - caller must have cell->vl_sem write-locked
 */
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
+static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
-                                           const char *name,
+                                           struct key *key,
-                                           unsigned namesz,
                                           struct afs_cache_vlocation *vldb)
 {
-        struct afs_server *server = NULL;
+        struct afs_cell *cell = vl->cell;
-        struct afs_cell *cell = vlocation->cell;
+        struct in_addr addr;
        int count, ret;
-        _enter("%s,%*.*s,%u", cell->name, namesz, namesz, name, namesz);
+        _enter("%s,%s", cell->name, vl->vldb.name);
+        down_write(&vl->cell->vl_sem);
        ret = -ENOMEDIUM;
        for (count = cell->vl_naddrs; count > 0; count--) {
-                _debug("CellServ[%hu]: %08x",
+                addr = cell->vl_addrs[cell->vl_curr_svix];
-                       cell->vl_curr_svix,
-                       cell->vl_addrs[cell->vl_curr_svix].s_addr);
+                _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-                /* try and create a server */
-                ret = afs_server_lookup(cell,
-                                        &cell->vl_addrs[cell->vl_curr_svix],
-                                        &server);
-                switch (ret) {
-                case 0:
-                        break;
-                case -ENOMEM:
-                case -ENONET:
-                        goto out;
-                default:
-                        goto rotate;
-                }
                /* attempt to access the VL server */
-                ret = afs_rxvl_get_entry_by_name(server, name, namesz, vldb);
+                ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
+                                               &afs_sync_call);
                switch (ret) {
                case 0:
-                        afs_put_server(server);
                        goto out;
                case -ENOMEM:
                case -ENONET:
                case -ENETUNREACH:
                case -EHOSTUNREACH:
                case -ECONNREFUSED:
-                        down_write(&server->sem);
-                        if (server->vlserver) {
-                                rxrpc_put_connection(server->vlserver);
-                                server->vlserver = NULL;
-                        }
-                        up_write(&server->sem);
-                        afs_put_server(server);
                        if (ret == -ENOMEM || ret == -ENONET)
                                goto out;
                        goto rotate;
                case -ENOMEDIUM:
-                        afs_put_server(server);
                        goto out;
                default:
-                        afs_put_server(server);
+                        ret = -EIO;
-                        ret = -ENOMEDIUM;
                        goto rotate;
                }
@@ -146,76 +76,66 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
                cell->vl_curr_svix %= cell->vl_naddrs;
        }
- out:
+out:
+        up_write(&vl->cell->vl_sem);
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_vlocation_access_vl_by_name() */
-/*****************************************************************************/
 /*
 * iterate through the VL servers in a cell until one of them admits knowing
 * about the volume in question
- * - caller must have cell->vl_sem write-locked
 */
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
+static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
+                                         struct key *key,
                                         afs_volid_t volid,
                                         afs_voltype_t voltype,
                                         struct afs_cache_vlocation *vldb)
 {
-        struct afs_server *server = NULL;
+        struct afs_cell *cell = vl->cell;
-        struct afs_cell *cell = vlocation->cell;
+        struct in_addr addr;
        int count, ret;
        _enter("%s,%x,%d,", cell->name, volid, voltype);
+        down_write(&vl->cell->vl_sem);
        ret = -ENOMEDIUM;
        for (count = cell->vl_naddrs; count > 0; count--) {
-                _debug("CellServ[%hu]: %08x",
+                addr = cell->vl_addrs[cell->vl_curr_svix];
-                       cell->vl_curr_svix,
-                       cell->vl_addrs[cell->vl_curr_svix].s_addr);
+                _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-                /* try and create a server */
-                ret = afs_server_lookup(cell,
-                                        &cell->vl_addrs[cell->vl_curr_svix],
-                                        &server);
-                switch (ret) {
-                case 0:
-                        break;
-                case -ENOMEM:
-                case -ENONET:
-                        goto out;
-                default:
-                        goto rotate;
-                }
                /* attempt to access the VL server */
-                ret = afs_rxvl_get_entry_by_id(server, volid, voltype, vldb);
+                ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
+                                             &afs_sync_call);
                switch (ret) {
                case 0:
-                        afs_put_server(server);
                        goto out;
                case -ENOMEM:
                case -ENONET:
                case -ENETUNREACH:
                case -EHOSTUNREACH:
                case -ECONNREFUSED:
-                        down_write(&server->sem);
-                        if (server->vlserver) {
-                                rxrpc_put_connection(server->vlserver);
-                                server->vlserver = NULL;
-                        }
-                        up_write(&server->sem);
-                        afs_put_server(server);
                        if (ret == -ENOMEM || ret == -ENONET)
                                goto out;
                        goto rotate;
+                case -EBUSY:
+                        vl->upd_busy_cnt++;
+                        if (vl->upd_busy_cnt <= 3) {
+                                if (vl->upd_busy_cnt > 1) {
+                                        /* second+ BUSY - sleep a little bit */
+                                        set_current_state(TASK_UNINTERRUPTIBLE);
+                                        schedule_timeout(1);
+                                        __set_current_state(TASK_RUNNING);
+                                }
+                                continue;
+                        }
+                        break;
                case -ENOMEDIUM:
-                        afs_put_server(server);
+                        vl->upd_rej_cnt++;
-                        goto out;
+                        goto rotate;
                default:
-                        afs_put_server(server);
+                        ret = -EIO;
-                        ret = -ENOMEDIUM;
                        goto rotate;
                }
@@ -223,729 +143,579 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
        rotate:
                cell->vl_curr_svix++;
                cell->vl_curr_svix %= cell->vl_naddrs;
+                vl->upd_busy_cnt = 0;
        }
- out:
+out:
+        if (ret < 0 && vl->upd_rej_cnt > 0) {
+                printk(KERN_NOTICE "kAFS:"
+                       " Active volume no longer valid '%s'\n",
+                       vl->vldb.name);
+                vl->valid = 0;
+                ret = -ENOMEDIUM;
+        }
+        up_write(&vl->cell->vl_sem);
        _leave(" = %d", ret);
        return ret;
+}
-} /* end afs_vlocation_access_vl_by_id() */
-/*****************************************************************************/
 /*
- * lookup volume location
+ * allocate a volume location record
- * - caller must have cell->vol_sem write-locked
- * - iterate through the VL servers in a cell until one of them admits knowing
- *   about the volume in question
- * - lookup in the local cache if not able to find on the VL server
- * - insert/update in the local cache if did get a VL response
 */
-int afs_vlocation_lookup(struct afs_cell *cell,
+static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
-                         const char *name,
+                                                 const char *name,
-                         unsigned namesz,
+                                                 size_t namesz)
-                         struct afs_vlocation **_vlocation)
 {
-        struct afs_cache_vlocation vldb;
+        struct afs_vlocation *vl;
-        struct afs_vlocation *vlocation;
-        afs_voltype_t voltype;
+        vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-        afs_volid_t vid;
+        if (vl) {
-        int active = 0, ret;
+                vl->cell = cell;
+                vl->state = AFS_VL_NEW;
-        _enter("{%s},%*.*s,%u,", cell->name, namesz, namesz, name, namesz);
+                atomic_set(&vl->usage, 1);
+                INIT_LIST_HEAD(&vl->link);
-        if (namesz > sizeof(vlocation->vldb.name)) {
+                INIT_LIST_HEAD(&vl->grave);
-                _leave(" = -ENAMETOOLONG");
+                INIT_LIST_HEAD(&vl->update);
-                return -ENAMETOOLONG;
+                init_waitqueue_head(&vl->waitq);
-        }
+                spin_lock_init(&vl->lock);
+                memcpy(vl->vldb.name, name, namesz);
-        /* search the cell's active list first */
-        list_for_each_entry(vlocation, &cell->vl_list, link) {
-                if (namesz < sizeof(vlocation->vldb.name) &&
-                    vlocation->vldb.name[namesz] != '\0')
-                        continue;
-                if (memcmp(vlocation->vldb.name, name, namesz) == 0)
-                        goto found_in_memory;
-        }
-        /* search the cell's graveyard list second */
-        spin_lock(&cell->vl_gylock);
-        list_for_each_entry(vlocation, &cell->vl_graveyard, link) {
-                if (namesz < sizeof(vlocation->vldb.name) &&
-                    vlocation->vldb.name[namesz] != '\0')
-                        continue;
-                if (memcmp(vlocation->vldb.name, name, namesz) == 0)
-                        goto found_in_graveyard;
-        }
-        spin_unlock(&cell->vl_gylock);
-        /* not in the cell's in-memory lists - create a new record */
-        vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-        if (!vlocation)
-                return -ENOMEM;
-        atomic_set(&vlocation->usage, 1);
-        INIT_LIST_HEAD(&vlocation->link);
-        rwlock_init(&vlocation->lock);
-        memcpy(vlocation->vldb.name, name, namesz);
-        afs_timer_init(&vlocation->timeout, &afs_vlocation_timer_ops);
-        afs_timer_init(&vlocation->upd_timer, &afs_vlocation_update_timer_ops);
-        afs_async_op_init(&vlocation->upd_op, &afs_vlocation_update_op_ops);
-        afs_get_cell(cell);
-        vlocation->cell = cell;
-        list_add_tail(&vlocation->link, &cell->vl_list);
-#ifdef AFS_CACHING_SUPPORT
-        /* we want to store it in the cache, plus it might already be
-         * encached */
-        cachefs_acquire_cookie(cell->cache,
-                               &afs_volume_cache_index_def,
-                               vlocation,
-                               &vlocation->cache);
-        if (vlocation->valid)
-                goto found_in_cache;
-#endif
-        /* try to look up an unknown volume in the cell VL databases by name */
-        ret = afs_vlocation_access_vl_by_name(vlocation, name, namesz, &vldb);
-        if (ret < 0) {
-                printk("kAFS: failed to locate '%*.*s' in cell '%s'\n",
-                       namesz, namesz, name, cell->name);
-                goto error;
        }
-        goto found_on_vlserver;
+        _leave(" = %p", vl);
+        return vl;
- found_in_graveyard:
+}
-        /* found in the graveyard - resurrect */
-        _debug("found in graveyard");
-        atomic_inc(&vlocation->usage);
-        list_move_tail(&vlocation->link, &cell->vl_list);
-        spin_unlock(&cell->vl_gylock);
-        afs_kafstimod_del_timer(&vlocation->timeout);
-        goto active;
- found_in_memory:
-        /* found in memory - check to see if it's active */
-        _debug("found in memory");
-        atomic_inc(&vlocation->usage);
- active:
+/*
-        active = 1;
+ * update record if we found it in the cache
+ */
+static int afs_vlocation_update_record(struct afs_vlocation *vl,
+                                       struct key *key,
+                                       struct afs_cache_vlocation *vldb)
+{
+        afs_voltype_t voltype;
+        afs_volid_t vid;
+        int ret;
-#ifdef AFS_CACHING_SUPPORT
- found_in_cache:
-#endif
        /* try to look up a cached volume in the cell VL databases by ID */
-        _debug("found in cache");
        _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-               vlocation->vldb.name,
+               vl->vldb.name,
-               vlocation->vldb.vidmask,
+               vl->vldb.vidmask,
-               ntohl(vlocation->vldb.servers[0].s_addr),
+               ntohl(vl->vldb.servers[0].s_addr),
-               vlocation->vldb.srvtmask[0],
+               vl->vldb.srvtmask[0],
-               ntohl(vlocation->vldb.servers[1].s_addr),
+               ntohl(vl->vldb.servers[1].s_addr),
-               vlocation->vldb.srvtmask[1],
+               vl->vldb.srvtmask[1],
-               ntohl(vlocation->vldb.servers[2].s_addr),
+               ntohl(vl->vldb.servers[2].s_addr),
-               vlocation->vldb.srvtmask[2]
+               vl->vldb.srvtmask[2]);
-               );
        _debug("Vids: %08x %08x %08x",
-               vlocation->vldb.vid[0],
+               vl->vldb.vid[0],
-               vlocation->vldb.vid[1],
+               vl->vldb.vid[1],
-               vlocation->vldb.vid[2]);
+               vl->vldb.vid[2]);
-        if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
+        if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
-                vid = vlocation->vldb.vid[0];
+                vid = vl->vldb.vid[0];
                voltype = AFSVL_RWVOL;
-        }
+        } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
+                vid = vl->vldb.vid[1];
-                vid = vlocation->vldb.vid[1];
                voltype = AFSVL_ROVOL;
-        }
+        } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
+                vid = vl->vldb.vid[2];
-                vid = vlocation->vldb.vid[2];
                voltype = AFSVL_BACKVOL;
-        }
+        } else {
-        else {
                BUG();
                vid = 0;
                voltype = 0;
        }
-        ret = afs_vlocation_access_vl_by_id(vlocation, vid, voltype, &vldb);
+        /* contact the server to make sure the volume is still available
+         * - TODO: need to handle disconnected operation here
+         */
+        ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
        switch (ret) {
                /* net error */
        default:
-                printk("kAFS: failed to volume '%*.*s' (%x) up in '%s': %d\n",
+                printk(KERN_WARNING "kAFS:"
-                       namesz, namesz, name, vid, cell->name, ret);
+                       " failed to update volume '%s' (%x) up in '%s': %d\n",
-                goto error;
+                       vl->vldb.name, vid, vl->cell->name, ret);
+                _leave(" = %d", ret);
+                return ret;
                /* pulled from local cache into memory */
        case 0:
-                goto found_on_vlserver;
+                _leave(" = 0");
+                return 0;
                /* uh oh... looks like the volume got deleted */
        case -ENOMEDIUM:
-                printk("kAFS: volume '%*.*s' (%x) does not exist '%s'\n",
+                printk(KERN_ERR "kAFS:"
-                       namesz, namesz, name, vid, cell->name);
+                       " volume '%s' (%x) does not exist '%s'\n",
+                       vl->vldb.name, vid, vl->cell->name);
                /* TODO: make existing record unavailable */
-                goto error;
+                _leave(" = %d", ret);
+                return ret;
        }
+}
- found_on_vlserver:
+/*
-        _debug("Done VL Lookup: %*.*s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+ * apply the update to a VL record
-               namesz, namesz, name,
+ */
-               vldb.vidmask,
+static void afs_vlocation_apply_update(struct afs_vlocation *vl,
-               ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
+                                       struct afs_cache_vlocation *vldb)
-               ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
+{
-               ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
+        _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-               );
+               vldb->name, vldb->vidmask,
+               ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
-        _debug("Vids: %08x %08x %08x", vldb.vid[0], vldb.vid[1], vldb.vid[2]);
+               ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
+               ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
-        if ((namesz < sizeof(vlocation->vldb.name) &&
+        _debug("Vids: %08x %08x %08x",
-             vlocation->vldb.name[namesz] != '\0') ||
+               vldb->vid[0], vldb->vid[1], vldb->vid[2]);
-            memcmp(vldb.name, name, namesz) != 0)
-                printk("kAFS: name of volume '%*.*s' changed to '%s' on server\n",
-                       namesz, namesz, name, vldb.name);
-        memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
+        if (strcmp(vldb->name, vl->vldb.name) != 0)
+                printk(KERN_NOTICE "kAFS:"
+                       " name of volume '%s' changed to '%s' on server\n",
+                       vl->vldb.name, vldb->name);
-        afs_kafstimod_add_timer(&vlocation->upd_timer, 10 * HZ);
+        vl->vldb = *vldb;
 #ifdef AFS_CACHING_SUPPORT
        /* update volume entry in local cache */
-        cachefs_update_cookie(vlocation->cache);
+        cachefs_update_cookie(vl->cache);
-#endif
-        *_vlocation = vlocation;
-        _leave(" = 0 (%p)",vlocation);
-        return 0;
- error:
-        if (vlocation) {
-                if (active) {
-                        __afs_put_vlocation(vlocation);
-                }
-                else {
-                        list_del(&vlocation->link);
-#ifdef AFS_CACHING_SUPPORT
-                        cachefs_relinquish_cookie(vlocation->cache, 0);
 #endif
-                        afs_put_cell(vlocation->cell);
+}
-                        kfree(vlocation);
-                }
-        }
-        _leave(" = %d", ret);
-        return ret;
-} /* end afs_vlocation_lookup() */
-/*****************************************************************************/
 /*
- * finish using a volume location record
+ * fill in a volume location record, consulting the cache and the VL server
- * - caller must have cell->vol_sem write-locked
+ * both
 */
-static void __afs_put_vlocation(struct afs_vlocation *vlocation)
+static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
+                                        struct key *key)
 {
-        struct afs_cell *cell;
+        struct afs_cache_vlocation vldb;
+        int ret;
-        if (!vlocation)
+        _enter("");
-                return;
-        _enter("%s", vlocation->vldb.name);
+        ASSERTCMP(vl->valid, ==, 0);
-        cell = vlocation->cell;
+        memset(&vldb, 0, sizeof(vldb));
-        /* sanity check */
+        /* see if we have an in-cache copy (will set vl->valid if there is) */
-        BUG_ON(atomic_read(&vlocation->usage) <= 0);
+#ifdef AFS_CACHING_SUPPORT
+        cachefs_acquire_cookie(cell->cache,
+                               &afs_volume_cache_index_def,
+                               vlocation,
+                               &vl->cache);
+#endif
-        spin_lock(&cell->vl_gylock);
+        if (vl->valid) {
-        if (likely(!atomic_dec_and_test(&vlocation->usage))) {
+                /* try to update a known volume in the cell VL databases by
-                spin_unlock(&cell->vl_gylock);
+                 * ID as the name may have changed */
-                _leave("");
+                _debug("found in cache");
-                return;
+                ret = afs_vlocation_update_record(vl, key, &vldb);
+        } else {
+                /* try to look up an unknown volume in the cell VL databases by
+                 * name */
+                ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+                if (ret < 0) {
+                        printk("kAFS: failed to locate '%s' in cell '%s'\n",
+                               vl->vldb.name, vl->cell->name);
+                        return ret;
+                }
        }
-        /* move to graveyard queue */
+        afs_vlocation_apply_update(vl, &vldb);
-        list_move_tail(&vlocation->link,&cell->vl_graveyard);
+        _leave(" = 0");
+        return 0;
-        /* remove from pending timeout queue (refcounted if actually being
+}
-         * updated) */
-        list_del_init(&vlocation->upd_op.link);
-        /* time out in 10 secs */
-        afs_kafstimod_del_timer(&vlocation->upd_timer);
-        afs_kafstimod_add_timer(&vlocation->timeout, 10 * HZ);
-        spin_unlock(&cell->vl_gylock);
-        _leave(" [killed]");
-} /* end __afs_put_vlocation() */
-/*****************************************************************************/
-/*
- * finish using a volume location record
- */
-void afs_put_vlocation(struct afs_vlocation *vlocation)
-{
-        if (vlocation) {
-                struct afs_cell *cell = vlocation->cell;
-                down_write(&cell->vl_sem);
-                __afs_put_vlocation(vlocation);
-                up_write(&cell->vl_sem);
-        }
-} /* end afs_put_vlocation() */
-/*****************************************************************************/
 /*
- * timeout vlocation record
+ * queue a vlocation record for updates
- * - removes from the cell's graveyard if the usage count is zero
 */
-void afs_vlocation_do_timeout(struct afs_vlocation *vlocation)
+void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
 {
-        struct afs_cell *cell;
+        struct afs_vlocation *xvl;
-        _enter("%s", vlocation->vldb.name);
+        /* wait at least 10 minutes before updating... */
+        vl->update_at = get_seconds() + afs_vlocation_update_timeout;
-        cell = vlocation->cell;
+        spin_lock(&afs_vlocation_updates_lock);
-        BUG_ON(atomic_read(&vlocation->usage) < 0);
+        if (!list_empty(&afs_vlocation_updates)) {
+                /* ... but wait at least 1 second more than the newest record
-        /* remove from graveyard if still dead */
+                 * already queued so that we don't spam the VL server suddenly
-        spin_lock(&cell->vl_gylock);
+                 * with lots of requests
-        if (atomic_read(&vlocation->usage) == 0)
+                 */
-                list_del_init(&vlocation->link);
+                xvl = list_entry(afs_vlocation_updates.prev,
-        else
+                                 struct afs_vlocation, update);
-                vlocation = NULL;
+                if (vl->update_at <= xvl->update_at)
-        spin_unlock(&cell->vl_gylock);
+                        vl->update_at = xvl->update_at + 1;
+        } else {
-        if (!vlocation) {
+                queue_delayed_work(afs_vlocation_update_worker,
-                _leave("");
+                                   &afs_vlocation_update,
-                return; /* resurrected */
+                                   afs_vlocation_update_timeout * HZ);
        }
-        /* we can now destroy it properly */
+        list_add_tail(&vl->update, &afs_vlocation_updates);
-#ifdef AFS_CACHING_SUPPORT
+        spin_unlock(&afs_vlocation_updates_lock);
-        cachefs_relinquish_cookie(vlocation->cache, 0);
+}
-#endif
-        afs_put_cell(cell);
-        kfree(vlocation);
-        _leave(" [destroyed]");
-} /* end afs_vlocation_do_timeout() */
-/*****************************************************************************/
 /*
- * send an update operation to the currently selected server
+ * lookup volume location
+ * - iterate through the VL servers in a cell until one of them admits knowing
+ *   about the volume in question
+ * - lookup in the local cache if not able to find on the VL server
+ * - insert/update in the local cache if did get a VL response
 */
-static int afs_vlocation_update_begin(struct afs_vlocation *vlocation)
+struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+                                           struct key *key,
+                                           const char *name,
+                                           size_t namesz)
 {
-        afs_voltype_t voltype;
+        struct afs_vlocation *vl;
-        afs_volid_t vid;
        int ret;
-        _enter("%s{ufs=%u ucs=%u}",
+        _enter("{%s},{%x},%*.*s,%zu",
-               vlocation->vldb.name,
+               cell->name, key_serial(key),
-               vlocation->upd_first_svix,
+               (int) namesz, (int) namesz, name, namesz);
-               vlocation->upd_curr_svix);
-        /* try to look up a cached volume in the cell VL databases by ID */
+        if (namesz > sizeof(vl->vldb.name)) {
-        if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
+                _leave(" = -ENAMETOOLONG");
-                vid = vlocation->vldb.vid[0];
+                return ERR_PTR(-ENAMETOOLONG);
-                voltype = AFSVL_RWVOL;
-        }
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
-                vid = vlocation->vldb.vid[1];
-                voltype = AFSVL_ROVOL;
        }
-        else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
-                vid = vlocation->vldb.vid[2];
+        /* see if we have an in-memory copy first */
-                voltype = AFSVL_BACKVOL;
+        down_write(&cell->vl_sem);
+        spin_lock(&cell->vl_lock);
+        list_for_each_entry(vl, &cell->vl_list, link) {
+                if (vl->vldb.name[namesz] != '\0')
+                        continue;
+                if (memcmp(vl->vldb.name, name, namesz) == 0)
+                        goto found_in_memory;
        }
-        else {
+        spin_unlock(&cell->vl_lock);
-                BUG();
-                vid = 0;
+        /* not in the cell's in-memory lists - create a new record */
-                voltype = 0;
+        vl = afs_vlocation_alloc(cell, name, namesz);
+        if (!vl) {
+                up_write(&cell->vl_sem);
+                return ERR_PTR(-ENOMEM);
        }
-        /* contact the chosen server */
+        afs_get_cell(cell);
-        ret = afs_server_lookup(
-                vlocation->cell,
-                &vlocation->cell->vl_addrs[vlocation->upd_curr_svix],
-                &vlocation->upd_op.server);
-        switch (ret) {
+        list_add_tail(&vl->link, &cell->vl_list);
-        case 0:
+        vl->state = AFS_VL_CREATING;
-                break;
+        up_write(&cell->vl_sem);
-        case -ENOMEM:
-        case -ENONET:
-        default:
-                _leave(" = %d", ret);
-                return ret;
-        }
-        /* initiate the update operation */
+fill_in_record:
-        ret = afs_rxvl_get_entry_by_id_async(&vlocation->upd_op, vid, voltype);
+        ret = afs_vlocation_fill_in_record(vl, key);
-        if (ret < 0) {
+        if (ret < 0)
-                _leave(" = %d", ret);
+                goto error_abandon;
-                return ret;
+        spin_lock(&vl->lock);
+        vl->state = AFS_VL_VALID;
+        spin_unlock(&vl->lock);
+        wake_up(&vl->waitq);
+        /* schedule for regular updates */
+        afs_vlocation_queue_for_updates(vl);
+        goto success;
+found_in_memory:
+        /* found in memory */
+        _debug("found in memory");
+        atomic_inc(&vl->usage);
+        spin_unlock(&cell->vl_lock);
+        if (!list_empty(&vl->grave)) {
+                spin_lock(&afs_vlocation_graveyard_lock);
+                list_del_init(&vl->grave);
+                spin_unlock(&afs_vlocation_graveyard_lock);
        }
+        up_write(&cell->vl_sem);
+        /* see if it was an abandoned record that we might try filling in */
+        spin_lock(&vl->lock);
+        while (vl->state != AFS_VL_VALID) {
+                afs_vlocation_state_t state = vl->state;
+                _debug("invalid [state %d]", state);
+                if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
+                        vl->state = AFS_VL_CREATING;
+                        spin_unlock(&vl->lock);
+                        goto fill_in_record;
+                }
+                /* must now wait for creation or update by someone else to
+                 * complete */
+                _debug("wait");
+                spin_unlock(&vl->lock);
+                ret = wait_event_interruptible(vl->waitq,
+                                               vl->state == AFS_VL_NEW ||
+                                               vl->state == AFS_VL_VALID ||
+                                               vl->state == AFS_VL_NO_VOLUME);
+                if (ret < 0)
+                        goto error;
+                spin_lock(&vl->lock);
+        }
+        spin_unlock(&vl->lock);
+success:
+        _leave(" = %p",vl);
+        return vl;
+error_abandon:
+        spin_lock(&vl->lock);
+        vl->state = AFS_VL_NEW;
+        spin_unlock(&vl->lock);
+        wake_up(&vl->waitq);
+error:
+        ASSERT(vl != NULL);
+        afs_put_vlocation(vl);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
-} /* end afs_vlocation_update_begin() */
+}
-/*****************************************************************************/
 /*
- * abandon updating a VL record
+ * finish using a volume location record
- * - does not restart the update timer
 */
-static void afs_vlocation_update_abandon(struct afs_vlocation *vlocation,
+void afs_put_vlocation(struct afs_vlocation *vl)
-                                         afs_vlocation_upd_t state,
-                                         int ret)
 {
-        _enter("%s,%u", vlocation->vldb.name, state);
+        if (!vl)
+                return;
-        if (ret < 0)
-                printk("kAFS: Abandoning VL update '%s': %d\n",
-                       vlocation->vldb.name, ret);
-        /* discard the server record */
-        afs_put_server(vlocation->upd_op.server);
-        vlocation->upd_op.server = NULL;
-        spin_lock(&afs_vlocation_update_lock);
+        _enter("%s", vl->vldb.name);
-        afs_vlocation_update = NULL;
-        vlocation->upd_state = state;
-        /* TODO: start updating next VL record on pending list */
+        ASSERTCMP(atomic_read(&vl->usage), >, 0);
-        spin_unlock(&afs_vlocation_update_lock);
+        if (likely(!atomic_dec_and_test(&vl->usage))) {
+                _leave("");
+                return;
+        }
-        _leave("");
+        spin_lock(&afs_vlocation_graveyard_lock);
-} /* end afs_vlocation_update_abandon() */
+        if (atomic_read(&vl->usage) == 0) {
+                _debug("buried");
+                list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+                vl->time_of_death = get_seconds();
+                schedule_delayed_work(&afs_vlocation_reap,
+                                      afs_vlocation_timeout * HZ);
+                /* suspend updates on this record */
+                if (!list_empty(&vl->update)) {
+                        spin_lock(&afs_vlocation_updates_lock);
+                        list_del_init(&vl->update);
+                        spin_unlock(&afs_vlocation_updates_lock);
+                }
+        }
+        spin_unlock(&afs_vlocation_graveyard_lock);
+        _leave(" [killed?]");
+}
-/*****************************************************************************/
 /*
- * handle periodic update timeouts and busy retry timeouts
+ * destroy a dead volume location record
- * - called from kafstimod
 */
-static void afs_vlocation_update_timer(struct afs_timer *timer)
+static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
-        struct afs_vlocation *vlocation =
+        _enter("%p", vl);
-                list_entry(timer, struct afs_vlocation, upd_timer);
-        int ret;
-        _enter("%s", vlocation->vldb.name);
+#ifdef AFS_CACHING_SUPPORT
+        cachefs_relinquish_cookie(vl->cache, 0);
+#endif
-        /* only update if not in the graveyard (defend against putting too) */
+        afs_put_cell(vl->cell);
-        spin_lock(&vlocation->cell->vl_gylock);
+        kfree(vl);
+}
-        if (!atomic_read(&vlocation->usage))
+/*
-                goto out_unlock1;
+ * reap dead volume location records
+ */
+static void afs_vlocation_reaper(struct work_struct *work)
+{
+        LIST_HEAD(corpses);
+        struct afs_vlocation *vl;
+        unsigned long delay, expiry;
+        time_t now;
-        spin_lock(&afs_vlocation_update_lock);
+        _enter("");
-        /* if we were woken up due to EBUSY sleep then restart immediately if
+        now = get_seconds();
-         * possible or else jump to front of pending queue */
+        spin_lock(&afs_vlocation_graveyard_lock);
-        if (vlocation->upd_state == AFS_VLUPD_BUSYSLEEP) {
-                if (afs_vlocation_update) {
+        while (!list_empty(&afs_vlocation_graveyard)) {
-                        list_add(&vlocation->upd_op.link,
+                vl = list_entry(afs_vlocation_graveyard.next,
-                                 &afs_vlocation_update_pendq);
+                                struct afs_vlocation, grave);
+                _debug("check %p", vl);
+                /* the queue is ordered most dead first */
+                expiry = vl->time_of_death + afs_vlocation_timeout;
+                if (expiry > now) {
+                        delay = (expiry - now) * HZ;
+                        _debug("delay %lu", delay);
+                        if (!schedule_delayed_work(&afs_vlocation_reap,
+                                                   delay)) {
+                                cancel_delayed_work(&afs_vlocation_reap);
+                                schedule_delayed_work(&afs_vlocation_reap,
+                                                      delay);
+                        }
+                        break;
                }
-                else {
-                        afs_get_vlocation(vlocation);
+                spin_lock(&vl->cell->vl_lock);
-                        afs_vlocation_update = vlocation;
+                if (atomic_read(&vl->usage) > 0) {
-                        vlocation->upd_state = AFS_VLUPD_INPROGRESS;
+                        _debug("no reap");
+                        list_del_init(&vl->grave);
+                } else {
+                        _debug("reap");
+                        list_move_tail(&vl->grave, &corpses);
+                        list_del_init(&vl->link);
                }
-                goto out_unlock2;
+                spin_unlock(&vl->cell->vl_lock);
        }
-        /* put on pending queue if there's already another update in progress */
+        spin_unlock(&afs_vlocation_graveyard_lock);
-        if (afs_vlocation_update) {
-                vlocation->upd_state = AFS_VLUPD_PENDING;
-                list_add_tail(&vlocation->upd_op.link,
-                              &afs_vlocation_update_pendq);
-                goto out_unlock2;
-        }
-        /* hold a ref on it while actually updating */
+        /* now reap the corpses we've extracted */
-        afs_get_vlocation(vlocation);
+        while (!list_empty(&corpses)) {
-        afs_vlocation_update = vlocation;
+                vl = list_entry(corpses.next, struct afs_vlocation, grave);
-        vlocation->upd_state = AFS_VLUPD_INPROGRESS;
+                list_del(&vl->grave);
+                afs_vlocation_destroy(vl);
-        spin_unlock(&afs_vlocation_update_lock);
-        spin_unlock(&vlocation->cell->vl_gylock);
-        /* okay... we can start the update */
-        _debug("BEGIN VL UPDATE [%s]", vlocation->vldb.name);
-        vlocation->upd_first_svix = vlocation->cell->vl_curr_svix;
-        vlocation->upd_curr_svix = vlocation->upd_first_svix;
-        vlocation->upd_rej_cnt = 0;
-        vlocation->upd_busy_cnt = 0;
-        ret = afs_vlocation_update_begin(vlocation);
-        if (ret < 0) {
-                afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
-                afs_kafstimod_add_timer(&vlocation->upd_timer,
-                                        AFS_VLDB_TIMEOUT);
-                afs_put_vlocation(vlocation);
        }
        _leave("");
-        return;
+}
- out_unlock2:
+/*
-        spin_unlock(&afs_vlocation_update_lock);
+ * initialise the VL update process
- out_unlock1:
+ */
-        spin_unlock(&vlocation->cell->vl_gylock);
+int __init afs_vlocation_update_init(void)
-        _leave("");
+{
-        return;
+        afs_vlocation_update_worker =
+                create_singlethread_workqueue("kafs_vlupdated");
+        return afs_vlocation_update_worker ? 0 : -ENOMEM;
+}
-} /* end afs_vlocation_update_timer() */
+/*
+ * discard all the volume location records for rmmod
+ */
+void __exit afs_vlocation_purge(void)
+{
+        afs_vlocation_timeout = 0;
+        spin_lock(&afs_vlocation_updates_lock);
+        list_del_init(&afs_vlocation_updates);
+        spin_unlock(&afs_vlocation_updates_lock);
+        cancel_delayed_work(&afs_vlocation_update);
+        queue_delayed_work(afs_vlocation_update_worker,
+                           &afs_vlocation_update, 0);
+        destroy_workqueue(afs_vlocation_update_worker);
+        cancel_delayed_work(&afs_vlocation_reap);
+        schedule_delayed_work(&afs_vlocation_reap, 0);
+}
-/*****************************************************************************/
 /*
- * attend to an update operation upon which an event happened
+ * update a volume location
- * - called in kafsasyncd context
 */
-static void afs_vlocation_update_attend(struct afs_async_op *op)
+static void afs_vlocation_updater(struct work_struct *work)
 {
        struct afs_cache_vlocation vldb;
-        struct afs_vlocation *vlocation =
+        struct afs_vlocation *vl, *xvl;
-                list_entry(op, struct afs_vlocation, upd_op);
+        time_t now;
-        unsigned tmp;
+        long timeout;
        int ret;
-        _enter("%s", vlocation->vldb.name);
+        _enter("");
-        ret = afs_rxvl_get_entry_by_id_async2(op, &vldb);
-        switch (ret) {
-        case -EAGAIN:
-                _leave(" [unfinished]");
-                return;
-        case 0:
-                _debug("END VL UPDATE: %d\n", ret);
-                vlocation->valid = 1;
-                _debug("Done VL Lookup: %02x { %08x(%x) %08x(%x) %08x(%x) }",
-                       vldb.vidmask,
-                       ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
-                       ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
-                       ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
-                       );
-                _debug("Vids: %08x %08x %08x",
-                       vldb.vid[0], vldb.vid[1], vldb.vid[2]);
-                afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
-                down_write(&vlocation->cell->vl_sem);
-                /* actually update the cache */
-                if (strncmp(vldb.name, vlocation->vldb.name,
-                            sizeof(vlocation->vldb.name)) != 0)
-                        printk("kAFS: name of volume '%s'"
-                               " changed to '%s' on server\n",
-                               vlocation->vldb.name, vldb.name);
-                memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
-#if 0
-                /* TODO update volume entry in local cache */
-#endif
-                up_write(&vlocation->cell->vl_sem);
-                if (ret < 0)
-                        printk("kAFS: failed to update local cache: %d\n", ret);
-                afs_kafstimod_add_timer(&vlocation->upd_timer,
-                                        AFS_VLDB_TIMEOUT);
-                afs_put_vlocation(vlocation);
-                _leave(" [found]");
-                return;
-        case -ENOMEDIUM:
-                vlocation->upd_rej_cnt++;
-                goto try_next;
-                /* the server is locked - retry in a very short while */
-        case -EBUSY:
-                vlocation->upd_busy_cnt++;
-                if (vlocation->upd_busy_cnt > 3)
-                        goto try_next; /* too many retries */
-                afs_vlocation_update_abandon(vlocation,
-                                             AFS_VLUPD_BUSYSLEEP, 0);
-                afs_kafstimod_add_timer(&vlocation->upd_timer, HZ / 2);
-                afs_put_vlocation(vlocation);
-                _leave(" [busy]");
-                return;
-        case -ENETUNREACH:
-        case -EHOSTUNREACH:
-        case -ECONNREFUSED:
-        case -EREMOTEIO:
-                /* record bad vlserver info in the cell too
-                 * - TODO: use down_write_trylock() if available
-                 */
-                if (vlocation->upd_curr_svix == vlocation->cell->vl_curr_svix)
-                        vlocation->cell->vl_curr_svix =
-                                vlocation->cell->vl_curr_svix %
-                                vlocation->cell->vl_naddrs;
-        case -EBADRQC:
-        case -EINVAL:
-        case -EACCES:
-        case -EBADMSG:
-                goto try_next;
-        default:
-                goto abandon;
-        }
-        /* try contacting the next server */
- try_next:
-        vlocation->upd_busy_cnt = 0;
-        /* discard the server record */
-        afs_put_server(vlocation->upd_op.server);
-        vlocation->upd_op.server = NULL;
-        tmp = vlocation->cell->vl_naddrs;
+        now = get_seconds();
-        if (tmp == 0)
-                goto abandon;
-        vlocation->upd_curr_svix++;
+        /* find a record to update */
-        if (vlocation->upd_curr_svix >= tmp)
+        spin_lock(&afs_vlocation_updates_lock);
-                vlocation->upd_curr_svix = 0;
+        for (;;) {
-        if (vlocation->upd_first_svix >= tmp)
+                if (list_empty(&afs_vlocation_updates)) {
-                vlocation->upd_first_svix = tmp - 1;
+                        spin_unlock(&afs_vlocation_updates_lock);
+                        _leave(" [nothing]");
+                        return;
+                }
-        /* move to the next server */
+                vl = list_entry(afs_vlocation_updates.next,
-        if (vlocation->upd_curr_svix != vlocation->upd_first_svix) {
+                                struct afs_vlocation, update);
-                afs_vlocation_update_begin(vlocation);
+                if (atomic_read(&vl->usage) > 0)
-                _leave(" [next]");
+                        break;
-                return;
+                list_del_init(&vl->update);
        }
-        /* run out of servers to try - was the volume rejected? */
+        timeout = vl->update_at - now;
-        if (vlocation->upd_rej_cnt > 0) {
+        if (timeout > 0) {
-                printk("kAFS: Active volume no longer valid '%s'\n",
+                queue_delayed_work(afs_vlocation_update_worker,
-                       vlocation->vldb.name);
+                                   &afs_vlocation_update, timeout * HZ);
-                vlocation->valid = 0;
+                spin_unlock(&afs_vlocation_updates_lock);
-                afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
+                _leave(" [nothing]");
-                afs_kafstimod_add_timer(&vlocation->upd_timer,
-                                        AFS_VLDB_TIMEOUT);
-                afs_put_vlocation(vlocation);
-                _leave(" [invalidated]");
                return;
        }
-        /* abandon the update */
+        list_del_init(&vl->update);
- abandon:
+        atomic_inc(&vl->usage);
-        afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
+        spin_unlock(&afs_vlocation_updates_lock);
-        afs_kafstimod_add_timer(&vlocation->upd_timer, HZ * 10);
-        afs_put_vlocation(vlocation);
-        _leave(" [abandoned]");
-} /* end afs_vlocation_update_attend() */
-/*****************************************************************************/
-/*
- * deal with an update operation being discarded
- * - called in kafsasyncd context when it's dying due to rmmod
- * - the call has already been aborted and put()'d
- */
-static void afs_vlocation_update_discard(struct afs_async_op *op)
-{
-        struct afs_vlocation *vlocation =
-                list_entry(op, struct afs_vlocation, upd_op);
-        _enter("%s", vlocation->vldb.name);
+        /* we can now perform the update */
+        _debug("update %s", vl->vldb.name);
+        vl->state = AFS_VL_UPDATING;
+        vl->upd_rej_cnt = 0;
+        vl->upd_busy_cnt = 0;
-        afs_put_server(op->server);
+        ret = afs_vlocation_update_record(vl, NULL, &vldb);
-        op->server = NULL;
+        spin_lock(&vl->lock);
+        switch (ret) {
+        case 0:
+                afs_vlocation_apply_update(vl, &vldb);
+                vl->state = AFS_VL_VALID;
+                break;
+        case -ENOMEDIUM:
+                vl->state = AFS_VL_VOLUME_DELETED;
+                break;
+        default:
+                vl->state = AFS_VL_UNCERTAIN;
+                break;
+        }
+        spin_unlock(&vl->lock);
+        wake_up(&vl->waitq);
-        afs_put_vlocation(vlocation);
+        /* and then reschedule */
+        _debug("reschedule");
+        vl->update_at = get_seconds() + afs_vlocation_update_timeout;
-        _leave("");
+        spin_lock(&afs_vlocation_updates_lock);
-} /* end afs_vlocation_update_discard() */
-/*****************************************************************************/
+        if (!list_empty(&afs_vlocation_updates)) {
-/*
+                /* next update in 10 minutes, but wait at least 1 second more
- * match a VLDB record stored in the cache
+                 * than the newest record already queued so that we don't spam
- * - may also load target from entry
+                 * the VL server suddenly with lots of requests
- */
+                 */
-#ifdef AFS_CACHING_SUPPORT
+                xvl = list_entry(afs_vlocation_updates.prev,
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+                                 struct afs_vlocation, update);
-                                                     const void *entry)
+                if (vl->update_at <= xvl->update_at)
-{
+                        vl->update_at = xvl->update_at + 1;
-        const struct afs_cache_vlocation *vldb = entry;
+                xvl = list_entry(afs_vlocation_updates.next,
-        struct afs_vlocation *vlocation = target;
+                                 struct afs_vlocation, update);
+                timeout = xvl->update_at - now;
-        _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+                if (timeout < 0)
+                        timeout = 0;
-        if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+        } else {
-            ) {
+                timeout = afs_vlocation_update_timeout;
-                if (!vlocation->valid ||
-                    vlocation->vldb.rtime == vldb->rtime
-                    ) {
-                        vlocation->vldb = *vldb;
-                        vlocation->valid = 1;
-                        _leave(" = SUCCESS [c->m]");
-                        return CACHEFS_MATCH_SUCCESS;
-                }
-                /* need to update cache if cached info differs */
-                else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-                        /* delete if VIDs for this name differ */
-                        if (memcmp(&vlocation->vldb.vid,
-                                   &vldb->vid,
-                                   sizeof(vldb->vid)) != 0) {
-                                _leave(" = DELETE");
-                                return CACHEFS_MATCH_SUCCESS_DELETE;
-                        }
-                        _leave(" = UPDATE");
-                        return CACHEFS_MATCH_SUCCESS_UPDATE;
-                }
-                else {
-                        _leave(" = SUCCESS");
-                        return CACHEFS_MATCH_SUCCESS;
-                }
        }
-        _leave(" = FAILED");
+        ASSERT(list_empty(&vl->update));
-        return CACHEFS_MATCH_FAILED;
-} /* end afs_vlocation_cache_match() */
-#endif
-/*****************************************************************************/
-/*
- * update a VLDB record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
-{
-        struct afs_cache_vlocation *vldb = entry;
-        struct afs_vlocation *vlocation = source;
-        _enter("");
+        list_add_tail(&vl->update, &afs_vlocation_updates);
-        *vldb = vlocation->vldb;
-} /* end afs_vlocation_cache_update() */
+        _debug("timeout %ld", timeout);
-#endif
+        queue_delayed_work(afs_vlocation_update_worker,
+                           &afs_vlocation_update, timeout * HZ);
+        spin_unlock(&afs_vlocation_updates_lock);
+        afs_put_vlocation(vl);
+}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index cf62da5d7825..a1904ab8426a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -1,6 +1,6 @@
-/* vnode.c: AFS vnode management
+/* AFS vnode management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -14,142 +14,237 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "vnode.h"
 #include "internal.h"
-static void afs_vnode_cb_timed_out(struct afs_timer *timer);
+#if 0
+static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
+                                   int depth, char lr)
+{
+        struct afs_vnode *vnode;
+        bool bad = false;
+        if (!node)
+                return false;
+        if (node->rb_left)
+                bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
+        vnode = rb_entry(node, struct afs_vnode, cb_promise);
+        _debug("%c %*.*s%c%p {%d}",
+               rb_is_red(node) ? 'R' : 'B',
+               depth, depth, "", lr,
+               vnode, vnode->cb_expires_at);
+        if (rb_parent(node) != parent) {
+                printk("BAD: %p != %p\n", rb_parent(node), parent);
+                bad = true;
+        }
-struct afs_timer_ops afs_vnode_cb_timed_out_ops = {
+        if (node->rb_right)
-        .timed_out      = afs_vnode_cb_timed_out,
+                bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
-};
-#ifdef AFS_CACHING_SUPPORT
+        return bad;
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
+}
-                                                 const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_vnode_cache_index_def = {
+static noinline void dump_tree(const char *name, struct afs_server *server)
-        .name           = "vnode",
+{
-        .data_size      = sizeof(struct afs_cache_vnode),
+        _enter("%s", name);
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
+        if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
-        .match          = afs_vnode_cache_match,
+                BUG();
-        .update         = afs_vnode_cache_update,
+}
-};
 #endif
-/*****************************************************************************/
 /*
- * handle a callback timing out
+ * insert a vnode into the backing server's vnode tree
- * TODO: retain a ref to vnode struct for an outstanding callback timeout
 */
-static void afs_vnode_cb_timed_out(struct afs_timer *timer)
+static void afs_install_vnode(struct afs_vnode *vnode,
+                              struct afs_server *server)
 {
-        struct afs_server *oldserver;
+        struct afs_server *old_server = vnode->server;
-        struct afs_vnode *vnode;
+        struct afs_vnode *xvnode;
+        struct rb_node *parent, **p;
-        vnode = list_entry(timer, struct afs_vnode, cb_timeout);
+        _enter("%p,%p", vnode, server);
-        _enter("%p", vnode);
+        if (old_server) {
+                spin_lock(&old_server->fs_lock);
+                rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
+                spin_unlock(&old_server->fs_lock);
+        }
-        /* set the changed flag in the vnode and release the server */
+        afs_get_server(server);
-        spin_lock(&vnode->lock);
+        vnode->server = server;
+        afs_put_server(old_server);
+        /* insert into the server's vnode tree in FID order */
+        spin_lock(&server->fs_lock);
+        parent = NULL;
+        p = &server->fs_vnodes.rb_node;
+        while (*p) {
+                parent = *p;
+                xvnode = rb_entry(parent, struct afs_vnode, server_rb);
+                if (vnode->fid.vid < xvnode->fid.vid)
+                        p = &(*p)->rb_left;
+                else if (vnode->fid.vid > xvnode->fid.vid)
+                        p = &(*p)->rb_right;
+                else if (vnode->fid.vnode < xvnode->fid.vnode)
+                        p = &(*p)->rb_left;
+                else if (vnode->fid.vnode > xvnode->fid.vnode)
+                        p = &(*p)->rb_right;
+                else if (vnode->fid.unique < xvnode->fid.unique)
+                        p = &(*p)->rb_left;
+                else if (vnode->fid.unique > xvnode->fid.unique)
+                        p = &(*p)->rb_right;
+                else
+                        BUG(); /* can't happen unless afs_iget() malfunctions */
+        }
+        rb_link_node(&vnode->server_rb, parent, p);
+        rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
-        oldserver = xchg(&vnode->cb_server, NULL);
+        spin_unlock(&server->fs_lock);
-        if (oldserver) {
+        _leave("");
-                vnode->flags |= AFS_VNODE_CHANGED;
+}
-                spin_lock(&afs_cb_hash_lock);
+/*
-                list_del_init(&vnode->cb_hash_link);
+ * insert a vnode into the promising server's update/expiration tree
-                spin_unlock(&afs_cb_hash_lock);
+ * - caller must hold vnode->lock
+ */
+static void afs_vnode_note_promise(struct afs_vnode *vnode,
+                                   struct afs_server *server)
+{
+        struct afs_server *old_server;
+        struct afs_vnode *xvnode;
+        struct rb_node *parent, **p;
-                spin_lock(&oldserver->cb_lock);
+        _enter("%p,%p", vnode, server);
-                list_del_init(&vnode->cb_link);
-                spin_unlock(&oldserver->cb_lock);
+        ASSERT(server != NULL);
+        old_server = vnode->server;
+        if (vnode->cb_promised) {
+                if (server == old_server &&
+                    vnode->cb_expires == vnode->cb_expires_at) {
+                        _leave(" [no change]");
+                        return;
+                }
+                spin_lock(&old_server->cb_lock);
+                if (vnode->cb_promised) {
+                        _debug("delete");
+                        rb_erase(&vnode->cb_promise, &old_server->cb_promises);
+                        vnode->cb_promised = false;
+                }
+                spin_unlock(&old_server->cb_lock);
        }
-        spin_unlock(&vnode->lock);
+        if (vnode->server != server)
+                afs_install_vnode(vnode, server);
+        vnode->cb_expires_at = vnode->cb_expires;
+        _debug("PROMISE on %p {%lu}",
+               vnode, (unsigned long) vnode->cb_expires_at);
+        /* abuse an RB-tree to hold the expiration order (we may have multiple
+         * items with the same expiration time) */
+        spin_lock(&server->cb_lock);
+        parent = NULL;
+        p = &server->cb_promises.rb_node;
+        while (*p) {
+                parent = *p;
+                xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
+                if (vnode->cb_expires_at < xvnode->cb_expires_at)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
-        afs_put_server(oldserver);
+        rb_link_node(&vnode->cb_promise, parent, p);
+        rb_insert_color(&vnode->cb_promise, &server->cb_promises);
+        vnode->cb_promised = true;
+        spin_unlock(&server->cb_lock);
        _leave("");
-} /* end afs_vnode_cb_timed_out() */
+}
-/*****************************************************************************/
 /*
- * finish off updating the recorded status of a file
+ * handle remote file deletion by discarding the callback promise
+ */
+static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
+{
+        struct afs_server *server;
+        set_bit(AFS_VNODE_DELETED, &vnode->flags);
+        server = vnode->server;
+        if (vnode->cb_promised) {
+                spin_lock(&server->cb_lock);
+                if (vnode->cb_promised) {
+                        rb_erase(&vnode->cb_promise, &server->cb_promises);
+                        vnode->cb_promised = false;
+                }
+                spin_unlock(&server->cb_lock);
+        }
+        spin_lock(&vnode->server->fs_lock);
+        rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+        spin_unlock(&vnode->server->fs_lock);
+        vnode->server = NULL;
+        afs_put_server(server);
+}
+/*
+ * finish off updating the recorded status of a file after a successful
+ * operation completion
 * - starts callback expiry timer
 * - adds to server's callback list
 */
-static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
+void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
-                                             struct afs_server *server,
+                                      struct afs_server *server)
-                                             int ret)
 {
        struct afs_server *oldserver = NULL;
-        _enter("%p,%p,%d", vnode, server, ret);
+        _enter("%p,%p", vnode, server);
        spin_lock(&vnode->lock);
+        clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+        afs_vnode_note_promise(vnode, server);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        wake_up_all(&vnode->update_waitq);
+        afs_put_server(oldserver);
+        _leave("");
+}
-        vnode->flags &= ~AFS_VNODE_CHANGED;
+/*
+ * finish off updating the recorded status of a file after an operation failed
+ */
+static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
+{
+        _enter("%p,%d", vnode, ret);
-        if (ret == 0) {
+        spin_lock(&vnode->lock);
-                /* adjust the callback timeout appropriately */
-                afs_kafstimod_add_timer(&vnode->cb_timeout,
-                                        vnode->cb_expiry * HZ);
-                spin_lock(&afs_cb_hash_lock);
-                list_move_tail(&vnode->cb_hash_link,
-                              &afs_cb_hash(server, &vnode->fid));
-                spin_unlock(&afs_cb_hash_lock);
-                /* swap ref to old callback server with that for new callback
-                 * server */
-                oldserver = xchg(&vnode->cb_server, server);
-                if (oldserver != server) {
-                        if (oldserver) {
-                                spin_lock(&oldserver->cb_lock);
-                                list_del_init(&vnode->cb_link);
-                                spin_unlock(&oldserver->cb_lock);
-                        }
-                        afs_get_server(server);
+        clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-                        spin_lock(&server->cb_lock);
-                        list_add_tail(&vnode->cb_link, &server->cb_promises);
-                        spin_unlock(&server->cb_lock);
-                }
-                else {
-                        /* same server */
-                        oldserver = NULL;
-                }
-        }
-        else if (ret == -ENOENT) {
-                /* the file was deleted - clear the callback timeout */
-                oldserver = xchg(&vnode->cb_server, NULL);
-                afs_kafstimod_del_timer(&vnode->cb_timeout);
+        if (ret == -ENOENT) {
+                /* the file was deleted on the server */
                _debug("got NOENT from server - marking file deleted");
-                vnode->flags |= AFS_VNODE_DELETED;
+                afs_vnode_deleted_remotely(vnode);
        }
        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
        spin_unlock(&vnode->lock);
        wake_up_all(&vnode->update_waitq);
-        afs_put_server(oldserver);
        _leave("");
+}
-} /* end afs_vnode_finalise_status_update() */
-/*****************************************************************************/
 /*
 * fetch file status from the volume
 * - don't issue a fetch if:
@@ -157,9 +252,11 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
 *   - there are any outstanding ops that will fetch the status
 * - TODO implement local caching
 */
-int afs_vnode_fetch_status(struct afs_vnode *vnode)
+int afs_vnode_fetch_status(struct afs_vnode *vnode,
+                           struct afs_vnode *auth_vnode, struct key *key)
 {
        struct afs_server *server;
+        unsigned long acl_order;
        int ret;
        DECLARE_WAITQUEUE(myself, current);
@@ -168,38 +265,49 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
               vnode->volume->vlocation->vldb.name,
               vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
-        if (!(vnode->flags & AFS_VNODE_CHANGED) && vnode->cb_server) {
+        if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+            vnode->cb_promised) {
                _leave(" [unchanged]");
                return 0;
        }
-        if (vnode->flags & AFS_VNODE_DELETED) {
+        if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
                _leave(" [deleted]");
                return -ENOENT;
        }
+        acl_order = 0;
+        if (auth_vnode)
+                acl_order = auth_vnode->acl_order;
        spin_lock(&vnode->lock);
-        if (!(vnode->flags & AFS_VNODE_CHANGED)) {
+        if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+            vnode->cb_promised) {
                spin_unlock(&vnode->lock);
                _leave(" [unchanged]");
                return 0;
        }
+        ASSERTCMP(vnode->update_cnt, >=, 0);
        if (vnode->update_cnt > 0) {
                /* someone else started a fetch */
+                _debug("wait on fetch %d", vnode->update_cnt);
                set_current_state(TASK_UNINTERRUPTIBLE);
+                ASSERT(myself.func != NULL);
                add_wait_queue(&vnode->update_waitq, &myself);
                /* wait for the status to be updated */
                for (;;) {
-                        if (!(vnode->flags & AFS_VNODE_CHANGED))
+                        if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
                                break;
-                        if (vnode->flags & AFS_VNODE_DELETED)
+                        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                                break;
-                        /* it got updated and invalidated all before we saw
+                        /* check to see if it got updated and invalidated all
-                         * it */
+                         * before we saw it */
                        if (vnode->update_cnt == 0) {
                                remove_wait_queue(&vnode->update_waitq,
                                                  &myself);
@@ -219,10 +327,11 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
                spin_unlock(&vnode->lock);
                set_current_state(TASK_RUNNING);
-                return vnode->flags & AFS_VNODE_DELETED ? -ENOENT : 0;
+                return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
+                        -ENOENT : 0;
        }
- get_anyway:
+get_anyway:
        /* okay... we're going to have to initiate the op */
        vnode->update_cnt++;
@@ -232,39 +341,60 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
         * vnode */
        do {
                /* pick a server to query */
-                ret = afs_volume_pick_fileserver(vnode->volume, &server);
+                server = afs_volume_pick_fileserver(vnode);
-                if (ret<0)
+                if (IS_ERR(server))
-                        return ret;
+                        goto no_server;
-                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                _debug("USING SERVER: %p{%08x}",
+                       server, ntohl(server->addr.s_addr));
-                ret = afs_rxfs_fetch_file_status(server, vnode, NULL);
+                ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
+                                               &afs_sync_call);
-        } while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
        /* adjust the flags */
-        afs_vnode_finalise_status_update(vnode, server, ret);
+        if (ret == 0) {
+                _debug("adjust");
+                if (auth_vnode)
+                        afs_cache_permit(vnode, key, acl_order);
+                afs_vnode_finalise_status_update(vnode, server);
+                afs_put_server(server);
+        } else {
+                _debug("failed [%d]", ret);
+                afs_vnode_status_update_failed(vnode, ret);
+        }
-        _leave(" = %d", ret);
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
        return ret;
-} /* end afs_vnode_fetch_status() */
-/*****************************************************************************/
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
 /*
 * fetch file data from the volume
- * - TODO implement caching and server failover
+ * - TODO implement caching
 */
-int afs_vnode_fetch_data(struct afs_vnode *vnode,
+int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-                         struct afs_rxfs_fetch_descriptor *desc)
+                         off_t offset, size_t length, struct page *page)
 {
        struct afs_server *server;
        int ret;
-        _enter("%s,{%u,%u,%u}",
+        _enter("%s{%u,%u,%u},%x,,,",
               vnode->volume->vlocation->vldb.name,
               vnode->fid.vid,
               vnode->fid.vnode,
-               vnode->fid.unique);
+               vnode->fid.unique,
+               key_serial(key));
        /* this op will fetch the status */
        spin_lock(&vnode->lock);
@@ -275,120 +405,351 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode,
         * vnode */
        do {
                /* pick a server to query */
-                ret = afs_volume_pick_fileserver(vnode->volume, &server);
+                server = afs_volume_pick_fileserver(vnode);
-                if (ret < 0)
+                if (IS_ERR(server))
-                        return ret;
+                        goto no_server;
                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-                ret = afs_rxfs_fetch_file_data(server, vnode, desc, NULL);
+                ret = afs_fs_fetch_data(server, key, vnode, offset, length,
+                                        page, &afs_sync_call);
-        } while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
        /* adjust the flags */
-        afs_vnode_finalise_status_update(vnode, server, ret);
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+        }
        _leave(" = %d", ret);
        return ret;
-} /* end afs_vnode_fetch_data() */
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        return PTR_ERR(server);
+}
-/*****************************************************************************/
 /*
- * break any outstanding callback on a vnode
+ * make a file or a directory
- * - only relevent to server that issued it
 */
-int afs_vnode_give_up_callback(struct afs_vnode *vnode)
+int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
+                     const char *name, umode_t mode, struct afs_fid *newfid,
+                     struct afs_file_status *newstatus,
+                     struct afs_callback *newcb, struct afs_server **_server)
 {
        struct afs_server *server;
        int ret;
-        _enter("%s,{%u,%u,%u}",
+        _enter("%s{%u,%u,%u},%x,%s,,",
               vnode->volume->vlocation->vldb.name,
               vnode->fid.vid,
               vnode->fid.vnode,
-               vnode->fid.unique);
+               vnode->fid.unique,
+               key_serial(key),
-        spin_lock(&afs_cb_hash_lock);
+               name);
-        list_del_init(&vnode->cb_hash_link);
-        spin_unlock(&afs_cb_hash_lock);
-        /* set the changed flag in the vnode and release the server */
+        /* this op will fetch the status on the directory we're creating in */
        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
-        afs_kafstimod_del_timer(&vnode->cb_timeout);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-        server = xchg(&vnode->cb_server, NULL);
+                ret = afs_fs_create(server, key, vnode, name, mode, newfid,
-        if (server) {
+                                    newstatus, newcb, &afs_sync_call);
-                vnode->flags |= AFS_VNODE_CHANGED;
-                spin_lock(&server->cb_lock);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
-                list_del_init(&vnode->cb_link);
-                spin_unlock(&server->cb_lock);
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                *_server = server;
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+                *_server = NULL;
        }
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+        return ret;
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
-        ret = 0;
+/*
-        if (server) {
+ * remove a file or directory
-                ret = afs_rxfs_give_up_callback(server, vnode);
+ */
+int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
+                     bool isdir)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%u,%u,%u},%x,%s",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key),
+               name);
+        /* this op will fetch the status on the directory we're removing from */
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_remove(server, key, vnode, name, isdir,
+                                    &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
        }
-        _leave(" = %d", ret);
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
        return ret;
-} /* end afs_vnode_give_up_callback() */
-/*****************************************************************************/
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
 /*
- * match a vnode record stored in the cache
+ * create a hard link
 */
-#ifdef AFS_CACHING_SUPPORT
+extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
+                          struct key *key, const char *name)
-                                                 const void *entry)
 {
-        const struct afs_cache_vnode *cvnode = entry;
+        struct afs_server *server;
-        struct afs_vnode *vnode = target;
+        int ret;
-        _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+        _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
+               dvnode->volume->vlocation->vldb.name,
+               dvnode->fid.vid,
+               dvnode->fid.vnode,
+               dvnode->fid.unique,
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
               vnode->fid.vnode,
               vnode->fid.unique,
-               vnode->status.version,
+               key_serial(key),
-               cvnode->vnode_id,
+               name);
-               cvnode->vnode_unique,
-               cvnode->data_version);
+        /* this op will fetch the status on the directory we're removing from */
+        spin_lock(&vnode->lock);
-        if (vnode->fid.vnode != cvnode->vnode_id) {
+        vnode->update_cnt++;
-                _leave(" = FAILED");
+        spin_unlock(&vnode->lock);
-                return CACHEFS_MATCH_FAILED;
+        spin_lock(&dvnode->lock);
+        dvnode->update_cnt++;
+        spin_unlock(&dvnode->lock);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(dvnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_link(server, key, dvnode, vnode, name,
+                                  &afs_sync_call);
+        } while (!afs_volume_release_fileserver(dvnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                afs_vnode_finalise_status_update(dvnode, server);
+                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+                afs_vnode_status_update_failed(dvnode, ret);
        }
-        if (vnode->fid.unique != cvnode->vnode_unique ||
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-            vnode->status.version != cvnode->data_version) {
+        return ret;
-                _leave(" = DELETE");
-                return CACHEFS_MATCH_SUCCESS_DELETE;
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        spin_lock(&dvnode->lock);
+        dvnode->update_cnt--;
+        ASSERTCMP(dvnode->update_cnt, >=, 0);
+        spin_unlock(&dvnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
+/*
+ * create a symbolic link
+ */
+int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
+                      const char *name, const char *content,
+                      struct afs_fid *newfid,
+                      struct afs_file_status *newstatus,
+                      struct afs_server **_server)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%u,%u,%u},%x,%s,%s,,,",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key),
+               name, content);
+        /* this op will fetch the status on the directory we're creating in */
+        spin_lock(&vnode->lock);
+        vnode->update_cnt++;
+        spin_unlock(&vnode->lock);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_symlink(server, key, vnode, name, content,
+                                     newfid, newstatus, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(vnode, server);
+                *_server = server;
+        } else {
+                afs_vnode_status_update_failed(vnode, ret);
+                *_server = NULL;
        }
-        _leave(" = SUCCESS");
+        _leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-        return CACHEFS_MATCH_SUCCESS;
+        return ret;
-} /* end afs_vnode_cache_match() */
-#endif
+no_server:
+        spin_lock(&vnode->lock);
+        vnode->update_cnt--;
+        ASSERTCMP(vnode->update_cnt, >=, 0);
+        spin_unlock(&vnode->lock);
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+        return PTR_ERR(server);
+}
-/*****************************************************************************/
 /*
- * update a vnode record stored in the cache
+ * rename a file
 */
-#ifdef AFS_CACHING_SUPPORT
+int afs_vnode_rename(struct afs_vnode *orig_dvnode,
-static void afs_vnode_cache_update(void *source, void *entry)
+                     struct afs_vnode *new_dvnode,
+                     struct key *key,
+                     const char *orig_name,
+                     const char *new_name)
 {
-        struct afs_cache_vnode *cvnode = entry;
+        struct afs_server *server;
-        struct afs_vnode *vnode = source;
+        int ret;
-        _enter("");
+        _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
+               orig_dvnode->volume->vlocation->vldb.name,
+               orig_dvnode->fid.vid,
+               orig_dvnode->fid.vnode,
+               orig_dvnode->fid.unique,
+               new_dvnode->volume->vlocation->vldb.name,
+               new_dvnode->fid.vid,
+               new_dvnode->fid.vnode,
+               new_dvnode->fid.unique,
+               key_serial(key),
+               orig_name,
+               new_name);
+        /* this op will fetch the status on both the directories we're dealing
+         * with */
+        spin_lock(&orig_dvnode->lock);
+        orig_dvnode->update_cnt++;
+        spin_unlock(&orig_dvnode->lock);
+        if (new_dvnode != orig_dvnode) {
+                spin_lock(&new_dvnode->lock);
+                new_dvnode->update_cnt++;
+                spin_unlock(&new_dvnode->lock);
+        }
-        cvnode->vnode_id        = vnode->fid.vnode;
+        do {
-        cvnode->vnode_unique    = vnode->fid.unique;
+                /* pick a server to query */
-        cvnode->data_version    = vnode->status.version;
+                server = afs_volume_pick_fileserver(orig_dvnode);
+                if (IS_ERR(server))
+                        goto no_server;
-} /* end afs_vnode_cache_update() */
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-#endif
+                ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
+                                    new_dvnode, new_name, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0) {
+                afs_vnode_finalise_status_update(orig_dvnode, server);
+                if (new_dvnode != orig_dvnode)
+                        afs_vnode_finalise_status_update(new_dvnode, server);
+                afs_put_server(server);
+        } else {
+                afs_vnode_status_update_failed(orig_dvnode, ret);
+                if (new_dvnode != orig_dvnode)
+                        afs_vnode_status_update_failed(new_dvnode, ret);
+        }
+        _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
+        return ret;
+no_server:
+        spin_lock(&orig_dvnode->lock);
+        orig_dvnode->update_cnt--;
+        ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
+        spin_unlock(&orig_dvnode->lock);
+        if (new_dvnode != orig_dvnode) {
+                spin_lock(&new_dvnode->lock);
+                new_dvnode->update_cnt--;
+                ASSERTCMP(new_dvnode->update_cnt, >=, 0);
+                spin_unlock(&new_dvnode->lock);
+        }
+        _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
+        return PTR_ERR(server);
+}
diff --git a/fs/afs/vnode.h b/fs/afs/vnode.h
deleted file mode 100644
index b86a97102e8b..000000000000
--- a/fs/afs/vnode.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* vnode.h: AFS vnode record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_VNODE_H
-#define _LINUX_AFS_VNODE_H
-#include <linux/fs.h>
-#include "server.h"
-#include "kafstimod.h"
-#include "cache.h"
-#ifdef __KERNEL__
-struct afs_rxfs_fetch_descriptor;
-/*****************************************************************************/
-/*
- * vnode catalogue entry
- */
-struct afs_cache_vnode
-{
-        afs_vnodeid_t           vnode_id;       /* vnode ID */
-        unsigned                vnode_unique;   /* vnode ID uniquifier */
-        afs_dataversion_t       data_version;   /* data version */
-};
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-/*****************************************************************************/
-/*
- * AFS inode private data
- */
-struct afs_vnode
-{
-        struct inode            vfs_inode;      /* the VFS's inode record */
-        struct afs_volume       *volume;        /* volume on which vnode resides */
-        struct afs_fid          fid;            /* the file identifier for this inode */
-        struct afs_file_status  status;         /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        wait_queue_head_t       update_waitq;   /* status fetch waitqueue */
-        unsigned                update_cnt;     /* number of outstanding ops that will update the
-                                                 * status */
-        spinlock_t              lock;           /* waitqueue/flags lock */
-        unsigned                flags;
-#define AFS_VNODE_CHANGED       0x00000001      /* set if vnode reported changed by callback */
-#define AFS_VNODE_DELETED       0x00000002      /* set if vnode deleted on server */
-#define AFS_VNODE_MOUNTPOINT    0x00000004      /* set if vnode is a mountpoint symlink */
-        /* outstanding callback notification on this file */
-        struct afs_server       *cb_server;     /* server that made the current promise */
-        struct list_head        cb_link;        /* link in server's promises list */
-        struct list_head        cb_hash_link;   /* link in master callback hash */
-        struct afs_timer        cb_timeout;     /* timeout on promise */
-        unsigned                cb_version;     /* callback version */
-        unsigned                cb_expiry;      /* callback expiry time */
-        afs_callback_type_t     cb_type;        /* type of callback */
-};
-static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
-{
-        return container_of(inode,struct afs_vnode,vfs_inode);
-}
-static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
-{
-        return &vnode->vfs_inode;
-}
-extern int afs_vnode_fetch_status(struct afs_vnode *vnode);
-extern int afs_vnode_fetch_data(struct afs_vnode *vnode,
-                                struct afs_rxfs_fetch_descriptor *desc);
-extern int afs_vnode_give_up_callback(struct afs_vnode *vnode);
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-#endif /* __KERNEL__ */
-#endif /* _LINUX_AFS_VNODE_H */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 768c6dbd323a..dd160cada45d 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -1,6 +1,6 @@
-/* volume.c: AFS volume management
+/* AFS volume management
 *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * This program is free software; you can redistribute it and/or
@@ -15,35 +15,10 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "cell.h"
-#include "cache.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
 #include "internal.h"
-#ifdef __KDEBUG
 static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
-#endif
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                  const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-struct cachefs_index_def afs_volume_cache_index_def = {
-        .name           = "volume",
-        .data_size      = sizeof(struct afs_cache_vhash),
-        .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-        .match          = afs_volume_cache_match,
-        .update         = afs_volume_cache_update,
-};
-#endif
-/*****************************************************************************/
 /*
 * lookup a volume by name
 * - this can be one of the following:
@@ -66,118 +41,52 @@ struct cachefs_index_def afs_volume_cache_index_def = {
 * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
 *           explicitly told otherwise
 */
-int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
+struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
-                      struct afs_volume **_volume)
 {
        struct afs_vlocation *vlocation = NULL;
        struct afs_volume *volume = NULL;
-        afs_voltype_t type;
+        struct afs_server *server = NULL;
-        const char *cellname, *volname, *suffix;
        char srvtmask;
-        int force, ret, loop, cellnamesz, volnamesz;
+        int ret, loop;
-        _enter("%s,,%d,", name, rwpath);
-        if (!name || (name[0] != '%' && name[0] != '#') || !name[1]) {
-                printk("kAFS: unparsable volume name\n");
-                return -EINVAL;
-        }
-        /* determine the type of volume we're looking for */
-        force = 0;
-        type = AFSVL_ROVOL;
-        if (rwpath || name[0] == '%') {
-                type = AFSVL_RWVOL;
-                force = 1;
-        }
-        suffix = strrchr(name, '.');
-        if (suffix) {
-                if (strcmp(suffix, ".readonly") == 0) {
-                        type = AFSVL_ROVOL;
-                        force = 1;
-                }
-                else if (strcmp(suffix, ".backup") == 0) {
-                        type = AFSVL_BACKVOL;
-                        force = 1;
-                }
-                else if (suffix[1] == 0) {
-                }
-                else {
-                        suffix = NULL;
-                }
-        }
-        /* split the cell and volume names */
+        _enter("{%*.*s,%d}",
-        name++;
+               params->volnamesz, params->volnamesz, params->volname, params->rwpath);
-        volname = strchr(name, ':');
-        if (volname) {
-                cellname = name;
-                cellnamesz = volname - name;
-                volname++;
-        }
-        else {
-                volname = name;
-                cellname = NULL;
-                cellnamesz = 0;
-        }
-        volnamesz = suffix ? suffix - volname : strlen(volname);
-        _debug("CELL:%*.*s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
-               cellnamesz, cellnamesz, cellname ?: "", cell,
-               volnamesz, volnamesz, volname, suffix ?: "-",
-               type,
-               force ? " FORCE" : "");
-        /* lookup the cell record */
-        if (cellname || !cell) {
-                ret = afs_cell_lookup(cellname, cellnamesz, &cell);
-                if (ret<0) {
-                        printk("kAFS: unable to lookup cell '%s'\n",
-                               cellname ?: "");
-                        goto error;
-                }
-        }
-        else {
-                afs_get_cell(cell);
-        }
        /* lookup the volume location record */
-        ret = afs_vlocation_lookup(cell, volname, volnamesz, &vlocation);
+        vlocation = afs_vlocation_lookup(params->cell, params->key,
-        if (ret < 0)
+                                         params->volname, params->volnamesz);
+        if (IS_ERR(vlocation)) {
+                ret = PTR_ERR(vlocation);
+                vlocation = NULL;
                goto error;
+        }
        /* make the final decision on the type we want */
        ret = -ENOMEDIUM;
-        if (force && !(vlocation->vldb.vidmask & (1 << type)))
+        if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
                goto error;
        srvtmask = 0;
        for (loop = 0; loop < vlocation->vldb.nservers; loop++)
                srvtmask |= vlocation->vldb.srvtmask[loop];
-        if (force) {
+        if (params->force) {
-                if (!(srvtmask & (1 << type)))
+                if (!(srvtmask & (1 << params->type)))
                        goto error;
-        }
+        } else if (srvtmask & AFS_VOL_VTM_RO) {
-        else if (srvtmask & AFS_VOL_VTM_RO) {
+                params->type = AFSVL_ROVOL;
-                type = AFSVL_ROVOL;
+        } else if (srvtmask & AFS_VOL_VTM_RW) {
-        }
+                params->type = AFSVL_RWVOL;
-        else if (srvtmask & AFS_VOL_VTM_RW) {
+        } else {
-                type = AFSVL_RWVOL;
-        }
-        else {
                goto error;
        }
-        down_write(&cell->vl_sem);
+        down_write(&params->cell->vl_sem);
        /* is the volume already active? */
-        if (vlocation->vols[type]) {
+        if (vlocation->vols[params->type]) {
                /* yes - re-use it */
-                volume = vlocation->vols[type];
+                volume = vlocation->vols[params->type];
                afs_get_volume(volume);
                goto success;
        }
@@ -191,23 +100,24 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
                goto error_up;
        atomic_set(&volume->usage, 1);
-        volume->type            = type;
+        volume->type            = params->type;
-        volume->type_force      = force;
+        volume->type_force      = params->force;
-        volume->cell            = cell;
+        volume->cell            = params->cell;
-        volume->vid             = vlocation->vldb.vid[type];
+        volume->vid             = vlocation->vldb.vid[params->type];
        init_rwsem(&volume->server_sem);
        /* look up all the applicable server records */
        for (loop = 0; loop < 8; loop++) {
                if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
-                        ret = afs_server_lookup(
+                        server = afs_lookup_server(
-                                volume->cell,
+                               volume->cell, &vlocation->vldb.servers[loop]);
-                                &vlocation->vldb.servers[loop],
+                        if (IS_ERR(server)) {
-                                &volume->servers[volume->nservers]);
+                                ret = PTR_ERR(server);
-                        if (ret < 0)
                                goto error_discard;
+                        }
+                        volume->servers[volume->nservers] = server;
                        volume->nservers++;
                }
        }
@@ -223,35 +133,34 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
        afs_get_vlocation(vlocation);
        volume->vlocation = vlocation;
-        vlocation->vols[type] = volume;
+        vlocation->vols[volume->type] = volume;
- success:
+success:
        _debug("kAFS selected %s volume %08x",
               afs_voltypes[volume->type], volume->vid);
-        *_volume = volume;
+        up_write(&params->cell->vl_sem);
-        ret = 0;
+        afs_put_vlocation(vlocation);
+        _leave(" = %p", volume);
+        return volume;
        /* clean up */
- error_up:
+error_up:
-        up_write(&cell->vl_sem);
+        up_write(&params->cell->vl_sem);
- error:
+error:
        afs_put_vlocation(vlocation);
-        afs_put_cell(cell);
+        _leave(" = %d", ret);
+        return ERR_PTR(ret);
-        _leave(" = %d (%p)", ret, volume);
-        return ret;
- error_discard:
+error_discard:
-        up_write(&cell->vl_sem);
+        up_write(&params->cell->vl_sem);
        for (loop = volume->nservers - 1; loop >= 0; loop--)
                afs_put_server(volume->servers[loop]);
        kfree(volume);
        goto error;
-} /* end afs_volume_lookup() */
+}
-/*****************************************************************************/
 /*
 * destroy a volume record
 */
@@ -265,10 +174,9 @@ void afs_put_volume(struct afs_volume *volume)
        _enter("%p", volume);
-        vlocation = volume->vlocation;
+        ASSERTCMP(atomic_read(&volume->usage), >, 0);
-        /* sanity check */
+        vlocation = volume->vlocation;
-        BUG_ON(atomic_read(&volume->usage) <= 0);
        /* to prevent a race, the decrement and the dequeue must be effectively
         * atomic */
@@ -296,21 +204,27 @@ void afs_put_volume(struct afs_volume *volume)
        kfree(volume);
        _leave(" [destroyed]");
-} /* end afs_put_volume() */
+}
-/*****************************************************************************/
 /*
 * pick a server to use to try accessing this volume
 * - returns with an elevated usage count on the server chosen
 */
-int afs_volume_pick_fileserver(struct afs_volume *volume,
+struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
-                               struct afs_server **_server)
 {
+        struct afs_volume *volume = vnode->volume;
        struct afs_server *server;
        int ret, state, loop;
        _enter("%s", volume->vlocation->vldb.name);
+        /* stick with the server we're already using if we can */
+        if (vnode->server && vnode->server->fs_state == 0) {
+                afs_get_server(vnode->server);
+                _leave(" = %p [current]", vnode->server);
+                return vnode->server;
+        }
        down_read(&volume->server_sem);
        /* handle the no-server case */
@@ -318,7 +232,7 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
                ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
                up_read(&volume->server_sem);
                _leave(" = %d [no servers]", ret);
-                return ret;
+                return ERR_PTR(ret);
        }
        /* basically, just search the list for the first live server and use
@@ -328,15 +242,16 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
                server = volume->servers[loop];
                state = server->fs_state;
+                _debug("consider %d [%d]", loop, state);
                switch (state) {
                        /* found an apparently healthy server */
                case 0:
                        afs_get_server(server);
                        up_read(&volume->server_sem);
-                        *_server = server;
+                        _leave(" = %p (picked %08x)",
-                        _leave(" = 0 (picked %08x)",
+                               server, ntohl(server->addr.s_addr));
-                               ntohl(server->addr.s_addr));
+                        return server;
-                        return 0;
                case -ENETUNREACH:
                        if (ret == 0)
@@ -372,20 +287,21 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
         */
        up_read(&volume->server_sem);
        _leave(" = %d", ret);
-        return ret;
+        return ERR_PTR(ret);
-} /* end afs_volume_pick_fileserver() */
+}
-/*****************************************************************************/
 /*
 * release a server after use
 * - releases the ref on the server struct that was acquired by picking
 * - records result of using a particular server to access a volume
 * - return 0 to try again, 1 if okay or to issue error
+ * - the caller must release the server struct if result was 0
 */
-int afs_volume_release_fileserver(struct afs_volume *volume,
+int afs_volume_release_fileserver(struct afs_vnode *vnode,
                                  struct afs_server *server,
                                  int result)
 {
+        struct afs_volume *volume = vnode->volume;
        unsigned loop;
        _enter("%s,%08x,%d",
@@ -396,14 +312,16 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
                /* success */
        case 0:
                server->fs_act_jif = jiffies;
-                break;
+                server->fs_state = 0;
+                _leave("");
+                return 1;
                /* the fileserver denied all knowledge of the volume */
        case -ENOMEDIUM:
                server->fs_act_jif = jiffies;
                down_write(&volume->server_sem);
-                /* first, find where the server is in the active list (if it
+                /* firstly, find where the server is in the active list (if it
                 * is) */
                for (loop = 0; loop < volume->nservers; loop++)
                        if (volume->servers[loop] == server)
@@ -441,6 +359,7 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
        case -ENETUNREACH:
        case -EHOSTUNREACH:
        case -ECONNREFUSED:
+        case -ETIME:
        case -ETIMEDOUT:
        case -EREMOTEIO:
                /* mark the server as dead
@@ -460,60 +379,17 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
                server->fs_act_jif = jiffies;
        case -ENOMEM:
        case -ENONET:
-                break;
+                /* tell the caller to accept the result */
+                afs_put_server(server);
+                _leave(" [local failure]");
+                return 1;
        }
-        /* tell the caller to accept the result */
-        afs_put_server(server);
-        _leave("");
-        return 1;
        /* tell the caller to loop around and try the next server */
- try_next_server_upw:
+try_next_server_upw:
        up_write(&volume->server_sem);
- try_next_server:
+try_next_server:
        afs_put_server(server);
        _leave(" [try next server]");
        return 0;
+}
-} /* end afs_volume_release_fileserver() */
-/*****************************************************************************/
-/*
- * match a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                  const void *entry)
-{
-        const struct afs_cache_vhash *vhash = entry;
-        struct afs_volume *volume = target;
-        _enter("{%u},{%u}", volume->type, vhash->vtype);
-        if (volume->type == vhash->vtype) {
-                _leave(" = SUCCESS");
-                return CACHEFS_MATCH_SUCCESS;
-        }
-        _leave(" = FAILED");
-        return CACHEFS_MATCH_FAILED;
-} /* end afs_volume_cache_match() */
-#endif
-/*****************************************************************************/
-/*
- * update a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
-{
-        struct afs_cache_vhash *vhash = entry;
-        struct afs_volume *volume = source;
-        _enter("");
-        vhash->vtype = volume->type;
-} /* end afs_volume_cache_update() */
-#endif
diff --git a/fs/afs/volume.h b/fs/afs/volume.h
deleted file mode 100644
index bfdcf19ba3f3..000000000000
--- a/fs/afs/volume.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* volume.h: AFS volume management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _LINUX_AFS_VOLUME_H
-#define _LINUX_AFS_VOLUME_H
-#include "types.h"
-#include "fsclient.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
-#include "cache.h"
-typedef enum {
-        AFS_VLUPD_SLEEP,                /* sleeping waiting for update timer to fire */
-        AFS_VLUPD_PENDING,              /* on pending queue */
-        AFS_VLUPD_INPROGRESS,           /* op in progress */
-        AFS_VLUPD_BUSYSLEEP,            /* sleeping because server returned EBUSY */
-        
-} __attribute__((packed)) afs_vlocation_upd_t;
-/*****************************************************************************/
-/*
- * entry in the cached volume location catalogue
- */
-struct afs_cache_vlocation
-{
-        uint8_t                 name[64];       /* volume name (lowercase, padded with NULs) */
-        uint8_t                 nservers;       /* number of entries used in servers[] */
-        uint8_t                 vidmask;        /* voltype mask for vid[] */
-        uint8_t                 srvtmask[8];    /* voltype masks for servers[] */
-#define AFS_VOL_VTM_RW  0x01 /* R/W version of the volume is available (on this server) */
-#define AFS_VOL_VTM_RO  0x02 /* R/O version of the volume is available (on this server) */
-#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
-        afs_volid_t             vid[3];         /* volume IDs for R/W, R/O and Bak volumes */
-        struct in_addr          servers[8];     /* fileserver addresses */
-        time_t                  rtime;          /* last retrieval time */
-};
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-/*****************************************************************************/
-/*
- * volume -> vnode hash table entry
- */
-struct afs_cache_vhash
-{
-        afs_voltype_t           vtype;          /* which volume variation */
-        uint8_t                 hash_bucket;    /* which hash bucket this represents */
-} __attribute__((packed));
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-/*****************************************************************************/
-/*
- * AFS volume location record
- */
-struct afs_vlocation
-{
-        atomic_t                usage;
-        struct list_head        link;           /* link in cell volume location list */
-        struct afs_timer        timeout;        /* decaching timer */
-        struct afs_cell         *cell;          /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        struct afs_cache_vlocation vldb;        /* volume information DB record */
-        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
-        rwlock_t                lock;           /* access lock */
-        unsigned long           read_jif;       /* time at which last read from vlserver */
-        struct afs_timer        upd_timer;      /* update timer */
-        struct afs_async_op     upd_op;         /* update operation */
-        afs_vlocation_upd_t     upd_state;      /* update state */
-        unsigned short          upd_first_svix; /* first server index during update */
-        unsigned short          upd_curr_svix;  /* current server index during update */
-        unsigned short          upd_rej_cnt;    /* ENOMEDIUM count during update */
-        unsigned short          upd_busy_cnt;   /* EBUSY count during update */
-        unsigned short          valid;          /* T if valid */
-};
-extern int afs_vlocation_lookup(struct afs_cell *cell,
-                                const char *name,
-                                unsigned namesz,
-                                struct afs_vlocation **_vlocation);
-#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-extern void afs_put_vlocation(struct afs_vlocation *vlocation);
-extern void afs_vlocation_do_timeout(struct afs_vlocation *vlocation);
-/*****************************************************************************/
-/*
- * AFS volume access record
- */
-struct afs_volume
-{
-        atomic_t                usage;
-        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
-        struct afs_vlocation    *vlocation;     /* volume location */
-#ifdef AFS_CACHING_SUPPORT
-        struct cachefs_cookie   *cache;         /* caching cookie */
-#endif
-        afs_volid_t             vid;            /* volume ID */
-        afs_voltype_t           type;           /* type of volume */
-        char                    type_force;     /* force volume type (suppress R/O -> R/W) */
-        unsigned short          nservers;       /* number of server slots filled */
-        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
-        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
-        struct rw_semaphore     server_sem;     /* lock for accessing current server */
-};
-extern int afs_volume_lookup(const char *name,
-                             struct afs_cell *cell,
-                             int rwpath,
-                             struct afs_volume **_volume);
-#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
-extern void afs_put_volume(struct afs_volume *volume);
-extern int afs_volume_pick_fileserver(struct afs_volume *volume,
-                                      struct afs_server **_server);
-extern int afs_volume_release_fileserver(struct afs_volume *volume,
-                                         struct afs_server *server,
-                                         int result);
-#endif /* _LINUX_AFS_VOLUME_H */
diff --git a/fs/aio.c b/fs/aio.c
index 0b4ee0a5c83e..e4598d6d49dd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -136,7 +136,6 @@ static int aio_setup_ring(struct kioctx *ctx)
                                  0);
        if (IS_ERR((void *)info->mmap_base)) {
                up_write(&ctx->mm->mmap_sem);
-                printk("mmap err: %ld\n", -info->mmap_base);
                info->mmap_size = 0;
                aio_free_ring(ctx);
                return -EAGAIN;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 6b4cec3f272f..d85f42fa9206 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -52,6 +52,8 @@ struct autofs_info {
        int             flags;
+        struct list_head rehash;
        struct autofs_sb_info *sbi;
        unsigned long last_used;
        atomic_t count;
@@ -110,6 +112,8 @@ struct autofs_sb_info {
        struct mutex wq_mutex;
        spinlock_t fs_lock;
        struct autofs_wait_queue *queues; /* Wait queue pointer */
+        spinlock_t rehash_lock;
+        struct list_head rehash_list;
 };
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 5e458e096ef6..26063dc84a2a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -48,6 +48,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
        ino->dentry = NULL;
        ino->size = 0;
+        INIT_LIST_HEAD(&ino->rehash);
        ino->last_used = jiffies;
        atomic_set(&ino->count, 0);
@@ -158,14 +160,13 @@ void autofs4_kill_sb(struct super_block *sb)
        if (!sbi)
                goto out_kill_sb;
-        sb->s_fs_info = NULL;
+        if (!sbi->catatonic)
-        if ( !sbi->catatonic )
                autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */
        /* Clean up and release dangling references */
        autofs4_force_release(sbi);
+        sb->s_fs_info = NULL;
        kfree(sbi);
 out_kill_sb:
@@ -336,6 +337,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        mutex_init(&sbi->wq_mutex);
        spin_lock_init(&sbi->fs_lock);
        sbi->queues = NULL;
+        spin_lock_init(&sbi->rehash_lock);
+        INIT_LIST_HEAD(&sbi->rehash_list);
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
        s->s_magic = AUTOFS_SUPER_MAGIC;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 47fee96c2182..d0e9b3a3905d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -263,7 +263,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                 */
                status = d_invalidate(dentry);
                if (status != -EBUSY)
-                        return -ENOENT;
+                        return -EAGAIN;
        }
        DPRINTK("dentry=%p %.*s ino=%p",
@@ -413,7 +413,16 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                 */
                status = try_to_fill_dentry(dentry, flags);
                if (status == 0)
-                                return 1;
+                        return 1;
+                /*
+                 * A status of EAGAIN here means that the dentry has gone
+                 * away while waiting for an expire to complete. If we are
+                 * racing with expire lookup will wait for it so this must
+                 * be a revalidate and we need to send it to lookup.
+                 */
+                if (status == -EAGAIN)
+                        return 0;
                return status;
        }
@@ -459,6 +468,15 @@ void autofs4_dentry_release(struct dentry *de)
        de->d_fsdata = NULL;
        if (inf) {
+                struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
+                if (sbi) {
+                        spin_lock(&sbi->rehash_lock);
+                        if (!list_empty(&inf->rehash))
+                                list_del(&inf->rehash);
+                        spin_unlock(&sbi->rehash_lock);
+                }
                inf->dentry = NULL;
                inf->inode = NULL;
@@ -478,10 +496,80 @@ static struct dentry_operations autofs4_dentry_operations = {
        .d_release      = autofs4_dentry_release,
 };
+static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+{
+        unsigned int len = name->len;
+        unsigned int hash = name->hash;
+        const unsigned char *str = name->name;
+        struct list_head *p, *head;
+        spin_lock(&dcache_lock);
+        spin_lock(&sbi->rehash_lock);
+        head = &sbi->rehash_list;
+        list_for_each(p, head) {
+                struct autofs_info *ino;
+                struct dentry *dentry;
+                struct qstr *qstr;
+                ino = list_entry(p, struct autofs_info, rehash);
+                dentry = ino->dentry;
+                spin_lock(&dentry->d_lock);
+                /* Bad luck, we've already been dentry_iput */
+                if (!dentry->d_inode)
+                        goto next;
+                qstr = &dentry->d_name;
+                if (dentry->d_name.hash != hash)
+                        goto next;
+                if (dentry->d_parent != parent)
+                        goto next;
+                if (qstr->len != len)
+                        goto next;
+                if (memcmp(qstr->name, str, len))
+                        goto next;
+                if (d_unhashed(dentry)) {
+                        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+                        struct inode *inode = dentry->d_inode;
+                        list_del_init(&ino->rehash);
+                        dget(dentry);
+                        /*
+                         * Make the rehashed dentry negative so the VFS
+                         * behaves as it should.
+                         */
+                        if (inode) {
+                                dentry->d_inode = NULL;
+                                list_del_init(&dentry->d_alias);
+                                spin_unlock(&dentry->d_lock);
+                                spin_unlock(&sbi->rehash_lock);
+                                spin_unlock(&dcache_lock);
+                                iput(inode);
+                                return dentry;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&sbi->rehash_lock);
+                        spin_unlock(&dcache_lock);
+                        return dentry;
+                }
+next:
+                spin_unlock(&dentry->d_lock);
+        }
+        spin_unlock(&sbi->rehash_lock);
+        spin_unlock(&dcache_lock);
+        return NULL;
+}
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct autofs_sb_info *sbi;
+        struct dentry *unhashed;
        int oz_mode;
        DPRINTK("name = %.*s",
@@ -497,25 +585,46 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
                 current->pid, process_group(current), sbi->catatonic, oz_mode);
-        /*
+        unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name);
-         * Mark the dentry incomplete, but add it. This is needed so
+        if (!unhashed) {
-         * that the VFS layer knows about the dentry, and we can count
+                /*
-         * on catching any lookups through the revalidate.
+                 * Mark the dentry incomplete, but add it. This is needed so
-         *
+                 * that the VFS layer knows about the dentry, and we can count
-         * Let all the hard work be done by the revalidate function that
+                 * on catching any lookups through the revalidate.
-         * needs to be able to do this anyway..
+                 *
-         *
+                 * Let all the hard work be done by the revalidate function that
-         * We need to do this before we release the directory semaphore.
+                 * needs to be able to do this anyway..
-         */
+                 *
-        dentry->d_op = &autofs4_root_dentry_operations;
+                 * We need to do this before we release the directory semaphore.
+                 */
+                dentry->d_op = &autofs4_root_dentry_operations;
+                dentry->d_fsdata = NULL;
+                d_add(dentry, NULL);
+        } else {
+                struct autofs_info *ino = autofs4_dentry_ino(unhashed);
+                DPRINTK("rehash %p with %p", dentry, unhashed);
+                /*
+                 * If we are racing with expire the request might not
+                 * be quite complete but the directory has been removed
+                 * so it must have been successful, so just wait for it.
+                 */
+                if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+                        DPRINTK("wait for incomplete expire %p name=%.*s",
+                                unhashed, unhashed->d_name.len,
+                                unhashed->d_name.name);
+                        autofs4_wait(sbi, unhashed, NFY_NONE);
+                        DPRINTK("request completed");
+                }
+                d_rehash(unhashed);
+                dentry = unhashed;
+        }
        if (!oz_mode) {
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
                spin_unlock(&dentry->d_lock);
        }
-        dentry->d_fsdata = NULL;
-        d_add(dentry, NULL);
        if (dentry->d_op && dentry->d_op->d_revalidate) {
                mutex_unlock(&dir->i_mutex);
@@ -534,6 +643,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                        if (sigismember (sigset, SIGKILL) ||
                            sigismember (sigset, SIGQUIT) ||
                            sigismember (sigset, SIGINT)) {
+                            if (unhashed)
+                                dput(unhashed);
                            return ERR_PTR(-ERESTARTNOINTR);
                        }
                }
@@ -544,12 +655,33 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        /*
         * If this dentry is unhashed, then we shouldn't honour this
-         * lookup even if the dentry is positive.  Returning ENOENT here
+         * lookup.  Returning ENOENT here doesn't do the right thing
-         * doesn't do the right thing for all system calls, but it should
+         * for all system calls, but it should be OK for the operations
-         * be OK for the operations we permit from an autofs.
+         * we permit from an autofs.
         */
-        if (dentry->d_inode && d_unhashed(dentry))
+        if (dentry->d_inode && d_unhashed(dentry)) {
-                return ERR_PTR(-ENOENT);
+                /*
+                 * A user space application can (and has done in the past)
+                 * remove and re-create this directory during the callback.
+                 * This can leave us with an unhashed dentry, but a
+                 * successful mount!  So we need to perform another
+                 * cached lookup in case the dentry now exists.
+                 */
+                struct dentry *parent = dentry->d_parent;
+                struct dentry *new = d_lookup(parent, &dentry->d_name);
+                if (new != NULL)
+                        dentry = new;
+                else
+                        dentry = ERR_PTR(-ENOENT);
+                if (unhashed)
+                        dput(unhashed);
+                return dentry;
+        }
+        if (unhashed)
+                return dentry;
        return NULL;
 }
@@ -611,9 +743,10 @@ static int autofs4_dir_symlink(struct inode *dir,
 * Normal filesystems would do a "d_delete()" to tell the VFS dcache
 * that the file no longer exists. However, doing that means that the
 * VFS layer can turn the dentry into a negative dentry.  We don't want
- * this, because since the unlink is probably the result of an expire.
+ * this, because the unlink is probably the result of an expire.
- * We simply d_drop it, which allows the dentry lookup to remount it
+ * We simply d_drop it and add it to a rehash candidates list in the
- * if necessary.
+ * super block, which allows the dentry lookup to reuse it retaining
+ * the flags, such as expire in progress, in case we're racing with expire.
 *
 * If a process is blocked on the dentry waiting for the expire to finish,
 * it will invalidate the dentry and try to mount with a new one.
@@ -642,7 +775,14 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        d_drop(dentry);
+        spin_lock(&dcache_lock);
+        spin_lock(&sbi->rehash_lock);
+        list_add(&ino->rehash, &sbi->rehash_list);
+        spin_unlock(&sbi->rehash_lock);
+        spin_lock(&dentry->d_lock);
+        __d_drop(dentry);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&dcache_lock);
        return 0;
 }
@@ -653,6 +793,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        struct autofs_info *p_ino;
        
+        DPRINTK("dentry %p, removing %.*s",
+                dentry, dentry->d_name.len, dentry->d_name.name);
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
@@ -661,6 +804,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
                spin_unlock(&dcache_lock);
                return -ENOTEMPTY;
        }
+        spin_lock(&sbi->rehash_lock);
+        list_add(&ino->rehash, &sbi->rehash_list);
+        spin_unlock(&sbi->rehash_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1e4a539f4417..0d041a9cb348 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -84,7 +84,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                                 struct autofs_wait_queue *wq,
                                 int type)
 {
-        union autofs_packet_union pkt;
+        union {
+                struct autofs_packet_hdr hdr;
+                union autofs_packet_union v4_pkt;
+                union autofs_v5_packet_union v5_pkt;
+        } pkt;
        size_t pktsz;
        DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
@@ -98,7 +102,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        /* Kernel protocol v4 missing and expire packets */
        case autofs_ptype_missing:
        {
-                struct autofs_packet_missing *mp = &pkt.missing;
+                struct autofs_packet_missing *mp = &pkt.v4_pkt.missing;
                pktsz = sizeof(*mp);
@@ -110,7 +114,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        }
        case autofs_ptype_expire_multi:
        {
-                struct autofs_packet_expire_multi *ep = &pkt.expire_multi;
+                struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
                pktsz = sizeof(*ep);
@@ -129,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        case autofs_ptype_missing_direct:
        case autofs_ptype_expire_direct:
        {
-                struct autofs_v5_packet *packet = &pkt.v5_packet;
+                struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
                pktsz = sizeof(*packet);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 51db1182b27e..9cc4f0a8aaae 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -507,7 +507,7 @@ out:
 #define INTERPRETER_ELF 2
 #ifndef STACK_RND_MASK
-#define STACK_RND_MASK 0x7ff            /* with 4K pages 8MB of VA */
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
 #endif
 static unsigned long randomize_stack_top(unsigned long stack_top)
@@ -1704,7 +1704,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
                                DUMP_SEEK(PAGE_SIZE);
                        } else {
                                if (page == ZERO_PAGE(addr)) {
-                                        DUMP_SEEK(PAGE_SIZE);
+                                        if (!dump_seek(file, PAGE_SIZE)) {
+                                                page_cache_release(page);
+                                                goto end_coredump;
+                                        }
                                } else {
                                        void *kaddr;
                                        flush_cache_page(vma, addr,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 5810aa1339fd..f3ddca4a387b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -179,6 +179,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        int executable_stack;
        int retval, i;
+        kdebug("____ LOAD %d ____", current->pid);
        memset(&exec_params, 0, sizeof(exec_params));
        memset(&interp_params, 0, sizeof(interp_params));
@@ -941,8 +943,11 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                if (mm) {
                        if (phdr->p_flags & PF_X) {
-                                mm->start_code = seg->addr;
+                                if (!mm->start_code) {
-                                mm->end_code = seg->addr + phdr->p_memsz;
+                                        mm->start_code = seg->addr;
+                                        mm->end_code = seg->addr +
+                                                phdr->p_memsz;
+                                }
                        } else if (!mm->start_data) {
                                mm->start_data = seg->addr;
 #ifndef CONFIG_MMU
@@ -1123,8 +1128,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (mm) {
                        if (phdr->p_flags & PF_X) {
-                                mm->start_code = maddr;
+                                if (!mm->start_code) {
-                                mm->end_code = maddr + phdr->p_memsz;
+                                        mm->start_code = maddr;
+                                        mm->end_code = maddr + phdr->p_memsz;
+                                }
                        } else if (!mm->start_data) {
                                mm->start_data = maddr;
                                mm->end_data = maddr + phdr->p_memsz;
@@ -1473,8 +1480,8 @@ static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm,
                                DUMP_SEEK(file->f_pos + PAGE_SIZE);
                        }
                        else if (page == ZERO_PAGE(addr)) {
-                                DUMP_SEEK(file->f_pos + PAGE_SIZE);
                                page_cache_release(page);
+                                DUMP_SEEK(file->f_pos + PAGE_SIZE);
                        }
                        else {
                                void *kaddr;
diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb18368..693940da4090 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,7 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-#define BIO_POOL_SIZE 256
+#define BIO_POOL_SIZE 2
 static struct kmem_cache *bio_slab __read_mostly;
@@ -38,7 +38,7 @@ static struct kmem_cache *bio_slab __read_mostly;
 * a small number of entries is fine, not going to be performance critical.
 * basically we just need to survive
 */
-#define BIO_SPLIT_ENTRIES 8     
+#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 struct biovec_slab {
@@ -1120,7 +1120,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
+static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
        int i;
@@ -1128,9 +1128,6 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
                struct biovec_slab *bp = bvec_slabs + i;
                mempool_t **bvp = bs->bvec_pools + i;
-                if (pool_entries > 1 && i >= scale)
-                        pool_entries >>= 1;
                *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
                if (!*bvp)
                        return -ENOMEM;
@@ -1161,7 +1158,7 @@ void bioset_free(struct bio_set *bs)
        kfree(bs);
 }
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
+struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 {
        struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
@@ -1172,7 +1169,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
        if (!bs->bio_pool)
                goto bad;
-        if (!biovec_create_pools(bs, bvec_pool_size, scale))
+        if (!biovec_create_pools(bs, bvec_pool_size))
                return bs;
 bad:
@@ -1196,38 +1193,12 @@ static void __init biovec_init_slabs(void)
 static int __init init_bio(void)
 {
-        int megabytes, bvec_pool_entries;
-        int scale = BIOVEC_NR_POOLS;
        bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
        biovec_init_slabs();
-        megabytes = nr_free_pages() >> (20 - PAGE_SHIFT);
+        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
-        /*
-         * find out where to start scaling
-         */
-        if (megabytes <= 16)
-                scale = 0;
-        else if (megabytes <= 32)
-                scale = 1;
-        else if (megabytes <= 64)
-                scale = 2;
-        else if (megabytes <= 96)
-                scale = 3;
-        else if (megabytes <= 128)
-                scale = 4;
-        /*
-         * Limit number of entries reserved -- mempools are only used when
-         * the system is completely unable to allocate memory, so we only
-         * need enough to make progress.
-         */
-        bvec_pool_entries = 1 + scale;
-        fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0c59b703e9d5..575076c018f4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1101,6 +1101,13 @@ static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
                        int for_part);
 static int __blkdev_put(struct block_device *bdev, int for_part);
+/*
+ * bd_mutex locking:
+ *
+ *  mutex_lock(part->bd_mutex)
+ *    mutex_lock_nested(whole->bd_mutex, 1)
+ */
 static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
        struct module *owner = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index f99c509697cd..1d0852fa728b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1743,7 +1743,6 @@ recover:
        SetPageError(page);
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);
-        unlock_page(page);
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
@@ -1753,6 +1752,7 @@ recover:
                }
                bh = next;
        } while (bh != head);
+        unlock_page(page);
        goto done;
 }
@@ -2248,7 +2248,6 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
        int i;
        int ret = 0;
        int is_mapped_to_disk = 1;
-        int dirtied_it = 0;
        if (PageMappedToDisk(page))
                return 0;
@@ -2285,14 +2284,10 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
                        continue;
                if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
                        kaddr = kmap_atomic(page, KM_USER0);
-                        if (block_start < from) {
+                        if (block_start < from)
                                memset(kaddr+block_start, 0, from-block_start);
-                                dirtied_it = 1;
+                        if (block_end > to)
-                        }
-                        if (block_end > to) {
                                memset(kaddr + to, 0, block_end - to);
-                                dirtied_it = 1;
-                        }
                        flush_dcache_page(page);
                        kunmap_atomic(kaddr, KM_USER0);
                        continue;
@@ -2347,17 +2342,6 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
        if (is_mapped_to_disk)
                SetPageMappedToDisk(page);
-        SetPageUptodate(page);
-        /*
-         * Setting the page dirty here isn't necessary for the prepare_write
-         * function - commit_write will do that.  But if/when this function is
-         * used within the pagefault handler to ensure that all mmapped pages
-         * have backing space in the filesystem, we will need to dirty the page
-         * if its contents were altered.
-         */
-        if (dirtied_it)
-                set_page_dirty(page);
        return 0;
@@ -2381,12 +2365,17 @@ failed:
 }
 EXPORT_SYMBOL(nobh_prepare_write);
+/*
+ * Make sure any changes to nobh_commit_write() are reflected in
+ * nobh_truncate_page(), since it doesn't call commit_write().
+ */
 int nobh_commit_write(struct file *file, struct page *page,
                unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        SetPageUptodate(page);
        set_page_dirty(page);
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
@@ -2481,6 +2470,11 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
                memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
                flush_dcache_page(page);
                kunmap_atomic(kaddr, KM_USER0);
+                /*
+                 * It would be more correct to call aops->commit_write()
+                 * here, but this is more efficient.
+                 */
+                SetPageUptodate(page);
                set_page_dirty(page);
        }
        unlock_page(page);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e6194e2b9bb9..164a45cdaf5f 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/kdev_t.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -108,13 +109,6 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
        /* temporary */
        if (major == 0) {
                for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) {
-                        /*
-                         * Disallow the LANANA-assigned LOCAL/EXPERIMENTAL
-                         * majors
-                         */
-                        if ((60 <= i && i <= 63) || (120 <= i && i <= 127) ||
-                                        (240 <= i && i <= 254))
-                                continue;
                        if (chrdevs[i] == NULL)
                                break;
                }
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 5fe13593b57f..5d1f4873d701 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,16 @@
+Verison 1.48
+------------
+Fix mtime bouncing around from local idea of last write times to remote time.
+Fix hang (in i_size_read) when simultaneous size update of same remote file
+on smp system corrupts sequence number. Do not reread unnecessarily partial page
+(which we are about to overwrite anyway) when writing out file opened rw.
+When DOS attribute of file on non-Unix server's file changes on the server side
+from read-only back to read-write, reflect this change in default file mode
+(we had been leaving a file's mode read-only until the inode were reloaded).
+Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
+when archive dos attribute not set and we are changing mode back to writeable
+on server which does not support the Unix Extensions).
 Version 1.47
 ------------
 Fix oops in list_del during mount caused by unaligned string.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index a26f26ed5a17..6ecd9d6ba3f3 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -3,4 +3,4 @@
 #
 obj-$(CONFIG_CIFS) += cifs.o
-cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o
+cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o export.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 432e515431c4..080c5eba112b 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -1,5 +1,5 @@
 The CIFS VFS support for Linux supports many advanced network filesystem 
-features such as heirarchical dfs like namespace, hardlinks, locking and more.  
+features such as hierarchical dfs like namespace, hardlinks, locking and more.  
 It was designed to comply with the SNIA CIFS Technical Reference (which 
 supersedes the 1992 X/Open SMB Standard) as well as to perform best practice 
 practical interoperability with Windows 2000, Windows XP, Samba and equivalent 
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 68372946dc92..d7b9c27c942d 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -18,7 +18,9 @@ better)
 d) Kerberos/SPNEGO session setup support - (started)
-e) NTLMv2 authentication (mostly implemented)
+e) NTLMv2 authentication (mostly implemented - double check
+that NTLMv2 signing works, also need to cleanup now unneeded SessSetup code in
+fs/cifs/connect.c)
 f) MD5-HMAC signing SMB PDUs when SPNEGO style SessionSetup 
 used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
@@ -88,11 +90,12 @@ w) Finish up the dos time conversion routines needed to return old server
 time to the client (default time, of now or time 0 is used now for these 
 very old servers)
-x) Add support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
+x) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
+need to add ability to set time to server (utimes command)
 y) Finish testing of Windows 9x/Windows ME server support (started).
-KNOWN BUGS (updated April 29, 2005)
+KNOWN BUGS (updated February 26, 2007)
 ====================================
 See http://bugzilla.samba.org - search on product "CifsVFS" for
 current bug list.
@@ -107,11 +110,6 @@ but recognizes them
 succeed but still return access denied (appears to be Windows 
 server not cifs client problem) and has not been reproduced recently.
 NTFS partitions do not have this problem.
-4) debug connectathon lock test case 10 which fails against
-Samba (may be unmappable due to POSIX to Windows lock model
-differences but worth investigating).  Also debug Samba to 
-see why lock test case 7 takes longer to complete to Samba
-than to Windows.
 Misc testing to do
 ==================
@@ -119,7 +117,7 @@ Misc testing to do
 types. Try nested symlinks (8 deep). Return max path name in stat -f information
 2) Modify file portion of ltp so it can run against a mounted network
-share and run it against cifs vfs.
+share and run it against cifs vfs in automated fashion.
 3) Additional performance testing and optimization using iozone and similar - 
 there are some easy changes that can be done to parallelize sequential writes,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e8287c4c6eb3..faba4d69fe91 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifsfs.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2004
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Common Internet FileSystem (CIFS) client
@@ -47,7 +47,11 @@
 #ifdef CONFIG_CIFS_QUOTA
 static struct quotactl_ops cifs_quotactl_ops;
-#endif
+#endif /* QUOTA */
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+extern struct export_operations cifs_export_ops;
+#endif /* EXPERIMENTAL */
 int cifsFYI = 0;
 int cifsERROR = 1;
@@ -62,8 +66,8 @@ unsigned int extended_security = CIFSSEC_DEF;
 unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct * oplockThread; /* remove sparse warning */
 struct task_struct * oplockThread = NULL;
-extern struct task_struct * dnotifyThread; /* remove sparse warning */
+/* extern struct task_struct * dnotifyThread; remove sparse warning */
-struct task_struct * dnotifyThread = NULL;
+static struct task_struct * dnotifyThread = NULL;
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
@@ -91,8 +95,9 @@ cifs_read_super(struct super_block *sb, void *data,
        struct inode *inode;
        struct cifs_sb_info *cifs_sb;
        int rc = 0;
+        
-        sb->s_flags |= MS_NODIRATIME; /* and probably even noatime */
+        /* BB should we make this contingent on mount parm? */
+        sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
        sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL);
        cifs_sb = CIFS_SB(sb);
        if(cifs_sb == NULL)
@@ -109,6 +114,10 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+        if(experimEnabled != 0)
+                sb->s_export_op = &cifs_export_ops;
+#endif /* EXPERIMENTAL */       
 /*      if(cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
            sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
 #ifdef CONFIG_CIFS_QUOTA
@@ -258,7 +267,10 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_inode->clientCanCacheRead = FALSE;
        cifs_inode->clientCanCacheAll = FALSE;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
-        cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;
+        
+        /* Can not set i_flags here - they get immediately overwritten
+           to zero by the VFS */
+/*      cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;*/
        INIT_LIST_HEAD(&cifs_inode->openFileList);
        return &cifs_inode->vfs_inode;
 }
@@ -283,6 +295,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        if (cifs_sb) {
                if (cifs_sb->tcon) {
+/* BB add prepath to mount options displayed */
                        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
                        if (cifs_sb->tcon->ses) {
                                if (cifs_sb->tcon->ses->userName)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c97c08eb481a..2c2c384894d8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -38,8 +38,8 @@ extern const struct address_space_operations cifs_addr_ops_smallbuf;
 /* Functions related to super block operations */
 /* extern const struct super_operations cifs_super_ops;*/
 extern void cifs_read_inode(struct inode *);
-extern void cifs_delete_inode(struct inode *);
+/*extern void cifs_delete_inode(struct inode *);*/  /* BB not needed yet */
-/* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */
+/* extern void cifs_write_inode(struct inode *); */ /* BB not needed yet */
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 74d3ccbb103b..e4de8eba4780 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -525,15 +525,17 @@ require use of the stronger protocol */
 */
 GLOBAL_EXTERN struct smbUidInfo *GlobalUidList[UID_HASH];
-GLOBAL_EXTERN struct list_head GlobalServerList; /* BB not implemented yet */
+/* GLOBAL_EXTERN struct list_head GlobalServerList; BB not implemented yet */
 GLOBAL_EXTERN struct list_head GlobalSMBSessionList;
 GLOBAL_EXTERN struct list_head GlobalTreeConnectionList;
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
-GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; /* Outstanding dir notify requests */
+/* Outstanding dir notify requests */
-GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;/* DirNotify response queue */
+GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
+/* DirNotify response queue */
+GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
 /*
 * Global transaction id (XID) information
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 7d9505491b16..4d8948e8762c 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -35,9 +35,11 @@
 #define BAD_PROT 0xFFFF
 /* SMB command codes */
-/* Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+/*
- (ie which include no useful data other than the SMB error code itself).
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
- Knowing this helps avoid response buffer allocations and copy in some cases */
+ * (ie which include no useful data other than the SMB error code itself).
+ * Knowing this helps avoid response buffer allocations and copy in some cases
+ */
 #define SMB_COM_CREATE_DIRECTORY      0x00 /* trivial response */
 #define SMB_COM_DELETE_DIRECTORY      0x01 /* trivial response */
 #define SMB_COM_CLOSE                 0x04 /* triv req/rsp, timestamp ignored */
@@ -218,6 +220,9 @@
 */
 #define CIFS_NO_HANDLE        0xFFFF
+#define NO_CHANGE_64          cpu_to_le64(0xFFFFFFFFFFFFFFFFULL)
+#define NO_CHANGE_32          0xFFFFFFFFUL
 /* IPC$ in ASCII */
 #define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
@@ -1882,7 +1887,13 @@ typedef struct {
                                                      calls including posix open
                                                      and posix unlink */ 
 #ifdef CONFIG_CIFS_POSIX
-#define CIFS_UNIX_CAP_MASK              0x0000003b
+/* Can not set pathnames cap yet until we send new posix create SMB since
+   otherwise server can treat such handles opened with older ntcreatex
+   (by a new client which knows how to send posix path ops)
+   as non-posix handles (can affect write behavior with byte range locks.
+   We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+/* #define CIFS_UNIX_CAP_MASK              0x0000003b */
+#define CIFS_UNIX_CAP_MASK              0x0000001b 
 #else 
 #define CIFS_UNIX_CAP_MASK              0x00000013
 #endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6148b82170c4..32eb1acab630 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -43,7 +43,7 @@ extern void _FreeXid(unsigned int);
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__FUNCTION__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
-extern void renew_parental_timestamps(struct dentry *direntry);
+/* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b8e91470c27f..48fc0c2ab0e5 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2824,10 +2824,10 @@ GetExtAttrOut:
 /* security id for everyone */
-const static struct cifs_sid sid_everyone = 
+static const struct cifs_sid sid_everyone =
                {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
 /* group users */
-const static struct cifs_sid sid_user = 
+static const struct cifs_sid sid_user =
                {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
 /* Convert CIFS ACL to POSIX form */
@@ -4803,6 +4803,16 @@ setPermsRetry:
        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
        pSMB->Reserved4 = 0;
        pSMB->hdr.smb_buf_length += byte_count;
+        /* Samba server ignores set of file size to zero due to bugs in some
+        older clients, but we should be precise - we use SetFileSize to
+        set file size and do not want to truncate file size to zero
+        accidently as happened on one Samba server beta by putting
+        zero instead of -1 here */ 
+        data_offset->EndOfFile = NO_CHANGE_64;
+        data_offset->NumOfBytes = NO_CHANGE_64;
+        data_offset->LastStatusChange = NO_CHANGE_64;
+        data_offset->LastAccessTime = NO_CHANGE_64;
+        data_offset->LastModificationTime = NO_CHANGE_64;
        data_offset->Uid = cpu_to_le64(uid);
        data_offset->Gid = cpu_to_le64(gid);
        /* better to leave device as zero when it is  */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 66b825ade3e1..3fad638d26d3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -31,7 +31,7 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
-void
+static void
 renew_parental_timestamps(struct dentry *direntry)
 {
        /* BB check if there is a way to get the kernel to do this or if we really need this */
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
new file mode 100644
index 000000000000..1d716392c3aa
--- /dev/null
+++ b/fs/cifs/export.c
@@ -0,0 +1,52 @@
+/*
+ *   fs/cifs/export.c
+ *
+ *   Copyright (C) International Business Machines  Corp., 2007
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   Common Internet FileSystem (CIFS) client
+ * 
+ *   Operations related to support for exporting files via NFSD
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+ 
+ /* 
+  * See Documentation/filesystems/Exporting
+  * and examples in fs/exportfs
+  */
+#include <linux/fs.h>
+ 
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+ 
+static struct dentry *cifs_get_parent(struct dentry *dentry)
+{
+        /* BB need to add code here eventually to enable export via NFSD */
+        return ERR_PTR(-EACCES);
+}
+ 
+struct export_operations cifs_export_ops = {
+        .get_parent = cifs_get_parent,
+/*      Following five export operations are unneeded so far and can default */         
+/*      .get_dentry =
+        .get_name =
+        .find_exported_dentry =
+        .decode_fh = 
+        .encode_fs =  */
+ };
+ 
+#endif /* EXPERIMENTAL */
+ 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 07ff9351e9ee..2d3275bedb55 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -879,18 +879,19 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        cifs_stats_bytes_written(pTcon, total_written);
        /* since the write may have blocked check these pointers again */
-        if (file->f_path.dentry) {
+        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-                if (file->f_path.dentry->d_inode) {
+                struct inode *inode = file->f_path.dentry->d_inode;
-                        struct inode *inode = file->f_path.dentry->d_inode;
+/* Do not update local mtime - server will set its actual value on write                
-                        inode->i_ctime = inode->i_mtime =
+ *              inode->i_ctime = inode->i_mtime = 
-                                current_fs_time(inode->i_sb);
+ *                      current_fs_time(inode->i_sb);*/
-                        if (total_written > 0) {
+                if (total_written > 0) {
-                                if (*poffset > file->f_path.dentry->d_inode->i_size)
+                        spin_lock(&inode->i_lock);
-                                        i_size_write(file->f_path.dentry->d_inode,
+                        if (*poffset > file->f_path.dentry->d_inode->i_size)
+                                i_size_write(file->f_path.dentry->d_inode,
                                        *poffset);
-                        }
+                        spin_unlock(&inode->i_lock);
-                        mark_inode_dirty_sync(file->f_path.dentry->d_inode);
                }
+                mark_inode_dirty_sync(file->f_path.dentry->d_inode);    
        }
        FreeXid(xid);
        return total_written;
@@ -1012,17 +1013,18 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        cifs_stats_bytes_written(pTcon, total_written);
        /* since the write may have blocked check these pointers again */
-        if (file->f_path.dentry) {
+        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-                if (file->f_path.dentry->d_inode) {
+/*BB We could make this contingent on superblock ATIME flag too */
-                        file->f_path.dentry->d_inode->i_ctime =
+/*              file->f_path.dentry->d_inode->i_ctime =
-                        file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;
+                file->f_path.dentry->d_inode->i_mtime = CURRENT_TIME;*/
-                        if (total_written > 0) {
+                if (total_written > 0) {
-                                if (*poffset > file->f_path.dentry->d_inode->i_size)
+                        spin_lock(&file->f_path.dentry->d_inode->i_lock);
-                                        i_size_write(file->f_path.dentry->d_inode,
+                        if (*poffset > file->f_path.dentry->d_inode->i_size)
-                                                     *poffset);
+                                i_size_write(file->f_path.dentry->d_inode,
-                        }
+                                             *poffset);
-                        mark_inode_dirty_sync(file->f_path.dentry->d_inode);
+                        spin_unlock(&file->f_path.dentry->d_inode->i_lock);
                }
+                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
        FreeXid(xid);
        return total_written;
@@ -1399,6 +1401,7 @@ static int cifs_commit_write(struct file *file, struct page *page,
        xid = GetXid();
        cFYI(1, ("commit write for page %p up to position %lld for %d", 
                 page, position, to));
+        spin_lock(&inode->i_lock);
        if (position > inode->i_size) {
                i_size_write(inode, position);
                /* if (file->private_data == NULL) {
@@ -1428,6 +1431,7 @@ static int cifs_commit_write(struct file *file, struct page *page,
                        cFYI(1, (" SetEOF (commit write) rc = %d", rc));
                } */
        }
+        spin_unlock(&inode->i_lock);
        if (!PageUptodate(page)) {
                position =  ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
                /* can not rely on (or let) writepage write this data */
@@ -1988,34 +1992,52 @@ static int cifs_prepare_write(struct file *file, struct page *page,
        unsigned from, unsigned to)
 {
        int rc = 0;
-        loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        loff_t i_size;
+        loff_t offset;
        cFYI(1, ("prepare write for page %p from %d to %d",page,from,to));
-        if (!PageUptodate(page)) {
+        if (PageUptodate(page))
-        /*      if (to - from != PAGE_CACHE_SIZE) {
+                return 0;
-                        void *kaddr = kmap_atomic(page, KM_USER0);
+        /* If we are writing a full page it will be up to date,
+           no need to read from the server */
+        if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
+                SetPageUptodate(page);
+                return 0;
+        }
+        offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        i_size = i_size_read(page->mapping->host);
+        if ((offset >= i_size) ||
+            ((from == 0) && (offset + to) >= i_size)) {
+                /*
+                 * We don't need to read data beyond the end of the file.
+                 * zero it, and set the page uptodate
+                 */
+                void *kaddr = kmap_atomic(page, KM_USER0);
+                if (from)
                        memset(kaddr, 0, from);
+                if (to < PAGE_CACHE_SIZE)
                        memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
-                        flush_dcache_page(page);
+                flush_dcache_page(page);
-                        kunmap_atomic(kaddr, KM_USER0);
+                kunmap_atomic(kaddr, KM_USER0);
-                } */
+                SetPageUptodate(page);
-                /* If we are writing a full page it will be up to date,
+        } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
-                   no need to read from the server */
-                if ((to == PAGE_CACHE_SIZE) && (from == 0))
-                        SetPageUptodate(page);
                /* might as well read a page, it is fast enough */
-                if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+                rc = cifs_readpage_worker(file, page, &offset);
-                        rc = cifs_readpage_worker(file, page, &offset);
+        } else {
-                } else {
+                /* we could try using another file handle if there is one -
-                /* should we try using another file handle if there is one -
+                   but how would we lock it to prevent close of that handle
-                   how would we lock it to prevent close of that handle
+                   racing with this read? In any case
-                   racing with this read?
+                   this will be written out by commit_write so is fine */
-                   In any case this will be written out by commit_write */
-                }
        }
-        /* BB should we pass any errors back? 
+        /* we do not need to pass errors back 
-           e.g. if we do not have read access to the file */
+           e.g. if we do not have read access to the file 
+           because cifs_commit_write will do the right thing.  -- shaggy */
        return 0;
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3f5bc83dc3d1..f414526e476a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -90,6 +90,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                                (*pinode)->i_ino =
                                        (unsigned long)findData.UniqueId;
                        } /* note ino incremented to unique num in new_inode */
+                        if(sb->s_flags & MS_NOATIME)
+                                (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
+                                
                        insert_inode_hash(*pinode);
                }
@@ -140,10 +143,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                inode->i_gid = le64_to_cpu(findData.Gid);
                inode->i_nlink = le64_to_cpu(findData.Nlinks);
+                spin_lock(&inode->i_lock);
                if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the
                   client is writing to it due to potential races */
                        i_size_write(inode, end_of_file);
                /* blksize needs to be multiple of two. So safer to default to
@@ -159,6 +162,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                /* for this calculation */
                        inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
                }
+                spin_unlock(&inode->i_lock);
                if (num_of_bytes < end_of_file)
                        cFYI(1, ("allocation size less than end of file"));
@@ -421,6 +425,8 @@ int cifs_get_inode_info(struct inode **pinode,
                                } else /* do we need cast or hash to ino? */
                                        (*pinode)->i_ino = inode_num;
                        } /* else ino incremented to unique num in new_inode*/
+                        if(sb->s_flags & MS_NOATIME)
+                                (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
                        insert_inode_hash(*pinode);
                }
                inode = *pinode;
@@ -488,9 +494,17 @@ int cifs_get_inode_info(struct inode **pinode,
                           mode e.g. 555 */
                        if (cifsInfo->cifsAttrs & ATTR_READONLY)
                                inode->i_mode &= ~(S_IWUGO);
+                        else if ((inode->i_mode & S_IWUGO) == 0)
+                                /* the ATTR_READONLY flag may have been */
+                                /* changed on server -- set any w bits  */
+                                /* allowed by mnt_file_mode             */
+                                inode->i_mode |= (S_IWUGO &
+                                                  cifs_sb->mnt_file_mode);
                /* BB add code here -
                   validate if device or weird share or device type? */
                }
+                
+                spin_lock(&inode->i_lock);
                if (is_size_safe_to_change(cifsInfo, le64_to_cpu(pfindData->EndOfFile))) {
                        /* can not safely shrink the file size here if the
                           client is writing to it due to potential races */
@@ -501,6 +515,7 @@ int cifs_get_inode_info(struct inode **pinode,
                        inode->i_blocks = (512 - 1 + le64_to_cpu(
                                           pfindData->AllocationSize)) >> 9;
                }
+                spin_unlock(&inode->i_lock);
                inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
@@ -829,8 +844,10 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        if (!rc) {
                drop_nlink(inode);
+                spin_lock(&direntry->d_inode->i_lock);
                i_size_write(direntry->d_inode,0);
                clear_nlink(direntry->d_inode);
+                spin_unlock(&direntry->d_inode->i_lock);
        }
        cifsInode = CIFS_I(direntry->d_inode);
@@ -1123,6 +1140,52 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
        return rc;
 }
+static int cifs_vmtruncate(struct inode * inode, loff_t offset)
+{
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long limit;
+        spin_lock(&inode->i_lock);
+        if (inode->i_size < offset)
+                goto do_expand;
+        /*
+         * truncation of in-use swapfiles is disallowed - it would cause
+         * subsequent swapout to scribble on the now-freed blocks.
+         */
+        if (IS_SWAPFILE(inode)) {
+                spin_unlock(&inode->i_lock);
+                goto out_busy;
+        }
+        i_size_write(inode, offset);
+        spin_unlock(&inode->i_lock);
+        unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+        truncate_inode_pages(mapping, offset);
+        goto out_truncate;
+do_expand:
+        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+        if (limit != RLIM_INFINITY && offset > limit) {
+                spin_unlock(&inode->i_lock);
+                goto out_sig;
+        }
+        if (offset > inode->i_sb->s_maxbytes) {
+                spin_unlock(&inode->i_lock);
+                goto out_big;
+        }
+        i_size_write(inode, offset);
+        spin_unlock(&inode->i_lock);
+out_truncate:
+        if (inode->i_op && inode->i_op->truncate)
+                inode->i_op->truncate(inode);
+        return 0;
+out_sig:
+        send_sig(SIGXFSZ, current, 0);
+out_big:
+        return -EFBIG;
+out_busy:
+        return -ETXTBSY;
+}
 int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
        int xid;
@@ -1133,6 +1196,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        struct cifsFileInfo *open_file = NULL;
        FILE_BASIC_INFO time_buf;
        int set_time = FALSE;
+        int set_dosattr = FALSE;
        __u64 mode = 0xFFFFFFFFFFFFFFFFULL;
        __u64 uid = 0xFFFFFFFFFFFFFFFFULL;
        __u64 gid = 0xFFFFFFFFFFFFFFFFULL;
@@ -1239,7 +1303,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                   */
                if (rc == 0) {
-                        rc = vmtruncate(direntry->d_inode, attrs->ia_size);
+                        rc = cifs_vmtruncate(direntry->d_inode, attrs->ia_size);
                        cifs_truncate_page(direntry->d_inode->i_mapping,
                                           direntry->d_inode->i_size);
                } else 
@@ -1269,15 +1333,23 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        else if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
                if ((mode & S_IWUGO) == 0) /* not writeable */ {
-                        if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0)
+                        if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+                                set_dosattr = TRUE;
                                time_buf.Attributes =
                                        cpu_to_le32(cifsInode->cifsAttrs |
                                                    ATTR_READONLY);
+                        }
                } else if ((mode & S_IWUGO) == S_IWUGO) {
-                        if (cifsInode->cifsAttrs & ATTR_READONLY)
+                        if (cifsInode->cifsAttrs & ATTR_READONLY) {
+                                set_dosattr = TRUE;
                                time_buf.Attributes =
                                        cpu_to_le32(cifsInode->cifsAttrs &
                                                    (~ATTR_READONLY));
+                                /* Windows ignores set to zero */
+                                if(time_buf.Attributes == 0)
+                                        time_buf.Attributes |= 
+                                                cpu_to_le32(ATTR_NORMAL);
+                        }
                }
                /* BB to be implemented -
                   via Windows security descriptors or streams */
@@ -1315,7 +1387,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        } else
                time_buf.ChangeTime = 0;
-        if (set_time || time_buf.Attributes) {
+        if (set_time || set_dosattr) {
                time_buf.CreationTime = 0;      /* do not change */
                /* In the future we should experiment - try setting timestamps
                   via Handle (SetFileInfo) instead of by path */
@@ -1359,7 +1431,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                and this check ensures that we are not being called from
                sys_utimes in which case we ought to fail the call back to
                the user when the server rejects the call */
-                if((rc) && (attrs->ia_valid &&
+                if((rc) && (attrs->ia_valid &
                         (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
                        rc = 0;
        }
@@ -1374,9 +1446,11 @@ cifs_setattr_exit:
        return rc;
 }
+#if 0
 void cifs_delete_inode(struct inode *inode)
 {
        cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
        /* may have to add back in if and when safe distributed caching of
           directories added e.g. via FindNotify */
 }
+#endif
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 8e259969354b..6baea85d726e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -77,7 +77,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
                cifsInode = CIFS_I(old_file->d_inode);
                if(rc == 0) {
                        old_file->d_inode->i_nlink++;
-                        old_file->d_inode->i_ctime = CURRENT_TIME;
+/* BB should we make this contingent on superblock flag NOATIME? */
+/*                      old_file->d_inode->i_ctime = CURRENT_TIME;*/
                        /* parent dir timestamps will update from srv
                        within a second, would it really be worth it
                        to set the parent dir cifs inode time to zero
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c6220bd27165..2a374d5215ab 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -3,7 +3,7 @@
 *
 *   Directory search handling
 * 
- *   Copyright (C) International Business Machines  Corp., 2004, 2005
+ *   Copyright (C) International Business Machines  Corp., 2004, 2007
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -83,6 +83,8 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
                                return rc;
                        rc = 1;
                }
+                if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
        } else {
                tmp_dentry = d_alloc(file->f_path.dentry, qstring);
                if(tmp_dentry == NULL) {
@@ -98,6 +100,8 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
                        tmp_dentry->d_op = &cifs_dentry_ops;
                if(*ptmp_inode == NULL)
                        return rc;
+                if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+                        (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;                       
                rc = 2;
        }
@@ -215,6 +219,10 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                tmp_inode->i_mode |= S_IFREG;
                if (attr & ATTR_READONLY)
                        tmp_inode->i_mode &= ~(S_IWUGO);
+                else if ((tmp_inode->i_mode & S_IWUGO) == 0)
+                        /* the ATTR_READONLY flag may have been changed on   */
+                        /* server -- set any w bits allowed by mnt_file_mode */
+                        tmp_inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
        } /* could add code here - to validate if device or weird share type? */
        /* can not fill in nlink here as in qpathinfo version and Unx search */
@@ -222,6 +230,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                atomic_set(&cifsInfo->inUse, 1);
        }
+        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the 
                client is writing to it due to potential races */
@@ -231,6 +240,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        /* for this calculation, even though the reported blocksize is larger */
                tmp_inode->i_blocks = (512 - 1 + allocation_size) >> 9;
        }
+        spin_unlock(&tmp_inode->i_lock);
        if (allocation_size < end_of_file)
                cFYI(1, ("May be sparse file, allocation less than file size"));
@@ -351,6 +361,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
        tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
        tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
+        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
                /* can not safely change the file size here if the 
                client is writing to it due to potential races */
@@ -360,6 +371,7 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
        /* for this calculation, not the real blocksize */
                tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
        }
+        spin_unlock(&tmp_inode->i_lock);
        if (S_ISREG(tmp_inode->i_mode)) {
                cFYI(1, ("File inode"));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f80007eaebf4..5f468459a1e2 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -499,7 +499,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
           due to last connection to this server being unmounted */
        if (signal_pending(current)) {
                /* if signal pending do not hold up user for full smb timeout
-                but we still give response a change to complete */
+                but we still give response a chance to complete */
                timeout = 2 * HZ;
        }   
@@ -587,7 +587,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        }
 out:
        DeleteMidQEntry(midQ);
        atomic_dec(&ses->server->inFlight); 
        wake_up(&ses->server->request_q);
@@ -681,7 +680,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           due to last connection to this server being unmounted */
        if (signal_pending(current)) {
                /* if signal pending do not hold up user for full smb timeout
-                but we still give response a change to complete */
+                but we still give response a chance to complete */
                timeout = 2 * HZ;
        }   
@@ -765,7 +764,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        }
 out:
        DeleteMidQEntry(midQ);
        atomic_dec(&ses->server->inFlight); 
        wake_up(&ses->server->request_q);
diff --git a/fs/compat.c b/fs/compat.c
index 0ec70e3cee0a..040a8be38a48 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -48,6 +48,7 @@
 #include <linux/highmem.h>
 #include <linux/poll.h>
 #include <linux/mm.h>
+#include <linux/eventpoll.h>
 #include <net/sock.h>           /* siocdevprivate_ioctl */
@@ -2235,3 +2236,102 @@ long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
        return sys_ni_syscall();
 }
 #endif
+#ifdef CONFIG_EPOLL
+#ifdef CONFIG_HAS_COMPAT_EPOLL_EVENT
+asmlinkage long compat_sys_epoll_ctl(int epfd, int op, int fd,
+                        struct compat_epoll_event __user *event)
+{
+        long err = 0;
+        struct compat_epoll_event user;
+        struct epoll_event __user *kernel = NULL;
+        if (event) {
+                if (copy_from_user(&user, event, sizeof(user)))
+                        return -EFAULT;
+                kernel = compat_alloc_user_space(sizeof(struct epoll_event));
+                err |= __put_user(user.events, &kernel->events);
+                err |= __put_user(user.data, &kernel->data);
+        }
+        return err ? err : sys_epoll_ctl(epfd, op, fd, kernel);
+}
+asmlinkage long compat_sys_epoll_wait(int epfd,
+                        struct compat_epoll_event __user *events,
+                        int maxevents, int timeout)
+{
+        long i, ret, err = 0;
+        struct epoll_event __user *kbuf;
+        struct epoll_event ev;
+        if ((maxevents <= 0) ||
+                        (maxevents > (INT_MAX / sizeof(struct epoll_event))))
+                return -EINVAL;
+        kbuf = compat_alloc_user_space(sizeof(struct epoll_event) * maxevents);
+        ret = sys_epoll_wait(epfd, kbuf, maxevents, timeout);
+        for (i = 0; i < ret; i++) {
+                err |= __get_user(ev.events, &kbuf[i].events);
+                err |= __get_user(ev.data, &kbuf[i].data);
+                err |= __put_user(ev.events, &events->events);
+                err |= __put_user_unaligned(ev.data, &events->data);
+                events++;
+        }
+        return err ? -EFAULT: ret;
+}
+#endif  /* CONFIG_HAS_COMPAT_EPOLL_EVENT */
+#ifdef TIF_RESTORE_SIGMASK
+asmlinkage long compat_sys_epoll_pwait(int epfd,
+                        struct compat_epoll_event __user *events,
+                        int maxevents, int timeout,
+                        const compat_sigset_t __user *sigmask,
+                        compat_size_t sigsetsize)
+{
+        long err;
+        compat_sigset_t csigmask;
+        sigset_t ksigmask, sigsaved;
+        /*
+         * If the caller wants a certain signal mask to be set during the wait,
+         * we apply it here.
+         */
+        if (sigmask) {
+                if (sigsetsize != sizeof(compat_sigset_t))
+                        return -EINVAL;
+                if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+                        return -EFAULT;
+                sigset_from_compat(&ksigmask, &csigmask);
+                sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+        }
+#ifdef CONFIG_HAS_COMPAT_EPOLL_EVENT
+        err = compat_sys_epoll_wait(epfd, events, maxevents, timeout);
+#else
+        err = sys_epoll_wait(epfd, events, maxevents, timeout);
+#endif
+        /*
+         * If we changed the signal mask, we need to restore the original one.
+         * In case we've got a signal while waiting, we do not restore the
+         * signal mask yet, and we allow do_signal() to deliver the signal on
+         * the way back to userspace, before the signal mask is restored.
+         */
+        if (sigmask) {
+                if (err == -EINTR) {
+                        memcpy(&current->saved_sigmask, &sigsaved,
+                               sizeof(sigsaved));
+                        set_thread_flag(TIF_RESTORE_SIGMASK);
+                } else
+                        sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+        }
+        return err;
+}
+#endif /* TIF_RESTORE_SIGMASK */
+#endif /* CONFIG_EPOLL */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c81c958b3e1d..c68b055fa26e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -266,6 +266,23 @@ static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
        return err;
 }
+static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+        struct compat_timespec __user *up = compat_ptr(arg);
+        struct timespec kts;
+        mm_segment_t old_fs = get_fs();
+        int err;
+        set_fs(KERNEL_DS);
+        err = sys_ioctl(fd, cmd, (unsigned long)&kts);
+        set_fs(old_fs);
+        if (!err) {
+                err = put_user(kts.tv_sec, &up->tv_sec);
+                err |= __put_user(kts.tv_nsec, &up->tv_nsec);
+        }
+        return err;
+}
 struct ifmap32 {
        compat_ulong_t mem_start;
        compat_ulong_t mem_end;
@@ -2437,6 +2454,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
 /* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
 HANDLE_IOCTL(SIOCRTMSG, ret_einval)
 HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
+HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
@@ -2553,11 +2571,15 @@ HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
 HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
 /* wireless */
 HANDLE_IOCTL(SIOCGIWRANGE, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCGIWPRIV, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCGIWSTATS, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWSPY, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWSPY, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWTHRSPY, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWTHRSPY, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCSIWMLME, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWAPLIST, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCSIWSCAN, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWSCAN, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWESSID, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWESSID, do_wireless_ioctl)
@@ -2565,6 +2587,11 @@ HANDLE_IOCTL(SIOCSIWNICKN, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWNICKN, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCSIWGENIE, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCGIWGENIE, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCSIWENCODEEXT, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
+HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 34750d5e4ff2..5e6e37e58f36 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1141,25 +1141,22 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        err = -ENOMEM;
        dentry = d_alloc(configfs_sb->s_root, &name);
-        if (!dentry)
+        if (dentry) {
-                goto out_release;
+                d_add(dentry, NULL);
-        d_add(dentry, NULL);
-        err = configfs_attach_group(sd->s_element, &group->cg_item,
+                err = configfs_attach_group(sd->s_element, &group->cg_item,
-                                    dentry);
+                                            dentry);
-        if (!err)
+                if (err) {
-                dentry = NULL;
+                        d_delete(dentry);
-        else
+                        dput(dentry);
-                d_delete(dentry);
+                }
+        }
        mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
-        if (dentry) {
+        if (err) {
-            dput(dentry);
+                unlink_group(group);
-out_release:
+                configfs_release_fs();
-            unlink_group(group);
-            configfs_release_fs();
        }
        return err;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 8d130cc85322..2e124e0075c5 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/namei.h>
 #include <linux/debugfs.h>
 static ssize_t default_read_file(struct file *file, char __user *buf,
@@ -44,6 +45,17 @@ const struct file_operations debugfs_file_operations = {
        .open =         default_open,
 };
+static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        nd_set_link(nd, dentry->d_inode->i_private);
+        return NULL;
+}
+const struct inode_operations debugfs_link_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = debugfs_follow_link,
+};
 static void debugfs_u8_set(void *data, u64 val)
 {
        *(u8 *)data = val;
@@ -167,6 +179,48 @@ struct dentry *debugfs_create_u32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32);
+static void debugfs_u64_set(void *data, u64 val)
+{
+        *(u64 *)data = val;
+}
+static u64 debugfs_u64_get(void *data)
+{
+        return *(u64 *)data;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+/**
+ * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+                                 struct dentry *parent, u64 *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_u64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u64);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c692487346ea..7b324cfebcb1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -25,11 +25,13 @@
 #include <linux/namei.h>
 #include <linux/debugfs.h>
 #include <linux/fsnotify.h>
+#include <linux/string.h>
 #define DEBUGFS_MAGIC   0x64626720
 /* declared over in file.c */
 extern struct file_operations debugfs_file_operations;
+extern struct inode_operations debugfs_link_operations;
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
@@ -51,6 +53,9 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
                case S_IFREG:
                        inode->i_fop = &debugfs_file_operations;
                        break;
+                case S_IFLNK:
+                        inode->i_op = &debugfs_link_operations;
+                        break;
                case S_IFDIR:
                        inode->i_op = &simple_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;
@@ -96,6 +101,12 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return res;
 }
+static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode)
+{
+        mode = (mode & S_IALLUGO) | S_IFLNK;
+        return debugfs_mknod(dir, dentry, mode, 0);
+}
 static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode)
 {
        int res;
@@ -158,10 +169,17 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
        mutex_lock(&parent->d_inode->i_mutex);
        *dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(*dentry)) {
-                if ((mode & S_IFMT) == S_IFDIR)
+                switch (mode & S_IFMT) {
+                case S_IFDIR:
                        error = debugfs_mkdir(parent->d_inode, *dentry, mode);
-                else 
+                        break;
+                case S_IFLNK:
+                        error = debugfs_link(parent->d_inode, *dentry, mode);
+                        break;
+                default:
                        error = debugfs_create(parent->d_inode, *dentry, mode);
+                        break;
+                }
                dput(*dentry);
        } else
                error = PTR_ERR(*dentry);
@@ -194,9 +212,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
+ * returned.
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
 */
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
                                   struct dentry *parent, void *data,
@@ -246,9 +262,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
 * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.  It is not wise to check for this value, but rather, check for
+ * returned.
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
 */
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 {
@@ -259,6 +273,47 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 EXPORT_SYMBOL_GPL(debugfs_create_dir);
 /**
+ * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the symbolic link to
+ *        create.
+ * @parent: a pointer to the parent dentry for this symbolic link.  This
+ *          should be a directory dentry if set.  If this paramater is NULL,
+ *          then the symbolic link will be created in the root of the debugfs
+ *          filesystem.
+ * @target: a pointer to a string containing the path to the target of the
+ *          symbolic link.
+ *
+ * This function creates a symbolic link with the given name in debugfs that
+ * links to the given target path.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the symbolic
+ * link is to be removed (no automatic cleanup happens if your module is
+ * unloaded, you are responsible here.)  If an error occurs, %NULL will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
+                                      const char *target)
+{
+        struct dentry *result;
+        char *link;
+        link = kstrdup(target, GFP_KERNEL);
+        if (!link)
+                return NULL;
+        result = debugfs_create_file(name, S_IFLNK | S_IRWXUGO, parent, link,
+                                     NULL);
+        if (!result)
+                kfree(link);
+        return result;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_symlink);
+/**
 * debugfs_remove - removes a file or directory from the debugfs filesystem
 * @dentry: a pointer to a the dentry of the file or directory to be
 *          removed.
@@ -287,15 +342,22 @@ void debugfs_remove(struct dentry *dentry)
        if (debugfs_positive(dentry)) {
                if (dentry->d_inode) {
                        dget(dentry);
-                        if (S_ISDIR(dentry->d_inode->i_mode)) {
+                        switch (dentry->d_inode->i_mode & S_IFMT) {
+                        case S_IFDIR:
                                ret = simple_rmdir(parent->d_inode, dentry);
                                if (ret)
                                        printk(KERN_ERR
                                                "DebugFS rmdir on %s failed : "
                                                "directory not empty.\n",
                                                dentry->d_name.name);
-                        } else
+                                break;
+                        case S_IFLNK:
+                                kfree(dentry->d_inode->i_private);
+                                /* fall through */
+                        default:
                                simple_unlink(parent->d_inode, dentry);
+                                break;
+                        }
                        if (!ret)
                                d_delete(dentry);
                        dput(dentry);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 40db61dc95f2..3870150b83a4 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -22,6 +22,7 @@
 #include "lockspace.h"
 #include "lock.h"
 #include "lvb_table.h"
+#include "user.h"
 static const char *name_prefix="dlm";
 static struct miscdevice ctl_device;
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 329efcd3d8c9..cb20b964419f 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -78,18 +78,13 @@ struct kmem_cache *ecryptfs_dentry_info_cache;
 */
 static void ecryptfs_d_release(struct dentry *dentry)
 {
-        struct dentry *lower_dentry;
+        if (ecryptfs_dentry_to_private(dentry)) {
+                if (ecryptfs_dentry_to_lower(dentry)) {
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+                        mntput(ecryptfs_dentry_to_lower_mnt(dentry));
-        if (ecryptfs_dentry_to_private(dentry))
+                        dput(ecryptfs_dentry_to_lower(dentry));
+                }
                kmem_cache_free(ecryptfs_dentry_info_cache,
                                ecryptfs_dentry_to_private(dentry));
-        if (lower_dentry) {
-                struct vfsmount *lower_mnt =
-                        ecryptfs_dentry_to_lower_mnt(dentry);
-                mntput(lower_mnt);
-                dput(lower_dentry);
        }
        return;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index bd969adf70d7..7a7d25d541e7 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -205,6 +205,7 @@ int ecryptfs_open_lower_file(struct file **lower_file,
 {
        int rc = 0;
+        flags |= O_LARGEFILE;
        dget(lower_dentry);
        mntget(lower_mnt);
        *lower_file = dentry_open(lower_dentry, lower_mnt, flags);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9fa7e0b27a96..1548be26b5e6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -38,7 +38,7 @@ static struct dentry *lock_parent(struct dentry *dentry)
        struct dentry *dir;
        dir = dget(dentry->d_parent);
-        mutex_lock(&(dir->d_inode->i_mutex));
+        mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT);
        return dir;
 }
@@ -168,9 +168,9 @@ static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file,
                goto out;
        }
        i_size_write(inode, 0);
-        ecryptfs_write_inode_size_to_metadata(lower_file, lower_inode, inode,
+        rc = ecryptfs_write_inode_size_to_metadata(lower_file, lower_inode,
-                                              ecryptfs_dentry,
+                        inode, ecryptfs_dentry,
-                                              ECRYPTFS_LOWER_I_MUTEX_NOT_HELD);
+                        ECRYPTFS_LOWER_I_MUTEX_NOT_HELD);
        ecryptfs_inode_to_private(inode)->crypt_stat.flags |= ECRYPTFS_NEW_FILE;
 out:
        return rc;
@@ -200,9 +200,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
        inode = ecryptfs_dentry->d_inode;
        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
        lower_flags = ((O_CREAT | O_TRUNC) & O_ACCMODE) | O_RDWR;
-#if BITS_PER_LONG != 32
-        lower_flags |= O_LARGEFILE;
-#endif
        lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
        /* Corresponding fput() at end of this function */
        if ((rc = ecryptfs_open_lower_file(&lower_file, lower_dentry, lower_mnt,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 80044d196fe0..fc4a3a224641 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -484,18 +484,12 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
        struct vfsmount *lower_mnt;
        memset(&nd, 0, sizeof(struct nameidata));
-        rc = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+        rc = path_lookup(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
-                goto out_free;
+                goto out;
        }
        lower_root = nd.dentry;
-        if (!lower_root->d_inode) {
-                ecryptfs_printk(KERN_WARNING,
-                                "No directory to interpose on\n");
-                rc = -ENOENT;
-                goto out_free;
-        }
        lower_mnt = nd.mnt;
        ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
        sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 3a6f65c3f14f..b731b09499cb 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -238,7 +238,6 @@ int ecryptfs_do_readpage(struct file *file, struct page *page,
        lower_page_data = kmap_atomic(lower_page, KM_USER1);
        memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
        kunmap_atomic(lower_page_data, KM_USER1);
-        flush_dcache_page(lower_page);
        kunmap_atomic(page_data, KM_USER0);
        flush_dcache_page(page);
        rc = 0;
@@ -422,9 +421,11 @@ out:
        return rc;
 }
-static void ecryptfs_release_lower_page(struct page *lower_page)
+static
+void ecryptfs_release_lower_page(struct page *lower_page, int page_locked)
 {
-        unlock_page(lower_page);
+        if (page_locked)
+                unlock_page(lower_page);
        page_cache_release(lower_page);
 }
@@ -445,6 +446,7 @@ static int ecryptfs_write_inode_size_to_header(struct file *lower_file,
        const struct address_space_operations *lower_a_ops;
        u64 file_size;
+retry:
        header_page = grab_cache_page(lower_inode->i_mapping, 0);
        if (!header_page) {
                ecryptfs_printk(KERN_ERR, "grab_cache_page for "
@@ -454,6 +456,14 @@ static int ecryptfs_write_inode_size_to_header(struct file *lower_file,
        }
        lower_a_ops = lower_inode->i_mapping->a_ops;
        rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8);
+        if (rc) {
+                if (rc == AOP_TRUNCATED_PAGE) {
+                        ecryptfs_release_lower_page(header_page, 0);
+                        goto retry;
+                } else
+                        ecryptfs_release_lower_page(header_page, 1);
+                goto out;
+        }
        file_size = (u64)i_size_read(inode);
        ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size);
        file_size = cpu_to_be64(file_size);
@@ -465,7 +475,11 @@ static int ecryptfs_write_inode_size_to_header(struct file *lower_file,
        if (rc < 0)
                ecryptfs_printk(KERN_ERR, "Error commiting header page "
                                "write\n");
-        ecryptfs_release_lower_page(header_page);
+        if (rc == AOP_TRUNCATED_PAGE) {
+                ecryptfs_release_lower_page(header_page, 0);
+                goto retry;
+        } else
+                ecryptfs_release_lower_page(header_page, 1);
        lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
        mark_inode_dirty_sync(inode);
 out:
@@ -491,7 +505,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *lower_inode,
                goto out;
        }
        lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
-        if (!lower_dentry->d_inode->i_op->getxattr) {
+        if (!lower_dentry->d_inode->i_op->getxattr ||
+                        !lower_dentry->d_inode->i_op->setxattr) {
                printk(KERN_WARNING
                       "No support for setting xattr in lower filesystem\n");
                rc = -ENOSYS;
@@ -553,6 +568,7 @@ int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
 {
        int rc = 0;
+retry:
        *lower_page = grab_cache_page(lower_inode->i_mapping, lower_page_index);
        if (!(*lower_page)) {
                rc = -EINVAL;
@@ -566,15 +582,18 @@ int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
                                                          byte_offset,
                                                          region_bytes);
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "prepare_write for "
+                if (rc == AOP_TRUNCATED_PAGE) {
+                        ecryptfs_release_lower_page(*lower_page, 0);
+                        goto retry;
+                } else {
+                        ecryptfs_printk(KERN_ERR, "prepare_write for "
                                "lower_page_index = [0x%.16x] failed; rc = "
                                "[%d]\n", lower_page_index, rc);
+                        ecryptfs_release_lower_page(*lower_page, 1);
+                        (*lower_page) = NULL;
+                }
        }
 out:
-        if (rc && (*lower_page)) {
-                ecryptfs_release_lower_page(*lower_page);
-                (*lower_page) = NULL;
-        }
        return rc;
 }
@@ -588,16 +607,19 @@ ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
                           struct file *lower_file, int byte_offset,
                           int region_size)
 {
+        int page_locked = 1;
        int rc = 0;
        rc = lower_inode->i_mapping->a_ops->commit_write(
                lower_file, lower_page, byte_offset, region_size);
+        if (rc == AOP_TRUNCATED_PAGE)
+                page_locked = 0;
        if (rc < 0) {
                ecryptfs_printk(KERN_ERR,
                                "Error committing write; rc = [%d]\n", rc);
        } else
                rc = 0;
-        ecryptfs_release_lower_page(lower_page);
+        ecryptfs_release_lower_page(lower_page, page_locked);
        return rc;
 }
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index e3aa2253c850..fe9186312d7c 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -97,7 +97,7 @@ out:
 */
 static int ecryptfs_process_nl_response(struct sk_buff *skb)
 {
-        struct nlmsghdr *nlh = (struct nlmsghdr*)skb->data;
+        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        struct ecryptfs_message *msg = NLMSG_DATA(nlh);
        int rc;
@@ -181,7 +181,7 @@ receive:
                                "rc = [%d]\n", rc);
                return;
        }
-        nlh = (struct nlmsghdr *)skb->data;
+        nlh = nlmsg_hdr(skb);
        if (!NLMSG_OK(nlh, skb->len)) {
                ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
                                "message\n");
@@ -229,7 +229,7 @@ int ecryptfs_init_netlink(void)
        ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
                                                 ecryptfs_receive_nl_message,
-                                                 THIS_MODULE);
+                                                 NULL, THIS_MODULE);
        if (!ecryptfs_nl_sock) {
                rc = -EIO;
                ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
diff --git a/fs/exec.c b/fs/exec.c
index 7e36c6f6f538..3155e915307a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1244,13 +1244,17 @@ EXPORT_SYMBOL(set_binfmt);
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
-static void format_corename(char *corename, const char *pattern, long signr)
+static int format_corename(char *corename, const char *pattern, long signr)
 {
        const char *pat_ptr = pattern;
        char *out_ptr = corename;
        char *const out_end = corename + CORENAME_MAX_SIZE;
        int rc;
        int pid_in_pattern = 0;
+        int ispipe = 0;
+        if (*pattern == '|')
+                ispipe = 1;
        /* Repeat as long as we have more pattern to process and more output
           space */
@@ -1341,8 +1345,8 @@ static void format_corename(char *corename, const char *pattern, long signr)
         *
         * If core_pattern does not include a %p (as is the default)
         * and core_uses_pid is set, then .%pid will be appended to
-         * the filename */
+         * the filename. Do not do this for piped commands. */
-        if (!pid_in_pattern
+        if (!ispipe && !pid_in_pattern
            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
                rc = snprintf(out_ptr, out_end - out_ptr,
                              ".%d", current->tgid);
@@ -1350,8 +1354,9 @@ static void format_corename(char *corename, const char *pattern, long signr)
                        goto out;
                out_ptr += rc;
        }
-      out:
+out:
        *out_ptr = 0;
+        return ispipe;
 }
 static void zap_process(struct task_struct *start)
@@ -1502,16 +1507,15 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
         * uses lock_kernel()
         */
        lock_kernel();
-        format_corename(corename, core_pattern, signr);
+        ispipe = format_corename(corename, core_pattern, signr);
        unlock_kernel();
-        if (corename[0] == '|') {
+        if (ispipe) {
                /* SIGPIPE can happen, but it's just never processed */
                if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
                        goto fail_unlock;
                }
-                ispipe = 1;
        } else
                file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index b1981d0e95ad..baf71dd721fa 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -29,7 +29,7 @@
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
- * when a file system is mounted (see ext2_read_super).
+ * when a file system is mounted (see ext2_fill_super).
 */
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 22161740ba29..ca8aee6efe37 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -32,7 +32,7 @@
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
- * when a file system is mounted (see ext3_read_super).
+ * when a file system is mounted (see ext3_fill_super).
 */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 8a824f4ce5c6..a5b150f7e8a2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1148,102 +1148,37 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext3_journal_get_write_access(handle, bh);
 }
-/*
- * The idea of this helper function is following:
- * if prepare_write has allocated some blocks, but not all of them, the
- * transaction must include the content of the newly allocated blocks.
- * This content is expected to be set to zeroes by block_prepare_write().
- * 2006/10/14  SAW
- */
-static int ext3_prepare_failure(struct file *file, struct page *page,
-                                unsigned from, unsigned to)
-{
-        struct address_space *mapping;
-        struct buffer_head *bh, *head, *next;
-        unsigned block_start, block_end;
-        unsigned blocksize;
-        int ret;
-        handle_t *handle = ext3_journal_current_handle();
-        mapping = page->mapping;
-        if (ext3_should_writeback_data(mapping->host)) {
-                /* optimization: no constraints about data */
-skip:
-                return ext3_journal_stop(handle);
-        }
-        head = page_buffers(page);
-        blocksize = head->b_size;
-        for (   bh = head, block_start = 0;
-                bh != head || !block_start;
-                block_start = block_end, bh = next)
-        {
-                next = bh->b_this_page;
-                block_end = block_start + blocksize;
-                if (block_end <= from)
-                        continue;
-                if (block_start >= to) {
-                        block_start = to;
-                        break;
-                }
-                if (!buffer_mapped(bh))
-                /* prepare_write failed on this bh */
-                        break;
-                if (ext3_should_journal_data(mapping->host)) {
-                        ret = do_journal_get_write_access(handle, bh);
-                        if (ret) {
-                                ext3_journal_stop(handle);
-                                return ret;
-                        }
-                }
-        /*
-         * block_start here becomes the first block where the current iteration
-         * of prepare_write failed.
-         */
-        }
-        if (block_start <= from)
-                goto skip;
-        /* commit allocated and zeroed buffers */
-        return mapping->a_ops->commit_write(file, page, from, block_start);
-}
 static int ext3_prepare_write(struct file *file, struct page *page,
                              unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
-        int ret, ret2;
+        int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
-        int needed_blocks = ext3_writepage_trans_blocks(inode);
        handle_t *handle;
        int retries = 0;
 retry:
        handle = ext3_journal_start(inode, needed_blocks);
-        if (IS_ERR(handle))
+        if (IS_ERR(handle)) {
-                return PTR_ERR(handle);
+                ret = PTR_ERR(handle);
+                goto out;
+        }
        if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
                ret = nobh_prepare_write(page, from, to, ext3_get_block);
        else
                ret = block_prepare_write(page, from, to, ext3_get_block);
        if (ret)
-                goto failure;
+                goto prepare_write_failed;
        if (ext3_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
                                from, to, NULL, do_journal_get_write_access);
-                if (ret)
-                        /* fatal error, just put the handle and return */
-                        journal_stop(handle);
        }
-        return ret;
+prepare_write_failed:
+        if (ret)
-failure:
+                ext3_journal_stop(handle);
-        ret2 = ext3_prepare_failure(file, page, from, to);
-        if (ret2 < 0)
-                return ret2;
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        /* retry number exceeded, or other error like -EDQUOT */
+out:
        return ret;
 }
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 99857a400f4b..f58cbb26323e 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -475,8 +475,15 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh)
 {
        struct mb_cache_entry *ce = NULL;
+        int error = 0;
        ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
+        error = ext3_journal_get_write_access(handle, bh);
+        if (error)
+                 goto out;
+        lock_buffer(bh);
        if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
@@ -485,21 +492,21 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
                get_bh(bh);
                ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
-                if (ext3_journal_get_write_access(handle, bh) == 0) {
+                BHDR(bh)->h_refcount = cpu_to_le32(
-                        lock_buffer(bh);
-                        BHDR(bh)->h_refcount = cpu_to_le32(
                                le32_to_cpu(BHDR(bh)->h_refcount) - 1);
-                        ext3_journal_dirty_metadata(handle, bh);
+                error = ext3_journal_dirty_metadata(handle, bh);
-                        if (IS_SYNC(inode))
+                if (IS_SYNC(inode))
-                                handle->h_sync = 1;
+                        handle->h_sync = 1;
-                        DQUOT_FREE_BLOCK(inode, 1);
+                DQUOT_FREE_BLOCK(inode, 1);
-                        unlock_buffer(bh);
+                ea_bdebug(bh, "refcount now=%d; releasing",
-                        ea_bdebug(bh, "refcount now=%d; releasing",
+                          le32_to_cpu(BHDR(bh)->h_refcount));
-                                  le32_to_cpu(BHDR(bh)->h_refcount));
-                }
                if (ce)
                        mb_cache_entry_release(ce);
        }
+        unlock_buffer(bh);
+out:
+        ext3_std_error(inode->i_sb, error);
+        return;
 }
 struct ext3_xattr_info {
@@ -675,7 +682,7 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
        struct buffer_head *new_bh = NULL;
        struct ext3_xattr_search *s = &bs->s;
        struct mb_cache_entry *ce = NULL;
-        int error;
+        int error = 0;
 #define header(x) ((struct ext3_xattr_header *)(x))
@@ -684,16 +691,17 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
        if (s->base) {
                ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
                                        bs->bh->b_blocknr);
+                error = ext3_journal_get_write_access(handle, bs->bh);
+                if (error)
+                        goto cleanup;
+                lock_buffer(bs->bh);
                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
                        if (ce) {
                                mb_cache_entry_free(ce);
                                ce = NULL;
                        }
                        ea_bdebug(bs->bh, "modifying in-place");
-                        error = ext3_journal_get_write_access(handle, bs->bh);
-                        if (error)
-                                goto cleanup;
-                        lock_buffer(bs->bh);
                        error = ext3_xattr_set_entry(i, s);
                        if (!error) {
                                if (!IS_LAST_ENTRY(s->first))
@@ -713,6 +721,9 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
                } else {
                        int offset = (char *)s->here - bs->bh->b_data;
+                        unlock_buffer(bs->bh);
+                        journal_release_buffer(handle, bs->bh);
                        if (ce) {
                                mb_cache_entry_release(ce);
                                ce = NULL;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c4dd1103ccf1..8a23483ca8d0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -50,7 +50,7 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
- * when a file system is mounted (see ext4_read_super).
+ * when a file system is mounted (see ext4_fill_super).
 */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index dc2724fa7622..7916b50f9a13 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -222,7 +222,7 @@ static int ext4_ext_space_block(struct inode *inode)
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
-#ifdef AGRESSIVE_TEST
+#ifdef AGGRESSIVE_TEST
        if (size > 6)
                size = 6;
 #endif
@@ -235,7 +235,7 @@ static int ext4_ext_space_block_idx(struct inode *inode)
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
-#ifdef AGRESSIVE_TEST
+#ifdef AGGRESSIVE_TEST
        if (size > 5)
                size = 5;
 #endif
@@ -249,7 +249,7 @@ static int ext4_ext_space_root(struct inode *inode)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
-#ifdef AGRESSIVE_TEST
+#ifdef AGGRESSIVE_TEST
        if (size > 3)
                size = 3;
 #endif
@@ -263,7 +263,7 @@ static int ext4_ext_space_root_idx(struct inode *inode)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
-#ifdef AGRESSIVE_TEST
+#ifdef AGGRESSIVE_TEST
        if (size > 4)
                size = 4;
 #endif
@@ -1118,7 +1118,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         */
        if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
                return 0;
-#ifdef AGRESSIVE_TEST
+#ifdef AGGRESSIVE_TEST
        if (le16_to_cpu(ex1->ee_len) >= 4)
                return 0;
 #endif
@@ -1891,8 +1891,8 @@ void ext4_ext_init(struct super_block *sb)
        if (test_opt(sb, EXTENTS)) {
                printk("EXT4-fs: file extents enabled");
-#ifdef AGRESSIVE_TEST
+#ifdef AGGRESSIVE_TEST
-                printk(", agressive tests");
+                printk(", aggressive tests");
 #endif
 #ifdef CHECK_BINSEARCH
                printk(", check binsearch");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fbff4b9e122a..810b6d6474bf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1147,102 +1147,37 @@ static int do_journal_get_write_access(handle_t *handle,
        return ext4_journal_get_write_access(handle, bh);
 }
-/*
- * The idea of this helper function is following:
- * if prepare_write has allocated some blocks, but not all of them, the
- * transaction must include the content of the newly allocated blocks.
- * This content is expected to be set to zeroes by block_prepare_write().
- * 2006/10/14  SAW
- */
-static int ext4_prepare_failure(struct file *file, struct page *page,
-                                unsigned from, unsigned to)
-{
-        struct address_space *mapping;
-        struct buffer_head *bh, *head, *next;
-        unsigned block_start, block_end;
-        unsigned blocksize;
-        int ret;
-        handle_t *handle = ext4_journal_current_handle();
-        mapping = page->mapping;
-        if (ext4_should_writeback_data(mapping->host)) {
-                /* optimization: no constraints about data */
-skip:
-                return ext4_journal_stop(handle);
-        }
-        head = page_buffers(page);
-        blocksize = head->b_size;
-        for (   bh = head, block_start = 0;
-                bh != head || !block_start;
-                block_start = block_end, bh = next)
-        {
-                next = bh->b_this_page;
-                block_end = block_start + blocksize;
-                if (block_end <= from)
-                        continue;
-                if (block_start >= to) {
-                        block_start = to;
-                        break;
-                }
-                if (!buffer_mapped(bh))
-                /* prepare_write failed on this bh */
-                        break;
-                if (ext4_should_journal_data(mapping->host)) {
-                        ret = do_journal_get_write_access(handle, bh);
-                        if (ret) {
-                                ext4_journal_stop(handle);
-                                return ret;
-                        }
-                }
-        /*
-         * block_start here becomes the first block where the current iteration
-         * of prepare_write failed.
-         */
-        }
-        if (block_start <= from)
-                goto skip;
-        /* commit allocated and zeroed buffers */
-        return mapping->a_ops->commit_write(file, page, from, block_start);
-}
 static int ext4_prepare_write(struct file *file, struct page *page,
                              unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
-        int ret, ret2;
+        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
-        int needed_blocks = ext4_writepage_trans_blocks(inode);
        handle_t *handle;
        int retries = 0;
 retry:
        handle = ext4_journal_start(inode, needed_blocks);
-        if (IS_ERR(handle))
+        if (IS_ERR(handle)) {
-                return PTR_ERR(handle);
+                ret = PTR_ERR(handle);
+                goto out;
+        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
                ret = nobh_prepare_write(page, from, to, ext4_get_block);
        else
                ret = block_prepare_write(page, from, to, ext4_get_block);
        if (ret)
-                goto failure;
+                goto prepare_write_failed;
        if (ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
                                from, to, NULL, do_journal_get_write_access);
-                if (ret)
-                        /* fatal error, just put the handle and return */
-                        ext4_journal_stop(handle);
        }
-        return ret;
+prepare_write_failed:
+        if (ret)
-failure:
+                ext4_journal_stop(handle);
-        ret2 = ext4_prepare_failure(file, page, from, to);
-        if (ret2 < 0)
-                return ret2;
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        /* retry number exceeded, or other error like -EDQUOT */
+out:
        return ret;
 }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index dc969c357aa1..e832e96095b3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -475,8 +475,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh)
 {
        struct mb_cache_entry *ce = NULL;
+        int error = 0;
        ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+        error = ext4_journal_get_write_access(handle, bh);
+        if (error)
+                goto out;
+        lock_buffer(bh);
        if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
@@ -485,21 +491,21 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                get_bh(bh);
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
-                if (ext4_journal_get_write_access(handle, bh) == 0) {
+                BHDR(bh)->h_refcount = cpu_to_le32(
-                        lock_buffer(bh);
-                        BHDR(bh)->h_refcount = cpu_to_le32(
                                le32_to_cpu(BHDR(bh)->h_refcount) - 1);
-                        ext4_journal_dirty_metadata(handle, bh);
+                error = ext4_journal_dirty_metadata(handle, bh);
-                        if (IS_SYNC(inode))
+                if (IS_SYNC(inode))
-                                handle->h_sync = 1;
+                        handle->h_sync = 1;
-                        DQUOT_FREE_BLOCK(inode, 1);
+                DQUOT_FREE_BLOCK(inode, 1);
-                        unlock_buffer(bh);
+                ea_bdebug(bh, "refcount now=%d; releasing",
-                        ea_bdebug(bh, "refcount now=%d; releasing",
+                          le32_to_cpu(BHDR(bh)->h_refcount));
-                                  le32_to_cpu(BHDR(bh)->h_refcount));
-                }
                if (ce)
                        mb_cache_entry_release(ce);
        }
+        unlock_buffer(bh);
+out:
+        ext4_std_error(inode->i_sb, error);
+        return;
 }
 struct ext4_xattr_info {
@@ -675,7 +681,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        struct buffer_head *new_bh = NULL;
        struct ext4_xattr_search *s = &bs->s;
        struct mb_cache_entry *ce = NULL;
-        int error;
+        int error = 0;
 #define header(x) ((struct ext4_xattr_header *)(x))
@@ -684,16 +690,17 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        if (s->base) {
                ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
                                        bs->bh->b_blocknr);
+                error = ext4_journal_get_write_access(handle, bs->bh);
+                if (error)
+                        goto cleanup;
+                lock_buffer(bs->bh);
                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
                        if (ce) {
                                mb_cache_entry_free(ce);
                                ce = NULL;
                        }
                        ea_bdebug(bs->bh, "modifying in-place");
-                        error = ext4_journal_get_write_access(handle, bs->bh);
-                        if (error)
-                                goto cleanup;
-                        lock_buffer(bs->bh);
                        error = ext4_xattr_set_entry(i, s);
                        if (!error) {
                                if (!IS_LAST_ENTRY(s->first))
@@ -713,6 +720,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                } else {
                        int offset = (char *)s->here - bs->bh->b_data;
+                        unlock_buffer(bs->bh);
+                        jbd2_journal_release_buffer(handle, bs->bh);
                        if (ce) {
                                mb_cache_entry_release(ce);
                                ce = NULL;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 761073544217..9bfe607c892e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -173,10 +173,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
                 *
                 * But we must fill the remaining area or hole by nul for
                 * updating ->mmu_private.
+                 *
+                 * Return 0, and fallback to normal buffered write.
                 */
                loff_t size = offset + iov_length(iov, nr_segs);
                if (MSDOS_I(inode)->mmu_private < size)
-                        return -EINVAL;
+                        return 0;
        }
        /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 406bf61ed510..8890eba1db52 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -195,7 +195,7 @@ static struct dentry_operations fuse_dentry_operations = {
        .d_revalidate   = fuse_dentry_revalidate,
 };
-static int valid_mode(int m)
+int fuse_valid_type(int m)
 {
        return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
                S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
@@ -248,7 +248,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        fuse_put_request(fc, req);
        /* Zero nodeid is same as -ENOENT, but with valid timeout */
        if (!err && outarg.nodeid &&
-            (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode)))
+            (invalid_nodeid(outarg.nodeid) ||
+             !fuse_valid_type(outarg.attr.mode)))
                err = -EIO;
        if (!err && outarg.nodeid) {
                inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b98b20de7405..68ae87cbafab 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -552,3 +552,8 @@ int fuse_ctl_add_conn(struct fuse_conn *fc);
 * Remove connection from control filesystem
 */
 void fuse_ctl_remove_conn(struct fuse_conn *fc);
+/**
+ * Is file type valid?
+ */
+int fuse_valid_type(int m);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5ab8e50e7808..608db81219a0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -330,6 +330,8 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
                case OPT_ROOTMODE:
                        if (match_octal(&args[0], &value))
                                return 0;
+                        if (!fuse_valid_type(value))
+                                return 0;
                        d->rootmode = value;
                        d->rootmode_present = 1;
                        break;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6618c1190252..12accb08fe02 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -20,6 +20,7 @@
 #include <linux/list.h>
 #include <linux/lm_interface.h>
 #include <linux/wait.h>
+#include <linux/module.h>
 #include <linux/rwsem.h>
 #include <asm/uaccess.h>
@@ -953,9 +954,6 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
                spin_unlock(&gl->gl_spin);
        }
-        if (glops->go_drop_bh)
-                glops->go_drop_bh(gl);
        spin_lock(&gl->gl_spin);
        gl->gl_req_gh = NULL;
        gl->gl_req_bh = NULL;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 46af55355513..39c8ae23bd9c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -245,7 +245,6 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
        if (ip && S_ISREG(ip->i_inode.i_mode)) {
                truncate_inode_pages(ip->i_inode.i_mapping, 0);
-                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !ip->i_inode.i_mapping->nrpages);
                clear_bit(GIF_PAGED, &ip->i_flags);
        }
 }
@@ -459,6 +458,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
+        .go_xmote_th = meta_go_sync,
+        .go_drop_th = meta_go_sync,
        .go_inval = meta_go_inval,
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 12c80fd28db5..49f0dbf40d86 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -104,7 +104,6 @@ struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
        void (*go_xmote_bh) (struct gfs2_glock *gl);
        void (*go_drop_th) (struct gfs2_glock *gl);
-        void (*go_drop_bh) (struct gfs2_glock *gl);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
@@ -416,7 +415,6 @@ struct gfs2_tune {
        unsigned int gt_stall_secs; /* Detects trouble! */
        unsigned int gt_complain_secs;
        unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
-        unsigned int gt_entries_per_readdir;
        unsigned int gt_statfs_quantum;
        unsigned int gt_statfs_slow;
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0d6831a40565..df0b8b3018b9 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -43,7 +43,8 @@ static int iget_test(struct inode *inode, void *opaque)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_inum_host *inum = opaque;
-        if (ip->i_num.no_addr == inum->no_addr)
+        if (ip->i_num.no_addr == inum->no_addr &&
+            inode->i_private != NULL)
                return 1;
        return 0;
@@ -61,13 +62,13 @@ static int iget_set(struct inode *inode, void *opaque)
 struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum_host *inum)
 {
-        return ilookup5(sb, (unsigned long)inum->no_formal_ino,
+        return ilookup5(sb, (unsigned long)inum->no_addr,
                        iget_test, inum);
 }
 static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum_host *inum)
 {
-        return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
+        return iget5_locked(sb, (unsigned long)inum->no_addr,
                     iget_test, iget_set, inum);
 }
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 56e33590b656..b3b7e8475359 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -266,9 +266,11 @@ skip_lock:
 out:
        return error;
 out_unlock:
-        if (error == GLR_TRYFAILED)
-                error = AOP_TRUNCATED_PAGE;
        unlock_page(page);
+        if (error == GLR_TRYFAILED) {
+                error = AOP_TRUNCATED_PAGE;
+                yield();
+        }
        if (do_unlock)
                gfs2_holder_uninit(&gh);
        goto out;
@@ -364,6 +366,7 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
                if (error == GLR_TRYFAILED) {
                        unlock_page(page);
                        error = AOP_TRUNCATED_PAGE;
+                        yield();
                }
                goto out_uninit;
        }
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 1de05b63d43a..aad918337a46 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -38,14 +38,11 @@ static struct dentry *gfs2_decode_fh(struct super_block *sb,
        struct gfs2_fh_obj fh_obj;
        struct gfs2_inum_host *this, parent;
-        if (fh_type != fh_len)
-                return NULL;
        this            = &fh_obj.this;
        fh_obj.imode    = DT_UNKNOWN;
        memset(&parent, 0, sizeof(struct gfs2_inum));
-        switch (fh_type) {
+        switch (fh_len) {
        case GFS2_LARGE_FH_SIZE:
                parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
                parent.no_formal_ino |= be32_to_cpu(fh[5]);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ee80b8a5e7bc..ee54cb667083 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -840,7 +840,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
        }
        printk(KERN_WARNING "GFS2: Unrecognized block device or "
-               "mount point %s", dev_name);
+               "mount point %s\n", dev_name);
 free_nd:
        path_release(&nd);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index d0db881b55d2..c186857e48a8 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -279,7 +279,7 @@ static int bh_get(struct gfs2_quota_data *qd)
                (bh->b_data + sizeof(struct gfs2_meta_header) +
                 offset * sizeof(struct gfs2_quota_change));
-        mutex_lock(&sdp->sd_quota_mutex);
+        mutex_unlock(&sdp->sd_quota_mutex);
        return 0;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 70f424fcf1cd..4fdda974dc83 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -76,7 +76,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
        gt->gt_stall_secs = 600;
        gt->gt_complain_secs = 10;
        gt->gt_reclaim_limit = 5000;
-        gt->gt_entries_per_readdir = 32;
        gt->gt_statfs_quantum = 30;
        gt->gt_statfs_slow = 0;
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e965eb11d76f..fd301a910122 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -20,7 +20,6 @@
 #include "hostfs.h"
 #include "kern_util.h"
 #include "kern.h"
-#include "user_util.h"
 #include "init.h"
 struct hostfs_inode_info {
@@ -47,7 +46,7 @@ struct dentry_operations hostfs_dentry_ops = {
 };
 /* Changed in hostfs_args before the kernel starts running */
-static char *root_ino = "/";
+static char *root_ino = "";
 static int append = 0;
 #define HOSTFS_SUPER_MAGIC 0x00c0ffee
@@ -939,7 +938,7 @@ static const struct address_space_operations hostfs_link_aops = {
 static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 {
        struct inode *root_inode;
-        char *name, *data = d;
+        char *host_root_path, *req_root = d;
        int err;
        sb->s_blocksize = 1024;
@@ -947,15 +946,17 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
-        if((data == NULL) || (*data == '\0'))
+        /* NULL is printed as <NULL> by sprintf: avoid that. */
-                data = root_ino;
+        if (req_root == NULL)
+                req_root = "";
        err = -ENOMEM;
-        name = kmalloc(strlen(data) + 1, GFP_KERNEL);
+        host_root_path = kmalloc(strlen(root_ino) + 1
-        if(name == NULL)
+                                 + strlen(req_root) + 1, GFP_KERNEL);
+        if(host_root_path == NULL)
                goto out;
-        strcpy(name, data);
+        sprintf(host_root_path, "%s/%s", root_ino, req_root);
        root_inode = iget(sb, 0);
        if(root_inode == NULL)
@@ -965,7 +966,10 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        if(err)
                goto out_put;
-        HOSTFS_I(root_inode)->host_filename = name;
+        HOSTFS_I(root_inode)->host_filename = host_root_path;
+        /* Avoid that in the error path, iput(root_inode) frees again
+         * host_root_path through hostfs_destroy_inode! */
+        host_root_path = NULL;
        err = -ENOMEM;
        sb->s_root = d_alloc_root(root_inode);
@@ -977,7 +981,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
                /* No iput in this case because the dput does that for us */
                dput(sb->s_root);
                sb->s_root = NULL;
-                goto out_free;
+                goto out;
        }
        return(0);
@@ -985,7 +989,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 out_put:
        iput(root_inode);
 out_free:
-        kfree(name);
+        kfree(host_root_path);
 out:
        return(err);
 }
diff --git a/fs/jffs/Makefile b/fs/jffs/Makefile
deleted file mode 100644
index 9c1c0bb59696..000000000000
--- a/fs/jffs/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for the linux Journalling Flash FileSystem (JFFS) routines.
-#
-# $Id: Makefile,v 1.11 2001/09/25 20:59:41 dwmw2 Exp $
-#
-obj-$(CONFIG_JFFS_FS) += jffs.o
-jffs-y                          := jffs_fm.o intrep.o inode-v23.o
-jffs-$(CONFIG_JFFS_PROC_FS)     += jffs_proc.o
-jffs-objs                       := $(jffs-y)
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
deleted file mode 100644
index 9602b925da08..000000000000
--- a/fs/jffs/inode-v23.c
+++ /dev/null
@@ -1,1847 +0,0 @@
-/*
- * JFFS -- Journalling Flash File System, Linux implementation.
- *
- * Copyright (C) 1999, 2000  Axis Communications AB.
- *
- * Created by Finn Hakansson <finn@axis.com>.
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * $Id: inode-v23.c,v 1.70 2001/10/02 09:16:02 dwmw2 Exp $
- *
- * Ported to Linux 2.3.x and MTD:
- * Copyright (C) 2000  Alexander Larsson (alex@cendio.se), Cendio Systems AB
- *
- * Copyright 2000, 2001  Red Hat, Inc.
- */
-/* inode.c -- Contains the code that is called from the VFS.  */
-/* TODO-ALEX:
- * uid and gid are just 16 bit.
- * jffs_file_write reads from user-space pointers without xx_from_user
- * maybe other stuff do to.
- */
-#include <linux/time.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/jffs.h>
-#include <linux/fs.h>
-#include <linux/smp_lock.h>
-#include <linux/ioctl.h>
-#include <linux/stat.h>
-#include <linux/blkdev.h>
-#include <linux/quotaops.h>
-#include <linux/highmem.h>
-#include <linux/vfs.h>
-#include <linux/mutex.h>
-#include <asm/byteorder.h>
-#include <asm/uaccess.h>
-#include "jffs_fm.h"
-#include "intrep.h"
-#ifdef CONFIG_JFFS_PROC_FS
-#include "jffs_proc.h"
-#endif
-static int jffs_remove(struct inode *dir, struct dentry *dentry, int type);
-static const struct super_operations jffs_ops;
-static const struct file_operations jffs_file_operations;
-static const struct inode_operations jffs_file_inode_operations;
-static const struct file_operations jffs_dir_operations;
-static const struct inode_operations jffs_dir_inode_operations;
-static const struct address_space_operations jffs_address_operations;
-struct kmem_cache     *node_cache = NULL;
-struct kmem_cache     *fm_cache = NULL;
-/* Called by the VFS at mount time to initialize the whole file system.  */
-static int jffs_fill_super(struct super_block *sb, void *data, int silent)
-{
-        struct inode *root_inode;
-        struct jffs_control *c;
-        sb->s_flags |= MS_NODIRATIME;
-        D1(printk(KERN_NOTICE "JFFS: Trying to mount device %s.\n",
-                  sb->s_id));
-        if (MAJOR(sb->s_dev) != MTD_BLOCK_MAJOR) {
-                printk(KERN_WARNING "JFFS: Trying to mount a "
-                       "non-mtd device.\n");
-                return -EINVAL;
-        }
-        sb->s_blocksize = PAGE_CACHE_SIZE;
-        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-        sb->s_fs_info = (void *) 0;
-        sb->s_maxbytes = 0xFFFFFFFF;
-        /* Build the file system.  */
-        if (jffs_build_fs(sb) < 0) {
-                goto jffs_sb_err1;
-        }
-        /*
-         * set up enough so that we can read an inode
-         */
-        sb->s_magic = JFFS_MAGIC_SB_BITMASK;
-        sb->s_op = &jffs_ops;
-        root_inode = iget(sb, JFFS_MIN_INO);
-        if (!root_inode)
-                goto jffs_sb_err2;
-        /* Get the root directory of this file system.  */
-        if (!(sb->s_root = d_alloc_root(root_inode))) {
-                goto jffs_sb_err3;
-        }
-        c = (struct jffs_control *) sb->s_fs_info;
-#ifdef CONFIG_JFFS_PROC_FS
-        /* Set up the jffs proc file system.  */
-        if (jffs_register_jffs_proc_dir(MINOR(sb->s_dev), c) < 0) {
-                printk(KERN_WARNING "JFFS: Failed to initialize the JFFS "
-                        "proc file system for device %s.\n",
-                        sb->s_id);
-        }
-#endif
-        /* Set the Garbage Collection thresholds */
-        /* GC if free space goes below 5% of the total size */
-        c->gc_minfree_threshold = c->fmc->flash_size / 20;
-        if (c->gc_minfree_threshold < c->fmc->sector_size)
-                c->gc_minfree_threshold = c->fmc->sector_size;
-        /* GC if dirty space exceeds 33% of the total size. */
-        c->gc_maxdirty_threshold = c->fmc->flash_size / 3;
-        if (c->gc_maxdirty_threshold < c->fmc->sector_size)
-                c->gc_maxdirty_threshold = c->fmc->sector_size;
-        c->thread_pid = kernel_thread (jffs_garbage_collect_thread, 
-                                        (void *) c, 
-                                        CLONE_KERNEL);
-        D1(printk(KERN_NOTICE "JFFS: GC thread pid=%d.\n", (int) c->thread_pid));
-        D1(printk(KERN_NOTICE "JFFS: Successfully mounted device %s.\n",
-               sb->s_id));
-        return 0;
-jffs_sb_err3:
-        iput(root_inode);
-jffs_sb_err2:
-        jffs_cleanup_control((struct jffs_control *)sb->s_fs_info);
-jffs_sb_err1:
-        printk(KERN_WARNING "JFFS: Failed to mount device %s.\n",
-               sb->s_id);
-        return -EINVAL;
-}
-/* This function is called when the file system is umounted.  */
-static void
-jffs_put_super(struct super_block *sb)
-{
-        struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
-        D2(printk("jffs_put_super()\n"));
-#ifdef CONFIG_JFFS_PROC_FS
-        jffs_unregister_jffs_proc_dir(c);
-#endif
-        if (c->gc_task) {
-                D1(printk (KERN_NOTICE "jffs_put_super(): Telling gc thread to die.\n"));
-                send_sig(SIGKILL, c->gc_task, 1);
-        }
-        wait_for_completion(&c->gc_thread_comp);
-        D1(printk (KERN_NOTICE "jffs_put_super(): Successfully waited on thread.\n"));
-        jffs_cleanup_control((struct jffs_control *)sb->s_fs_info);
-        D1(printk(KERN_NOTICE "JFFS: Successfully unmounted device %s.\n",
-               sb->s_id));
-}
-/* This function is called when user commands like chmod, chgrp and
-   chown are executed. System calls like trunc() results in a call
-   to this function.  */
-static int
-jffs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_fmcontrol *fmc;
-        struct jffs_file *f;
-        struct jffs_node *new_node;
-        int update_all;
-        int res = 0;
-        int recoverable = 0;
-        lock_kernel();
-        if ((res = inode_change_ok(inode, iattr))) 
-                goto out;
-        c = (struct jffs_control *)inode->i_sb->s_fs_info;
-        fmc = c->fmc;
-        D3(printk (KERN_NOTICE "notify_change(): down biglock\n"));
-        mutex_lock(&fmc->biglock);
-        f = jffs_find_file(c, inode->i_ino);
-        ASSERT(if (!f) {
-                printk("jffs_setattr(): Invalid inode number: %lu\n",
-                       inode->i_ino);
-                D3(printk (KERN_NOTICE "notify_change(): up biglock\n"));
-                mutex_unlock(&fmc->biglock);
-                res = -EINVAL;
-                goto out;
-        });
-        D1(printk("***jffs_setattr(): file: \"%s\", ino: %u\n",
-                  f->name, f->ino));
-        update_all = iattr->ia_valid & ATTR_FORCE;
-        if ( (update_all || iattr->ia_valid & ATTR_SIZE)
-             && (iattr->ia_size + 128 < f->size) ) {
-                /* We're shrinking the file by more than 128 bytes.
-                   We'll be able to GC and recover this space, so
-                   allow it to go into the reserved space. */
-                recoverable = 1;
-        }
-        if (!(new_node = jffs_alloc_node())) {
-                D(printk("jffs_setattr(): Allocation failed!\n"));
-                D3(printk (KERN_NOTICE "notify_change(): up biglock\n"));
-                mutex_unlock(&fmc->biglock);
-                res = -ENOMEM;
-                goto out;
-        }
-        new_node->data_offset = 0;
-        new_node->removed_size = 0;
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = f->ino;
-        raw_inode.pino = f->pino;
-        raw_inode.mode = f->mode;
-        raw_inode.uid = f->uid;
-        raw_inode.gid = f->gid;
-        raw_inode.atime = f->atime;
-        raw_inode.mtime = f->mtime;
-        raw_inode.ctime = f->ctime;
-        raw_inode.dsize = 0;
-        raw_inode.offset = 0;
-        raw_inode.rsize = 0;
-        raw_inode.dsize = 0;
-        raw_inode.nsize = f->nsize;
-        raw_inode.nlink = f->nlink;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 0;
-        if (update_all || iattr->ia_valid & ATTR_MODE) {
-                raw_inode.mode = iattr->ia_mode;
-                inode->i_mode = iattr->ia_mode;
-        }
-        if (update_all || iattr->ia_valid & ATTR_UID) {
-                raw_inode.uid = iattr->ia_uid;
-                inode->i_uid = iattr->ia_uid;
-        }
-        if (update_all || iattr->ia_valid & ATTR_GID) {
-                raw_inode.gid = iattr->ia_gid;
-                inode->i_gid = iattr->ia_gid;
-        }
-        if (update_all || iattr->ia_valid & ATTR_SIZE) {
-                int len;
-                D1(printk("jffs_notify_change(): Changing size "
-                          "to %lu bytes!\n", (long)iattr->ia_size));
-                raw_inode.offset = iattr->ia_size;
-                /* Calculate how many bytes need to be removed from
-                   the end.  */
-                if (f->size < iattr->ia_size) {
-                        len = 0;
-                }
-                else {
-                        len = f->size - iattr->ia_size;
-                }
-                raw_inode.rsize = len;
-                /* The updated node will be a removal node, with
-                   base at the new size and size of the nbr of bytes
-                   to be removed.  */
-                new_node->data_offset = iattr->ia_size;
-                new_node->removed_size = len;
-                inode->i_size = iattr->ia_size;
-                inode->i_blocks = (inode->i_size + 511) >> 9;
-                if (len) {
-                        invalidate_mapping_pages(inode->i_mapping, 0, -1);
-                }
-                inode->i_ctime = CURRENT_TIME_SEC;
-                inode->i_mtime = inode->i_ctime;
-        }
-        if (update_all || iattr->ia_valid & ATTR_ATIME) {
-                raw_inode.atime = iattr->ia_atime.tv_sec;
-                inode->i_atime = iattr->ia_atime;
-        }
-        if (update_all || iattr->ia_valid & ATTR_MTIME) {
-                raw_inode.mtime = iattr->ia_mtime.tv_sec;
-                inode->i_mtime = iattr->ia_mtime;
-        }
-        if (update_all || iattr->ia_valid & ATTR_CTIME) {
-                raw_inode.ctime = iattr->ia_ctime.tv_sec;
-                inode->i_ctime = iattr->ia_ctime;
-        }
-        /* Write this node to the flash.  */
-        if ((res = jffs_write_node(c, new_node, &raw_inode, f->name, NULL, recoverable, f)) < 0) {
-                D(printk("jffs_notify_change(): The write failed!\n"));
-                jffs_free_node(new_node);
-                D3(printk (KERN_NOTICE "n_c(): up biglock\n"));
-                mutex_unlock(&c->fmc->biglock);
-                goto out;
-        }
-        jffs_insert_node(c, f, &raw_inode, NULL, new_node);
-        mark_inode_dirty(inode);
-        D3(printk (KERN_NOTICE "n_c(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-out:
-        unlock_kernel();
-        return res;
-} /* jffs_notify_change()  */
-static struct inode *
-jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
-               int * err)
-{
-        struct super_block * sb;
-        struct inode * inode;
-        struct jffs_control *c;
-        struct jffs_file *f;
-        sb = dir->i_sb;
-        inode = new_inode(sb);
-        if (!inode) {
-                *err = -ENOMEM;
-                return NULL;
-        }
-        c = (struct jffs_control *)sb->s_fs_info;
-        inode->i_ino = raw_inode->ino;
-        inode->i_mode = raw_inode->mode;
-        inode->i_nlink = raw_inode->nlink;
-        inode->i_uid = raw_inode->uid;
-        inode->i_gid = raw_inode->gid;
-        inode->i_size = raw_inode->dsize;
-        inode->i_atime.tv_sec = raw_inode->atime;
-        inode->i_mtime.tv_sec = raw_inode->mtime;
-        inode->i_ctime.tv_sec = raw_inode->ctime;
-        inode->i_ctime.tv_nsec = 0;
-        inode->i_mtime.tv_nsec = 0;
-        inode->i_atime.tv_nsec = 0;
-        inode->i_blocks = (inode->i_size + 511) >> 9;
-        f = jffs_find_file(c, raw_inode->ino);
-        inode->i_private = (void *)f;
-        insert_inode_hash(inode);
-        return inode;
-}
-/* Get statistics of the file system.  */
-static int
-jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
-        struct jffs_fmcontrol *fmc;
-        lock_kernel();
-        fmc = c->fmc;
-        D2(printk("jffs_statfs()\n"));
-        buf->f_type = JFFS_MAGIC_SB_BITMASK;
-        buf->f_bsize = PAGE_CACHE_SIZE;
-        buf->f_blocks = (fmc->flash_size / PAGE_CACHE_SIZE)
-                       - (fmc->min_free_size / PAGE_CACHE_SIZE);
-        buf->f_bfree = (jffs_free_size1(fmc) + jffs_free_size2(fmc) +
-                       fmc->dirty_size - fmc->min_free_size)
-                               >> PAGE_CACHE_SHIFT;
-        buf->f_bavail = buf->f_bfree;
-        /* Find out how many files there are in the filesystem.  */
-        buf->f_files = jffs_foreach_file(c, jffs_file_count);
-        buf->f_ffree = buf->f_bfree;
-        /* buf->f_fsid = 0; */
-        buf->f_namelen = JFFS_MAX_NAME_LEN;
-        unlock_kernel();
-        return 0;
-}
-/* Rename a file.  */
-static int
-jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
-            struct inode *new_dir, struct dentry *new_dentry)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_file *old_dir_f;
-        struct jffs_file *new_dir_f;
-        struct jffs_file *del_f;
-        struct jffs_file *f;
-        struct jffs_node *node;
-        struct inode *inode;
-        int result = 0;
-        __u32 rename_data = 0;
-        D2(printk("***jffs_rename()\n"));
-        D(printk("jffs_rename(): old_dir: 0x%p, old name: 0x%p, "
-                 "new_dir: 0x%p, new name: 0x%p\n",
-                 old_dir, old_dentry->d_name.name,
-                 new_dir, new_dentry->d_name.name));
-        lock_kernel();
-        c = (struct jffs_control *)old_dir->i_sb->s_fs_info;
-        ASSERT(if (!c) {
-                printk(KERN_ERR "jffs_rename(): The old_dir inode "
-                       "didn't have a reference to a jffs_file struct\n");
-                unlock_kernel();
-                return -EIO;
-        });
-        result = -ENOTDIR;
-        if (!(old_dir_f = old_dir->i_private)) {
-                D(printk("jffs_rename(): Old dir invalid.\n"));
-                goto jffs_rename_end;
-        }
-        /* Try to find the file to move.  */
-        result = -ENOENT;
-        if (!(f = jffs_find_child(old_dir_f, old_dentry->d_name.name,
-                                  old_dentry->d_name.len))) {
-                goto jffs_rename_end;
-        }
-        /* Find the new directory.  */
-        result = -ENOTDIR;
-        if (!(new_dir_f = new_dir->i_private)) {
-                D(printk("jffs_rename(): New dir invalid.\n"));
-                goto jffs_rename_end;
-        }
-        D3(printk (KERN_NOTICE "rename(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        /* Create a node and initialize as much as needed.  */
-        result = -ENOMEM;
-        if (!(node = jffs_alloc_node())) {
-                D(printk("jffs_rename(): Allocation failed: node == 0\n"));
-                goto jffs_rename_end;
-        }
-        node->data_offset = 0;
-        node->removed_size = 0;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = f->ino;
-        raw_inode.pino = new_dir_f->ino;
-/*      raw_inode.version = f->highest_version + 1; */
-        raw_inode.mode = f->mode;
-        raw_inode.uid = current->fsuid;
-        raw_inode.gid = current->fsgid;
-#if 0
-        raw_inode.uid = f->uid;
-        raw_inode.gid = f->gid;
-#endif
-        raw_inode.atime = get_seconds();
-        raw_inode.mtime = raw_inode.atime;
-        raw_inode.ctime = f->ctime;
-        raw_inode.offset = 0;
-        raw_inode.dsize = 0;
-        raw_inode.rsize = 0;
-        raw_inode.nsize = new_dentry->d_name.len;
-        raw_inode.nlink = f->nlink;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 0;
-        /* See if there already exists a file with the same name as
-           new_name.  */
-        if ((del_f = jffs_find_child(new_dir_f, new_dentry->d_name.name,
-                                     new_dentry->d_name.len))) {
-                raw_inode.rename = 1;
-                raw_inode.dsize = sizeof(__u32);
-                rename_data = del_f->ino;
-        }
-        /* Write the new node to the flash memory.  */
-        if ((result = jffs_write_node(c, node, &raw_inode,
-                                      new_dentry->d_name.name,
-                                      (unsigned char*)&rename_data, 0, f)) < 0) {
-                D(printk("jffs_rename(): Failed to write node to flash.\n"));
-                jffs_free_node(node);
-                goto jffs_rename_end;
-        }
-        raw_inode.dsize = 0;
-        if (raw_inode.rename) {
-                /* The file with the same name must be deleted.  */
-                //FIXME deadlock                down(&c->fmc->gclock);
-                if ((result = jffs_remove(new_dir, new_dentry,
-                                          del_f->mode)) < 0) {
-                        /* This is really bad.  */
-                        printk(KERN_ERR "JFFS: An error occurred in "
-                               "rename().\n");
-                }
-                //              up(&c->fmc->gclock);
-        }
-        if (old_dir_f != new_dir_f) {
-                /* Remove the file from its old position in the
-                   filesystem tree.  */
-                jffs_unlink_file_from_tree(f);
-        }
-        /* Insert the new node into the file system.  */
-        if ((result = jffs_insert_node(c, f, &raw_inode,
-                                       new_dentry->d_name.name, node)) < 0) {
-                D(printk(KERN_ERR "jffs_rename(): jffs_insert_node() "
-                         "failed!\n"));
-        }
-        if (old_dir_f != new_dir_f) {
-                /* Insert the file to its new position in the
-                   file system.  */
-                jffs_insert_file_into_tree(f);
-        }
-        /* This is a kind of update of the inode we're about to make
-           here.  This is what they do in ext2fs.  Kind of.  */
-        if ((inode = iget(new_dir->i_sb, f->ino))) {
-                inode->i_ctime = CURRENT_TIME_SEC;
-                mark_inode_dirty(inode);
-                iput(inode);
-        }
-jffs_rename_end:
-        D3(printk (KERN_NOTICE "rename(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return result;
-} /* jffs_rename()  */
-/* Read the contents of a directory.  Used by programs like `ls'
-   for instance.  */
-static int
-jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-        struct jffs_file *f;
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info;
-        int j;
-        int ddino;
-        lock_kernel();
-        D3(printk (KERN_NOTICE "readdir(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        D2(printk("jffs_readdir(): inode: 0x%p, filp: 0x%p\n", inode, filp));
-        if (filp->f_pos == 0) {
-                D3(printk("jffs_readdir(): \".\" %lu\n", inode->i_ino));
-                if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) {
-                        D3(printk (KERN_NOTICE "readdir(): up biglock\n"));
-                        mutex_unlock(&c->fmc->biglock);
-                        unlock_kernel();
-                        return 0;
-                }
-                filp->f_pos = 1;
-        }
-        if (filp->f_pos == 1) {
-                if (inode->i_ino == JFFS_MIN_INO) {
-                        ddino = JFFS_MIN_INO;
-                }
-                else {
-                        ddino = ((struct jffs_file *)
-                                 inode->i_private)->pino;
-                }
-                D3(printk("jffs_readdir(): \"..\" %u\n", ddino));
-                if (filldir(dirent, "..", 2, filp->f_pos, ddino, DT_DIR) < 0) {
-                        D3(printk (KERN_NOTICE "readdir(): up biglock\n"));
-                        mutex_unlock(&c->fmc->biglock);
-                        unlock_kernel();
-                        return 0;
-                }
-                filp->f_pos++;
-        }
-        f = ((struct jffs_file *)inode->i_private)->children;
-        j = 2;
-        while(f && (f->deleted || j++ < filp->f_pos )) {
-                f = f->sibling_next;
-        }
-        while (f) {
-                D3(printk("jffs_readdir(): \"%s\" ino: %u\n",
-                          (f->name ? f->name : ""), f->ino));
-                if (filldir(dirent, f->name, f->nsize,
-                            filp->f_pos , f->ino, DT_UNKNOWN) < 0) {
-                        D3(printk (KERN_NOTICE "readdir(): up biglock\n"));
-                        mutex_unlock(&c->fmc->biglock);
-                        unlock_kernel();
-                        return 0;
-                }
-                filp->f_pos++;
-                do {
-                        f = f->sibling_next;
-                } while(f && f->deleted);
-        }
-        D3(printk (KERN_NOTICE "readdir(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return filp->f_pos;
-} /* jffs_readdir()  */
-/* Find a file in a directory. If the file exists, return its
-   corresponding dentry.  */
-static struct dentry *
-jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-        struct jffs_file *d;
-        struct jffs_file *f;
-        struct jffs_control *c = (struct jffs_control *)dir->i_sb->s_fs_info;
-        int len;
-        int r = 0;
-        const char *name;
-        struct inode *inode = NULL;
-        len = dentry->d_name.len;
-        name = dentry->d_name.name;
-        lock_kernel();
-        D3({
-                char *s = kmalloc(len + 1, GFP_KERNEL);
-                memcpy(s, name, len);
-                s[len] = '\0';
-                printk("jffs_lookup(): dir: 0x%p, name: \"%s\"\n", dir, s);
-                kfree(s);
-        });
-        D3(printk (KERN_NOTICE "lookup(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        r = -ENAMETOOLONG;
-        if (len > JFFS_MAX_NAME_LEN) {
-                goto jffs_lookup_end;
-        }
-        r = -EACCES;
-        if (!(d = (struct jffs_file *)dir->i_private)) {
-                D(printk("jffs_lookup(): No such inode! (%lu)\n",
-                         dir->i_ino));
-                goto jffs_lookup_end;
-        }
-        /* Get the corresponding inode to the file.  */
-        /* iget calls jffs_read_inode, so we need to drop the biglock
-           before calling iget.  Unfortunately, the GC has a tendency
-           to sneak in here, because iget sometimes calls schedule ().
-        */
-        if ((len == 1) && (name[0] == '.')) {
-                D3(printk (KERN_NOTICE "lookup(): up biglock\n"));
-                mutex_unlock(&c->fmc->biglock);
-                if (!(inode = iget(dir->i_sb, d->ino))) {
-                        D(printk("jffs_lookup(): . iget() ==> NULL\n"));
-                        goto jffs_lookup_end_no_biglock;
-                }
-                D3(printk (KERN_NOTICE "lookup(): down biglock\n"));
-                mutex_lock(&c->fmc->biglock);
-        } else if ((len == 2) && (name[0] == '.') && (name[1] == '.')) {
-                D3(printk (KERN_NOTICE "lookup(): up biglock\n"));
-                mutex_unlock(&c->fmc->biglock);
-                if (!(inode = iget(dir->i_sb, d->pino))) {
-                        D(printk("jffs_lookup(): .. iget() ==> NULL\n"));
-                        goto jffs_lookup_end_no_biglock;
-                }
-                D3(printk (KERN_NOTICE "lookup(): down biglock\n"));
-                mutex_lock(&c->fmc->biglock);
-        } else if ((f = jffs_find_child(d, name, len))) {
-                D3(printk (KERN_NOTICE "lookup(): up biglock\n"));
-                mutex_unlock(&c->fmc->biglock);
-                if (!(inode = iget(dir->i_sb, f->ino))) {
-                        D(printk("jffs_lookup(): iget() ==> NULL\n"));
-                        goto jffs_lookup_end_no_biglock;
-                }
-                D3(printk (KERN_NOTICE "lookup(): down biglock\n"));
-                mutex_lock(&c->fmc->biglock);
-        } else {
-                D3(printk("jffs_lookup(): Couldn't find the file. "
-                          "f = 0x%p, name = \"%s\", d = 0x%p, d->ino = %u\n",
-                          f, name, d, d->ino));
-                inode = NULL;
-        }
-        d_add(dentry, inode);
-        D3(printk (KERN_NOTICE "lookup(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return NULL;
-jffs_lookup_end:
-        D3(printk (KERN_NOTICE "lookup(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-jffs_lookup_end_no_biglock:
-        unlock_kernel();
-        return ERR_PTR(r);
-} /* jffs_lookup()  */
-/* Try to read a page of data from a file.  */
-static int
-jffs_do_readpage_nolock(struct file *file, struct page *page)
-{
-        void *buf;
-        unsigned long read_len;
-        int result;
-        struct inode *inode = (struct inode*)page->mapping->host;
-        struct jffs_file *f = (struct jffs_file *)inode->i_private;
-        struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info;
-        int r;
-        loff_t offset;
-        D2(printk("***jffs_readpage(): file = \"%s\", page->index = %lu\n",
-                  (f->name ? f->name : ""), (long)page->index));
-        get_page(page);
-        /* Don't SetPageLocked(page), should be locked already */
-        ClearPageUptodate(page);
-        ClearPageError(page);
-        D3(printk (KERN_NOTICE "readpage(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        read_len = 0;
-        result = 0;
-        offset = page_offset(page);
-        kmap(page);
-        buf = page_address(page);
-        if (offset < inode->i_size) {
-                read_len = min_t(long, inode->i_size - offset, PAGE_SIZE);
-                r = jffs_read_data(f, buf, offset, read_len);
-                if (r != read_len) {
-                        result = -EIO;
-                        D(
-                                printk("***jffs_readpage(): Read error! "
-                                       "Wanted to read %lu bytes but only "
-                                       "read %d bytes.\n", read_len, r);
-                          );
-                }
-        }
-        /* This handles the case of partial or no read in above */
-        if(read_len < PAGE_SIZE)
-                memset(buf + read_len, 0, PAGE_SIZE - read_len);
-        flush_dcache_page(page);
-        kunmap(page);
-        D3(printk (KERN_NOTICE "readpage(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        if (result) {
-                SetPageError(page);
-        }else {
-                SetPageUptodate(page);          
-        }
-        page_cache_release(page);
-        D3(printk("jffs_readpage(): Leaving...\n"));
-        return result;
-} /* jffs_do_readpage_nolock()  */
-static int jffs_readpage(struct file *file, struct page *page)
-{
-        int ret = jffs_do_readpage_nolock(file, page);
-        unlock_page(page);
-        return ret;
-}
-/* Create a new directory.  */
-static int
-jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_node *node;
-        struct jffs_file *dir_f;
-        struct inode *inode;
-        int dir_mode;
-        int result = 0;
-        int err;
-        D1({
-                int len = dentry->d_name.len;
-                char *_name = kmalloc(len + 1, GFP_KERNEL);
-                memcpy(_name, dentry->d_name.name, len);
-                _name[len] = '\0';
-                printk("***jffs_mkdir(): dir = 0x%p, name = \"%s\", "
-                       "len = %d, mode = 0x%08x\n", dir, _name, len, mode);
-                kfree(_name);
-        });
-        lock_kernel();
-        dir_f = dir->i_private;
-        ASSERT(if (!dir_f) {
-                printk(KERN_ERR "jffs_mkdir(): No reference to a "
-                       "jffs_file struct in inode.\n");
-                unlock_kernel();
-                return -EIO;
-        });
-        c = dir_f->c;
-        D3(printk (KERN_NOTICE "mkdir(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        dir_mode = S_IFDIR | (mode & (S_IRWXUGO|S_ISVTX)
-                              & ~current->fs->umask);
-        if (dir->i_mode & S_ISGID) {
-                dir_mode |= S_ISGID;
-        }
-        /* Create a node and initialize it as much as needed.  */
-        if (!(node = jffs_alloc_node())) {
-                D(printk("jffs_mkdir(): Allocation failed: node == 0\n"));
-                result = -ENOMEM;
-                goto jffs_mkdir_end;
-        }
-        node->data_offset = 0;
-        node->removed_size = 0;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = c->next_ino++;
-        raw_inode.pino = dir_f->ino;
-        raw_inode.version = 1;
-        raw_inode.mode = dir_mode;
-        raw_inode.uid = current->fsuid;
-        raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
-        /*      raw_inode.gid = current->fsgid; */
-        raw_inode.atime = get_seconds();
-        raw_inode.mtime = raw_inode.atime;
-        raw_inode.ctime = raw_inode.atime;
-        raw_inode.offset = 0;
-        raw_inode.dsize = 0;
-        raw_inode.rsize = 0;
-        raw_inode.nsize = dentry->d_name.len;
-        raw_inode.nlink = 1;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 0;
-        /* Write the new node to the flash.  */
-        if ((result = jffs_write_node(c, node, &raw_inode,
-                                     dentry->d_name.name, NULL, 0, NULL)) < 0) {
-                D(printk("jffs_mkdir(): jffs_write_node() failed.\n"));
-                jffs_free_node(node);
-                goto jffs_mkdir_end;
-        }
-        /* Insert the new node into the file system.  */
-        if ((result = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name,
-                                       node)) < 0) {
-                goto jffs_mkdir_end;
-        }
-        inode = jffs_new_inode(dir, &raw_inode, &err);
-        if (inode == NULL) {
-                result = err;
-                goto jffs_mkdir_end;
-        }
-        inode->i_op = &jffs_dir_inode_operations;
-        inode->i_fop = &jffs_dir_operations;
-        mark_inode_dirty(dir);
-        d_instantiate(dentry, inode);
-        result = 0;
-jffs_mkdir_end:
-        D3(printk (KERN_NOTICE "mkdir(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return result;
-} /* jffs_mkdir()  */
-/* Remove a directory.  */
-static int
-jffs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        struct jffs_control *c = (struct jffs_control *)dir->i_sb->s_fs_info;
-        int ret;
-        D3(printk("***jffs_rmdir()\n"));
-        D3(printk (KERN_NOTICE "rmdir(): down biglock\n"));
-        lock_kernel();
-        mutex_lock(&c->fmc->biglock);
-        ret = jffs_remove(dir, dentry, S_IFDIR);
-        D3(printk (KERN_NOTICE "rmdir(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return ret;
-}
-/* Remove any kind of file except for directories.  */
-static int
-jffs_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct jffs_control *c = (struct jffs_control *)dir->i_sb->s_fs_info;
-        int ret; 
-        lock_kernel();
-        D3(printk("***jffs_unlink()\n"));
-        D3(printk (KERN_NOTICE "unlink(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        ret = jffs_remove(dir, dentry, 0);
-        D3(printk (KERN_NOTICE "unlink(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return ret;
-}
-/* Remove a JFFS entry, i.e. plain files, directories, etc.  Here we
-   shouldn't test for free space on the device.  */
-static int
-jffs_remove(struct inode *dir, struct dentry *dentry, int type)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_file *dir_f; /* The file-to-remove's parent.  */
-        struct jffs_file *del_f; /* The file to remove.  */
-        struct jffs_node *del_node;
-        struct inode *inode = NULL;
-        int result = 0;
-        D1({
-                int len = dentry->d_name.len;
-                const char *name = dentry->d_name.name;
-                char *_name = kmalloc(len + 1, GFP_KERNEL);
-                memcpy(_name, name, len);
-                _name[len] = '\0';
-                printk("***jffs_remove(): file = \"%s\", ino = %ld\n", _name, dentry->d_inode->i_ino);
-                kfree(_name);
-        });
-        dir_f = dir->i_private;
-        c = dir_f->c;
-        result = -ENOENT;
-        if (!(del_f = jffs_find_child(dir_f, dentry->d_name.name,
-                                      dentry->d_name.len))) {
-                D(printk("jffs_remove(): jffs_find_child() failed.\n"));
-                goto jffs_remove_end;
-        }
-        if (S_ISDIR(type)) {
-                struct jffs_file *child = del_f->children;
-                while(child) {
-                        if( !child->deleted ) {
-                                result = -ENOTEMPTY;
-                                goto jffs_remove_end;
-                        }
-                        child = child->sibling_next;
-                }
-        }            
-        else if (S_ISDIR(del_f->mode)) {
-                D(printk("jffs_remove(): node is a directory "
-                         "but it shouldn't be.\n"));
-                result = -EPERM;
-                goto jffs_remove_end;
-        }
-        inode = dentry->d_inode;
-        result = -EIO;
-        if (del_f->ino != inode->i_ino)
-                goto jffs_remove_end;
-        if (!inode->i_nlink) {
-                printk("Deleting nonexistent file inode: %lu, nlink: %d\n",
-                       inode->i_ino, inode->i_nlink);
-                inode->i_nlink=1;
-        }
-        /* Create a node for the deletion.  */
-        result = -ENOMEM;
-        if (!(del_node = jffs_alloc_node())) {
-                D(printk("jffs_remove(): Allocation failed!\n"));
-                goto jffs_remove_end;
-        }
-        del_node->data_offset = 0;
-        del_node->removed_size = 0;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = del_f->ino;
-        raw_inode.pino = del_f->pino;
-/*      raw_inode.version = del_f->highest_version + 1; */
-        raw_inode.mode = del_f->mode;
-        raw_inode.uid = current->fsuid;
-        raw_inode.gid = current->fsgid;
-        raw_inode.atime = get_seconds();
-        raw_inode.mtime = del_f->mtime;
-        raw_inode.ctime = raw_inode.atime;
-        raw_inode.offset = 0;
-        raw_inode.dsize = 0;
-        raw_inode.rsize = 0;
-        raw_inode.nsize = 0;
-        raw_inode.nlink = del_f->nlink;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 1;
-        /* Write the new node to the flash memory.  */
-        if (jffs_write_node(c, del_node, &raw_inode, NULL, NULL, 1, del_f) < 0) {
-                jffs_free_node(del_node);
-                result = -EIO;
-                goto jffs_remove_end;
-        }
-        /* Update the file.  This operation will make the file disappear
-           from the in-memory file system structures.  */
-        jffs_insert_node(c, del_f, &raw_inode, NULL, del_node);
-        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-        mark_inode_dirty(dir);
-        inode->i_ctime = dir->i_ctime;
-        inode_dec_link_count(inode);
-        d_delete(dentry);       /* This also frees the inode */
-        result = 0;
-jffs_remove_end:
-        return result;
-} /* jffs_remove()  */
-static int
-jffs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_file *dir_f;
-        struct jffs_node *node = NULL;
-        struct jffs_control *c;
-        struct inode *inode;
-        int result = 0;
-        u16 data = old_encode_dev(rdev);
-        int err;
-        D1(printk("***jffs_mknod()\n"));
-        if (!old_valid_dev(rdev))
-                return -EINVAL;
-        lock_kernel();
-        dir_f = dir->i_private;
-        c = dir_f->c;
-        D3(printk (KERN_NOTICE "mknod(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        /* Create and initialize a new node.  */
-        if (!(node = jffs_alloc_node())) {
-                D(printk("jffs_mknod(): Allocation failed!\n"));
-                result = -ENOMEM;
-                goto jffs_mknod_err;
-        }
-        node->data_offset = 0;
-        node->removed_size = 0;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = c->next_ino++;
-        raw_inode.pino = dir_f->ino;
-        raw_inode.version = 1;
-        raw_inode.mode = mode;
-        raw_inode.uid = current->fsuid;
-        raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
-        /*      raw_inode.gid = current->fsgid; */
-        raw_inode.atime = get_seconds();
-        raw_inode.mtime = raw_inode.atime;
-        raw_inode.ctime = raw_inode.atime;
-        raw_inode.offset = 0;
-        raw_inode.dsize = 2;
-        raw_inode.rsize = 0;
-        raw_inode.nsize = dentry->d_name.len;
-        raw_inode.nlink = 1;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 0;
-        /* Write the new node to the flash.  */
-        if ((err = jffs_write_node(c, node, &raw_inode, dentry->d_name.name,
-                                   (unsigned char *)&data, 0, NULL)) < 0) {
-                D(printk("jffs_mknod(): jffs_write_node() failed.\n"));
-                result = err;
-                goto jffs_mknod_err;
-        }
-        /* Insert the new node into the file system.  */
-        if ((err = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name,
-                                    node)) < 0) {
-                result = err;
-                goto jffs_mknod_end;
-        }
-        inode = jffs_new_inode(dir, &raw_inode, &err);
-        if (inode == NULL) {
-                result = err;
-                goto jffs_mknod_end;
-        }
-        init_special_inode(inode, mode, rdev);
-        d_instantiate(dentry, inode);
-        goto jffs_mknod_end;
-jffs_mknod_err:
-        if (node) {
-                jffs_free_node(node);
-        }
-jffs_mknod_end:
-        D3(printk (KERN_NOTICE "mknod(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return result;
-} /* jffs_mknod()  */
-static int
-jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_file *dir_f;
-        struct jffs_node *node;
-        struct inode *inode;
-        int symname_len = strlen(symname);
-        int err;
-        lock_kernel();
-        D1({
-                int len = dentry->d_name.len; 
-                char *_name = kmalloc(len + 1, GFP_KERNEL);
-                char *_symname = kmalloc(symname_len + 1, GFP_KERNEL);
-                memcpy(_name, dentry->d_name.name, len);
-                _name[len] = '\0';
-                memcpy(_symname, symname, symname_len);
-                _symname[symname_len] = '\0';
-                printk("***jffs_symlink(): dir = 0x%p, "
-                       "dentry->dname.name = \"%s\", "
-                       "symname = \"%s\"\n", dir, _name, _symname);
-                kfree(_name);
-                kfree(_symname);
-        });
-        dir_f = dir->i_private;
-        ASSERT(if (!dir_f) {
-                printk(KERN_ERR "jffs_symlink(): No reference to a "
-                       "jffs_file struct in inode.\n");
-                unlock_kernel();
-                return -EIO;
-        });
-        c = dir_f->c;
-        /* Create a node and initialize it as much as needed.  */
-        if (!(node = jffs_alloc_node())) {
-                D(printk("jffs_symlink(): Allocation failed: node = NULL\n"));
-                unlock_kernel();
-                return -ENOMEM;
-        }
-        D3(printk (KERN_NOTICE "symlink(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        node->data_offset = 0;
-        node->removed_size = 0;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = c->next_ino++;
-        raw_inode.pino = dir_f->ino;
-        raw_inode.version = 1;
-        raw_inode.mode = S_IFLNK | S_IRWXUGO;
-        raw_inode.uid = current->fsuid;
-        raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
-        raw_inode.atime = get_seconds();
-        raw_inode.mtime = raw_inode.atime;
-        raw_inode.ctime = raw_inode.atime;
-        raw_inode.offset = 0;
-        raw_inode.dsize = symname_len;
-        raw_inode.rsize = 0;
-        raw_inode.nsize = dentry->d_name.len;
-        raw_inode.nlink = 1;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 0;
-        /* Write the new node to the flash.  */
-        if ((err = jffs_write_node(c, node, &raw_inode, dentry->d_name.name,
-                                   (const unsigned char *)symname, 0, NULL)) < 0) {
-                D(printk("jffs_symlink(): jffs_write_node() failed.\n"));
-                jffs_free_node(node);
-                goto jffs_symlink_end;
-        }
-        /* Insert the new node into the file system.  */
-        if ((err = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name,
-                                    node)) < 0) {
-                goto jffs_symlink_end;
-        }
-        inode = jffs_new_inode(dir, &raw_inode, &err);
-        if (inode == NULL) {
-                goto jffs_symlink_end;
-        }
-        err = 0;
-        inode->i_op = &page_symlink_inode_operations;
-        inode->i_mapping->a_ops = &jffs_address_operations;
-        d_instantiate(dentry, inode);
- jffs_symlink_end:
-        D3(printk (KERN_NOTICE "symlink(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return err;
-} /* jffs_symlink()  */
-/* Create an inode inside a JFFS directory (dir) and return it.
- *
- * By the time this is called, we already have created
- * the directory cache entry for the new file, but it
- * is so far negative - it has no inode.
- *
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
- */
-static int
-jffs_create(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_node *node;
-        struct jffs_file *dir_f; /* JFFS representation of the directory.  */
-        struct inode *inode;
-        int err;
-        lock_kernel();
-        D1({
-                int len = dentry->d_name.len;
-                char *s = kmalloc(len + 1, GFP_KERNEL);
-                memcpy(s, dentry->d_name.name, len);
-                s[len] = '\0';
-                printk("jffs_create(): dir: 0x%p, name: \"%s\"\n", dir, s);
-                kfree(s);
-        });
-        dir_f = dir->i_private;
-        ASSERT(if (!dir_f) {
-                printk(KERN_ERR "jffs_create(): No reference to a "
-                       "jffs_file struct in inode.\n");
-                unlock_kernel();
-                return -EIO;
-        });
-        c = dir_f->c;
-        /* Create a node and initialize as much as needed.  */
-        if (!(node = jffs_alloc_node())) {
-                D(printk("jffs_create(): Allocation failed: node == 0\n"));
-                unlock_kernel();
-                return -ENOMEM;
-        }
-        D3(printk (KERN_NOTICE "create(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        node->data_offset = 0;
-        node->removed_size = 0;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = c->next_ino++;
-        raw_inode.pino = dir_f->ino;
-        raw_inode.version = 1;
-        raw_inode.mode = mode;
-        raw_inode.uid = current->fsuid;
-        raw_inode.gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
-        raw_inode.atime = get_seconds();
-        raw_inode.mtime = raw_inode.atime;
-        raw_inode.ctime = raw_inode.atime;
-        raw_inode.offset = 0;
-        raw_inode.dsize = 0;
-        raw_inode.rsize = 0;
-        raw_inode.nsize = dentry->d_name.len;
-        raw_inode.nlink = 1;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = 0;
-        /* Write the new node to the flash.  */
-        if ((err = jffs_write_node(c, node, &raw_inode,
-                                   dentry->d_name.name, NULL, 0, NULL)) < 0) {
-                D(printk("jffs_create(): jffs_write_node() failed.\n"));
-                jffs_free_node(node);
-                goto jffs_create_end;
-        }
-        /* Insert the new node into the file system.  */
-        if ((err = jffs_insert_node(c, NULL, &raw_inode, dentry->d_name.name,
-                                    node)) < 0) {
-                goto jffs_create_end;
-        }
-        /* Initialize an inode.  */
-        inode = jffs_new_inode(dir, &raw_inode, &err);
-        if (inode == NULL) {
-                goto jffs_create_end;
-        }
-        err = 0;
-        inode->i_op = &jffs_file_inode_operations;
-        inode->i_fop = &jffs_file_operations;
-        inode->i_mapping->a_ops = &jffs_address_operations;
-        inode->i_mapping->nrpages = 0;
-        d_instantiate(dentry, inode);
- jffs_create_end:
-        D3(printk (KERN_NOTICE "create(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        unlock_kernel();
-        return err;
-} /* jffs_create()  */
-/* Write, append or rewrite data to an existing file.  */
-static ssize_t
-jffs_file_write(struct file *filp, const char *buf, size_t count,
-                loff_t *ppos)
-{
-        struct jffs_raw_inode raw_inode;
-        struct jffs_control *c;
-        struct jffs_file *f;
-        struct jffs_node *node;
-        struct dentry *dentry = filp->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        int recoverable = 0;
-        size_t written = 0;
-        __u32 thiscount = count;
-        loff_t pos = *ppos;
-        int err;
-        inode = filp->f_path.dentry->d_inode;
-        D2(printk("***jffs_file_write(): inode: 0x%p (ino: %lu), "
-                  "filp: 0x%p, buf: 0x%p, count: %d\n",
-                  inode, inode->i_ino, filp, buf, count));
-#if 0
-        if (inode->i_sb->s_flags & MS_RDONLY) {
-                D(printk("jffs_file_write(): MS_RDONLY\n"));
-                err = -EROFS;
-                goto out_isem;
-        }
-#endif  
-        err = -EINVAL;
-        if (!S_ISREG(inode->i_mode)) {
-                D(printk("jffs_file_write(): inode->i_mode == 0x%08x\n",
-                                inode->i_mode));
-                goto out_isem;
-        }
-        if (!(f = inode->i_private)) {
-                D(printk("jffs_file_write(): inode->i_private = 0x%p\n",
-                                inode->i_private));
-                goto out_isem;
-        }
-        c = f->c;
-        /*
-         * This will never trigger with sane page sizes.  leave it in
-         * anyway, since I'm thinking about how to merge larger writes
-         * (the current idea is to poke a thread that does the actual
-         * I/O and starts by doing a mutex_lock(&inode->i_mutex).  then we
-         * would need to get the page cache pages and have a list of
-         * I/O requests and do write-merging here.
-         * -- prumpf
-         */
-        thiscount = min(c->fmc->max_chunk_size - sizeof(struct jffs_raw_inode), count);
-        D3(printk (KERN_NOTICE "file_write(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        /* Urgh. POSIX says we can do short writes if we feel like it. 
-         * In practice, we can't. Nothing will cope. So we loop until
-         * we're done.
-         *
-         * <_Anarchy_> posix and reality are not interconnected on this issue
-         */
-        while (count) {
-                /* Things are going to be written so we could allocate and
-                   initialize the necessary data structures now.  */
-                if (!(node = jffs_alloc_node())) {
-                        D(printk("jffs_file_write(): node == 0\n"));
-                        err = -ENOMEM;
-                        goto out;
-                }
-                node->data_offset = pos;
-                node->removed_size = 0;
-                /* Initialize the raw inode.  */
-                raw_inode.magic = JFFS_MAGIC_BITMASK;
-                raw_inode.ino = f->ino;
-                raw_inode.pino = f->pino;
-                raw_inode.mode = f->mode;
-                raw_inode.uid = f->uid;
-                raw_inode.gid = f->gid;
-                raw_inode.atime = get_seconds();
-                raw_inode.mtime = raw_inode.atime;
-                raw_inode.ctime = f->ctime;
-                raw_inode.offset = pos;
-                raw_inode.dsize = thiscount;
-                raw_inode.rsize = 0;
-                raw_inode.nsize = f->nsize;
-                raw_inode.nlink = f->nlink;
-                raw_inode.spare = 0;
-                raw_inode.rename = 0;
-                raw_inode.deleted = 0;
-                if (pos < f->size) {
-                        node->removed_size = raw_inode.rsize = min(thiscount, (__u32)(f->size - pos));
-                        /* If this node is going entirely over the top of old data,
-                           we can allow it to go into the reserved space, because
-                           we know that GC can reclaim the space later.
-                        */
-                        if (pos + thiscount < f->size) {
-                                /* If all the data we're overwriting are _real_,
-                                   not just holes, then:
-                                   recoverable = 1;
-                                */
-                        }
-                }
-                /* Write the new node to the flash.  */
-                /* NOTE: We would be quite happy if jffs_write_node() wrote a
-                   smaller node than we were expecting. There's no need for it
-                   to waste the space at the end of the flash just because it's
-                   a little smaller than what we asked for. But that's a whole
-                   new can of worms which I'm not going to open this week. 
-                   -- dwmw2.
-                */
-                if ((err = jffs_write_node(c, node, &raw_inode, f->name,
-                                           (const unsigned char *)buf,
-                                           recoverable, f)) < 0) {
-                        D(printk("jffs_file_write(): jffs_write_node() failed.\n"));
-                        jffs_free_node(node);
-                        goto out;
-                }
-                written += err;
-                buf += err;
-                count -= err;
-                pos += err;
-                /* Insert the new node into the file system.  */
-                if ((err = jffs_insert_node(c, f, &raw_inode, NULL, node)) < 0) {
-                        goto out;
-                }
-                D3(printk("jffs_file_write(): new f_pos %ld.\n", (long)pos));
-                thiscount = min(c->fmc->max_chunk_size - sizeof(struct jffs_raw_inode), count);
-        }
- out:
-        D3(printk (KERN_NOTICE "file_write(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        /* Fix things in the real inode.  */
-        if (pos > inode->i_size) {
-                inode->i_size = pos;
-                inode->i_blocks = (inode->i_size + 511) >> 9;
-        }
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        invalidate_mapping_pages(inode->i_mapping, 0, -1);
- out_isem:
-        return err;
-} /* jffs_file_write()  */
-static int
-jffs_prepare_write(struct file *filp, struct page *page,
-                  unsigned from, unsigned to)
-{
-        /* FIXME: we should detect some error conditions here */
-        /* Bugger that. We should make sure the page is uptodate */
-        if (!PageUptodate(page) && (from || to < PAGE_CACHE_SIZE))
-                return jffs_do_readpage_nolock(filp, page);
-        return 0;
-} /* jffs_prepare_write() */
-static int
-jffs_commit_write(struct file *filp, struct page *page,
-                 unsigned from, unsigned to)
-{
-       void *addr = page_address(page) + from;
-       /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */
-       loff_t pos = page_offset(page) + from;
-       return jffs_file_write(filp, addr, to-from, &pos);
-} /* jffs_commit_write() */
-/* This is our ioctl() routine.  */
-static int
-jffs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-           unsigned long arg)
-{
-        struct jffs_control *c;
-        int ret = 0;
-        D2(printk("***jffs_ioctl(): cmd = 0x%08x, arg = 0x%08lx\n",
-                  cmd, arg));
-        if (!(c = (struct jffs_control *)inode->i_sb->s_fs_info)) {
-                printk(KERN_ERR "JFFS: Bad inode in ioctl() call. "
-                       "(cmd = 0x%08x)\n", cmd);
-                return -EIO;
-        }
-        D3(printk (KERN_NOTICE "ioctl(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        switch (cmd) {
-        case JFFS_PRINT_HASH:
-                jffs_print_hash_table(c);
-                break;
-        case JFFS_PRINT_TREE:
-                jffs_print_tree(c->root, 0);
-                break;
-        case JFFS_GET_STATUS:
-                {
-                        struct jffs_flash_status fst;
-                        struct jffs_fmcontrol *fmc = c->fmc;
-                        printk("Flash status -- ");
-                        if (!access_ok(VERIFY_WRITE,
-                                       (struct jffs_flash_status __user *)arg,
-                                       sizeof(struct jffs_flash_status))) {
-                                D(printk("jffs_ioctl(): Bad arg in "
-                                         "JFFS_GET_STATUS ioctl!\n"));
-                                ret = -EFAULT;
-                                break;
-                        }
-                        fst.size = fmc->flash_size;
-                        fst.used = fmc->used_size;
-                        fst.dirty = fmc->dirty_size;
-                        fst.begin = fmc->head->offset;
-                        fst.end = fmc->tail->offset + fmc->tail->size;
-                        printk("size: %d, used: %d, dirty: %d, "
-                               "begin: %d, end: %d\n",
-                               fst.size, fst.used, fst.dirty,
-                               fst.begin, fst.end);
-                        if (copy_to_user((struct jffs_flash_status __user *)arg,
-                                         &fst,
-                                         sizeof(struct jffs_flash_status))) {
-                                ret = -EFAULT;
-                        }
-                }
-                break;
-        default:
-                ret = -ENOTTY;
-        }
-        D3(printk (KERN_NOTICE "ioctl(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-        return ret;
-} /* jffs_ioctl()  */
-static const struct address_space_operations jffs_address_operations = {
-        .readpage       = jffs_readpage,
-        .prepare_write  = jffs_prepare_write,
-        .commit_write   = jffs_commit_write,
-};
-static int jffs_fsync(struct file *f, struct dentry *d, int datasync)
-{
-        /* We currently have O_SYNC operations at all times.
-           Do nothing.
-        */
-        return 0;
-}
-static const struct file_operations jffs_file_operations =
-{
-        .open           = generic_file_open,
-        .llseek         = generic_file_llseek,
-        .read           = do_sync_read,
-        .aio_read       = generic_file_aio_read,
-        .write          = do_sync_write,
-        .aio_write      = generic_file_aio_write,
-        .ioctl          = jffs_ioctl,
-        .mmap           = generic_file_readonly_mmap,
-        .fsync          = jffs_fsync,
-        .sendfile       = generic_file_sendfile,
-};
-static const struct inode_operations jffs_file_inode_operations =
-{
-        .lookup         = jffs_lookup,          /* lookup */
-        .setattr        = jffs_setattr,
-};
-static const struct file_operations jffs_dir_operations =
-{
-        .readdir        = jffs_readdir,
-};
-static const struct inode_operations jffs_dir_inode_operations =
-{
-        .create         = jffs_create,
-        .lookup         = jffs_lookup,
-        .unlink         = jffs_unlink,
-        .symlink        = jffs_symlink,
-        .mkdir          = jffs_mkdir,
-        .rmdir          = jffs_rmdir,
-        .mknod          = jffs_mknod,
-        .rename         = jffs_rename,
-        .setattr        = jffs_setattr,
-};
-/* Initialize an inode for the VFS.  */
-static void
-jffs_read_inode(struct inode *inode)
-{
-        struct jffs_file *f;
-        struct jffs_control *c;
-        D3(printk("jffs_read_inode(): inode->i_ino == %lu\n", inode->i_ino));
-        if (!inode->i_sb) {
-                D(printk("jffs_read_inode(): !inode->i_sb ==> "
-                         "No super block!\n"));
-                return;
-        }
-        c = (struct jffs_control *)inode->i_sb->s_fs_info;
-        D3(printk (KERN_NOTICE "read_inode(): down biglock\n"));
-        mutex_lock(&c->fmc->biglock);
-        if (!(f = jffs_find_file(c, inode->i_ino))) {
-                D(printk("jffs_read_inode(): No such inode (%lu).\n",
-                         inode->i_ino));
-                D3(printk (KERN_NOTICE "read_inode(): up biglock\n"));
-                mutex_unlock(&c->fmc->biglock);
-                return;
-        }
-        inode->i_private = f;
-        inode->i_mode = f->mode;
-        inode->i_nlink = f->nlink;
-        inode->i_uid = f->uid;
-        inode->i_gid = f->gid;
-        inode->i_size = f->size;
-        inode->i_atime.tv_sec = f->atime;
-        inode->i_mtime.tv_sec = f->mtime;
-        inode->i_ctime.tv_sec = f->ctime;
-        inode->i_atime.tv_nsec = 
-        inode->i_mtime.tv_nsec = 
-        inode->i_ctime.tv_nsec = 0;
-        inode->i_blocks = (inode->i_size + 511) >> 9;
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_op = &jffs_file_inode_operations;
-                inode->i_fop = &jffs_file_operations;
-                inode->i_mapping->a_ops = &jffs_address_operations;
-        }
-        else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &jffs_dir_inode_operations;
-                inode->i_fop = &jffs_dir_operations;
-        }
-        else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_mapping->a_ops = &jffs_address_operations;
-        }
-        else {
-                /* If the node is a device of some sort, then the number of
-                   the device should be read from the flash memory and then
-                   added to the inode's i_rdev member.  */
-                u16 val;
-                jffs_read_data(f, (char *)&val, 0, 2);
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(val));
-        }
-        D3(printk (KERN_NOTICE "read_inode(): up biglock\n"));
-        mutex_unlock(&c->fmc->biglock);
-}
-static void
-jffs_delete_inode(struct inode *inode)
-{
-        struct jffs_file *f;
-        struct jffs_control *c;
-        D3(printk("jffs_delete_inode(): inode->i_ino == %lu\n",
-                  inode->i_ino));
-        truncate_inode_pages(&inode->i_data, 0);
-        lock_kernel();
-        inode->i_size = 0;
-        inode->i_blocks = 0;
-        inode->i_private = NULL;
-        clear_inode(inode);
-        if (inode->i_nlink == 0) {
-                c = (struct jffs_control *) inode->i_sb->s_fs_info;
-                f = (struct jffs_file *) jffs_find_file (c, inode->i_ino);
-                jffs_possibly_delete_file(f);
-        }
-        unlock_kernel();
-}
-static void
-jffs_write_super(struct super_block *sb)
-{
-        struct jffs_control *c = (struct jffs_control *)sb->s_fs_info;
-        lock_kernel();
-        jffs_garbage_collect_trigger(c);
-        unlock_kernel();
-}
-static int jffs_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_NODIRATIME;
-        return 0;
-}
-static const struct super_operations jffs_ops =
-{
-        .read_inode     = jffs_read_inode,
-        .delete_inode   = jffs_delete_inode,
-        .put_super      = jffs_put_super,
-        .write_super    = jffs_write_super,
-        .statfs         = jffs_statfs,
-        .remount_fs     = jffs_remount,
-};
-static int jffs_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
-                           mnt);
-}
-static struct file_system_type jffs_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "jffs",
-        .get_sb         = jffs_get_sb,
-        .kill_sb        = kill_block_super,
-        .fs_flags       = FS_REQUIRES_DEV,
-};
-static int __init
-init_jffs_fs(void)
-{
-        printk(KERN_INFO "JFFS version " JFFS_VERSION_STRING
-                ", (C) 1999, 2000  Axis Communications AB\n");
-        
-#ifdef CONFIG_JFFS_PROC_FS
-        jffs_proc_root = proc_mkdir("jffs", proc_root_fs);
-        if (!jffs_proc_root) {
-                printk(KERN_WARNING "cannot create /proc/jffs entry\n");
-        }
-#endif
-        fm_cache = kmem_cache_create("jffs_fm", sizeof(struct jffs_fm),
-                       0,
-                       SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-                       NULL, NULL);
-        if (!fm_cache) {
-                return -ENOMEM;
-        }
-        node_cache = kmem_cache_create("jffs_node",sizeof(struct jffs_node),
-                       0,
-                       SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-                       NULL, NULL);
-        if (!node_cache) {
-                kmem_cache_destroy(fm_cache);
-                return -ENOMEM;
-        }
-        return register_filesystem(&jffs_fs_type);
-}
-static void __exit
-exit_jffs_fs(void)
-{
-        unregister_filesystem(&jffs_fs_type);
-        kmem_cache_destroy(fm_cache);
-        kmem_cache_destroy(node_cache);
-}
-module_init(init_jffs_fs)
-module_exit(exit_jffs_fs)
-MODULE_DESCRIPTION("The Journalling Flash File System");
-MODULE_AUTHOR("Axis Communications AB.");
-MODULE_LICENSE("GPL");
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
deleted file mode 100644
index 6dd18911b44c..000000000000
--- a/fs/jffs/intrep.c
+++ /dev/null
@@ -1,3449 +0,0 @@
-/*
- * JFFS -- Journaling Flash File System, Linux implementation.
- *
- * Copyright (C) 1999, 2000  Axis Communications, Inc.
- *
- * Created by Finn Hakansson <finn@axis.com>.
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * $Id: intrep.c,v 1.102 2001/09/23 23:28:36 dwmw2 Exp $
- *
- * Ported to Linux 2.3.x and MTD:
- * Copyright (C) 2000  Alexander Larsson (alex@cendio.se), Cendio Systems AB
- *
- */
-/* This file contains the code for the internal structure of the
-   Journaling Flash File System, JFFS.  */
-/*
- * Todo list:
- *
- * memcpy_to_flash() and memcpy_from_flash() functions.
- *
- * Implementation of hard links.
- *
- * Organize the source code in a better way. Against the VFS we could
- * have jffs_ext.c, and against the block device jffs_int.c.
- * A better file-internal organization too.
- *
- * A better checksum algorithm.
- *
- * Consider endianness stuff. ntohl() etc.
- *
- * Are we handling the atime, mtime, ctime members of the inode right?
- *
- * Remove some duplicated code. Take a look at jffs_write_node() and
- * jffs_rewrite_data() for instance.
- *
- * Implement more meaning of the nlink member in various data structures.
- * nlink could be used in conjunction with hard links for instance.
- *
- * Better memory management. Allocate data structures in larger chunks
- * if possible.
- *
- * If too much meta data is stored, a garbage collect should be issued.
- * We have experienced problems with too much meta data with for instance
- * log files.
- *
- * Improve the calls to jffs_ioctl(). We would like to retrieve more
- * information to be able to debug (or to supervise) JFFS during run-time.
- *
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/jffs.h>
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/pagemap.h>
-#include <linux/mutex.h>
-#include <asm/byteorder.h>
-#include <linux/smp_lock.h>
-#include <linux/time.h>
-#include <linux/ctype.h>
-#include <linux/freezer.h>
-#include "intrep.h"
-#include "jffs_fm.h"
-long no_jffs_node = 0;
-static long no_jffs_file = 0;
-#if defined(JFFS_MEMORY_DEBUG) && JFFS_MEMORY_DEBUG
-long no_jffs_control = 0;
-long no_jffs_raw_inode = 0;
-long no_jffs_node_ref = 0;
-long no_jffs_fm = 0;
-long no_jffs_fmcontrol = 0;
-long no_hash = 0;
-long no_name = 0;
-#endif
-static int jffs_scan_flash(struct jffs_control *c);
-static int jffs_update_file(struct jffs_file *f, struct jffs_node *node);
-static int jffs_build_file(struct jffs_file *f);
-static int jffs_free_file(struct jffs_file *f);
-static int jffs_free_node_list(struct jffs_file *f);
-static int jffs_garbage_collect_now(struct jffs_control *c);
-static int jffs_insert_file_into_hash(struct jffs_file *f);
-static int jffs_remove_redundant_nodes(struct jffs_file *f);
-/* Is there enough space on the flash?  */
-static inline int JFFS_ENOUGH_SPACE(struct jffs_control *c, __u32 space)
-{
-        struct jffs_fmcontrol *fmc = c->fmc;
-        while (1) {
-                if ((fmc->flash_size - (fmc->used_size + fmc->dirty_size))
-                        >= fmc->min_free_size + space) {
-                        return 1;
-                }
-                if (fmc->dirty_size < fmc->sector_size)
-                        return 0;
-                if (jffs_garbage_collect_now(c)) {
-                  D1(printk("JFFS_ENOUGH_SPACE: jffs_garbage_collect_now() failed.\n"));
-                  return 0;
-                }
-        }
-}
-#if CONFIG_JFFS_FS_VERBOSE > 0
-static __u8
-flash_read_u8(struct mtd_info *mtd, loff_t from)
-{
-        size_t retlen;
-        __u8 ret;
-        int res;
-        res = MTD_READ(mtd, from, 1, &retlen, &ret);
-        if (retlen != 1) {
-                printk("Didn't read a byte in flash_read_u8(). Returned %d\n", res);
-                return 0;
-        }
-        return ret;
-}
-static void
-jffs_hexdump(struct mtd_info *mtd, loff_t pos, int size)
-{
-        char line[16];
-        int j = 0;
-        while (size > 0) {
-                int i;
-                printk("%ld:", (long) pos);
-                for (j = 0; j < 16; j++) {
-                        line[j] = flash_read_u8(mtd, pos++);
-                }
-                for (i = 0; i < j; i++) {
-                        if (!(i & 1)) {
-                                printk(" %.2x", line[i] & 0xff);
-                        }
-                        else {
-                                printk("%.2x", line[i] & 0xff);
-                        }
-                }
-                /* Print empty space */
-                for (; i < 16; i++) {
-                        if (!(i & 1)) {
-                                printk("   ");
-                        }
-                        else {
-                                printk("  ");
-                        }
-                }
-                printk("  ");
-                for (i = 0; i < j; i++) {
-                        if (isgraph(line[i])) {
-                                printk("%c", line[i]);
-                        }
-                        else {
-                                printk(".");
-                        }
-                }
-                printk("\n");
-                size -= 16;
-        }
-}
-/* Print the contents of a node.  */
-static void
-jffs_print_node(struct jffs_node *n)
-{
-        D(printk("jffs_node: 0x%p\n", n));
-        D(printk("{\n"));
-        D(printk("        0x%08x, /* version  */\n", n->version));
-        D(printk("        0x%08x, /* data_offset  */\n", n->data_offset));
-        D(printk("        0x%08x, /* data_size  */\n", n->data_size));
-        D(printk("        0x%08x, /* removed_size  */\n", n->removed_size));
-        D(printk("        0x%08x, /* fm_offset  */\n", n->fm_offset));
-        D(printk("        0x%02x,       /* name_size  */\n", n->name_size));
-        D(printk("        0x%p, /* fm,  fm->offset: %u  */\n",
-                 n->fm, (n->fm ? n->fm->offset : 0)));
-        D(printk("        0x%p, /* version_prev  */\n", n->version_prev));
-        D(printk("        0x%p, /* version_next  */\n", n->version_next));
-        D(printk("        0x%p, /* range_prev  */\n", n->range_prev));
-        D(printk("        0x%p, /* range_next  */\n", n->range_next));
-        D(printk("}\n"));
-}
-#endif
-/* Print the contents of a raw inode.  */
-static void
-jffs_print_raw_inode(struct jffs_raw_inode *raw_inode)
-{
-        D(printk("jffs_raw_inode: inode number: %u\n", raw_inode->ino));
-        D(printk("{\n"));
-        D(printk("        0x%08x, /* magic  */\n", raw_inode->magic));
-        D(printk("        0x%08x, /* ino  */\n", raw_inode->ino));
-        D(printk("        0x%08x, /* pino  */\n", raw_inode->pino));
-        D(printk("        0x%08x, /* version  */\n", raw_inode->version));
-        D(printk("        0x%08x, /* mode  */\n", raw_inode->mode));
-        D(printk("        0x%04x,     /* uid  */\n", raw_inode->uid));
-        D(printk("        0x%04x,     /* gid  */\n", raw_inode->gid));
-        D(printk("        0x%08x, /* atime  */\n", raw_inode->atime));
-        D(printk("        0x%08x, /* mtime  */\n", raw_inode->mtime));
-        D(printk("        0x%08x, /* ctime  */\n", raw_inode->ctime));
-        D(printk("        0x%08x, /* offset  */\n", raw_inode->offset));
-        D(printk("        0x%08x, /* dsize  */\n", raw_inode->dsize));
-        D(printk("        0x%08x, /* rsize  */\n", raw_inode->rsize));
-        D(printk("        0x%02x,       /* nsize  */\n", raw_inode->nsize));
-        D(printk("        0x%02x,       /* nlink  */\n", raw_inode->nlink));
-        D(printk("        0x%02x,       /* spare  */\n",
-                 raw_inode->spare));
-        D(printk("        %u,          /* rename  */\n",
-                 raw_inode->rename));
-        D(printk("        %u,          /* deleted  */\n",
-                 raw_inode->deleted));
-        D(printk("        0x%02x,       /* accurate  */\n",
-                 raw_inode->accurate));
-        D(printk("        0x%08x, /* dchksum  */\n", raw_inode->dchksum));
-        D(printk("        0x%04x,     /* nchksum  */\n", raw_inode->nchksum));
-        D(printk("        0x%04x,     /* chksum  */\n", raw_inode->chksum));
-        D(printk("}\n"));
-}
-#define flash_safe_acquire(arg)
-#define flash_safe_release(arg)
-static int
-flash_safe_read(struct mtd_info *mtd, loff_t from,
-                u_char *buf, size_t count)
-{
-        size_t retlen;
-        int res;
-        D3(printk(KERN_NOTICE "flash_safe_read(%p, %08x, %p, %08x)\n",
-                  mtd, (unsigned int) from, buf, count));
-        res = mtd->read(mtd, from, count, &retlen, buf);
-        if (retlen != count) {
-                panic("Didn't read all bytes in flash_safe_read(). Returned %d\n", res);
-        }
-        return res?res:retlen;
-}
-static __u32
-flash_read_u32(struct mtd_info *mtd, loff_t from)
-{
-        size_t retlen;
-        __u32 ret;
-        int res;
-        res = mtd->read(mtd, from, 4, &retlen, (unsigned char *)&ret);
-        if (retlen != 4) {
-                printk("Didn't read all bytes in flash_read_u32(). Returned %d\n", res);
-                return 0;
-        }
-        return ret;
-}
-static int
-flash_safe_write(struct mtd_info *mtd, loff_t to,
-                 const u_char *buf, size_t count)
-{
-        size_t retlen;
-        int res;
-        D3(printk(KERN_NOTICE "flash_safe_write(%p, %08x, %p, %08x)\n",
-                  mtd, (unsigned int) to, buf, count));
-        res = mtd->write(mtd, to, count, &retlen, buf);
-        if (retlen != count) {
-                printk("Didn't write all bytes in flash_safe_write(). Returned %d\n", res);
-        }
-        return res?res:retlen;
-}
-static int
-flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
-                        unsigned long iovec_cnt, loff_t to)
-{
-        size_t retlen, retlen_a;
-        int i;
-        int res;
-        D3(printk(KERN_NOTICE "flash_safe_writev(%p, %08x, %p)\n",
-                  mtd, (unsigned int) to, vecs));
-        if (mtd->writev) {
-                res = mtd->writev(mtd, vecs, iovec_cnt, to, &retlen);
-                return res ? res : retlen;
-        }
-        /* Not implemented writev. Repeatedly use write - on the not so
-           unreasonable assumption that the mtd driver doesn't care how
-           many write cycles we use. */
-        res=0;
-        retlen=0;
-        for (i=0; !res && i<iovec_cnt; i++) {
-                res = mtd->write(mtd, to, vecs[i].iov_len, &retlen_a,
-                                 vecs[i].iov_base);
-                if (retlen_a != vecs[i].iov_len) {
-                        printk("Didn't write all bytes in flash_safe_writev(). Returned %d\n", res);
-                        if (i != iovec_cnt-1)
-                                return -EIO;
-                }
-                /* If res is non-zero, retlen_a is undefined, but we don't
-                   care because in that case it's not going to be 
-                   returned anyway.
-                */
-                to += retlen_a;
-                retlen += retlen_a;
-        }
-        return res?res:retlen;
-}
-static int
-flash_memset(struct mtd_info *mtd, loff_t to,
-             const u_char c, size_t size)
-{
-        static unsigned char pattern[64];
-        int i;
-        /* fill up pattern */
-        for(i = 0; i < 64; i++)
-                pattern[i] = c;
-        /* write as many 64-byte chunks as we can */
-        while (size >= 64) {
-                flash_safe_write(mtd, to, pattern, 64);
-                size -= 64;
-                to += 64;
-        }
-        /* and the rest */
-        if(size)
-                flash_safe_write(mtd, to, pattern, size);
-        return size;
-}
-static void
-intrep_erase_callback(struct erase_info *done)
-{
-        wait_queue_head_t *wait_q;
-        wait_q = (wait_queue_head_t *)done->priv;
-        wake_up(wait_q);
-}
-static int
-flash_erase_region(struct mtd_info *mtd, loff_t start,
-                   size_t size)
-{
-        struct erase_info *erase;
-        DECLARE_WAITQUEUE(wait, current);
-        wait_queue_head_t wait_q;
-        erase = kmalloc(sizeof(struct erase_info), GFP_KERNEL);
-        if (!erase)
-                return -ENOMEM;
-        init_waitqueue_head(&wait_q);
-        erase->mtd = mtd;
-        erase->callback = intrep_erase_callback;
-        erase->addr = start;
-        erase->len = size;
-        erase->priv = (u_long)&wait_q;
-        /* FIXME: Use TASK_INTERRUPTIBLE and deal with being interrupted */
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        add_wait_queue(&wait_q, &wait);
-        if (mtd->erase(mtd, erase) < 0) {
-                set_current_state(TASK_RUNNING);
-                remove_wait_queue(&wait_q, &wait);
-                kfree(erase);
-                printk(KERN_WARNING "flash: erase of region [0x%lx, 0x%lx] "
-                       "totally failed\n", (long)start, (long)start + size);
-                return -1;
-        }
-        schedule(); /* Wait for flash to finish. */
-        remove_wait_queue(&wait_q, &wait);
-        kfree(erase);
-        return 0;
-}
-/* This routine calculates checksums in JFFS.  */
-static __u32
-jffs_checksum(const void *data, int size)
-{
-        __u32 sum = 0;
-        __u8 *ptr = (__u8 *)data;
-        while (size-- > 0) {
-                sum += *ptr++;
-        }
-        D3(printk(", result: 0x%08x\n", sum));
-        return sum;
-}
-static int
-jffs_checksum_flash(struct mtd_info *mtd, loff_t start, int size, __u32 *result)
-{
-        __u32 sum = 0;
-        loff_t ptr = start;
-        __u8 *read_buf;
-        int i, length;
-        /* Allocate read buffer */
-        read_buf = kmalloc(sizeof(__u8) * 4096, GFP_KERNEL);
-        if (!read_buf) {
-                printk(KERN_NOTICE "kmalloc failed in jffs_checksum_flash()\n");
-                return -ENOMEM;
-        }
-        /* Loop until checksum done */
-        while (size) {
-                /* Get amount of data to read */
-                if (size < 4096)
-                        length = size;
-                else
-                        length = 4096;
-                /* Perform flash read */
-                D3(printk(KERN_NOTICE "jffs_checksum_flash\n"));
-                flash_safe_read(mtd, ptr, &read_buf[0], length);
-                /* Compute checksum */
-                for (i=0; i < length ; i++)
-                        sum += read_buf[i];
-                /* Update pointer and size */
-                size -= length;
-                ptr += length;
-        }
-        /* Free read buffer */
-        kfree(read_buf);
-        /* Return result */
-        D3(printk("checksum result: 0x%08x\n", sum));
-        *result = sum;
-        return 0;
-}
-static __inline__ void jffs_fm_write_lock(struct jffs_fmcontrol *fmc)
-{
-  //    down(&fmc->wlock);
-}
-static __inline__ void jffs_fm_write_unlock(struct jffs_fmcontrol *fmc)
-{
-  //    up(&fmc->wlock);
-}
-/* Create and initialize a new struct jffs_file.  */
-static struct jffs_file *
-jffs_create_file(struct jffs_control *c,
-                 const struct jffs_raw_inode *raw_inode)
-{
-        struct jffs_file *f;
-        if (!(f = kzalloc(sizeof(*f), GFP_KERNEL))) {
-                D(printk("jffs_create_file(): Failed!\n"));
-                return NULL;
-        }
-        no_jffs_file++;
-        f->ino = raw_inode->ino;
-        f->pino = raw_inode->pino;
-        f->nlink = raw_inode->nlink;
-        f->deleted = raw_inode->deleted;
-        f->c = c;
-        return f;
-}
-/* Build a control block for the file system.  */
-static struct jffs_control *
-jffs_create_control(struct super_block *sb)
-{
-        struct jffs_control *c;
-        register int s = sizeof(struct jffs_control);
-        int i;
-        D(char *t = 0);
-        D2(printk("jffs_create_control()\n"));
-        if (!(c = kmalloc(s, GFP_KERNEL))) {
-                goto fail_control;
-        }
-        DJM(no_jffs_control++);
-        c->root = NULL;
-        c->gc_task = NULL;
-        c->hash_len = JFFS_HASH_SIZE;
-        s = sizeof(struct list_head) * c->hash_len;
-        if (!(c->hash = kmalloc(s, GFP_KERNEL))) {
-                goto fail_hash;
-        }
-        DJM(no_hash++);
-        for (i = 0; i < c->hash_len; i++)
-                INIT_LIST_HEAD(&c->hash[i]);
-        if (!(c->fmc = jffs_build_begin(c, MINOR(sb->s_dev)))) {
-                goto fail_fminit;
-        }
-        c->next_ino = JFFS_MIN_INO + 1;
-        c->delete_list = (struct jffs_delete_list *) 0;
-        return c;
-fail_fminit:
-        D(t = "c->fmc");
-fail_hash:
-        kfree(c);
-        DJM(no_jffs_control--);
-        D(t = t ? t : "c->hash");
-fail_control:
-        D(t = t ? t : "control");
-        D(printk("jffs_create_control(): Allocation failed: (%s)\n", t));
-        return (struct jffs_control *)0;
-}
-/* Clean up all data structures associated with the file system.  */
-void
-jffs_cleanup_control(struct jffs_control *c)
-{
-        D2(printk("jffs_cleanup_control()\n"));
-        if (!c) {
-                D(printk("jffs_cleanup_control(): c == NULL !!!\n"));
-                return;
-        }
-        while (c->delete_list) {
-                struct jffs_delete_list *delete_list_element;
-                delete_list_element = c->delete_list;
-                c->delete_list = c->delete_list->next;
-                kfree(delete_list_element);
-        }
-        /* Free all files and nodes.  */
-        if (c->hash) {
-                jffs_foreach_file(c, jffs_free_node_list);
-                jffs_foreach_file(c, jffs_free_file);
-                kfree(c->hash);
-                DJM(no_hash--);
-        }
-        jffs_cleanup_fmcontrol(c->fmc);
-        kfree(c);
-        DJM(no_jffs_control--);
-        D3(printk("jffs_cleanup_control(): Leaving...\n"));
-}
-/* This function adds a virtual root node to the in-RAM representation.
-   Called by jffs_build_fs().  */
-static int
-jffs_add_virtual_root(struct jffs_control *c)
-{
-        struct jffs_file *root;
-        struct jffs_node *node;
-        D2(printk("jffs_add_virtual_root(): "
-                  "Creating a virtual root directory.\n"));
-        if (!(root = kzalloc(sizeof(struct jffs_file), GFP_KERNEL))) {
-                return -ENOMEM;
-        }
-        no_jffs_file++;
-        if (!(node = jffs_alloc_node())) {
-                kfree(root);
-                no_jffs_file--;
-                return -ENOMEM;
-        }
-        DJM(no_jffs_node++);
-        memset(node, 0, sizeof(struct jffs_node));
-        node->ino = JFFS_MIN_INO;
-        root->ino = JFFS_MIN_INO;
-        root->mode = S_IFDIR | S_IRWXU | S_IRGRP
-                     | S_IXGRP | S_IROTH | S_IXOTH;
-        root->atime = root->mtime = root->ctime = get_seconds();
-        root->nlink = 1;
-        root->c = c;
-        root->version_head = root->version_tail = node;
-        jffs_insert_file_into_hash(root);
-        return 0;
-}
-/* This is where the file system is built and initialized.  */
-int
-jffs_build_fs(struct super_block *sb)
-{
-        struct jffs_control *c;
-        int err = 0;
-        D2(printk("jffs_build_fs()\n"));
-        if (!(c = jffs_create_control(sb))) {
-                return -ENOMEM;
-        }
-        c->building_fs = 1;
-        c->sb = sb;
-        if ((err = jffs_scan_flash(c)) < 0) {
-                if(err == -EAGAIN){
-                        /* scan_flash() wants us to try once more. A flipping 
-                           bits sector was detect in the middle of the scan flash.
-                           Clean up old allocated memory before going in.
-                        */
-                        D1(printk("jffs_build_fs: Cleaning up all control structures,"
-                                  " reallocating them and trying mount again.\n"));
-                        jffs_cleanup_control(c);
-                        if (!(c = jffs_create_control(sb))) {
-                                return -ENOMEM;
-                        }
-                        c->building_fs = 1;
-                        c->sb = sb;
-                        if ((err = jffs_scan_flash(c)) < 0) {
-                                goto jffs_build_fs_fail;
-                        }                       
-                }else{
-                        goto jffs_build_fs_fail;
-                }
-        }
-        /* Add a virtual root node if no one exists.  */
-        if (!jffs_find_file(c, JFFS_MIN_INO)) {
-                if ((err = jffs_add_virtual_root(c)) < 0) {
-                        goto jffs_build_fs_fail;
-                }
-        }
-        while (c->delete_list) {
-                struct jffs_file *f;
-                struct jffs_delete_list *delete_list_element;
-                if ((f = jffs_find_file(c, c->delete_list->ino))) {
-                        f->deleted = 1;
-                }
-                delete_list_element = c->delete_list;
-                c->delete_list = c->delete_list->next;
-                kfree(delete_list_element);
-        }
-        /* Remove deleted nodes.  */
-        if ((err = jffs_foreach_file(c, jffs_possibly_delete_file)) < 0) {
-                printk(KERN_ERR "JFFS: Failed to remove deleted nodes.\n");
-                goto jffs_build_fs_fail;
-        }
-        /* Remove redundant nodes.  (We are not interested in the
-           return value in this case.)  */
-        jffs_foreach_file(c, jffs_remove_redundant_nodes);
-        /* Try to build a tree from all the nodes.  */
-        if ((err = jffs_foreach_file(c, jffs_insert_file_into_tree)) < 0) {
-                printk("JFFS: Failed to build tree.\n");
-                goto jffs_build_fs_fail;
-        }
-        /* Compute the sizes of all files in the filesystem.  Adjust if
-           necessary.  */
-        if ((err = jffs_foreach_file(c, jffs_build_file)) < 0) {
-                printk("JFFS: Failed to build file system.\n");
-                goto jffs_build_fs_fail;
-        }
-        sb->s_fs_info = (void *)c;
-        c->building_fs = 0;
-        D1(jffs_print_hash_table(c));
-        D1(jffs_print_tree(c->root, 0));
-        return 0;
-jffs_build_fs_fail:
-        jffs_cleanup_control(c);
-        return err;
-} /* jffs_build_fs()  */
-/*
-  This checks for sectors that were being erased in their previous 
-  lifetimes and for some reason or the other (power fail etc.), 
-  the erase cycles never completed.
-  As the flash array would have reverted back to read status, 
-  these sectors are detected by the symptom of the "flipping bits",
-  i.e. bits being read back differently from the same location in
-  flash if read multiple times.
-  The only solution to this is to re-erase the entire
-  sector.
-  Unfortunately detecting "flipping bits" is not a simple exercise
-  as a bit may be read back at 1 or 0 depending on the alignment 
-  of the stars in the universe.
-  The level of confidence is in direct proportion to the number of 
-  scans done. By power fail testing I (Vipin) have been able to 
-  proove that reading twice is not enough.
-  Maybe 4 times? Change NUM_REREADS to a higher number if you want
-  a (even) higher degree of confidence in your mount process. 
-  A higher number would of course slow down your mount.
-*/
-static int check_partly_erased_sectors(struct jffs_fmcontrol *fmc){
-#define NUM_REREADS             4 /* see note above */
-#define READ_AHEAD_BYTES        4096 /* must be a multiple of 4, 
-                                        usually set to kernel page size */
-        __u8 *read_buf1;
-        __u8 *read_buf2;
-        int err = 0;
-        int retlen;
-        int i;
-        int cnt;
-        __u32 offset;
-        loff_t pos = 0;
-        loff_t end = fmc->flash_size;
-        /* Allocate read buffers */
-        read_buf1 = kmalloc(sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL);
-        if (!read_buf1)
-                return -ENOMEM;
-        read_buf2 = kmalloc(sizeof(__u8) * READ_AHEAD_BYTES, GFP_KERNEL);
-        if (!read_buf2) {
-                kfree(read_buf1);
-                return -ENOMEM;
-        }
- CHECK_NEXT:
-        while(pos < end){
-                
-                D1(printk("check_partly_erased_sector():checking sector which contains"
-                          " offset 0x%x for flipping bits..\n", (__u32)pos));
-                
-                retlen = flash_safe_read(fmc->mtd, pos,
-                                         &read_buf1[0], READ_AHEAD_BYTES);
-                retlen &= ~3;
-                
-                for(cnt = 0; cnt < NUM_REREADS; cnt++){
-                        (void)flash_safe_read(fmc->mtd, pos,
-                                              &read_buf2[0], READ_AHEAD_BYTES);
-                        
-                        for (i=0 ; i < retlen ; i+=4) {
-                                /* buffers MUST match, double word for word! */
-                                if(*((__u32 *) &read_buf1[i]) !=
-                                   *((__u32 *) &read_buf2[i])
-                                   ){
-                                        /* flipping bits detected, time to erase sector */
-                                        /* This will help us log some statistics etc. */
-                                        D1(printk("Flipping bits detected in re-read round:%i of %i\n",
-                                               cnt, NUM_REREADS));
-                                        D1(printk("check_partly_erased_sectors:flipping bits detected"
-                                                  " @offset:0x%x(0x%x!=0x%x)\n",
-                                                  (__u32)pos+i, *((__u32 *) &read_buf1[i]), 
-                                                  *((__u32 *) &read_buf2[i])));
-                                        
-                                        /* calculate start of present sector */
-                                        offset = (((__u32)pos+i)/(__u32)fmc->sector_size) * (__u32)fmc->sector_size;
-                                        
-                                        D1(printk("check_partly_erased_sector():erasing sector starting 0x%x.\n",
-                                                  offset));
-                                        
-                                        if (flash_erase_region(fmc->mtd,
-                                                               offset, fmc->sector_size) < 0) {
-                                                printk(KERN_ERR "JFFS: Erase of flash failed. "
-                                                       "offset = %u, erase_size = %d\n",
-                                                       offset , fmc->sector_size);
-                                                
-                                                err = -EIO;
-                                                goto returnBack;
-                                        }else{
-                                                D1(printk("JFFS: Erase of flash sector @0x%x successful.\n",
-                                                       offset));
-                                                /* skip ahead to the next sector */
-                                                pos = (((__u32)pos+i)/(__u32)fmc->sector_size) * (__u32)fmc->sector_size;
-                                                pos += fmc->sector_size;
-                                                goto CHECK_NEXT;
-                                        }
-                                }
-                        }
-                }
-                pos += READ_AHEAD_BYTES;
-        }
- returnBack:
-        kfree(read_buf1);
-        kfree(read_buf2);
-        D2(printk("check_partly_erased_sector():Done checking all sectors till offset 0x%x for flipping bits.\n",
-                  (__u32)pos));
-        return err;
-}/* end check_partly_erased_sectors() */
-/* Scan the whole flash memory in order to find all nodes in the
-   file systems.  */
-static int
-jffs_scan_flash(struct jffs_control *c)
-{
-        char name[JFFS_MAX_NAME_LEN + 2];
-        struct jffs_raw_inode raw_inode;
-        struct jffs_node *node = NULL;
-        struct jffs_fmcontrol *fmc = c->fmc;
-        __u32 checksum;
-        __u8 tmp_accurate;
-        __u16 tmp_chksum;
-        __u32 deleted_file;
-        loff_t pos = 0;
-        loff_t start;
-        loff_t test_start;
-        loff_t end = fmc->flash_size;
-        __u8 *read_buf;
-        int i, len, retlen;
-        __u32 offset;
-        __u32 free_chunk_size1;
-        __u32 free_chunk_size2;
-        
-#define NUMFREEALLOWED     2        /* 2 chunks of at least erase size space allowed */
-        int num_free_space = 0;       /* Flag err if more than TWO
-                                       free blocks found. This is NOT allowed
-                                       by the current jffs design.
-                                    */
-        int num_free_spc_not_accp = 0; /* For debugging purposed keep count 
-                                        of how much free space was rejected and
-                                        marked dirty
-                                     */
-        D1(printk("jffs_scan_flash(): start pos = 0x%lx, end = 0x%lx\n",
-                  (long)pos, (long)end));
-        flash_safe_acquire(fmc->mtd);
-        /*
-          check and make sure that any sector does not suffer
-          from the "partly erased, bit flipping syndrome" (TM Vipin :)
-          If so, offending sectors will be erased.
-        */
-        if(check_partly_erased_sectors(fmc) < 0){
-                flash_safe_release(fmc->mtd);
-                return -EIO; /* bad, bad, bad error. Cannot continue.*/
-        }
-        /* Allocate read buffer */
-        read_buf = kmalloc(sizeof(__u8) * 4096, GFP_KERNEL);
-        if (!read_buf) {
-                flash_safe_release(fmc->mtd);
-                return -ENOMEM;
-        }
-                              
-        /* Start the scan.  */
-        while (pos < end) {
-                deleted_file = 0;
-                /* Remember the position from where we started this scan.  */
-                start = pos;
-                switch (flash_read_u32(fmc->mtd, pos)) {
-                case JFFS_EMPTY_BITMASK:
-                        /* We have found 0xffffffff at this position.  We have to
-                           scan the rest of the flash till the end or till
-                           something else than 0xffffffff is found.
-                           Keep going till we do not find JFFS_EMPTY_BITMASK 
-                           anymore */
-                        D1(printk("jffs_scan_flash(): 0xffffffff at pos 0x%lx.\n",
-                                  (long)pos));
-                        while(pos < end){
-                              len = end - pos < 4096 ? end - pos : 4096;
-                              
-                              retlen = flash_safe_read(fmc->mtd, pos,
-                                                 &read_buf[0], len);
-                              retlen &= ~3;
-                              
-                              for (i=0 ; i < retlen ; i+=4, pos += 4) {
-                                      if(*((__u32 *) &read_buf[i]) !=
-                                         JFFS_EMPTY_BITMASK)
-                                        break;
-                              }
-                              if (i == retlen)
-                                    continue;
-                              else
-                                    break;
-                        }
-                        D1(printk("jffs_scan_flash():0xffffffff ended at pos 0x%lx.\n",
-                                  (long)pos));
-                        
-                        /* If some free space ends in the middle of a sector,
-                           treat it as dirty rather than clean.
-                           This is to handle the case where one thread 
-                           allocated space for a node, but didn't get to
-                           actually _write_ it before power was lost, leaving
-                           a gap in the log. Shifting all node writes into
-                           a single kernel thread will fix the original problem.
-                        */
-                        if ((__u32) pos % fmc->sector_size) {
-                                /* If there was free space in previous 
-                                   sectors, don't mark that dirty too - 
-                                   only from the beginning of this sector
-                                   (or from start) 
-                                */
-                                test_start = pos & ~(fmc->sector_size-1); /* end of last sector */
-                                if (start < test_start) {
-                                        /* free space started in the previous sector! */
-                                        if((num_free_space < NUMFREEALLOWED) && 
-                                           ((unsigned int)(test_start - start) >= fmc->sector_size)){
-                                                /*
-                                                  Count it in if we are still under NUMFREEALLOWED *and* it is 
-                                                  at least 1 erase sector in length. This will keep us from 
-                                                  picking any little ole' space as "free".
-                                                */
-                                          
-                                                D1(printk("Reducing end of free space to 0x%x from 0x%x\n",
-                                                          (unsigned int)test_start, (unsigned int)pos));
-                                                D1(printk("Free space accepted: Starting 0x%x for 0x%x bytes\n",
-                                                          (unsigned int) start,
-                                                          (unsigned int)(test_start - start)));
-                                                /* below, space from "start" to "pos" will be marked dirty. */
-                                                start = test_start; 
-                                                
-                                                /* Being in here means that we have found at least an entire 
-                                                   erase sector size of free space ending on a sector boundary.
-                                                   Keep track of free spaces accepted.
-                                                */
-                                                num_free_space++;
-                                        }else{
-                                                num_free_spc_not_accp++;
-                                                D1(printk("Free space (#%i) found but *Not* accepted: Starting"
-                                                          " 0x%x for 0x%x bytes\n",
-                                                          num_free_spc_not_accp, (unsigned int)start, 
-                                                          (unsigned int)((unsigned int)(pos & ~(fmc->sector_size-1)) - (unsigned int)start)));
-                                                
-                                        }
-                                        
-                                }
-                                if((((__u32)(pos - start)) != 0)){
-                                        D1(printk("Dirty space: Starting 0x%x for 0x%x bytes\n",
-                                                  (unsigned int) start, (unsigned int) (pos - start)));
-                                        jffs_fmalloced(fmc, (__u32) start,
-                                                       (__u32) (pos - start), NULL);
-                                }else{
-                                        /* "Flipping bits" detected. This means that our scan for them
-                                           did not catch this offset. See check_partly_erased_sectors() for
-                                           more info.
-                                        */
-                                        
-                                        D1(printk("jffs_scan_flash():wants to allocate dirty flash "
-                                                  "space for 0 bytes.\n"));
-                                        D1(printk("jffs_scan_flash(): Flipping bits! We will free "
-                                                  "all allocated memory, erase this sector and remount\n"));
-                                        /* calculate start of present sector */
-                                        offset = (((__u32)pos)/(__u32)fmc->sector_size) * (__u32)fmc->sector_size;
-                                        
-                                        D1(printk("jffs_scan_flash():erasing sector starting 0x%x.\n",
-                                                  offset));
-                                        
-                                        if (flash_erase_region(fmc->mtd,
-                                                               offset, fmc->sector_size) < 0) {
-                                                printk(KERN_ERR "JFFS: Erase of flash failed. "
-                                                       "offset = %u, erase_size = %d\n",
-                                                       offset , fmc->sector_size);
-                                                flash_safe_release(fmc->mtd);
-                                                kfree(read_buf);
-                                                return -1; /* bad, bad, bad! */
-                                        }
-                                        flash_safe_release(fmc->mtd);
-                                        kfree(read_buf);
-                                        return -EAGAIN; /* erased offending sector. Try mount one more time please. */
-                                }
-                        }else{
-                                /* Being in here means that we have found free space that ends on an erase sector
-                                   boundary.
-                                   Count it in if we are still under NUMFREEALLOWED *and* it is at least 1 erase 
-                                   sector in length. This will keep us from picking any little ole' space as "free".
-                                 */
-                                 if((num_free_space < NUMFREEALLOWED) && 
-                                    ((unsigned int)(pos - start) >= fmc->sector_size)){
-                                           /* We really don't do anything to mark space as free, except *not* 
-                                              mark it dirty and just advance the "pos" location pointer. 
-                                              It will automatically be picked up as free space.
-                                            */ 
-                                           num_free_space++;
-                                           D1(printk("Free space accepted: Starting 0x%x for 0x%x bytes\n",
-                                                     (unsigned int) start, (unsigned int) (pos - start)));
-                                 }else{
-                                         num_free_spc_not_accp++;
-                                         D1(printk("Free space (#%i) found but *Not* accepted: Starting "
-                                                   "0x%x for 0x%x bytes\n", num_free_spc_not_accp, 
-                                                   (unsigned int) start, 
-                                                   (unsigned int) (pos - start)));
-                                         
-                                         /* Mark this space as dirty. We already have our free space. */
-                                         D1(printk("Dirty space: Starting 0x%x for 0x%x bytes\n",
-                                                   (unsigned int) start, (unsigned int) (pos - start)));
-                                         jffs_fmalloced(fmc, (__u32) start,
-                                                        (__u32) (pos - start), NULL);                                      
-                                 }
-                                 
-                        }
-                        if(num_free_space > NUMFREEALLOWED){
-                                 printk(KERN_WARNING "jffs_scan_flash(): Found free space "
-                                        "number %i. Only %i free space is allowed.\n",
-                                        num_free_space, NUMFREEALLOWED);                              
-                        }
-                        continue;
-                case JFFS_DIRTY_BITMASK:
-                        /* We have found 0x00000000 at this position.  Scan as far
-                           as possible to find out how much is dirty.  */
-                        D1(printk("jffs_scan_flash(): 0x00000000 at pos 0x%lx.\n",
-                                  (long)pos));
-                        for (; pos < end
-                               && JFFS_DIRTY_BITMASK == flash_read_u32(fmc->mtd, pos);
-                             pos += 4);
-                        D1(printk("jffs_scan_flash(): 0x00 ended at "
-                                  "pos 0x%lx.\n", (long)pos));
-                        jffs_fmalloced(fmc, (__u32) start,
-                                       (__u32) (pos - start), NULL);
-                        continue;
-                case JFFS_MAGIC_BITMASK:
-                        /* We have probably found a new raw inode.  */
-                        break;
-                default:
-                bad_inode:
-                        /* We're f*cked.  This is not solved yet.  We have
-                           to scan for the magic pattern.  */
-                        D1(printk("*************** Dirty flash memory or "
-                                  "bad inode: "
-                                  "hexdump(pos = 0x%lx, len = 128):\n",
-                                  (long)pos));
-                        D1(jffs_hexdump(fmc->mtd, pos, 128));
-                        for (pos += 4; pos < end; pos += 4) {
-                                switch (flash_read_u32(fmc->mtd, pos)) {
-                                case JFFS_MAGIC_BITMASK:
-                                case JFFS_EMPTY_BITMASK:
-                                        /* handle these in the main switch() loop */
-                                        goto cont_scan;
-                                default:
-                                        break;
-                                }
-                        }
-                        cont_scan:
-                        /* First, mark as dirty the region
-                           which really does contain crap. */
-                        jffs_fmalloced(fmc, (__u32) start,
-                                       (__u32) (pos - start),
-                                       NULL);
-                        
-                        continue;
-                }/* switch */
-                /* We have found the beginning of an inode.  Create a
-                   node for it unless there already is one available.  */
-                if (!node) {
-                        if (!(node = jffs_alloc_node())) {
-                                /* Free read buffer */
-                                kfree(read_buf);
-                                /* Release the flash device */
-                                flash_safe_release(fmc->mtd);
-        
-                                return -ENOMEM;
-                        }
-                        DJM(no_jffs_node++);
-                }
-                /* Read the next raw inode.  */
-                flash_safe_read(fmc->mtd, pos, (u_char *) &raw_inode,
-                                sizeof(struct jffs_raw_inode));
-                /* When we compute the checksum for the inode, we never
-                   count the 'accurate' or the 'checksum' fields.  */
-                tmp_accurate = raw_inode.accurate;
-                tmp_chksum = raw_inode.chksum;
-                raw_inode.accurate = 0;
-                raw_inode.chksum = 0;
-                checksum = jffs_checksum(&raw_inode,
-                                         sizeof(struct jffs_raw_inode));
-                raw_inode.accurate = tmp_accurate;
-                raw_inode.chksum = tmp_chksum;
-                D3(printk("*** We have found this raw inode at pos 0x%lx "
-                          "on the flash:\n", (long)pos));
-                D3(jffs_print_raw_inode(&raw_inode));
-                if (checksum != raw_inode.chksum) {
-                        D1(printk("jffs_scan_flash(): Bad checksum: "
-                                  "checksum = %u, "
-                                  "raw_inode.chksum = %u\n",
-                                  checksum, raw_inode.chksum));
-                        pos += sizeof(struct jffs_raw_inode);
-                        jffs_fmalloced(fmc, (__u32) start,
-                                       (__u32) (pos - start), NULL);
-                        /* Reuse this unused struct jffs_node.  */
-                        continue;
-                }
-                /* Check the raw inode read so far.  Start with the
-                   maximum length of the filename.  */
-                if (raw_inode.nsize > JFFS_MAX_NAME_LEN) {
-                        printk(KERN_WARNING "jffs_scan_flash: Found a "
-                               "JFFS node with name too large\n");
-                        goto bad_inode;
-                }
-                if (raw_inode.rename && raw_inode.dsize != sizeof(__u32)) {
-                        printk(KERN_WARNING "jffs_scan_flash: Found a "
-                               "rename node with dsize %u.\n",
-                               raw_inode.dsize);
-                        jffs_print_raw_inode(&raw_inode);
-                        goto bad_inode;
-                }
-                /* The node's data segment should not exceed a
-                   certain length.  */
-                if (raw_inode.dsize > fmc->max_chunk_size) {
-                        printk(KERN_WARNING "jffs_scan_flash: Found a "
-                               "JFFS node with dsize (0x%x) > max_chunk_size (0x%x)\n",
-                               raw_inode.dsize, fmc->max_chunk_size);
-                        goto bad_inode;
-                }
-                pos += sizeof(struct jffs_raw_inode);
-                /* This shouldn't be necessary because a node that
-                   violates the flash boundaries shouldn't be written
-                   in the first place. */
-                if (pos >= end) {
-                        goto check_node;
-                }
-                /* Read the name.  */
-                *name = 0;
-                if (raw_inode.nsize) {
-                        flash_safe_read(fmc->mtd, pos, name, raw_inode.nsize);
-                        name[raw_inode.nsize] = '\0';
-                        pos += raw_inode.nsize
-                               + JFFS_GET_PAD_BYTES(raw_inode.nsize);
-                        D3(printk("name == \"%s\"\n", name));
-                        checksum = jffs_checksum(name, raw_inode.nsize);
-                        if (checksum != raw_inode.nchksum) {
-                                D1(printk("jffs_scan_flash(): Bad checksum: "
-                                          "checksum = %u, "
-                                          "raw_inode.nchksum = %u\n",
-                                          checksum, raw_inode.nchksum));
-                                jffs_fmalloced(fmc, (__u32) start,
-                                               (__u32) (pos - start), NULL);
-                                /* Reuse this unused struct jffs_node.  */
-                                continue;
-                        }
-                        if (pos >= end) {
-                                goto check_node;
-                        }
-                }
-                /* Read the data, if it exists, in order to be sure it
-                   matches the checksum.  */
-                if (raw_inode.dsize) {
-                        if (raw_inode.rename) {
-                                deleted_file = flash_read_u32(fmc->mtd, pos);
-                        }
-                        if (jffs_checksum_flash(fmc->mtd, pos, raw_inode.dsize, &checksum)) {
-                                printk("jffs_checksum_flash() failed to calculate a checksum\n");
-                                jffs_fmalloced(fmc, (__u32) start,
-                                               (__u32) (pos - start), NULL);
-                                /* Reuse this unused struct jffs_node.  */
-                                continue;
-                        }                               
-                        pos += raw_inode.dsize
-                               + JFFS_GET_PAD_BYTES(raw_inode.dsize);
-                        if (checksum != raw_inode.dchksum) {
-                                D1(printk("jffs_scan_flash(): Bad checksum: "
-                                          "checksum = %u, "
-                                          "raw_inode.dchksum = %u\n",
-                                          checksum, raw_inode.dchksum));
-                                jffs_fmalloced(fmc, (__u32) start,
-                                               (__u32) (pos - start), NULL);
-                                /* Reuse this unused struct jffs_node.  */
-                                continue;
-                        }
-                }
-                check_node:
-                /* Remember the highest inode number in the whole file
-                   system.  This information will be used when assigning
-                   new files new inode numbers.  */
-                if (c->next_ino <= raw_inode.ino) {
-                        c->next_ino = raw_inode.ino + 1;
-                }
-                if (raw_inode.accurate) {
-                        int err;
-                        node->data_offset = raw_inode.offset;
-                        node->data_size = raw_inode.dsize;
-                        node->removed_size = raw_inode.rsize;
-                        /* Compute the offset to the actual data in the
-                           on-flash node.  */
-                        node->fm_offset
-                        = sizeof(struct jffs_raw_inode)
-                          + raw_inode.nsize
-                          + JFFS_GET_PAD_BYTES(raw_inode.nsize);
-                        node->fm = jffs_fmalloced(fmc, (__u32) start,
-                                                  (__u32) (pos - start),
-                                                  node);
-                        if (!node->fm) {
-                                D(printk("jffs_scan_flash(): !node->fm\n"));
-                                jffs_free_node(node);
-                                DJM(no_jffs_node--);
-                                /* Free read buffer */
-                                kfree(read_buf);
-                                /* Release the flash device */
-                                flash_safe_release(fmc->mtd);
-                                return -ENOMEM;
-                        }
-                        if ((err = jffs_insert_node(c, NULL, &raw_inode,
-                                                    name, node)) < 0) {
-                                printk("JFFS: Failed to handle raw inode. "
-                                       "(err = %d)\n", err);
-                                break;
-                        }
-                        if (raw_inode.rename) {
-                                struct jffs_delete_list *dl
-                                = (struct jffs_delete_list *)
-                                  kmalloc(sizeof(struct jffs_delete_list),
-                                          GFP_KERNEL);
-                                if (!dl) {
-                                        D(printk("jffs_scan_flash: !dl\n"));
-                                        jffs_free_node(node);
-                                        DJM(no_jffs_node--);
-                                        /* Release the flash device */
-                                        flash_safe_release(fmc->flash_part);
-                                        /* Free read buffer */
-                                        kfree(read_buf);
-                                        return -ENOMEM;
-                                }
-                                dl->ino = deleted_file;
-                                dl->next = c->delete_list;
-                                c->delete_list = dl;
-                                node->data_size = 0;
-                        }
-                        D3(jffs_print_node(node));
-                        node = NULL; /* Don't free the node!  */
-                }
-                else {
-                        jffs_fmalloced(fmc, (__u32) start,
-                                       (__u32) (pos - start), NULL);
-                        D3(printk("jffs_scan_flash(): Just found an obsolete "
-                                  "raw_inode. Continuing the scan...\n"));
-                        /* Reuse this unused struct jffs_node.  */
-                }
-        }
-        if (node) {
-                jffs_free_node(node);
-                DJM(no_jffs_node--);
-        }
-        jffs_build_end(fmc);
-        /* Free read buffer */
-        kfree(read_buf);
-        if(!num_free_space){
-                printk(KERN_WARNING "jffs_scan_flash(): Did not find even a single "
-                       "chunk of free space. This is BAD!\n");
-        }
-        /* Return happy */
-        D3(printk("jffs_scan_flash(): Leaving...\n"));
-        flash_safe_release(fmc->mtd);
-        /* This is to trap the "free size accounting screwed error. */
-        free_chunk_size1 = jffs_free_size1(fmc);
-        free_chunk_size2 = jffs_free_size2(fmc);
-        if (free_chunk_size1 + free_chunk_size2 != fmc->free_size) {
-                printk(KERN_WARNING "jffs_scan_falsh():Free size accounting screwed\n");
-                printk(KERN_WARNING "jfffs_scan_flash():free_chunk_size1 == 0x%x, "
-                       "free_chunk_size2 == 0x%x, fmc->free_size == 0x%x\n", 
-                       free_chunk_size1, free_chunk_size2, fmc->free_size);
-                return -1; /* Do NOT mount f/s so that we can inspect what happened.
-                              Mounting this  screwed up f/s will screw us up anyway.
-                            */
-        }       
-        return 0; /* as far as we are concerned, we are happy! */
-} /* jffs_scan_flash()  */
-/* Insert any kind of node into the file system.  Take care of data
-   insertions and deletions.  Also remove redundant information. The
-   memory allocated for the `name' is regarded as "given away" in the
-   caller's perspective.  */
-int
-jffs_insert_node(struct jffs_control *c, struct jffs_file *f,
-                 const struct jffs_raw_inode *raw_inode,
-                 const char *name, struct jffs_node *node)
-{
-        int update_name = 0;
-        int insert_into_tree = 0;
-        D2(printk("jffs_insert_node(): ino = %u, version = %u, "
-                  "name = \"%s\", deleted = %d\n",
-                  raw_inode->ino, raw_inode->version,
-                  ((name && *name) ? name : ""), raw_inode->deleted));
-        /* If there doesn't exist an associated jffs_file, then
-           create, initialize and insert one into the file system.  */
-        if (!f && !(f = jffs_find_file(c, raw_inode->ino))) {
-                if (!(f = jffs_create_file(c, raw_inode))) {
-                        return -ENOMEM;
-                }
-                jffs_insert_file_into_hash(f);
-                insert_into_tree = 1;
-        }
-        node->ino = raw_inode->ino;
-        node->version = raw_inode->version;
-        node->data_size = raw_inode->dsize;
-        node->fm_offset = sizeof(struct jffs_raw_inode) + raw_inode->nsize
-                          + JFFS_GET_PAD_BYTES(raw_inode->nsize);
-        node->name_size = raw_inode->nsize;
-        /* Now insert the node at the correct position into the file's
-           version list.  */
-        if (!f->version_head) {
-                /* This is the first node.  */
-                f->version_head = node;
-                f->version_tail = node;
-                node->version_prev = NULL;
-                node->version_next = NULL;
-                f->highest_version = node->version;
-                update_name = 1;
-                f->mode = raw_inode->mode;
-                f->uid = raw_inode->uid;
-                f->gid = raw_inode->gid;
-                f->atime = raw_inode->atime;
-                f->mtime = raw_inode->mtime;
-                f->ctime = raw_inode->ctime;
-        }
-        else if ((f->highest_version < node->version)
-                 || (node->version == 0)) {
-                /* Insert at the end of the list.  I.e. this node is the
-                   newest one so far.  */
-                node->version_prev = f->version_tail;
-                node->version_next = NULL;
-                f->version_tail->version_next = node;
-                f->version_tail = node;
-                f->highest_version = node->version;
-                update_name = 1;
-                f->pino = raw_inode->pino;
-                f->mode = raw_inode->mode;
-                f->uid = raw_inode->uid;
-                f->gid = raw_inode->gid;
-                f->atime = raw_inode->atime;
-                f->mtime = raw_inode->mtime;
-                f->ctime = raw_inode->ctime;
-        }
-        else if (f->version_head->version > node->version) {
-                /* Insert at the bottom of the list.  */
-                node->version_prev = NULL;
-                node->version_next = f->version_head;
-                f->version_head->version_prev = node;
-                f->version_head = node;
-                if (!f->name) {
-                        update_name = 1;
-                }
-        }
-        else {
-                struct jffs_node *n;
-                int newer_name = 0;
-                /* Search for the insertion position starting from
-                   the tail (newest node).  */
-                for (n = f->version_tail; n; n = n->version_prev) {
-                        if (n->version < node->version) {
-                                node->version_prev = n;
-                                node->version_next = n->version_next;
-                                node->version_next->version_prev = node;
-                                n->version_next = node;
-                                if (!newer_name) {
-                                        update_name = 1;
-                                }
-                                break;
-                        }
-                        if (n->name_size) {
-                                newer_name = 1;
-                        }
-                }
-        }
-        /* Deletion is irreversible. If any 'deleted' node is ever
-           written, the file is deleted */
-        if (raw_inode->deleted)
-                f->deleted = raw_inode->deleted;
-        /* Perhaps update the name.  */
-        if (raw_inode->nsize && update_name && name && *name && (name != f->name)) {
-                if (f->name) {
-                        kfree(f->name);
-                        DJM(no_name--);
-                }
-                if (!(f->name = kmalloc(raw_inode->nsize + 1,
-                                                 GFP_KERNEL))) {
-                        return -ENOMEM;
-                }
-                DJM(no_name++);
-                memcpy(f->name, name, raw_inode->nsize);
-                f->name[raw_inode->nsize] = '\0';
-                f->nsize = raw_inode->nsize;
-                D3(printk("jffs_insert_node(): Updated the name of "
-                          "the file to \"%s\".\n", name));
-        }
-        if (!c->building_fs) {
-                D3(printk("jffs_insert_node(): ---------------------------"
-                          "------------------------------------------- 1\n"));
-                if (insert_into_tree) {
-                        jffs_insert_file_into_tree(f);
-                }
-                /* Once upon a time, we would call jffs_possibly_delete_file()
-                   here. That causes an oops if someone's still got the file
-                   open, so now we only do it in jffs_delete_inode()
-                   -- dwmw2
-                */
-                if (node->data_size || node->removed_size) {
-                        jffs_update_file(f, node);
-                }
-                jffs_remove_redundant_nodes(f);
-                jffs_garbage_collect_trigger(c);
-                D3(printk("jffs_insert_node(): ---------------------------"
-                          "------------------------------------------- 2\n"));
-        }
-        return 0;
-} /* jffs_insert_node()  */
-/* Unlink a jffs_node from the version list it is in.  */
-static inline void
-jffs_unlink_node_from_version_list(struct jffs_file *f,
-                                   struct jffs_node *node)
-{
-        if (node->version_prev) {
-                node->version_prev->version_next = node->version_next;
-        } else {
-                f->version_head = node->version_next;
-        }
-        if (node->version_next) {
-                node->version_next->version_prev = node->version_prev;
-        } else {
-                f->version_tail = node->version_prev;
-        }
-}
-/* Unlink a jffs_node from the range list it is in.  */
-static inline void
-jffs_unlink_node_from_range_list(struct jffs_file *f, struct jffs_node *node)
-{
-        if (node->range_prev) {
-                node->range_prev->range_next = node->range_next;
-        }
-        else {
-                f->range_head = node->range_next;
-        }
-        if (node->range_next) {
-                node->range_next->range_prev = node->range_prev;
-        }
-        else {
-                f->range_tail = node->range_prev;
-        }
-}
-/* Function used by jffs_remove_redundant_nodes() below.  This function
-   classifies what kind of information a node adds to a file.  */
-static inline __u8
-jffs_classify_node(struct jffs_node *node)
-{
-        __u8 mod_type = JFFS_MODIFY_INODE;
-        if (node->name_size) {
-                mod_type |= JFFS_MODIFY_NAME;
-        }
-        if (node->data_size || node->removed_size) {
-                mod_type |= JFFS_MODIFY_DATA;
-        }
-        return mod_type;
-}
-/* Remove redundant nodes from a file.  Mark the on-flash memory
-   as dirty.  */
-static int
-jffs_remove_redundant_nodes(struct jffs_file *f)
-{
-        struct jffs_node *newest_node;
-        struct jffs_node *cur;
-        struct jffs_node *prev;
-        __u8 newest_type;
-        __u8 mod_type;
-        __u8 node_with_name_later = 0;
-        if (!(newest_node = f->version_tail)) {
-                return 0;
-        }
-        /* What does the `newest_node' modify?  */
-        newest_type = jffs_classify_node(newest_node);
-        node_with_name_later = newest_type & JFFS_MODIFY_NAME;
-        D3(printk("jffs_remove_redundant_nodes(): ino: %u, name: \"%s\", "
-                  "newest_type: %u\n", f->ino, (f->name ? f->name : ""),
-                  newest_type));
-        /* Traverse the file's nodes and determine which of them that are
-           superfluous.  Yeah, this might look very complex at first
-           glance but it is actually very simple.  */
-        for (cur = newest_node->version_prev; cur; cur = prev) {
-                prev = cur->version_prev;
-                mod_type = jffs_classify_node(cur);
-                if ((mod_type <= JFFS_MODIFY_INODE)
-                    || ((newest_type & JFFS_MODIFY_NAME)
-                        && (mod_type
-                            <= (JFFS_MODIFY_INODE + JFFS_MODIFY_NAME)))
-                    || (cur->data_size == 0 && cur->removed_size
-                        && !cur->version_prev && node_with_name_later)) {
-                        /* Yes, this node is redundant. Remove it.  */
-                        D2(printk("jffs_remove_redundant_nodes(): "
-                                  "Removing node: ino: %u, version: %u, "
-                                  "mod_type: %u\n", cur->ino, cur->version,
-                                  mod_type));
-                        jffs_unlink_node_from_version_list(f, cur);
-                        jffs_fmfree(f->c->fmc, cur->fm, cur);
-                        jffs_free_node(cur);
-                        DJM(no_jffs_node--);
-                }
-                else {
-                        node_with_name_later |= (mod_type & JFFS_MODIFY_NAME);
-                }
-        }
-        return 0;
-}
-/* Insert a file into the hash table.  */
-static int
-jffs_insert_file_into_hash(struct jffs_file *f)
-{
-        int i = f->ino % f->c->hash_len;
-        D3(printk("jffs_insert_file_into_hash(): f->ino: %u\n", f->ino));
-        list_add(&f->hash, &f->c->hash[i]);
-        return 0;
-}
-/* Insert a file into the file system tree.  */
-int
-jffs_insert_file_into_tree(struct jffs_file *f)
-{
-        struct jffs_file *parent;
-        D3(printk("jffs_insert_file_into_tree(): name: \"%s\"\n",
-                  (f->name ? f->name : "")));
-        if (!(parent = jffs_find_file(f->c, f->pino))) {
-                if (f->pino == 0) {
-                        f->c->root = f;
-                        f->parent = NULL;
-                        f->sibling_prev = NULL;
-                        f->sibling_next = NULL;
-                        return 0;
-                }
-                else {
-                        D1(printk("jffs_insert_file_into_tree(): Found "
-                                  "inode with no parent and pino == %u\n",
-                                  f->pino));
-                        return -1;
-                }
-        }
-        f->parent = parent;
-        f->sibling_next = parent->children;
-        if (f->sibling_next) {
-                f->sibling_next->sibling_prev = f;
-        }
-        f->sibling_prev = NULL;
-        parent->children = f;
-        return 0;
-}
-/* Remove a file from the hash table.  */
-static int
-jffs_unlink_file_from_hash(struct jffs_file *f)
-{
-        D3(printk("jffs_unlink_file_from_hash(): f: 0x%p, "
-                  "ino %u\n", f, f->ino));
-        list_del(&f->hash);
-        return 0;
-}
-/* Just remove the file from the parent's children.  Don't free
-   any memory.  */
-int
-jffs_unlink_file_from_tree(struct jffs_file *f)
-{
-        D3(printk("jffs_unlink_file_from_tree(): ino: %d, pino: %d, name: "
-                  "\"%s\"\n", f->ino, f->pino, (f->name ? f->name : "")));
-        if (f->sibling_prev) {
-                f->sibling_prev->sibling_next = f->sibling_next;
-        }
-        else if (f->parent) {
-                D3(printk("f->parent=%p\n", f->parent));
-                f->parent->children = f->sibling_next;
-        }
-        if (f->sibling_next) {
-                f->sibling_next->sibling_prev = f->sibling_prev;
-        }
-        return 0;
-}
-/* Find a file with its inode number.  */
-struct jffs_file *
-jffs_find_file(struct jffs_control *c, __u32 ino)
-{
-        struct jffs_file *f;
-        int i = ino % c->hash_len;
-        D3(printk("jffs_find_file(): ino: %u\n", ino));
-        list_for_each_entry(f, &c->hash[i], hash) {
-                if (ino != f->ino)
-                        continue;
-                D3(printk("jffs_find_file(): Found file with ino "
-                               "%u. (name: \"%s\")\n",
-                               ino, (f->name ? f->name : ""));
-                );
-                return f;
-        }
-        D3(printk("jffs_find_file(): Didn't find file "
-                         "with ino %u.\n", ino);
-        );
-        return NULL;
-}
-/* Find a file in a directory.  We are comparing the names.  */
-struct jffs_file *
-jffs_find_child(struct jffs_file *dir, const char *name, int len)
-{
-        struct jffs_file *f;
-        D3(printk("jffs_find_child()\n"));
-        for (f = dir->children; f; f = f->sibling_next) {
-                if (!f->deleted && f->name
-                    && !strncmp(f->name, name, len)
-                    && f->name[len] == '\0') {
-                        break;
-                }
-        }
-        D3(if (f) {
-                printk("jffs_find_child(): Found \"%s\".\n", f->name);
-        }
-        else {
-                char *copy = kmalloc(len + 1, GFP_KERNEL);
-                if (copy) {
-                        memcpy(copy, name, len);
-                        copy[len] = '\0';
-                }
-                printk("jffs_find_child(): Didn't find the file \"%s\".\n",
-                       (copy ? copy : ""));
-                kfree(copy);
-        });
-        return f;
-}
-/* Write a raw inode that takes up a certain amount of space in the flash
-   memory.  At the end of the flash device, there is often space that is
-   impossible to use.  At these times we want to mark this space as not
-   used.  In the cases when the amount of space is greater or equal than
-   a struct jffs_raw_inode, we write a "dummy node" that takes up this
-   space.  The space after the raw inode, if it exists, is left as it is.
-   Since this space after the raw inode contains JFFS_EMPTY_BITMASK bytes,
-   we can compute the checksum of it; we don't have to manipulate it any
-   further.
-   If the space left on the device is less than the size of a struct
-   jffs_raw_inode, this space is filled with JFFS_DIRTY_BITMASK bytes.
-   No raw inode is written this time.  */
-static int
-jffs_write_dummy_node(struct jffs_control *c, struct jffs_fm *dirty_fm)
-{
-        struct jffs_fmcontrol *fmc = c->fmc;
-        int err;
-        D1(printk("jffs_write_dummy_node(): dirty_fm->offset = 0x%08x, "
-                  "dirty_fm->size = %u\n",
-                  dirty_fm->offset, dirty_fm->size));
-        if (dirty_fm->size >= sizeof(struct jffs_raw_inode)) {
-                struct jffs_raw_inode raw_inode;
-                memset(&raw_inode, 0, sizeof(struct jffs_raw_inode));
-                raw_inode.magic = JFFS_MAGIC_BITMASK;
-                raw_inode.dsize = dirty_fm->size
-                                  - sizeof(struct jffs_raw_inode);
-                raw_inode.dchksum = raw_inode.dsize * 0xff;
-                raw_inode.chksum
-                = jffs_checksum(&raw_inode, sizeof(struct jffs_raw_inode));
-                if ((err = flash_safe_write(fmc->mtd,
-                                            dirty_fm->offset,
-                                            (u_char *)&raw_inode,
-                                            sizeof(struct jffs_raw_inode)))
-                    < 0) {
-                        printk(KERN_ERR "JFFS: jffs_write_dummy_node: "
-                               "flash_safe_write failed!\n");
-                        return err;
-                }
-        }
-        else {
-                flash_safe_acquire(fmc->mtd);
-                flash_memset(fmc->mtd, dirty_fm->offset, 0, dirty_fm->size);
-                flash_safe_release(fmc->mtd);
-        }
-        D3(printk("jffs_write_dummy_node(): Leaving...\n"));
-        return 0;
-}
-/* Write a raw inode, possibly its name and possibly some data.  */
-int
-jffs_write_node(struct jffs_control *c, struct jffs_node *node,
-                struct jffs_raw_inode *raw_inode,
-                const char *name, const unsigned char *data,
-                int recoverable,
-                struct jffs_file *f)
-{
-        struct jffs_fmcontrol *fmc = c->fmc;
-        struct jffs_fm *fm;
-        struct kvec node_iovec[4];
-        unsigned long iovec_cnt;
-        __u32 pos;
-        int err;
-        __u32 slack = 0;
-        __u32 total_name_size = raw_inode->nsize
-                                + JFFS_GET_PAD_BYTES(raw_inode->nsize);
-        __u32 total_data_size = raw_inode->dsize
-                                + JFFS_GET_PAD_BYTES(raw_inode->dsize);
-        __u32 total_size = sizeof(struct jffs_raw_inode)
-                           + total_name_size + total_data_size;
-        
-        /* If this node isn't something that will eventually let
-           GC free even more space, then don't allow it unless
-           there's at least max_chunk_size space still available
-        */
-        if (!recoverable)
-                slack = fmc->max_chunk_size;
-                
-        /* Fire the retrorockets and shoot the fruiton torpedoes, sir!  */
-        ASSERT(if (!node) {
-                printk("jffs_write_node(): node == NULL\n");
-                return -EINVAL;
-        });
-        ASSERT(if (raw_inode && raw_inode->nsize && !name) {
-                printk("*** jffs_write_node(): nsize = %u but name == NULL\n",
-                       raw_inode->nsize);
-                return -EINVAL;
-        });
-        D1(printk("jffs_write_node(): filename = \"%s\", ino = %u, "
-                  "total_size = %u\n",
-                  (name ? name : ""), raw_inode->ino,
-                  total_size));
-        jffs_fm_write_lock(fmc);
-retry:
-        fm = NULL;
-        err = 0;
-        while (!fm) {
-                /* Deadlocks suck. */
-                while(fmc->free_size < fmc->min_free_size + total_size + slack) {
-                        jffs_fm_write_unlock(fmc);
-                        if (!JFFS_ENOUGH_SPACE(c, total_size + slack))
-                                return -ENOSPC;
-                        jffs_fm_write_lock(fmc);
-                }
-                /* First try to allocate some flash memory.  */
-                err = jffs_fmalloc(fmc, total_size, node, &fm);
-                
-                if (err == -ENOSPC) {
-                        /* Just out of space. GC and try again */
-                        if (fmc->dirty_size < fmc->sector_size) {
-                                D(printk("jffs_write_node(): jffs_fmalloc(0x%p, %u) "
-                                         "failed, no dirty space to GC\n", fmc,
-                                         total_size));
-                                return err;
-                        }
-                        
-                        D1(printk(KERN_INFO "jffs_write_node(): Calling jffs_garbage_collect_now()\n"));
-                        jffs_fm_write_unlock(fmc);
-                        if ((err = jffs_garbage_collect_now(c))) {
-                                D(printk("jffs_write_node(): jffs_garbage_collect_now() failed\n"));
-                                return err;
-                        }
-                        jffs_fm_write_lock(fmc);
-                        continue;
-                } 
-                if (err < 0) {
-                        jffs_fm_write_unlock(fmc);
-                        D(printk("jffs_write_node(): jffs_fmalloc(0x%p, %u) "
-                                 "failed!\n", fmc, total_size));
-                        return err;
-                }
-                if (!fm->nodes) {
-                        /* The jffs_fm struct that we got is not good enough.
-                           Make that space dirty and try again  */
-                        if ((err = jffs_write_dummy_node(c, fm)) < 0) {
-                                kfree(fm);
-                                DJM(no_jffs_fm--);
-                                jffs_fm_write_unlock(fmc);
-                                D(printk("jffs_write_node(): "
-                                         "jffs_write_dummy_node(): Failed!\n"));
-                                return err;
-                        }
-                        fm = NULL;
-                }
-        } /* while(!fm) */
-        node->fm = fm;
-        ASSERT(if (fm->nodes == 0) {
-                printk(KERN_ERR "jffs_write_node(): fm->nodes == 0\n");
-        });
-        pos = node->fm->offset;
-        /* Increment the version number here. We can't let the caller
-           set it beforehand, because we might have had to do GC on a node
-           of this file - and we'd end up reusing version numbers.
-        */
-        if (f) {
-                raw_inode->version = f->highest_version + 1;
-                D1(printk (KERN_NOTICE "jffs_write_node(): setting version of %s to %d\n", f->name, raw_inode->version));
-                /* if the file was deleted, set the deleted bit in the raw inode */
-                if (f->deleted)
-                        raw_inode->deleted = 1;
-        }
-        /* Compute the checksum for the data and name chunks.  */
-        raw_inode->dchksum = jffs_checksum(data, raw_inode->dsize);
-        raw_inode->nchksum = jffs_checksum(name, raw_inode->nsize);
-        /* The checksum is calculated without the chksum and accurate
-           fields so set them to zero first.  */
-        raw_inode->accurate = 0;
-        raw_inode->chksum = 0;
-        raw_inode->chksum = jffs_checksum(raw_inode,
-                                          sizeof(struct jffs_raw_inode));
-        raw_inode->accurate = 0xff;
-        D3(printk("jffs_write_node(): About to write this raw inode to the "
-                  "flash at pos 0x%lx:\n", (long)pos));
-        D3(jffs_print_raw_inode(raw_inode));
-        /* The actual raw JFFS node */
-        node_iovec[0].iov_base = (void *) raw_inode;
-        node_iovec[0].iov_len = (size_t) sizeof(struct jffs_raw_inode);
-        iovec_cnt = 1;
-        /* Get name and size if there is one */
-        if (raw_inode->nsize) {
-                node_iovec[iovec_cnt].iov_base = (void *) name;
-                node_iovec[iovec_cnt].iov_len = (size_t) raw_inode->nsize;
-                iovec_cnt++;
-                if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) {
-                        static unsigned char allff[3]={255,255,255};
-                        /* Add some extra padding if necessary */
-                        node_iovec[iovec_cnt].iov_base = allff;
-                        node_iovec[iovec_cnt].iov_len =
-                                JFFS_GET_PAD_BYTES(raw_inode->nsize);
-                        iovec_cnt++;
-                }
-        }
-        /* Get data and size if there is any */
-        if (raw_inode->dsize) {
-                node_iovec[iovec_cnt].iov_base = (void *) data;
-                node_iovec[iovec_cnt].iov_len = (size_t) raw_inode->dsize;
-                iovec_cnt++;
-                /* No need to pad this because we're not actually putting
-                   anything after it.
-                */
-        }
-        if ((err = flash_safe_writev(fmc->mtd, node_iovec, iovec_cnt,
-                                    pos)) < 0) {
-                jffs_fmfree_partly(fmc, fm, 0);
-                jffs_fm_write_unlock(fmc);
-                printk(KERN_ERR "JFFS: jffs_write_node: Failed to write, "
-                       "requested %i, wrote %i\n", total_size, err);
-                goto retry;
-        }
-        if (raw_inode->deleted)
-                f->deleted = 1;
-        jffs_fm_write_unlock(fmc);
-        D3(printk("jffs_write_node(): Leaving...\n"));
-        return raw_inode->dsize;
-} /* jffs_write_node()  */
-/* Read data from the node and write it to the buffer.  'node_offset'
-   is how much we have read from this particular node before and which
-   shouldn't be read again.  'max_size' is how much space there is in
-   the buffer.  */
-static int
-jffs_get_node_data(struct jffs_file *f, struct jffs_node *node, 
-                   unsigned char *buf,__u32 node_offset, __u32 max_size)
-{
-        struct jffs_fmcontrol *fmc = f->c->fmc;
-        __u32 pos = node->fm->offset + node->fm_offset + node_offset;
-        __u32 avail = node->data_size - node_offset;
-        __u32 r;
-        D2(printk("  jffs_get_node_data(): file: \"%s\", ino: %u, "
-                  "version: %u, node_offset: %u\n",
-                  f->name, node->ino, node->version, node_offset));
-        r = min(avail, max_size);
-        D3(printk(KERN_NOTICE "jffs_get_node_data\n"));
-        flash_safe_read(fmc->mtd, pos, buf, r);
-        D3(printk("  jffs_get_node_data(): Read %u byte%s.\n",
-                  r, (r == 1 ? "" : "s")));
-        return r;
-}
-/* Read data from the file's nodes.  Write the data to the buffer
-   'buf'.  'read_offset' tells how much data we should skip.  */
-int
-jffs_read_data(struct jffs_file *f, unsigned char *buf, __u32 read_offset,
-               __u32 size)
-{
-        struct jffs_node *node;
-        __u32 read_data = 0; /* Total amount of read data.  */
-        __u32 node_offset = 0;
-        __u32 pos = 0; /* Number of bytes traversed.  */
-        D2(printk("jffs_read_data(): file = \"%s\", read_offset = %d, "
-                  "size = %u\n",
-                  (f->name ? f->name : ""), read_offset, size));
-        if (read_offset >= f->size) {
-                D(printk("  f->size: %d\n", f->size));
-                return 0;
-        }
-        /* First find the node to read data from.  */
-        node = f->range_head;
-        while (pos <= read_offset) {
-                node_offset = read_offset - pos;
-                if (node_offset >= node->data_size) {
-                        pos += node->data_size;
-                        node = node->range_next;
-                }
-                else {
-                        break;
-                }
-        }
-        /* "Cats are living proof that not everything in nature
-           has to be useful."
-           - Garrison Keilor ('97)  */
-        /* Fill the buffer.  */
-        while (node && (read_data < size)) {
-                int r;
-                if (!node->fm) {
-                        /* This node does not refer to real data.  */
-                        r = min(size - read_data,
-                                     node->data_size - node_offset);
-                        memset(&buf[read_data], 0, r);
-                }
-                else if ((r = jffs_get_node_data(f, node, &buf[read_data],
-                                                 node_offset,
-                                                 size - read_data)) < 0) {
-                        return r;
-                }
-                read_data += r;
-                node_offset = 0;
-                node = node->range_next;
-        }
-        D3(printk("  jffs_read_data(): Read %u bytes.\n", read_data));
-        return read_data;
-}
-/* Used for traversing all nodes in the hash table.  */
-int
-jffs_foreach_file(struct jffs_control *c, int (*func)(struct jffs_file *))
-{
-        int pos;
-        int r;
-        int result = 0;
-        for (pos = 0; pos < c->hash_len; pos++) {
-                struct jffs_file *f, *next;
-                /* We must do _safe, because 'func' might remove the
-                   current file 'f' from the list.  */
-                list_for_each_entry_safe(f, next, &c->hash[pos], hash) {
-                        r = func(f);
-                        if (r < 0)
-                                return r;
-                        result += r;
-                }
-        }
-        return result;
-}
-/* Free all nodes associated with a file.  */
-static int
-jffs_free_node_list(struct jffs_file *f)
-{
-        struct jffs_node *node;
-        struct jffs_node *p;
-        D3(printk("jffs_free_node_list(): f #%u, \"%s\"\n",
-                  f->ino, (f->name ? f->name : "")));
-        node = f->version_head;
-        while (node) {
-                p = node;
-                node = node->version_next;
-                jffs_free_node(p);
-                DJM(no_jffs_node--);
-        }
-        return 0;
-}
-/* Free a file and its name.  */
-static int
-jffs_free_file(struct jffs_file *f)
-{
-        D3(printk("jffs_free_file: f #%u, \"%s\"\n",
-                  f->ino, (f->name ? f->name : "")));
-        if (f->name) {
-                kfree(f->name);
-                DJM(no_name--);
-        }
-        kfree(f);
-        no_jffs_file--;
-        return 0;
-}
-static long
-jffs_get_file_count(void)
-{
-        return no_jffs_file;
-}
-/* See if a file is deleted. If so, mark that file's nodes as obsolete.  */
-int
-jffs_possibly_delete_file(struct jffs_file *f)
-{
-        struct jffs_node *n;
-        D3(printk("jffs_possibly_delete_file(): ino: %u\n",
-                  f->ino));
-        ASSERT(if (!f) {
-                printk(KERN_ERR "jffs_possibly_delete_file(): f == NULL\n");
-                return -1;
-        });
-        if (f->deleted) {
-                /* First try to remove all older versions.  Commence with
-                   the oldest node.  */
-                for (n = f->version_head; n; n = n->version_next) {
-                        if (!n->fm) {
-                                continue;
-                        }
-                        if (jffs_fmfree(f->c->fmc, n->fm, n) < 0) {
-                                break;
-                        }
-                }
-                /* Unlink the file from the filesystem.  */
-                if (!f->c->building_fs) {
-                        jffs_unlink_file_from_tree(f);
-                }
-                jffs_unlink_file_from_hash(f);
-                jffs_free_node_list(f);
-                jffs_free_file(f);
-        }
-        return 0;
-}
-/* Used in conjunction with jffs_foreach_file() to count the number
-   of files in the file system.  */
-int
-jffs_file_count(struct jffs_file *f)
-{
-        return 1;
-}
-/* Build up a file's range list from scratch by going through the
-   version list.  */
-static int
-jffs_build_file(struct jffs_file *f)
-{
-        struct jffs_node *n;
-        D3(printk("jffs_build_file(): ino: %u, name: \"%s\"\n",
-                  f->ino, (f->name ? f->name : "")));
-        for (n = f->version_head; n; n = n->version_next) {
-                jffs_update_file(f, n);
-        }
-        return 0;
-}
-/* Remove an amount of data from a file. If this amount of data is
-   zero, that could mean that a node should be split in two parts.
-   We remove or change the appropriate nodes in the lists.
-   Starting offset of area to be removed is node->data_offset,
-   and the length of the area is in node->removed_size.   */
-static int
-jffs_delete_data(struct jffs_file *f, struct jffs_node *node)
-{
-        struct jffs_node *n;
-        __u32 offset = node->data_offset;
-        __u32 remove_size = node->removed_size;
-        D3(printk("jffs_delete_data(): offset = %u, remove_size = %u\n",
-                  offset, remove_size));
-        if (remove_size == 0
-            && f->range_tail
-            && f->range_tail->data_offset + f->range_tail->data_size
-               == offset) {
-                /* A simple append; nothing to remove or no node to split.  */
-                return 0;
-        }
-        /* Find the node where we should begin the removal.  */
-        for (n = f->range_head; n; n = n->range_next) {
-                if (n->data_offset + n->data_size > offset) {
-                        break;
-                }
-        }
-        if (!n) {
-                /* If there's no data in the file there's no data to
-                   remove either.  */
-                return 0;
-        }
-        if (n->data_offset > offset) {
-                /* XXX: Not implemented yet.  */
-                printk(KERN_WARNING "JFFS: An unexpected situation "
-                       "occurred in jffs_delete_data.\n");
-        }
-        else if (n->data_offset < offset) {
-                /* See if the node has to be split into two parts.  */
-                if (n->data_offset + n->data_size > offset + remove_size) {
-                        /* Do the split.  */
-                        struct jffs_node *new_node;
-                        D3(printk("jffs_delete_data(): Split node with "
-                                  "version number %u.\n", n->version));
-                        if (!(new_node = jffs_alloc_node())) {
-                                D(printk("jffs_delete_data(): -ENOMEM\n"));
-                                return -ENOMEM;
-                        }
-                        DJM(no_jffs_node++);
-                        new_node->ino = n->ino;
-                        new_node->version = n->version;
-                        new_node->data_offset = offset;
-                        new_node->data_size = n->data_size - (remove_size + (offset - n->data_offset));
-                        new_node->fm_offset = n->fm_offset + (remove_size + (offset - n->data_offset));
-                        new_node->name_size = n->name_size;
-                        new_node->fm = n->fm;
-                        new_node->version_prev = n;
-                        new_node->version_next = n->version_next;
-                        if (new_node->version_next) {
-                                new_node->version_next->version_prev
-                                = new_node;
-                        }
-                        else {
-                                f->version_tail = new_node;
-                        }
-                        n->version_next = new_node;
-                        new_node->range_prev = n;
-                        new_node->range_next = n->range_next;
-                        if (new_node->range_next) {
-                                new_node->range_next->range_prev = new_node;
-                        }
-                        else {
-                                f->range_tail = new_node;
-                        }
-                        /* A very interesting can of worms.  */
-                        n->range_next = new_node;
-                        n->data_size = offset - n->data_offset;
-                        if (new_node->fm)
-                                jffs_add_node(new_node);
-                        else {
-                                D1(printk(KERN_WARNING "jffs_delete_data(): Splitting an empty node (file hold).\n!"));
-                                D1(printk(KERN_WARNING "FIXME: Did dwmw2 do the right thing here?\n"));
-                        }
-                        n = new_node->range_next;
-                        remove_size = 0;
-                }
-                else {
-                        /* No.  No need to split the node.  Just remove
-                           the end of the node.  */
-                        int r = min(n->data_offset + n->data_size
-                                         - offset, remove_size);
-                        n->data_size -= r;
-                        remove_size -= r;
-                        n = n->range_next;
-                }
-        }
-        /* Remove as many nodes as necessary.  */
-        while (n && remove_size) {
-                if (n->data_size <= remove_size) {
-                        struct jffs_node *p = n;
-                        remove_size -= n->data_size;
-                        n = n->range_next;
-                        D3(printk("jffs_delete_data(): Removing node: "
-                                  "ino: %u, version: %u%s\n",
-                                  p->ino, p->version,
-                                  (p->fm ? "" : " (virtual)")));
-                        if (p->fm) {
-                                jffs_fmfree(f->c->fmc, p->fm, p);
-                        }
-                        jffs_unlink_node_from_range_list(f, p);
-                        jffs_unlink_node_from_version_list(f, p);
-                        jffs_free_node(p);
-                        DJM(no_jffs_node--);
-                }
-                else {
-                        n->data_size -= remove_size;
-                        n->fm_offset += remove_size;
-                        n->data_offset -= (node->removed_size - remove_size);
-                        n = n->range_next;
-                        break;
-                }
-        }
-        /* Adjust the following nodes' information about offsets etc.  */
-        while (n && node->removed_size) {
-                n->data_offset -= node->removed_size;
-                n = n->range_next;
-        }
-        if (node->removed_size > (f->size - node->data_offset)) {
-                /* It's possible that the removed_size is in fact
-                 * greater than the amount of data we actually thought
-                 * were present in the first place - some of the nodes 
-                 * which this node originally obsoleted may already have
-                 * been deleted from the flash by subsequent garbage 
-                 * collection.
-                 *
-                 * If this is the case, don't let f->size go negative.
-                 * Bad things would happen :)
-                 */
-                f->size = node->data_offset;
-        } else {
-                f->size -= node->removed_size;
-        }
-        D3(printk("jffs_delete_data(): f->size = %d\n", f->size));
-        return 0;
-} /* jffs_delete_data()  */
-/* Insert some data into a file.  Prior to the call to this function,
-   jffs_delete_data should be called.  */
-static int
-jffs_insert_data(struct jffs_file *f, struct jffs_node *node)
-{
-        D3(printk("jffs_insert_data(): node->data_offset = %u, "
-                  "node->data_size = %u, f->size = %u\n",
-                  node->data_offset, node->data_size, f->size));
-        /* Find the position where we should insert data.  */
-        retry:
-        if (node->data_offset == f->size) {
-                /* A simple append.  This is the most common operation.  */
-                node->range_next = NULL;
-                node->range_prev = f->range_tail;
-                if (node->range_prev) {
-                        node->range_prev->range_next = node;
-                }
-                f->range_tail = node;
-                f->size += node->data_size;
-                if (!f->range_head) {
-                        f->range_head = node;
-                }
-        }
-        else if (node->data_offset < f->size) {
-                /* Trying to insert data into the middle of the file.  This
-                   means no problem because jffs_delete_data() has already
-                   prepared the range list for us.  */
-                struct jffs_node *n;
-                /* Find the correct place for the insertion and then insert
-                   the node.  */
-                for (n = f->range_head; n; n = n->range_next) {
-                        D2(printk("Cool stuff's happening!\n"));
-                        if (n->data_offset == node->data_offset) {
-                                node->range_prev = n->range_prev;
-                                if (node->range_prev) {
-                                        node->range_prev->range_next = node;
-                                }
-                                else {
-                                        f->range_head = node;
-                                }
-                                node->range_next = n;
-                                n->range_prev = node;
-                                break;
-                        }
-                        ASSERT(else if (n->data_offset + n->data_size >
-                                        node->data_offset) {
-                                printk(KERN_ERR "jffs_insert_data(): "
-                                       "Couldn't find a place to insert "
-                                       "the data!\n");
-                                return -1;
-                        });
-                }
-                /* Adjust later nodes' offsets etc.  */
-                n = node->range_next;
-                while (n) {
-                        n->data_offset += node->data_size;
-                        n = n->range_next;
-                }
-                f->size += node->data_size;
-        }
-        else if (node->data_offset > f->size) {
-                /* Okay.  This is tricky.  This means that we want to insert
-                   data at a place that is beyond the limits of the file as
-                   it is constructed right now.  This is actually a common
-                   event that for instance could occur during the mounting
-                   of the file system if a large file have been truncated,
-                   rewritten and then only partially garbage collected.  */
-                struct jffs_node *n;
-                /* We need a place holder for the data that is missing in
-                   front of this insertion.  This "virtual node" will not
-                   be associated with any space on the flash device.  */
-                struct jffs_node *virtual_node;
-                if (!(virtual_node = jffs_alloc_node())) {
-                        return -ENOMEM;
-                }
-                D(printk("jffs_insert_data: Inserting a virtual node.\n"));
-                D(printk("  node->data_offset = %u\n", node->data_offset));
-                D(printk("  f->size = %u\n", f->size));
-                virtual_node->ino = node->ino;
-                virtual_node->version = node->version;
-                virtual_node->removed_size = 0;
-                virtual_node->fm_offset = 0;
-                virtual_node->name_size = 0;
-                virtual_node->fm = NULL; /* This is a virtual data holder.  */
-                virtual_node->version_prev = NULL;
-                virtual_node->version_next = NULL;
-                virtual_node->range_next = NULL;
-                /* Are there any data at all in the file yet?  */
-                if (f->range_head) {
-                        virtual_node->data_offset
-                        = f->range_tail->data_offset
-                          + f->range_tail->data_size;
-                        virtual_node->data_size
-                        = node->data_offset - virtual_node->data_offset;
-                        virtual_node->range_prev = f->range_tail;
-                        f->range_tail->range_next = virtual_node;
-                }
-                else {
-                        virtual_node->data_offset = 0;
-                        virtual_node->data_size = node->data_offset;
-                        virtual_node->range_prev = NULL;
-                        f->range_head = virtual_node;
-                }
-                f->range_tail = virtual_node;
-                f->size += virtual_node->data_size;
-                /* Insert this virtual node in the version list as well.  */
-                for (n = f->version_head; n ; n = n->version_next) {
-                        if (n->version == virtual_node->version) {
-                                virtual_node->version_prev = n->version_prev;
-                                n->version_prev = virtual_node;
-                                if (virtual_node->version_prev) {
-                                        virtual_node->version_prev
-                                        ->version_next = virtual_node;
-                                }
-                                else {
-                                        f->version_head = virtual_node;
-                                }
-                                virtual_node->version_next = n;
-                                break;
-                        }
-                }
-                D(jffs_print_node(virtual_node));
-                /* Make a new try to insert the node.  */
-                goto retry;
-        }
-        D3(printk("jffs_insert_data(): f->size = %d\n", f->size));
-        return 0;
-}
-/* A new node (with data) has been added to the file and now the range
-   list has to be modified.  */
-static int
-jffs_update_file(struct jffs_file *f, struct jffs_node *node)
-{
-        int err;
-        D3(printk("jffs_update_file(): ino: %u, version: %u\n",
-                  f->ino, node->version));
-        if (node->data_size == 0) {
-                if (node->removed_size == 0) {
-                        /* data_offset == X  */
-                        /* data_size == 0  */
-                        /* remove_size == 0  */
-                }
-                else {
-                        /* data_offset == X  */
-                        /* data_size == 0  */
-                        /* remove_size != 0  */
-                        if ((err = jffs_delete_data(f, node)) < 0) {
-                                return err;
-                        }
-                }
-        }
-        else {
-                /* data_offset == X  */
-                /* data_size != 0  */
-                /* remove_size == Y  */
-                if ((err = jffs_delete_data(f, node)) < 0) {
-                        return err;
-                }
-                if ((err = jffs_insert_data(f, node)) < 0) {
-                        return err;
-                }
-        }
-        return 0;
-}
-/* Print the contents of a file.  */
-#if 0
-int
-jffs_print_file(struct jffs_file *f)
-{
-        D(int i);
-        D(printk("jffs_file: 0x%p\n", f));
-        D(printk("{\n"));
-        D(printk("        0x%08x, /* ino  */\n", f->ino));
-        D(printk("        0x%08x, /* pino  */\n", f->pino));
-        D(printk("        0x%08x, /* mode  */\n", f->mode));
-        D(printk("        0x%04x,     /* uid  */\n", f->uid));
-        D(printk("        0x%04x,     /* gid  */\n", f->gid));
-        D(printk("        0x%08x, /* atime  */\n", f->atime));
-        D(printk("        0x%08x, /* mtime  */\n", f->mtime));
-        D(printk("        0x%08x, /* ctime  */\n", f->ctime));
-        D(printk("        0x%02x,       /* nsize  */\n", f->nsize));
-        D(printk("        0x%02x,       /* nlink  */\n", f->nlink));
-        D(printk("        0x%02x,       /* deleted  */\n", f->deleted));
-        D(printk("        \"%s\", ", (f->name ? f->name : "")));
-        D(for (i = strlen(f->name ? f->name : ""); i < 8; ++i) {
-                printk(" ");
-        });
-        D(printk("/* name  */\n"));
-        D(printk("        0x%08x, /* size  */\n", f->size));
-        D(printk("        0x%08x, /* highest_version  */\n",
-                 f->highest_version));
-        D(printk("        0x%p, /* c  */\n", f->c));
-        D(printk("        0x%p, /* parent  */\n", f->parent));
-        D(printk("        0x%p, /* children  */\n", f->children));
-        D(printk("        0x%p, /* sibling_prev  */\n", f->sibling_prev));
-        D(printk("        0x%p, /* sibling_next  */\n", f->sibling_next));
-        D(printk("        0x%p, /* hash_prev  */\n", f->hash.prev));
-        D(printk("        0x%p, /* hash_next  */\n", f->hash.next));
-        D(printk("        0x%p, /* range_head  */\n", f->range_head));
-        D(printk("        0x%p, /* range_tail  */\n", f->range_tail));
-        D(printk("        0x%p, /* version_head  */\n", f->version_head));
-        D(printk("        0x%p, /* version_tail  */\n", f->version_tail));
-        D(printk("}\n"));
-        return 0;
-}
-#endif  /*  0  */
-void
-jffs_print_hash_table(struct jffs_control *c)
-{
-        int i;
-        printk("JFFS: Dumping the file system's hash table...\n");
-        for (i = 0; i < c->hash_len; i++) {
-                struct jffs_file *f;
-                list_for_each_entry(f, &c->hash[i], hash) {
-                        printk("*** c->hash[%u]: \"%s\" "
-                               "(ino: %u, pino: %u)\n",
-                               i, (f->name ? f->name : ""),
-                               f->ino, f->pino);
-                }
-        }
-}
-void
-jffs_print_tree(struct jffs_file *first_file, int indent)
-{
-        struct jffs_file *f;
-        char *space;
-        int dir;
-        if (!first_file) {
-                return;
-        }
-        if (!(space = kmalloc(indent + 1, GFP_KERNEL))) {
-                printk("jffs_print_tree(): Out of memory!\n");
-                return;
-        }
-        memset(space, ' ', indent);
-        space[indent] = '\0';
-        for (f = first_file; f; f = f->sibling_next) {
-                dir = S_ISDIR(f->mode);
-                printk("%s%s%s (ino: %u, highest_version: %u, size: %u)\n",
-                       space, (f->name ? f->name : ""), (dir ? "/" : ""),
-                       f->ino, f->highest_version, f->size);
-                if (dir) {
-                        jffs_print_tree(f->children, indent + 2);
-                }
-        }
-        kfree(space);
-}
-#if defined(JFFS_MEMORY_DEBUG) && JFFS_MEMORY_DEBUG
-void
-jffs_print_memory_allocation_statistics(void)
-{
-        static long printout;
-        printk("________ Memory printout #%ld ________\n", ++printout);
-        printk("no_jffs_file = %ld\n", no_jffs_file);
-        printk("no_jffs_node = %ld\n", no_jffs_node);
-        printk("no_jffs_control = %ld\n", no_jffs_control);
-        printk("no_jffs_raw_inode = %ld\n", no_jffs_raw_inode);
-        printk("no_jffs_node_ref = %ld\n", no_jffs_node_ref);
-        printk("no_jffs_fm = %ld\n", no_jffs_fm);
-        printk("no_jffs_fmcontrol = %ld\n", no_jffs_fmcontrol);
-        printk("no_hash = %ld\n", no_hash);
-        printk("no_name = %ld\n", no_name);
-        printk("\n");
-}
-#endif
-/* Rewrite `size' bytes, and begin at `node'.  */
-static int
-jffs_rewrite_data(struct jffs_file *f, struct jffs_node *node, __u32 size)
-{
-        struct jffs_control *c = f->c;
-        struct jffs_fmcontrol *fmc = c->fmc;
-        struct jffs_raw_inode raw_inode;
-        struct jffs_node *new_node;
-        struct jffs_fm *fm;
-        __u32 pos;
-        __u32 pos_dchksum;
-        __u32 total_name_size;
-        __u32 total_data_size;
-        __u32 total_size;
-        int err;
-        D1(printk("***jffs_rewrite_data(): node: %u, name: \"%s\", size: %u\n",
-                  f->ino, (f->name ? f->name : "(null)"), size));
-        /* Create and initialize the new node.  */
-        if (!(new_node = jffs_alloc_node())) {
-                D(printk("jffs_rewrite_data(): "
-                         "Failed to allocate node.\n"));
-                return -ENOMEM;
-        }
-        DJM(no_jffs_node++);
-        new_node->data_offset = node->data_offset;
-        new_node->removed_size = size;
-        total_name_size = JFFS_PAD(f->nsize);
-        total_data_size = JFFS_PAD(size);
-        total_size = sizeof(struct jffs_raw_inode)
-                     + total_name_size + total_data_size;
-        new_node->fm_offset = sizeof(struct jffs_raw_inode)
-                              + total_name_size;
-retry:
-        jffs_fm_write_lock(fmc);
-        err = 0;
-        if ((err = jffs_fmalloc(fmc, total_size, new_node, &fm)) < 0) {
-                DJM(no_jffs_node--);
-                jffs_fm_write_unlock(fmc);
-                D(printk("jffs_rewrite_data(): Failed to allocate fm.\n"));
-                jffs_free_node(new_node);
-                return err;
-        }
-        else if (!fm->nodes) {
-                /* The jffs_fm struct that we got is not big enough.  */
-                /* This should never happen, because we deal with this case
-                   in jffs_garbage_collect_next().*/
-                printk(KERN_WARNING "jffs_rewrite_data(): Allocated node is too small (%d bytes of %d)\n", fm->size, total_size);
-                if ((err = jffs_write_dummy_node(c, fm)) < 0) {
-                        D(printk("jffs_rewrite_data(): "
-                                 "jffs_write_dummy_node() Failed!\n"));
-                } else {
-                        err = -ENOSPC;
-                }
-                DJM(no_jffs_fm--);
-                jffs_fm_write_unlock(fmc);
-                kfree(fm);
-                
-                return err;
-        }
-        new_node->fm = fm;
-        /* Initialize the raw inode.  */
-        raw_inode.magic = JFFS_MAGIC_BITMASK;
-        raw_inode.ino = f->ino;
-        raw_inode.pino = f->pino;
-        raw_inode.version = f->highest_version + 1;
-        raw_inode.mode = f->mode;
-        raw_inode.uid = f->uid;
-        raw_inode.gid = f->gid;
-        raw_inode.atime = f->atime;
-        raw_inode.mtime = f->mtime;
-        raw_inode.ctime = f->ctime;
-        raw_inode.offset = node->data_offset;
-        raw_inode.dsize = size;
-        raw_inode.rsize = size;
-        raw_inode.nsize = f->nsize;
-        raw_inode.nlink = f->nlink;
-        raw_inode.spare = 0;
-        raw_inode.rename = 0;
-        raw_inode.deleted = f->deleted;
-        raw_inode.accurate = 0xff;
-        raw_inode.dchksum = 0;
-        raw_inode.nchksum = 0;
-        pos = new_node->fm->offset;
-        pos_dchksum = pos +JFFS_RAW_INODE_DCHKSUM_OFFSET;
-        D3(printk("jffs_rewrite_data(): Writing this raw inode "
-                  "to pos 0x%ul.\n", pos));
-        D3(jffs_print_raw_inode(&raw_inode));
-        if ((err = flash_safe_write(fmc->mtd, pos,
-                                    (u_char *) &raw_inode,
-                                    sizeof(struct jffs_raw_inode)
-                                    - sizeof(__u32)
-                                    - sizeof(__u16) - sizeof(__u16))) < 0) {
-                jffs_fmfree_partly(fmc, fm,
-                                   total_name_size + total_data_size);
-                jffs_fm_write_unlock(fmc);
-                printk(KERN_ERR "JFFS: jffs_rewrite_data: Write error during "
-                        "rewrite. (raw inode)\n");
-                printk(KERN_ERR "JFFS: jffs_rewrite_data: Now retrying "
-                        "rewrite. (raw inode)\n");
-                goto retry;
-        }
-        pos += sizeof(struct jffs_raw_inode);
-        /* Write the name to the flash memory.  */
-        if (f->nsize) {
-                D3(printk("jffs_rewrite_data(): Writing name \"%s\" to "
-                          "pos 0x%ul.\n", f->name, (unsigned int) pos));
-                if ((err = flash_safe_write(fmc->mtd, pos,
-                                            (u_char *)f->name,
-                                            f->nsize)) < 0) {
-                        jffs_fmfree_partly(fmc, fm, total_data_size);
-                        jffs_fm_write_unlock(fmc);
-                        printk(KERN_ERR "JFFS: jffs_rewrite_data: Write "
-                                "error during rewrite. (name)\n");
-                        printk(KERN_ERR "JFFS: jffs_rewrite_data: Now retrying "
-                                "rewrite. (name)\n");
-                        goto retry;
-                }
-                pos += total_name_size;
-                raw_inode.nchksum = jffs_checksum(f->name, f->nsize);
-        }
-        /* Write the data.  */
-        if (size) {
-                int r;
-                unsigned char *page;
-                __u32 offset = node->data_offset;
-                if (!(page = (unsigned char *)__get_free_page(GFP_KERNEL))) {
-                        jffs_fmfree_partly(fmc, fm, 0);
-                        return -1;
-                }
-                while (size) {
-                        __u32 s = min(size, (__u32)PAGE_SIZE);
-                        if ((r = jffs_read_data(f, (char *)page,
-                                                offset, s)) < s) {
-                                free_page((unsigned long)page);
-                                jffs_fmfree_partly(fmc, fm, 0);
-                                jffs_fm_write_unlock(fmc);
-                                printk(KERN_ERR "JFFS: jffs_rewrite_data: "
-                                         "jffs_read_data() "
-                                         "failed! (r = %d)\n", r);
-                                return -1;
-                        }
-                        if ((err = flash_safe_write(fmc->mtd,
-                                                    pos, page, r)) < 0) {
-                                free_page((unsigned long)page);
-                                jffs_fmfree_partly(fmc, fm, 0);
-                                jffs_fm_write_unlock(fmc);
-                                printk(KERN_ERR "JFFS: jffs_rewrite_data: "
-                                       "Write error during rewrite. "
-                                       "(data)\n");
-                                goto retry;
-                        }
-                        pos += r;
-                        size -= r;
-                        offset += r;
-                        raw_inode.dchksum += jffs_checksum(page, r);
-                }
-                free_page((unsigned long)page);
-        }
-        raw_inode.accurate = 0;
-        raw_inode.chksum = jffs_checksum(&raw_inode,
-                                         sizeof(struct jffs_raw_inode)
-                                         - sizeof(__u16));
-        /* Add the checksum.  */
-        if ((err
-             = flash_safe_write(fmc->mtd, pos_dchksum,
-                                &((u_char *)
-                                &raw_inode)[JFFS_RAW_INODE_DCHKSUM_OFFSET],
-                                sizeof(__u32) + sizeof(__u16)
-                                + sizeof(__u16))) < 0) {
-                jffs_fmfree_partly(fmc, fm, 0);
-                jffs_fm_write_unlock(fmc);
-                printk(KERN_ERR "JFFS: jffs_rewrite_data: Write error during "
-                       "rewrite. (checksum)\n");
-                goto retry;
-        }
-        /* Now make the file system aware of the newly written node.  */
-        jffs_insert_node(c, f, &raw_inode, f->name, new_node);
-        jffs_fm_write_unlock(fmc);
-        D3(printk("jffs_rewrite_data(): Leaving...\n"));
-        return 0;
-} /* jffs_rewrite_data()  */
-/* jffs_garbage_collect_next implements one step in the garbage collect
-   process and is often called multiple times at each occasion of a
-   garbage collect.  */
-static int
-jffs_garbage_collect_next(struct jffs_control *c)
-{
-        struct jffs_fmcontrol *fmc = c->fmc;
-        struct jffs_node *node;
-        struct jffs_file *f;
-        int err = 0;
-        __u32 size;
-        __u32 data_size;
-        __u32 total_name_size;
-        __u32 extra_available;
-        __u32 space_needed;
-        __u32 free_chunk_size1 = jffs_free_size1(fmc);
-        D2(__u32 free_chunk_size2 = jffs_free_size2(fmc));
-        /* Get the oldest node in the flash.  */
-        node = jffs_get_oldest_node(fmc);
-        ASSERT(if (!node) {
-                printk(KERN_ERR "JFFS: jffs_garbage_collect_next: "
-                       "No oldest node found!\n");
-                err = -1;
-                goto jffs_garbage_collect_next_end;
-                
-        });
-        /* Find its corresponding file too.  */
-        f = jffs_find_file(c, node->ino);
-        if (!f) {
-          printk (KERN_ERR "JFFS: jffs_garbage_collect_next: "
-                  "No file to garbage collect! "
-                  "(ino = 0x%08x)\n", node->ino);
-          /* FIXME: Free the offending node and recover. */
-          err = -1;
-          goto jffs_garbage_collect_next_end;
-        }
-        /* We always write out the name. Theoretically, we don't need
-           to, but for now it's easier - because otherwise we'd have
-           to keep track of how many times the current name exists on
-           the flash and make sure it never reaches zero.
-           The current approach means that would be possible to cause
-           the GC to end up eating its tail by writing lots of nodes
-           with no name for it to garbage-collect. Hence the change in
-           inode.c to write names with _every_ node.
-           It sucks, but it _should_ work.
-        */
-        total_name_size = JFFS_PAD(f->nsize);
-        D1(printk("jffs_garbage_collect_next(): \"%s\", "
-                  "ino: %u, version: %u, location 0x%x, dsize %u\n",
-                  (f->name ? f->name : ""), node->ino, node->version, 
-                  node->fm->offset, node->data_size));
-        /* Compute how many data it's possible to rewrite at the moment.  */
-        data_size = f->size - node->data_offset;
-        /* And from that, the total size of the chunk we want to write */
-        size = sizeof(struct jffs_raw_inode) + total_name_size
-               + data_size + JFFS_GET_PAD_BYTES(data_size);
-        /* If that's more than max_chunk_size, reduce it accordingly */
-        if (size > fmc->max_chunk_size) {
-                size = fmc->max_chunk_size;
-                data_size = size - sizeof(struct jffs_raw_inode)
-                            - total_name_size;
-        }
-        /* If we're asking to take up more space than free_chunk_size1
-           but we _could_ fit in it, shrink accordingly.
-        */
-        if (size > free_chunk_size1) {
-                if (free_chunk_size1 <
-                    (sizeof(struct jffs_raw_inode) + total_name_size + BLOCK_SIZE)){
-                        /* The space left is too small to be of any
-                           use really.  */
-                        struct jffs_fm *dirty_fm
-                        = jffs_fmalloced(fmc,
-                                         fmc->tail->offset + fmc->tail->size,
-                                         free_chunk_size1, NULL);
-                        if (!dirty_fm) {
-                                printk(KERN_ERR "JFFS: "
-                                       "jffs_garbage_collect_next: "
-                                       "Failed to allocate `dirty' "
-                                       "flash memory!\n");
-                                err = -1;
-                                goto jffs_garbage_collect_next_end;
-                        }
-                        D1(printk("Dirtying end of flash - too small\n"));
-                        jffs_write_dummy_node(c, dirty_fm);
-                        err = 0;
-                        goto jffs_garbage_collect_next_end;
-                }
-                D1(printk("Reducing size of new node from %d to %d to avoid "
-                          " exceeding free_chunk_size1\n",
-                          size, free_chunk_size1));
-                size = free_chunk_size1;
-                data_size = size - sizeof(struct jffs_raw_inode)
-                            - total_name_size;
-        }
-        /* Calculate the amount of space needed to hold the nodes
-           which are remaining in the tail */
-        space_needed = fmc->min_free_size - (node->fm->offset % fmc->sector_size);
-        /* From that, calculate how much 'extra' space we can use to
-           increase the size of the node we're writing from the size
-           of the node we're obsoleting
-        */
-        if (space_needed > fmc->free_size) {
-                /* If we've gone below min_free_size for some reason,
-                   don't fuck up. This is why we have 
-                   min_free_size > sector_size. Whinge about it though,
-                   just so I can convince myself my maths is right.
-                */
-                D1(printk(KERN_WARNING "jffs_garbage_collect_next(): "
-                          "space_needed %d exceeded free_size %d\n",
-                          space_needed, fmc->free_size));
-                extra_available = 0;
-        } else {
-                extra_available = fmc->free_size - space_needed;
-        }
-        /* Check that we don't use up any more 'extra' space than
-           what's available */
-        if (size > JFFS_PAD(node->data_size) + total_name_size + 
-            sizeof(struct jffs_raw_inode) + extra_available) {
-                D1(printk("Reducing size of new node from %d to %ld to avoid "
-                       "catching our tail\n", size, 
-                          (long) (JFFS_PAD(node->data_size) + JFFS_PAD(node->name_size) + 
-                          sizeof(struct jffs_raw_inode) + extra_available)));
-                D1(printk("space_needed = %d, extra_available = %d\n", 
-                          space_needed, extra_available));
-                size = JFFS_PAD(node->data_size) + total_name_size + 
-                  sizeof(struct jffs_raw_inode) + extra_available;
-                data_size = size - sizeof(struct jffs_raw_inode)
-                        - total_name_size;
-        };
-        D2(printk("  total_name_size: %u\n", total_name_size));
-        D2(printk("  data_size: %u\n", data_size));
-        D2(printk("  size: %u\n", size));
-        D2(printk("  f->nsize: %u\n", f->nsize));
-        D2(printk("  f->size: %u\n", f->size));
-        D2(printk("  node->data_offset: %u\n", node->data_offset));
-        D2(printk("  free_chunk_size1: %u\n", free_chunk_size1));
-        D2(printk("  free_chunk_size2: %u\n", free_chunk_size2));
-        D2(printk("  node->fm->offset: 0x%08x\n", node->fm->offset));
-        if ((err = jffs_rewrite_data(f, node, data_size))) {
-                printk(KERN_WARNING "jffs_rewrite_data() failed: %d\n", err);
-                return err;
-        }
-          
-jffs_garbage_collect_next_end:
-        D3(printk("jffs_garbage_collect_next: Leaving...\n"));
-        return err;
-} /* jffs_garbage_collect_next */
-/* If an obsolete node is partly going to be erased due to garbage
-   collection, the part that isn't going to be erased must be filled
-   with zeroes so that the scan of the flash will work smoothly next
-   time.  (The data in the file could for instance be a JFFS image
-   which could cause enormous confusion during a scan of the flash
-   device if we didn't do this.)
-     There are two phases in this procedure: First, the clearing of
-   the name and data parts of the node. Second, possibly also clearing
-   a part of the raw inode as well.  If the box is power cycled during
-   the first phase, only the checksum of this node-to-be-cleared-at-
-   the-end will be wrong.  If the box is power cycled during, or after,
-   the clearing of the raw inode, the information like the length of
-   the name and data parts are zeroed.  The next time the box is
-   powered up, the scanning algorithm manages this faulty data too
-   because:
-   - The checksum is invalid and thus the raw inode must be discarded
-     in any case.
-   - If the lengths of the data part or the name part are zeroed, the
-     scanning just continues after the raw inode.  But after the inode
-     the scanning procedure just finds zeroes which is the same as
-     dirt.
-   So, in the end, this could never fail. :-)  Even if it does fail,
-   the scanning algorithm should manage that too.  */
-static int
-jffs_clear_end_of_node(struct jffs_control *c, __u32 erase_size)
-{
-        struct jffs_fm *fm;
-        struct jffs_fmcontrol *fmc = c->fmc;
-        __u32 zero_offset;
-        __u32 zero_size;
-        __u32 zero_offset_data;
-        __u32 zero_size_data;
-        __u32 cutting_raw_inode = 0;
-        if (!(fm = jffs_cut_node(fmc, erase_size))) {
-                D3(printk("jffs_clear_end_of_node(): fm == NULL\n"));
-                return 0;
-        }
-        /* Where and how much shall we clear?  */
-        zero_offset = fmc->head->offset + erase_size;
-        zero_size = fm->offset + fm->size - zero_offset;
-        /* Do we have to clear the raw_inode explicitly?  */
-        if (fm->size - zero_size < sizeof(struct jffs_raw_inode)) {
-                cutting_raw_inode = sizeof(struct jffs_raw_inode)
-                                    - (fm->size - zero_size);
-        }
-        /* First, clear the name and data fields.  */
-        zero_offset_data = zero_offset + cutting_raw_inode;
-        zero_size_data = zero_size - cutting_raw_inode;
-        flash_safe_acquire(fmc->mtd);
-        flash_memset(fmc->mtd, zero_offset_data, 0, zero_size_data);
-        flash_safe_release(fmc->mtd);
-        /* Should we clear a part of the raw inode?  */
-        if (cutting_raw_inode) {
-                /* I guess it is ok to clear the raw inode in this order.  */
-                flash_safe_acquire(fmc->mtd);
-                flash_memset(fmc->mtd, zero_offset, 0,
-                             cutting_raw_inode);
-                flash_safe_release(fmc->mtd);
-        }
-        return 0;
-} /* jffs_clear_end_of_node()  */
-/* Try to erase as much as possible of the dirt in the flash memory.  */
-static long
-jffs_try_to_erase(struct jffs_control *c)
-{
-        struct jffs_fmcontrol *fmc = c->fmc;
-        long erase_size;
-        int err;
-        __u32 offset;
-        D3(printk("jffs_try_to_erase()\n"));
-        erase_size = jffs_erasable_size(fmc);
-        D2(printk("jffs_try_to_erase(): erase_size = %ld\n", erase_size));
-        if (erase_size == 0) {
-                return 0;
-        }
-        else if (erase_size < 0) {
-                printk(KERN_ERR "JFFS: jffs_try_to_erase: "
-                       "jffs_erasable_size returned %ld.\n", erase_size);
-                return erase_size;
-        }
-        if ((err = jffs_clear_end_of_node(c, erase_size)) < 0) {
-                printk(KERN_ERR "JFFS: jffs_try_to_erase: "
-                       "Clearing of node failed.\n");
-                return err;
-        }
-        offset = fmc->head->offset;
-        /* Now, let's try to do the erase.  */
-        if ((err = flash_erase_region(fmc->mtd,
-                                      offset, erase_size)) < 0) {
-                printk(KERN_ERR "JFFS: Erase of flash failed. "
-                       "offset = %u, erase_size = %ld\n",
-                       offset, erase_size);
-                /* XXX: Here we should allocate this area as dirty
-                   with jffs_fmalloced or something similar.  Now
-                   we just report the error.  */
-                return err;
-        }
-#if 0
-        /* Check if the erased sectors really got erased.  */
-        {
-                __u32 pos;
-                __u32 end;
-                pos = (__u32)flash_get_direct_pointer(to_kdev_t(c->sb->s_dev), offset);
-                end = pos + erase_size;
-                D2(printk("JFFS: Checking erased sector(s)...\n"));
-                flash_safe_acquire(fmc->mtd);
-                for (; pos < end; pos += 4) {
-                        if (*(__u32 *)pos != JFFS_EMPTY_BITMASK) {
-                                printk("JFFS: Erase failed! pos = 0x%lx\n",
-                                       (long)pos);
-                                jffs_hexdump(fmc->mtd, pos,
-                                             jffs_min(256, end - pos));
-                                err = -1;
-                                break;
-                        }
-                }
-                flash_safe_release(fmc->mtd);
-                if (!err) {
-                        D2(printk("JFFS: Erase succeeded.\n"));
-                }
-                else {
-                        /* XXX: Here we should allocate the memory
-                           with jffs_fmalloced() in order to prevent
-                           JFFS from using this area accidentally.  */
-                        return err;
-                }
-        }
-#endif
-        /* Update the flash memory data structures.  */
-        jffs_sync_erase(fmc, erase_size);
-        return erase_size;
-}
-/* There are different criteria that should trigger a garbage collect:
-   1. There is too much dirt in the memory.
-   2. The free space is becoming small.
-   3. There are many versions of a node.
-   The garbage collect should always be done in a manner that guarantees
-   that future garbage collects cannot be locked.  E.g. Rewritten chunks
-   should not be too large (span more than one sector in the flash memory
-   for exemple).  Of course there is a limit on how intelligent this garbage
-   collection can be.  */
-static int
-jffs_garbage_collect_now(struct jffs_control *c)
-{
-        struct jffs_fmcontrol *fmc = c->fmc;
-        long erased = 0;
-        int result = 0;
-        D1(int i = 1);
-        D2(printk("***jffs_garbage_collect_now(): fmc->dirty_size = %u, fmc->free_size = 0x%x\n, fcs1=0x%x, fcs2=0x%x",
-                  fmc->dirty_size, fmc->free_size, jffs_free_size1(fmc), jffs_free_size2(fmc)));
-        D2(jffs_print_fmcontrol(fmc));
-        //      down(&fmc->gclock);
-        /* If it is possible to garbage collect, do so.  */
-        
-        while (erased == 0) {
-                D1(printk("***jffs_garbage_collect_now(): round #%u, "
-                          "fmc->dirty_size = %u\n", i++, fmc->dirty_size));
-                D2(jffs_print_fmcontrol(fmc));
-                if ((erased = jffs_try_to_erase(c)) < 0) {
-                        printk(KERN_WARNING "JFFS: Error in "
-                               "garbage collector.\n");
-                        result = erased;
-                        goto gc_end;
-                }
-                if (erased)
-                        break;
-                
-                if (fmc->free_size == 0) {
-                        /* Argh */
-                        printk(KERN_ERR "jffs_garbage_collect_now(): free_size == 0. This is BAD.\n");
-                        result = -ENOSPC;
-                        break;
-                }
-                if (fmc->dirty_size < fmc->sector_size) {
-                        /* Actually, we _may_ have been able to free some, 
-                         * if there are many overlapping nodes which aren't
-                         * actually marked dirty because they still have
-                         * some valid data in each.
-                         */
-                        result = -ENOSPC;
-                        break;
-                }
-                /* Let's dare to make a garbage collect.  */
-                if ((result = jffs_garbage_collect_next(c)) < 0) {
-                        printk(KERN_ERR "JFFS: Something "
-                               "has gone seriously wrong "
-                               "with a garbage collect.\n");
-                        goto gc_end;
-                }
-                D1(printk("   jffs_garbage_collect_now(): erased: %ld\n", erased));
-                DJM(jffs_print_memory_allocation_statistics());
-        }
-        
-gc_end:
-        //      up(&fmc->gclock);
-        D3(printk("   jffs_garbage_collect_now(): Leaving...\n"));
-        D1(if (erased) {
-                printk("jffs_g_c_now(): erased = %ld\n", erased);
-                jffs_print_fmcontrol(fmc);
-        });
-        if (!erased && !result)
-                return -ENOSPC;
-        return result;
-} /* jffs_garbage_collect_now() */
-/* Determine if it is reasonable to start garbage collection.
-   We start a gc pass if either:
-   - The number of free bytes < MIN_FREE_BYTES && at least one
-     block is dirty, OR
-   - The number of dirty bytes > MAX_DIRTY_BYTES
-*/
-static inline int thread_should_wake (struct jffs_control *c)
-{
-        D1(printk (KERN_NOTICE "thread_should_wake(): free=%d, dirty=%d, blocksize=%d.\n",
-                   c->fmc->free_size, c->fmc->dirty_size, c->fmc->sector_size));
-        /* If there's not enough dirty space to free a block, there's no point. */
-        if (c->fmc->dirty_size < c->fmc->sector_size) {
-                D2(printk(KERN_NOTICE "thread_should_wake(): Not waking. Insufficient dirty space\n"));
-                return 0;
-        }
-#if 1
-        /* If there is too much RAM used by the various structures, GC */
-        if (jffs_get_node_inuse() > (c->fmc->used_size/c->fmc->max_chunk_size * 5 + jffs_get_file_count() * 2 + 50)) {
-                /* FIXME: Provide proof that this test can be satisfied. We
-                   don't want a filesystem doing endless GC just because this
-                   condition cannot ever be false.
-                */
-                D2(printk(KERN_NOTICE "thread_should_wake(): Waking due to number of nodes\n"));
-                return 1;
-        }
-#endif
-        /* If there are fewer free bytes than the threshold, GC */
-        if (c->fmc->free_size < c->gc_minfree_threshold) {
-                D2(printk(KERN_NOTICE "thread_should_wake(): Waking due to insufficent free space\n"));
-                return 1;
-        }
-        /* If there are more dirty bytes than the threshold, GC */
-        if (c->fmc->dirty_size > c->gc_maxdirty_threshold) {
-                D2(printk(KERN_NOTICE "thread_should_wake(): Waking due to excessive dirty space\n"));
-                return 1;
-        }       
-        /* FIXME: What about the "There are many versions of a node" condition? */
-        return 0;
-}
-void jffs_garbage_collect_trigger(struct jffs_control *c)
-{
-        /* NOTE: We rely on the fact that we have the BKL here.
-         * Otherwise, the gc_task could go away between the check
-         * and the wake_up_process()
-         */
-        if (c->gc_task && thread_should_wake(c))
-                send_sig(SIGHUP, c->gc_task, 1);
-}
-  
-/* Kernel threads  take (void *) as arguments.   Thus we pass
-   the jffs_control data as a (void *) and then cast it. */
-int
-jffs_garbage_collect_thread(void *ptr)
-{
-        struct jffs_control *c = (struct jffs_control *) ptr;
-        struct jffs_fmcontrol *fmc = c->fmc;
-        long erased;
-        int result = 0;
-        D1(int i = 1);
-        daemonize("jffs_gcd");
-        c->gc_task = current;
-        lock_kernel();
-        init_completion(&c->gc_thread_comp); /* barrier */ 
-        spin_lock_irq(&current->sighand->siglock);
-        siginitsetinv (&current->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
-        D1(printk (KERN_NOTICE "jffs_garbage_collect_thread(): Starting infinite loop.\n"));
-        for (;;) {
-                /* See if we need to start gc.  If we don't, go to sleep.
-                   
-                   Current implementation is a BAD THING(tm).  If we try 
-                   to unmount the FS, the unmount operation will sleep waiting
-                   for this thread to exit.  We need to arrange to send it a
-                   sig before the umount process sleeps.
-                */
-                if (!thread_should_wake(c))
-                        set_current_state (TASK_INTERRUPTIBLE);
-                
-                schedule(); /* Yes, we do this even if we want to go
-                                       on immediately - we're a low priority 
-                                       background task. */
-                /* Put_super will send a SIGKILL and then wait on the sem. 
-                 */
-                while (signal_pending(current)) {
-                        siginfo_t info;
-                        unsigned long signr = 0;
-                        if (try_to_freeze())
-                                continue;
-                        spin_lock_irq(&current->sighand->siglock);
-                        signr = dequeue_signal(current, &current->blocked, &info);
-                        spin_unlock_irq(&current->sighand->siglock);
-                        switch(signr) {
-                        case SIGSTOP:
-                                D1(printk("jffs_garbage_collect_thread(): SIGSTOP received.\n"));
-                                set_current_state(TASK_STOPPED);
-                                schedule();
-                                break;
-                        case SIGKILL:
-                                D1(printk("jffs_garbage_collect_thread(): SIGKILL received.\n"));
-                                c->gc_task = NULL;
-                                complete_and_exit(&c->gc_thread_comp, 0);
-                        }
-                }
-                D1(printk (KERN_NOTICE "jffs_garbage_collect_thread(): collecting.\n"));
-                D3(printk (KERN_NOTICE "g_c_thread(): down biglock\n"));
-                mutex_lock(&fmc->biglock);
-                
-                D1(printk("***jffs_garbage_collect_thread(): round #%u, "
-                          "fmc->dirty_size = %u\n", i++, fmc->dirty_size));
-                D2(jffs_print_fmcontrol(fmc));
-                if ((erased = jffs_try_to_erase(c)) < 0) {
-                        printk(KERN_WARNING "JFFS: Error in "
-                               "garbage collector: %ld.\n", erased);
-                }
-                if (erased)
-                        goto gc_end;
-                if (fmc->free_size == 0) {
-                        /* Argh. Might as well commit suicide. */
-                        printk(KERN_ERR "jffs_garbage_collect_thread(): free_size == 0. This is BAD.\n");
-                        send_sig(SIGQUIT, c->gc_task, 1);
-                        // panic()
-                        goto gc_end;
-                }
-                
-                /* Let's dare to make a garbage collect.  */
-                if ((result = jffs_garbage_collect_next(c)) < 0) {
-                        printk(KERN_ERR "JFFS: Something "
-                               "has gone seriously wrong "
-                               "with a garbage collect: %d\n", result);
-                }
-                
-        gc_end:
-                D3(printk (KERN_NOTICE "g_c_thread(): up biglock\n"));
-                mutex_unlock(&fmc->biglock);
-        } /* for (;;) */
-} /* jffs_garbage_collect_thread() */
diff --git a/fs/jffs/intrep.h b/fs/jffs/intrep.h
deleted file mode 100644
index 5c7abe0e2695..000000000000
--- a/fs/jffs/intrep.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * JFFS -- Journaling Flash File System, Linux implementation.
- *
- * Copyright (C) 1999, 2000  Axis Communications AB.
- *
- * Created by Finn Hakansson <finn@axis.com>.
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * $Id: intrep.h,v 1.14 2001/09/23 23:28:37 dwmw2 Exp $
- *
- */
-#ifndef __LINUX_JFFS_INTREP_H__
-#define __LINUX_JFFS_INTREP_H__
-#include "jffs_fm.h"
-struct jffs_node *jffs_alloc_node(void);
-void jffs_free_node(struct jffs_node *n);
-int jffs_get_node_inuse(void);
-void jffs_cleanup_control(struct jffs_control *c);
-int jffs_build_fs(struct super_block *sb);
-int jffs_insert_node(struct jffs_control *c, struct jffs_file *f,
-                     const struct jffs_raw_inode *raw_inode,
-                     const char *name, struct jffs_node *node);
-struct jffs_file *jffs_find_file(struct jffs_control *c, __u32 ino);
-struct jffs_file *jffs_find_child(struct jffs_file *dir, const char *name, int len);
-void jffs_free_node(struct jffs_node *node);
-int jffs_foreach_file(struct jffs_control *c, int (*func)(struct jffs_file *));
-int jffs_possibly_delete_file(struct jffs_file *f);
-int jffs_insert_file_into_tree(struct jffs_file *f);
-int jffs_unlink_file_from_tree(struct jffs_file *f);
-int jffs_file_count(struct jffs_file *f);
-int jffs_write_node(struct jffs_control *c, struct jffs_node *node,
-                    struct jffs_raw_inode *raw_inode,
-                    const char *name, const unsigned char *buf,
-                    int recoverable, struct jffs_file *f);
-int jffs_read_data(struct jffs_file *f, unsigned char *buf, __u32 read_offset, __u32 size);
-/* Garbage collection stuff.  */
-int jffs_garbage_collect_thread(void *c);
-void jffs_garbage_collect_trigger(struct jffs_control *c);
-/* For debugging purposes.  */
-#if 0
-int jffs_print_file(struct jffs_file *f);
-#endif  /*  0  */
-void jffs_print_hash_table(struct jffs_control *c);
-void jffs_print_tree(struct jffs_file *first_file, int indent);
-#endif /* __LINUX_JFFS_INTREP_H__  */
diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c
deleted file mode 100644
index 5a95fbdd6fdb..000000000000
--- a/fs/jffs/jffs_fm.c
+++ /dev/null
@@ -1,798 +0,0 @@
-/*
- * JFFS -- Journaling Flash File System, Linux implementation.
- *
- * Copyright (C) 1999, 2000  Axis Communications AB.
- *
- * Created by Finn Hakansson <finn@axis.com>.
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * $Id: jffs_fm.c,v 1.27 2001/09/20 12:29:47 dwmw2 Exp $
- *
- * Ported to Linux 2.3.x and MTD:
- * Copyright (C) 2000  Alexander Larsson (alex@cendio.se), Cendio Systems AB
- *
- */
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/blkdev.h>
-#include <linux/jffs.h>
-#include "jffs_fm.h"
-#include "intrep.h"
-#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE
-static int jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset);
-#endif
-static struct jffs_fm *jffs_alloc_fm(void);
-static void jffs_free_fm(struct jffs_fm *n);
-extern struct kmem_cache     *fm_cache;
-extern struct kmem_cache     *node_cache;
-#if CONFIG_JFFS_FS_VERBOSE > 0
-void
-jffs_print_fmcontrol(struct jffs_fmcontrol *fmc)
-{
-        D(printk("struct jffs_fmcontrol: 0x%p\n", fmc));
-        D(printk("{\n"));
-        D(printk("        %u, /* flash_size  */\n", fmc->flash_size));
-        D(printk("        %u, /* used_size  */\n", fmc->used_size));
-        D(printk("        %u, /* dirty_size  */\n", fmc->dirty_size));
-        D(printk("        %u, /* free_size  */\n", fmc->free_size));
-        D(printk("        %u, /* sector_size  */\n", fmc->sector_size));
-        D(printk("        %u, /* min_free_size  */\n", fmc->min_free_size));
-        D(printk("        %u, /* max_chunk_size  */\n", fmc->max_chunk_size));
-        D(printk("        0x%p, /* mtd  */\n", fmc->mtd));
-        D(printk("        0x%p, /* head  */    "
-                 "(head->offset = 0x%08x)\n",
-                 fmc->head, (fmc->head ? fmc->head->offset : 0)));
-        D(printk("        0x%p, /* tail  */    "
-                 "(tail->offset + tail->size = 0x%08x)\n",
-                 fmc->tail,
-                 (fmc->tail ? fmc->tail->offset + fmc->tail->size : 0)));
-        D(printk("        0x%p, /* head_extra  */\n", fmc->head_extra));
-        D(printk("        0x%p, /* tail_extra  */\n", fmc->tail_extra));
-        D(printk("}\n"));
-}
-#endif  /*  CONFIG_JFFS_FS_VERBOSE > 0  */
-#if CONFIG_JFFS_FS_VERBOSE > 2
-static void
-jffs_print_fm(struct jffs_fm *fm)
-{
-        D(printk("struct jffs_fm: 0x%p\n", fm));
-        D(printk("{\n"));
-        D(printk("       0x%08x, /* offset  */\n", fm->offset));
-        D(printk("       %u, /* size  */\n", fm->size));
-        D(printk("       0x%p, /* prev  */\n", fm->prev));
-        D(printk("       0x%p, /* next  */\n", fm->next));
-        D(printk("       0x%p, /* nodes  */\n", fm->nodes));
-        D(printk("}\n"));
-}
-#endif  /*  CONFIG_JFFS_FS_VERBOSE > 2  */
-#if 0
-void
-jffs_print_node_ref(struct jffs_node_ref *ref)
-{
-        D(printk("struct jffs_node_ref: 0x%p\n", ref));
-        D(printk("{\n"));
-        D(printk("       0x%p, /* node  */\n", ref->node));
-        D(printk("       0x%p, /* next  */\n", ref->next));
-        D(printk("}\n"));
-}
-#endif  /*  0  */
-/* This function creates a new shiny flash memory control structure.  */
-struct jffs_fmcontrol *
-jffs_build_begin(struct jffs_control *c, int unit)
-{
-        struct jffs_fmcontrol *fmc;
-        struct mtd_info *mtd;
-        
-        D3(printk("jffs_build_begin()\n"));
-        fmc = kmalloc(sizeof(*fmc), GFP_KERNEL);
-        if (!fmc) {
-                D(printk("jffs_build_begin(): Allocation of "
-                         "struct jffs_fmcontrol failed!\n"));
-                return (struct jffs_fmcontrol *)0;
-        }
-        DJM(no_jffs_fmcontrol++);
-        mtd = get_mtd_device(NULL, unit);
-        if (IS_ERR(mtd)) {
-                kfree(fmc);
-                DJM(no_jffs_fmcontrol--);
-                return NULL;
-        }
-        
-        /* Retrieve the size of the flash memory.  */
-        fmc->flash_size = mtd->size;
-        D3(printk("  fmc->flash_size = %d bytes\n", fmc->flash_size));
-        fmc->used_size = 0;
-        fmc->dirty_size = 0;
-        fmc->free_size = mtd->size;
-        fmc->sector_size = mtd->erasesize;
-        fmc->max_chunk_size = fmc->sector_size >> 1;
-        /* min_free_size:
-           1 sector, obviously.
-           + 1 x max_chunk_size, for when a nodes overlaps the end of a sector
-           + 1 x max_chunk_size again, which ought to be enough to handle 
-                   the case where a rename causes a name to grow, and GC has
-                   to write out larger nodes than the ones it's obsoleting.
-                   We should fix it so it doesn't have to write the name
-                   _every_ time. Later.
-           + another 2 sectors because people keep getting GC stuck and
-                   we don't know why. This scares me - I want formal proof
-                   of correctness of whatever number we put here. dwmw2.
-        */
-        fmc->min_free_size = fmc->sector_size << 2;
-        fmc->mtd = mtd;
-        fmc->c = c;
-        fmc->head = NULL;
-        fmc->tail = NULL;
-        fmc->head_extra = NULL;
-        fmc->tail_extra = NULL;
-        mutex_init(&fmc->biglock);
-        return fmc;
-}
-/* When the flash memory scan has completed, this function should be called
-   before use of the control structure.  */
-void
-jffs_build_end(struct jffs_fmcontrol *fmc)
-{
-        D3(printk("jffs_build_end()\n"));
-        if (!fmc->head) {
-                fmc->head = fmc->head_extra;
-                fmc->tail = fmc->tail_extra;
-        }
-        else if (fmc->head_extra) {
-                fmc->tail_extra->next = fmc->head;
-                fmc->head->prev = fmc->tail_extra;
-                fmc->head = fmc->head_extra;
-        }
-        fmc->head_extra = NULL; /* These two instructions should be omitted.  */
-        fmc->tail_extra = NULL;
-        D3(jffs_print_fmcontrol(fmc));
-}
-/* Call this function when the file system is unmounted.  This function
-   frees all memory used by this module.  */
-void
-jffs_cleanup_fmcontrol(struct jffs_fmcontrol *fmc)
-{
-        if (fmc) {
-                struct jffs_fm *next = fmc->head;
-                while (next) {
-                        struct jffs_fm *cur = next;
-                        next = next->next;
-                        jffs_free_fm(cur);
-                }
-                put_mtd_device(fmc->mtd);
-                kfree(fmc);
-                DJM(no_jffs_fmcontrol--);
-        }
-}
-/* This function returns the size of the first chunk of free space on the
-   flash memory.  This function will return something nonzero if the flash
-   memory contains any free space.  */
-__u32
-jffs_free_size1(struct jffs_fmcontrol *fmc)
-{
-        __u32 head;
-        __u32 tail;
-        __u32 end = fmc->flash_size;
-        if (!fmc->head) {
-                /* There is nothing on the flash.  */
-                return fmc->flash_size;
-        }
-        /* Compute the beginning and ending of the contents of the flash.  */
-        head = fmc->head->offset;
-        tail = fmc->tail->offset + fmc->tail->size;
-        if (tail == end) {
-                tail = 0;
-        }
-        ASSERT(else if (tail > end) {
-                printk(KERN_WARNING "jffs_free_size1(): tail > end\n");
-                tail = 0;
-        });
-        if (head <= tail) {
-                return end - tail;
-        }
-        else {
-                return head - tail;
-        }
-}
-/* This function will return something nonzero in case there are two free
-   areas on the flash.  Like this:
-     +----------------+------------------+----------------+
-     |     FREE 1     |   USED / DIRTY   |     FREE 2     |
-     +----------------+------------------+----------------+
-       fmc->head -----^
-       fmc->tail ------------------------^
-   The value returned, will be the size of the first empty area on the
-   flash, in this case marked "FREE 1".  */
-__u32
-jffs_free_size2(struct jffs_fmcontrol *fmc)
-{
-        if (fmc->head) {
-                __u32 head = fmc->head->offset;
-                __u32 tail = fmc->tail->offset + fmc->tail->size;
-                if (tail == fmc->flash_size) {
-                        tail = 0;
-                }
-                if (tail >= head) {
-                        return head;
-                }
-        }
-        return 0;
-}
-/* Allocate a chunk of flash memory.  If there is enough space on the
-   device, a reference to the associated node is stored in the jffs_fm
-   struct.  */
-int
-jffs_fmalloc(struct jffs_fmcontrol *fmc, __u32 size, struct jffs_node *node,
-             struct jffs_fm **result)
-{
-        struct jffs_fm *fm;
-        __u32 free_chunk_size1;
-        __u32 free_chunk_size2;
-        D2(printk("jffs_fmalloc(): fmc = 0x%p, size = %d, "
-                  "node = 0x%p\n", fmc, size, node));
-        *result = NULL;
-        if (!(fm = jffs_alloc_fm())) {
-                D(printk("jffs_fmalloc(): kmalloc() failed! (fm)\n"));
-                return -ENOMEM;
-        }
-        free_chunk_size1 = jffs_free_size1(fmc);
-        free_chunk_size2 = jffs_free_size2(fmc);
-        if (free_chunk_size1 + free_chunk_size2 != fmc->free_size) {
-                printk(KERN_WARNING "Free size accounting screwed\n");
-                printk(KERN_WARNING "free_chunk_size1 == 0x%x, free_chunk_size2 == 0x%x, fmc->free_size == 0x%x\n", free_chunk_size1, free_chunk_size2, fmc->free_size);
-        }
-        D3(printk("jffs_fmalloc(): free_chunk_size1 = %u, "
-                  "free_chunk_size2 = %u\n",
-                  free_chunk_size1, free_chunk_size2));
-        if (size <= free_chunk_size1) {
-                if (!(fm->nodes = (struct jffs_node_ref *)
-                                  kmalloc(sizeof(struct jffs_node_ref),
-                                          GFP_KERNEL))) {
-                        D(printk("jffs_fmalloc(): kmalloc() failed! "
-                                 "(node_ref)\n"));
-                        jffs_free_fm(fm);
-                        return -ENOMEM;
-                }
-                DJM(no_jffs_node_ref++);
-                fm->nodes->node = node;
-                fm->nodes->next = NULL;
-                if (fmc->tail) {
-                        fm->offset = fmc->tail->offset + fmc->tail->size;
-                        if (fm->offset == fmc->flash_size) {
-                                fm->offset = 0;
-                        }
-                        ASSERT(else if (fm->offset > fmc->flash_size) {
-                                printk(KERN_WARNING "jffs_fmalloc(): "
-                                       "offset > flash_end\n");
-                                fm->offset = 0;
-                        });
-                }
-                else {
-                        /* There don't have to be files in the file
-                           system yet.  */
-                        fm->offset = 0;
-                }
-                fm->size = size;
-                fmc->free_size -= size;
-                fmc->used_size += size;
-        }
-        else if (size > free_chunk_size2) {
-                printk(KERN_WARNING "JFFS: Tried to allocate a too "
-                       "large flash memory chunk. (size = %u)\n", size);
-                jffs_free_fm(fm);
-                return -ENOSPC;
-        }
-        else {
-                fm->offset = fmc->tail->offset + fmc->tail->size;
-                fm->size = free_chunk_size1;
-                fm->nodes = NULL;
-                fmc->free_size -= fm->size;
-                fmc->dirty_size += fm->size; /* Changed by simonk. This seemingly fixes a 
-                                                bug that caused infinite garbage collection.
-                                                It previously set fmc->dirty_size to size (which is the
-                                                size of the requested chunk).
-                                             */
-        }
-        fm->next = NULL;
-        if (!fmc->head) {
-                fm->prev = NULL;
-                fmc->head = fm;
-                fmc->tail = fm;
-        }
-        else {
-                fm->prev = fmc->tail;
-                fmc->tail->next = fm;
-                fmc->tail = fm;
-        }
-        D3(jffs_print_fmcontrol(fmc));
-        D3(jffs_print_fm(fm));
-        *result = fm;
-        return 0;
-}
-/* The on-flash space is not needed anymore by the passed node.  Remove
-   the reference to the node from the node list.  If the data chunk in
-   the flash memory isn't used by any more nodes anymore (fm->nodes == 0),
-   then mark that chunk as dirty.  */
-int
-jffs_fmfree(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, struct jffs_node *node)
-{
-        struct jffs_node_ref *ref;
-        struct jffs_node_ref *prev;
-        ASSERT(int del = 0);
-        D2(printk("jffs_fmfree(): node->ino = %u, node->version = %u\n",
-                 node->ino, node->version));
-        ASSERT(if (!fmc || !fm || !fm->nodes) {
-                printk(KERN_ERR "jffs_fmfree(): fmc: 0x%p, fm: 0x%p, "
-                       "fm->nodes: 0x%p\n",
-                       fmc, fm, (fm ? fm->nodes : NULL));
-                return -1;
-        });
-        /* Find the reference to the node that is going to be removed
-           and remove it.  */
-        for (ref = fm->nodes, prev = NULL; ref; ref = ref->next) {
-                if (ref->node == node) {
-                        if (prev) {
-                                prev->next = ref->next;
-                        }
-                        else {
-                                fm->nodes = ref->next;
-                        }
-                        kfree(ref);
-                        DJM(no_jffs_node_ref--);
-                        ASSERT(del = 1);
-                        break;
-                }
-                prev = ref;
-        }
-        /* If the data chunk in the flash memory isn't used anymore
-           just mark it as obsolete.  */
-        if (!fm->nodes) {
-                /* No node uses this chunk so let's remove it.  */
-                fmc->used_size -= fm->size;
-                fmc->dirty_size += fm->size;
-#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE
-                if (jffs_mark_obsolete(fmc, fm->offset) < 0) {
-                        D1(printk("jffs_fmfree(): Failed to mark an on-flash "
-                                  "node obsolete!\n"));
-                        return -1;
-                }
-#endif
-        }
-        ASSERT(if (!del) {
-                printk(KERN_WARNING "***jffs_fmfree(): "
-                       "Didn't delete any node reference!\n");
-        });
-        return 0;
-}
-/* This allocation function is used during the initialization of
-   the file system.  */
-struct jffs_fm *
-jffs_fmalloced(struct jffs_fmcontrol *fmc, __u32 offset, __u32 size,
-               struct jffs_node *node)
-{
-        struct jffs_fm *fm;
-        D3(printk("jffs_fmalloced()\n"));
-        if (!(fm = jffs_alloc_fm())) {
-                D(printk("jffs_fmalloced(0x%p, %u, %u, 0x%p): failed!\n",
-                         fmc, offset, size, node));
-                return NULL;
-        }
-        fm->offset = offset;
-        fm->size = size;
-        fm->prev = NULL;
-        fm->next = NULL;
-        fm->nodes = NULL;
-        if (node) {
-                /* `node' exists and it should be associated with the
-                    jffs_fm structure `fm'.  */
-                if (!(fm->nodes = (struct jffs_node_ref *)
-                                  kmalloc(sizeof(struct jffs_node_ref),
-                                          GFP_KERNEL))) {
-                        D(printk("jffs_fmalloced(): !fm->nodes\n"));
-                        jffs_free_fm(fm);
-                        return NULL;
-                }
-                DJM(no_jffs_node_ref++);
-                fm->nodes->node = node;
-                fm->nodes->next = NULL;
-                fmc->used_size += size;
-                fmc->free_size -= size;
-        }
-        else {
-                /* If there is no node, then this is just a chunk of dirt.  */
-                fmc->dirty_size += size;
-                fmc->free_size -= size;
-        }
-        if (fmc->head_extra) {
-                fm->prev = fmc->tail_extra;
-                fmc->tail_extra->next = fm;
-                fmc->tail_extra = fm;
-        }
-        else if (!fmc->head) {
-                fmc->head = fm;
-                fmc->tail = fm;
-        }
-        else if (fmc->tail->offset + fmc->tail->size < offset) {
-                fmc->head_extra = fm;
-                fmc->tail_extra = fm;
-        }
-        else {
-                fm->prev = fmc->tail;
-                fmc->tail->next = fm;
-                fmc->tail = fm;
-        }
-        D3(jffs_print_fmcontrol(fmc));
-        D3(jffs_print_fm(fm));
-        return fm;
-}
-/* Add a new node to an already existing jffs_fm struct.  */
-int
-jffs_add_node(struct jffs_node *node)
-{
-        struct jffs_node_ref *ref;
-        D3(printk("jffs_add_node(): ino = %u\n", node->ino));
-        ref = kmalloc(sizeof(*ref), GFP_KERNEL);
-        if (!ref)
-                return -ENOMEM;
-        DJM(no_jffs_node_ref++);
-        ref->node = node;
-        ref->next = node->fm->nodes;
-        node->fm->nodes = ref;
-        return 0;
-}
-/* Free a part of some allocated space.  */
-void
-jffs_fmfree_partly(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, __u32 size)
-{
-        D1(printk("***jffs_fmfree_partly(): fm = 0x%p, fm->nodes = 0x%p, "
-                  "fm->nodes->node->ino = %u, size = %u\n",
-                  fm, (fm ? fm->nodes : 0),
-                  (!fm ? 0 : (!fm->nodes ? 0 : fm->nodes->node->ino)), size));
-        if (fm->nodes) {
-                kfree(fm->nodes);
-                DJM(no_jffs_node_ref--);
-                fm->nodes = NULL;
-        }
-        fmc->used_size -= fm->size;
-        if (fm == fmc->tail) {
-                fm->size -= size;
-                fmc->free_size += size;
-        }
-        fmc->dirty_size += fm->size;
-}
-/* Find the jffs_fm struct that contains the end of the data chunk that
-   begins at the logical beginning of the flash memory and spans `size'
-   bytes.  If we want to erase a sector of the flash memory, we use this
-   function to find where the sector limit cuts a chunk of data.  */
-struct jffs_fm *
-jffs_cut_node(struct jffs_fmcontrol *fmc, __u32 size)
-{
-        struct jffs_fm *fm;
-        __u32 pos = 0;
-        if (size == 0) {
-                return NULL;
-        }
-        ASSERT(if (!fmc) {
-                printk(KERN_ERR "jffs_cut_node(): fmc == NULL\n");
-                return NULL;
-        });
-        fm = fmc->head;
-        while (fm) {
-                pos += fm->size;
-                if (pos < size) {
-                        fm = fm->next;
-                }
-                else if (pos > size) {
-                        break;
-                }
-                else {
-                        fm = NULL;
-                        break;
-                }
-        }
-        return fm;
-}
-/* Move the head of the fmc structures and delete the obsolete parts.  */
-void
-jffs_sync_erase(struct jffs_fmcontrol *fmc, int erased_size)
-{
-        struct jffs_fm *fm;
-        struct jffs_fm *del;
-        ASSERT(if (!fmc) {
-                printk(KERN_ERR "jffs_sync_erase(): fmc == NULL\n");
-                return;
-        });
-        fmc->dirty_size -= erased_size;
-        fmc->free_size += erased_size;
-        for (fm = fmc->head; fm && (erased_size > 0);) {
-                if (erased_size >= fm->size) {
-                        erased_size -= fm->size;
-                        del = fm;
-                        fm = fm->next;
-                        fm->prev = NULL;
-                        fmc->head = fm;
-                        jffs_free_fm(del);
-                }
-                else {
-                        fm->size -= erased_size;
-                        fm->offset += erased_size;
-                        break;
-                }
-        }
-}
-/* Return the oldest used node in the flash memory.  */
-struct jffs_node *
-jffs_get_oldest_node(struct jffs_fmcontrol *fmc)
-{
-        struct jffs_fm *fm;
-        struct jffs_node_ref *nref;
-        struct jffs_node *node = NULL;
-        ASSERT(if (!fmc) {
-                printk(KERN_ERR "jffs_get_oldest_node(): fmc == NULL\n");
-                return NULL;
-        });
-        for (fm = fmc->head; fm && !fm->nodes; fm = fm->next);
-        if (!fm) {
-                return NULL;
-        }
-        /* The oldest node is the last one in the reference list.  This list
-           shouldn't be too long; just one or perhaps two elements.  */
-        for (nref = fm->nodes; nref; nref = nref->next) {
-                node = nref->node;
-        }
-        D2(printk("jffs_get_oldest_node(): ino = %u, version = %u\n",
-                  (node ? node->ino : 0), (node ? node->version : 0)));
-        return node;
-}
-#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE
-/* Mark an on-flash node as obsolete.
-   Note that this is just an optimization that isn't necessary for the
-   filesystem to work.  */
-static int
-jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset)
-{
-        /* The `accurate_pos' holds the position of the accurate byte
-           in the jffs_raw_inode structure that we are going to mark
-           as obsolete.  */
-        __u32 accurate_pos = fm_offset + JFFS_RAW_INODE_ACCURATE_OFFSET;
-        unsigned char zero = 0x00;
-        size_t len;
-        D3(printk("jffs_mark_obsolete(): accurate_pos = %u\n", accurate_pos));
-        ASSERT(if (!fmc) {
-                printk(KERN_ERR "jffs_mark_obsolete(): fmc == NULL\n");
-                return -1;
-        });
-        /* Write 0x00 to the raw inode's accurate member.  Don't care
-           about the return value.  */
-        MTD_WRITE(fmc->mtd, accurate_pos, 1, &len, &zero);
-        return 0;
-}
-#endif /* JFFS_MARK_OBSOLETE  */
-/* check if it's possible to erase the wanted range, and if not, return
- * the range that IS erasable, or a negative error code.
- */
-static long
-jffs_flash_erasable_size(struct mtd_info *mtd, __u32 offset, __u32 size)
-{
-         u_long ssize;
-        /* assume that sector size for a partition is constant even
-         * if it spans more than one chip (you usually put the same
-         * type of chips in a system)
-         */
-        ssize = mtd->erasesize;
-        if (offset % ssize) {
-                printk(KERN_WARNING "jffs_flash_erasable_size() given non-aligned offset %x (erasesize %lx)\n", offset, ssize);
-                /* The offset is not sector size aligned.  */
-                return -1;
-        }
-        else if (offset > mtd->size) {
-                printk(KERN_WARNING "jffs_flash_erasable_size given offset off the end of device (%x > %x)\n", offset, mtd->size);
-                return -2;
-        }
-        else if (offset + size > mtd->size) {
-                printk(KERN_WARNING "jffs_flash_erasable_size() given length which runs off the end of device (ofs %x + len %x = %x, > %x)\n", offset,size, offset+size, mtd->size);
-                return -3;
-        }
-        return (size / ssize) * ssize;
-}
-/* How much dirty flash memory is possible to erase at the moment?  */
-long
-jffs_erasable_size(struct jffs_fmcontrol *fmc)
-{
-        struct jffs_fm *fm;
-        __u32 size = 0;
-        long ret;
-        ASSERT(if (!fmc) {
-                printk(KERN_ERR "jffs_erasable_size(): fmc = NULL\n");
-                return -1;
-        });
-        if (!fmc->head) {
-                /* The flash memory is totally empty. No nodes. No dirt.
-                   Just return.  */
-                return 0;
-        }
-        /* Calculate how much space that is dirty.  */
-        for (fm = fmc->head; fm && !fm->nodes; fm = fm->next) {
-                if (size && fm->offset == 0) {
-                        /* We have reached the beginning of the flash.  */
-                        break;
-                }
-                size += fm->size;
-        }
-        /* Someone's signature contained this:
-           There's a fine line between fishing and just standing on
-           the shore like an idiot...  */
-        ret = jffs_flash_erasable_size(fmc->mtd, fmc->head->offset, size);
-        ASSERT(if (ret < 0) {
-                printk("jffs_erasable_size: flash_erasable_size() "
-                       "returned something less than zero (%ld).\n", ret);
-                printk("jffs_erasable_size: offset = 0x%08x\n",
-                       fmc->head->offset);
-        });
-        /* If there is dirt on the flash (which is the reason to why
-           this function was called in the first place) but no space is
-           possible to erase right now, the initial part of the list of
-           jffs_fm structs, that hold place for dirty space, could perhaps
-           be shortened.  The list's initial "dirty" elements are merged
-           into just one large dirty jffs_fm struct.  This operation must
-           only be performed if nothing is possible to erase.  Otherwise,
-           jffs_clear_end_of_node() won't work as expected.  */
-        if (ret == 0) {
-                struct jffs_fm *head = fmc->head;
-                struct jffs_fm *del;
-                /* While there are two dirty nodes beside each other.*/
-                while (head->nodes == 0
-                       && head->next
-                       && head->next->nodes == 0) {
-                        del = head->next;
-                        head->size += del->size;
-                        head->next = del->next;
-                        if (del->next) {
-                                del->next->prev = head;
-                        }
-                        jffs_free_fm(del);
-                }
-        }
-        return (ret >= 0 ? ret : 0);
-}
-static struct jffs_fm *jffs_alloc_fm(void)
-{
-        struct jffs_fm *fm;
-        fm = kmem_cache_alloc(fm_cache,GFP_KERNEL);
-        DJM(if (fm) no_jffs_fm++;);
-        
-        return fm;
-}
-static void jffs_free_fm(struct jffs_fm *n)
-{
-        kmem_cache_free(fm_cache,n);
-        DJM(no_jffs_fm--);
-}
-struct jffs_node *jffs_alloc_node(void)
-{
-        struct jffs_node *n;
-        n = (struct jffs_node *)kmem_cache_alloc(node_cache,GFP_KERNEL);
-        if(n != NULL)
-                no_jffs_node++;
-        return n;
-}
-void jffs_free_node(struct jffs_node *n)
-{
-        kmem_cache_free(node_cache,n);
-        no_jffs_node--;
-}
-int jffs_get_node_inuse(void)
-{
-        return no_jffs_node;
-}
diff --git a/fs/jffs/jffs_fm.h b/fs/jffs/jffs_fm.h
deleted file mode 100644
index 9ee6ad29eff5..000000000000
--- a/fs/jffs/jffs_fm.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * JFFS -- Journaling Flash File System, Linux implementation.
- *
- * Copyright (C) 1999, 2000  Axis Communications AB.
- *
- * Created by Finn Hakansson <finn@axis.com>.
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * $Id: jffs_fm.h,v 1.13 2001/01/11 12:03:25 dwmw2 Exp $
- *
- * Ported to Linux 2.3.x and MTD:
- * Copyright (C) 2000  Alexander Larsson (alex@cendio.se), Cendio Systems AB
- *
- */
-#ifndef __LINUX_JFFS_FM_H__
-#define __LINUX_JFFS_FM_H__
-#include <linux/types.h>
-#include <linux/jffs.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mutex.h>
-/* The alignment between two nodes in the flash memory.  */
-#define JFFS_ALIGN_SIZE 4
-/* Mark the on-flash space as obsolete when appropriate.  */
-#define JFFS_MARK_OBSOLETE 0
-#ifndef CONFIG_JFFS_FS_VERBOSE
-#define CONFIG_JFFS_FS_VERBOSE 1
-#endif
-#if CONFIG_JFFS_FS_VERBOSE > 0
-#define D(x) x
-#define D1(x) D(x)
-#else
-#define D(x)
-#define D1(x)
-#endif
-#if CONFIG_JFFS_FS_VERBOSE > 1
-#define D2(x) D(x)
-#else
-#define D2(x)
-#endif
-#if CONFIG_JFFS_FS_VERBOSE > 2
-#define D3(x) D(x)
-#else
-#define D3(x)
-#endif
-#define ASSERT(x) x
-/* How many padding bytes should be inserted between two chunks of data
-   on the flash?  */
-#define JFFS_GET_PAD_BYTES(size) ( (JFFS_ALIGN_SIZE-1) & -(__u32)(size) )
-#define JFFS_PAD(size) ( (size + (JFFS_ALIGN_SIZE-1)) & ~(JFFS_ALIGN_SIZE-1) )
-struct jffs_node_ref
-{
-        struct jffs_node *node;
-        struct jffs_node_ref *next;
-};
-/* The struct jffs_fm represents a chunk of data in the flash memory.  */
-struct jffs_fm
-{
-        __u32 offset;
-        __u32 size;
-        struct jffs_fm *prev;
-        struct jffs_fm *next;
-        struct jffs_node_ref *nodes; /* USED if != 0.  */
-};
-struct jffs_fmcontrol
-{
-        __u32 flash_size;
-        __u32 used_size;
-        __u32 dirty_size;
-        __u32 free_size;
-        __u32 sector_size;
-        __u32 min_free_size;  /* The minimum free space needed to be able
-                                 to perform garbage collections.  */
-        __u32 max_chunk_size; /* The maximum size of a chunk of data.  */
-        struct mtd_info *mtd;
-        struct jffs_control *c;
-        struct jffs_fm *head;
-        struct jffs_fm *tail;
-        struct jffs_fm *head_extra;
-        struct jffs_fm *tail_extra;
-        struct mutex biglock;
-};
-/* Notice the two members head_extra and tail_extra in the jffs_control
-   structure above. Those are only used during the scanning of the flash
-   memory; while the file system is being built. If the data in the flash
-   memory is organized like
-      +----------------+------------------+----------------+
-      |  USED / DIRTY  |       FREE       |  USED / DIRTY  |
-      +----------------+------------------+----------------+
-   then the scan is split in two parts. The first scanned part of the
-   flash memory is organized through the members head and tail. The
-   second scanned part is organized with head_extra and tail_extra. When
-   the scan is completed, the two lists are merged together. The jffs_fm
-   struct that head_extra references is the logical beginning of the
-   flash memory so it will be referenced by the head member.  */
-struct jffs_fmcontrol *jffs_build_begin(struct jffs_control *c, int unit);
-void jffs_build_end(struct jffs_fmcontrol *fmc);
-void jffs_cleanup_fmcontrol(struct jffs_fmcontrol *fmc);
-int jffs_fmalloc(struct jffs_fmcontrol *fmc, __u32 size,
-                 struct jffs_node *node, struct jffs_fm **result);
-int jffs_fmfree(struct jffs_fmcontrol *fmc, struct jffs_fm *fm,
-                struct jffs_node *node);
-__u32 jffs_free_size1(struct jffs_fmcontrol *fmc);
-__u32 jffs_free_size2(struct jffs_fmcontrol *fmc);
-void jffs_sync_erase(struct jffs_fmcontrol *fmc, int erased_size);
-struct jffs_fm *jffs_cut_node(struct jffs_fmcontrol *fmc, __u32 size);
-struct jffs_node *jffs_get_oldest_node(struct jffs_fmcontrol *fmc);
-long jffs_erasable_size(struct jffs_fmcontrol *fmc);
-struct jffs_fm *jffs_fmalloced(struct jffs_fmcontrol *fmc, __u32 offset,
-                               __u32 size, struct jffs_node *node);
-int jffs_add_node(struct jffs_node *node);
-void jffs_fmfree_partly(struct jffs_fmcontrol *fmc, struct jffs_fm *fm,
-                        __u32 size);
-#if CONFIG_JFFS_FS_VERBOSE > 0
-void jffs_print_fmcontrol(struct jffs_fmcontrol *fmc);
-#endif
-#if 0
-void jffs_print_node_ref(struct jffs_node_ref *ref);
-#endif  /*  0  */
-#endif /* __LINUX_JFFS_FM_H__  */
diff --git a/fs/jffs/jffs_proc.c b/fs/jffs/jffs_proc.c
deleted file mode 100644
index 9bdd99a557c2..000000000000
--- a/fs/jffs/jffs_proc.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * JFFS -- Journaling Flash File System, Linux implementation.
- *
- * Copyright (C) 2000  Axis Communications AB.
- *
- * Created by Simon Kagstrom <simonk@axis.com>.
- *
- * $Id: jffs_proc.c,v 1.5 2001/06/02 14:34:55 dwmw2 Exp $
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- *  Overview:
- *   This file defines JFFS partition entries in the proc file system.
- *
- *  TODO:
- *   Create some more proc files for different kinds of info, i.e. statistics
- *   about written and read bytes, number of calls to different routines,
- *   reports about failures.
- */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/jffs.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/time.h>
-#include <linux/types.h>
-#include "jffs_fm.h"
-#include "jffs_proc.h"
-/*
- * Structure for a JFFS partition in the system
- */
-struct jffs_partition_dir {
-        struct jffs_control *c;
-        struct proc_dir_entry *part_root;
-        struct proc_dir_entry *part_info;
-        struct proc_dir_entry *part_layout;
-        struct jffs_partition_dir *next;
-};
-/*
- * Structure for top-level entry in '/proc/fs' directory
- */
-struct proc_dir_entry *jffs_proc_root;
-/*
- * Linked list of 'jffs_partition_dirs' to help us track
- * the mounted JFFS partitions in the system
- */
-static struct jffs_partition_dir *jffs_part_dirs;
-/*
- * Read functions for entries
- */
-static int jffs_proc_info_read(char *page, char **start, off_t off,
-                int count, int *eof, void *data);
-static int jffs_proc_layout_read (char *page, char **start, off_t off,
-                int count, int *eof, void *data);
-/*
- * Register a JFFS partition directory (called upon mount)
- */
-int jffs_register_jffs_proc_dir(int mtd, struct jffs_control *c)
-{
-        struct jffs_partition_dir *part_dir;
-        struct proc_dir_entry *part_info = NULL;
-        struct proc_dir_entry *part_layout = NULL;
-        struct proc_dir_entry *part_root = NULL;
-        char name[10];
-        sprintf(name, "%d", mtd);
-        /* Allocate structure for local JFFS partition table */
-        part_dir = (struct jffs_partition_dir *)
-                kmalloc(sizeof (struct jffs_partition_dir), GFP_KERNEL);
-        if (!part_dir)
-                goto out;
-        /* Create entry for this partition */
-        part_root = proc_mkdir(name, jffs_proc_root);
-        if (!part_root)
-                goto out1;
-        /* Create entry for 'info' file */
-        part_info = create_proc_entry ("info", 0, part_root);
-        if (!part_info)
-                goto out2;
-        part_info->read_proc = jffs_proc_info_read;
-        part_info->data = (void *) c;
-        /* Create entry for 'layout' file */
-        part_layout = create_proc_entry ("layout", 0, part_root);
-        if (!part_layout)
-                goto out3;
-        part_layout->read_proc = jffs_proc_layout_read;
-        part_layout->data = (void *) c;
-        /* Fill in structure for table and insert in the list */
-        part_dir->c = c;
-        part_dir->part_root = part_root;
-        part_dir->part_info = part_info;
-        part_dir->part_layout = part_layout;
-        part_dir->next = jffs_part_dirs;
-        jffs_part_dirs = part_dir;
-        /* Return happy */
-        return 0;
-out3:
-        remove_proc_entry("info", part_root);
-out2:
-        remove_proc_entry(name, jffs_proc_root);
-out1:
-        kfree(part_dir);
-out:
-        return -ENOMEM;
-}
-/*
- * Unregister a JFFS partition directory (called at umount)
- */
-int jffs_unregister_jffs_proc_dir(struct jffs_control *c)
-{
-        struct jffs_partition_dir *part_dir = jffs_part_dirs;
-        struct jffs_partition_dir *prev_part_dir = NULL;
-        while (part_dir) {
-                if (part_dir->c == c) {
-                        /* Remove entries for partition */
-                        remove_proc_entry (part_dir->part_info->name,
-                                part_dir->part_root);
-                        remove_proc_entry (part_dir->part_layout->name,
-                                part_dir->part_root);
-                        remove_proc_entry (part_dir->part_root->name,
-                                jffs_proc_root);
-                        /* Remove entry from list */
-                        if (prev_part_dir)
-                                prev_part_dir->next = part_dir->next;
-                        else
-                                jffs_part_dirs = part_dir->next;
-                        /*
-                         * Check to see if this is the last one
-                         * and remove the entry from '/proc/fs'
-                         * if it is.
-                         */
-                        if (jffs_part_dirs == part_dir->next)
-                                remove_proc_entry ("jffs", proc_root_fs);
-                        /* Free memory for entry */
-                        kfree(part_dir);
-                        /* Return happy */
-                        return 0;
-                }
-                /* Move to next entry */
-                prev_part_dir = part_dir;
-                part_dir = part_dir->next;
-        }
-        /* Return unhappy */
-        return -1;
-}
-/*
- * Read a JFFS partition's `info' file
- */
-static int jffs_proc_info_read (char *page, char **start, off_t off,
-                int count, int *eof, void *data)
-{
-        struct jffs_control *c = (struct jffs_control *) data;
-        int len = 0;
-        /* Get information on the parition */
-        len += sprintf (page,
-                "partition size:     %08lX (%u)\n"
-                "sector size:        %08lX (%u)\n"
-                "used size:          %08lX (%u)\n"
-                "dirty size:         %08lX (%u)\n"
-                "free size:          %08lX (%u)\n\n",
-                (unsigned long) c->fmc->flash_size, c->fmc->flash_size,
-                (unsigned long) c->fmc->sector_size, c->fmc->sector_size,
-                (unsigned long) c->fmc->used_size, c->fmc->used_size,
-                (unsigned long) c->fmc->dirty_size, c->fmc->dirty_size,
-                (unsigned long) (c->fmc->flash_size -
-                        (c->fmc->used_size + c->fmc->dirty_size)),
-                c->fmc->flash_size - (c->fmc->used_size + c->fmc->dirty_size));
-        /* We're done */
-        *eof = 1;
-        /* Return length */
-        return len;
-}
-/*
- * Read a JFFS partition's `layout' file
- */
-static int jffs_proc_layout_read (char *page, char **start, off_t off,
-                int count, int *eof, void *data)
-{
-        struct jffs_control *c = (struct jffs_control *) data;
-        struct jffs_fm *fm = NULL;
-        struct jffs_fm *last_fm = NULL;
-        int len = 0;
-        /* Get the first item in the list */
-        fm = c->fmc->head;
-        /* Print free space */
-        if (fm && fm->offset) {
-                len += sprintf (page, "00000000 %08lX free\n",
-                        (unsigned long) fm->offset);
-        }
-        /* Loop through all of the flash control structures */
-        while (fm && (len < (off + count))) {
-                if (fm->nodes) {
-                        len += sprintf (page + len,
-                                "%08lX %08lX ino=%08lX, ver=%08lX\n",
-                                (unsigned long) fm->offset,
-                                (unsigned long) fm->size,
-                                (unsigned long) fm->nodes->node->ino,
-                                (unsigned long) fm->nodes->node->version);
-                }
-                else {
-                        len += sprintf (page + len,
-                                "%08lX %08lX dirty\n",
-                                (unsigned long) fm->offset,
-                                (unsigned long) fm->size);
-                }
-                last_fm = fm;
-                fm = fm->next;
-        }
-        /* Print free space */
-        if ((len < (off + count)) && last_fm
-            && (last_fm->offset < c->fmc->flash_size)) {
-                len += sprintf (page + len,
-                               "%08lX %08lX free\n",
-                               (unsigned long) last_fm->offset + 
-                                last_fm->size,
-                               (unsigned long) (c->fmc->flash_size -
-                                                    (last_fm->offset + last_fm->size)));
-        }
-        /* We're done */
-        *eof = 1;
-        /* Return length */
-        return len;
-}
diff --git a/fs/jffs/jffs_proc.h b/fs/jffs/jffs_proc.h
deleted file mode 100644
index 39a1c5d162b0..000000000000
--- a/fs/jffs/jffs_proc.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * JFFS -- Journaling Flash File System, Linux implementation.
- *
- * Copyright (C) 2000  Axis Communications AB.
- *
- * Created by Simon Kagstrom <simonk@axis.com>.
- *
- * This is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * $Id: jffs_proc.h,v 1.2 2000/11/15 22:04:12 sjhill Exp $
- */
-/* jffs_proc.h defines a structure for inclusion in the proc-file system.  */
-#ifndef __LINUX_JFFS_PROC_H__
-#define __LINUX_JFFS_PROC_H__
-#include <linux/proc_fs.h>
-/* The proc_dir_entry for jffs (defined in jffs_proc.c).  */
-extern struct proc_dir_entry *jffs_proc_root;
-int jffs_register_jffs_proc_dir(int mtd, struct jffs_control *c);
-int jffs_unregister_jffs_proc_dir(struct jffs_control *c);
-#endif /* __LINUX_JFFS_PROC_H__ */
diff --git a/fs/jffs2/LICENCE b/fs/jffs2/LICENCE
index cd81d83e4ad2..562885908135 100644
--- a/fs/jffs2/LICENCE
+++ b/fs/jffs2/LICENCE
@@ -1,7 +1,7 @@
 The files in this directory and elsewhere which refer to this LICENCE
 file are part of JFFS2, the Journalling Flash File System v2.
-        Copyright (C) 2001, 2002 Red Hat, Inc.
+        Copyright © 2001-2007 Red Hat, Inc. and others
 JFFS2 is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
@@ -28,8 +28,3 @@ of the GNU General Public License.
 This exception does not invalidate any other reasons why a work based on
 this file might be covered by the GNU General Public License.
-For information on obtaining alternative licences for JFFS2, see 
-http://sources.redhat.com/jffs2/jffs2-licence.html
-        $Id: LICENCE,v 1.1 2002/05/20 14:56:37 dwmw2 Exp $
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 7f28ee0bd132..c32b241e3d91 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -1,7 +1,6 @@
 #
 # Makefile for the Linux Journalling Flash File System v2 (JFFS2)
 #
-# $Id: Makefile.common,v 1.11 2005/09/07 08:34:53 havasi Exp $
 #
 obj-$(CONFIG_JFFS2_FS) += jffs2.o
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index c8f0bd64e53e..d14d5a4dc5ac 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -1,4 +1,3 @@
-        $Id: README.Locking,v 1.12 2005/04/13 13:22:35 dwmw2 Exp $
        JFFS2 LOCKING DOCUMENTATION
        ---------------------------
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
index d0e23b26fa50..5d3ea4070f01 100644
--- a/fs/jffs2/TODO
+++ b/fs/jffs2/TODO
@@ -1,4 +1,3 @@
-$Id: TODO,v 1.18 2005/09/22 11:24:56 dedekind Exp $
 - support asynchronous operation -- add a per-fs 'reserved_space' count,
   let each outstanding write reserve the _maximum_ amount of physical
@@ -30,8 +29,6 @@ $Id: TODO,v 1.18 2005/09/22 11:24:56 dedekind Exp $
     the full dirent, we only need to go to the flash in lookup() when we think we've
     got a match, and in readdir(). 
   - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately?
-   - Remove totlen from jffs2_raw_node_ref? Need to have totlen passed into
-        jffs2_mark_node_obsolete(). Can all callers work it out?
   - Remove size from jffs2_raw_node_frag. 
 dedekind:
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 73f0d60f73a5..a46101ee867a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fa327dbd3171..c84378cee82a 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 struct jffs2_acl_entry {
        jint16_t        e_tag;
        jint16_t        e_perm;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 6eb3daebd563..0c82dfcfd246 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: background.c,v 1.54 2005/05/20 21:37:12 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -99,7 +97,13 @@ static int jffs2_garbage_collect_thread(void *_c)
                if (try_to_freeze())
                        continue;
-                cond_resched();
+                /* This thread is purely an optimisation. But if it runs when
+                   other things could be running, it actually makes things a
+                   lot worse. Use yield() and put it at the back of the runqueue
+                   every time. Especially during boot, pulling an inode in
+                   with read_inode() is much preferable to having the GC thread
+                   get there first. */
+                yield();
                /* Put_super will send a SIGKILL and then wait on the sem.
                 */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 02826967ab58..0ca2fff2617f 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: build.c,v 1.85 2005/11/07 11:14:38 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -348,23 +346,27 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
        ret = jffs2_sum_init(c);
        if (ret)
-                return ret;
+                goto out_free;
        if (jffs2_build_filesystem(c)) {
                dbg_fsbuild("build_fs failed\n");
                jffs2_free_ino_caches(c);
                jffs2_free_raw_node_refs(c);
-#ifndef __ECOS
+                ret = -EIO;
-                if (jffs2_blocks_use_vmalloc(c))
+                goto out_free;
-                        vfree(c->blocks);
-                else
-#endif
-                        kfree(c->blocks);
-                return -EIO;
        }
        jffs2_calc_trigger_levels(c);
        return 0;
+ out_free:
+#ifndef __ECOS
+        if (jffs2_blocks_use_vmalloc(c))
+                vfree(c->blocks);
+        else
+#endif
+                kfree(c->blocks);
+        return ret;
 }
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 7001ba26c067..485d065de41f 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -1,16 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
- * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
 *                    University of Szeged, Hungary
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr.c,v 1.46 2005/11/07 11:14:38 gleixner Exp $
- *
 */
 #include "compr.h"
@@ -268,144 +266,6 @@ int jffs2_unregister_compressor(struct jffs2_compressor *comp)
        return 0;
 }
-#ifdef CONFIG_JFFS2_PROC
-#define JFFS2_STAT_BUF_SIZE 16000
-char *jffs2_list_compressors(void)
-{
-        struct jffs2_compressor *this;
-        char *buf, *act_buf;
-        act_buf = buf = kmalloc(JFFS2_STAT_BUF_SIZE,GFP_KERNEL);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                act_buf += sprintf(act_buf, "%10s priority:%d ", this->name, this->priority);
-                if ((this->disabled)||(!this->compress))
-                        act_buf += sprintf(act_buf,"disabled");
-                else
-                        act_buf += sprintf(act_buf,"enabled");
-                act_buf += sprintf(act_buf,"\n");
-        }
-        return buf;
-}
-char *jffs2_stats(void)
-{
-        struct jffs2_compressor *this;
-        char *buf, *act_buf;
-        act_buf = buf = kmalloc(JFFS2_STAT_BUF_SIZE,GFP_KERNEL);
-        act_buf += sprintf(act_buf,"JFFS2 compressor statistics:\n");
-        act_buf += sprintf(act_buf,"%10s   ","none");
-        act_buf += sprintf(act_buf,"compr: %d blocks (%d)  decompr: %d blocks\n", none_stat_compr_blocks,
-                           none_stat_compr_size, none_stat_decompr_blocks);
-        spin_lock(&jffs2_compressor_list_lock);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                act_buf += sprintf(act_buf,"%10s ",this->name);
-                if ((this->disabled)||(!this->compress))
-                        act_buf += sprintf(act_buf,"- ");
-                else
-                        act_buf += sprintf(act_buf,"+ ");
-                act_buf += sprintf(act_buf,"compr: %d blocks (%d/%d)  decompr: %d blocks ", this->stat_compr_blocks,
-                                   this->stat_compr_new_size, this->stat_compr_orig_size,
-                                   this->stat_decompr_blocks);
-                act_buf += sprintf(act_buf,"\n");
-        }
-        spin_unlock(&jffs2_compressor_list_lock);
-        return buf;
-}
-char *jffs2_get_compression_mode_name(void)
-{
-        switch (jffs2_compression_mode) {
-        case JFFS2_COMPR_MODE_NONE:
-                return "none";
-        case JFFS2_COMPR_MODE_PRIORITY:
-                return "priority";
-        case JFFS2_COMPR_MODE_SIZE:
-                return "size";
-        }
-        return "unkown";
-}
-int jffs2_set_compression_mode_name(const char *name)
-{
-        if (!strcmp("none",name)) {
-                jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
-                return 0;
-        }
-        if (!strcmp("priority",name)) {
-                jffs2_compression_mode = JFFS2_COMPR_MODE_PRIORITY;
-                return 0;
-        }
-        if (!strcmp("size",name)) {
-                jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE;
-                return 0;
-        }
-        return 1;
-}
-static int jffs2_compressor_Xable(const char *name, int disabled)
-{
-        struct jffs2_compressor *this;
-        spin_lock(&jffs2_compressor_list_lock);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                if (!strcmp(this->name, name)) {
-                        this->disabled = disabled;
-                        spin_unlock(&jffs2_compressor_list_lock);
-                        return 0;
-                }
-        }
-        spin_unlock(&jffs2_compressor_list_lock);
-        printk(KERN_WARNING "JFFS2: compressor %s not found.\n",name);
-        return 1;
-}
-int jffs2_enable_compressor_name(const char *name)
-{
-        return jffs2_compressor_Xable(name, 0);
-}
-int jffs2_disable_compressor_name(const char *name)
-{
-        return jffs2_compressor_Xable(name, 1);
-}
-int jffs2_set_compressor_priority(const char *name, int priority)
-{
-        struct jffs2_compressor *this,*comp;
-        spin_lock(&jffs2_compressor_list_lock);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                if (!strcmp(this->name, name)) {
-                        this->priority = priority;
-                        comp = this;
-                        goto reinsert;
-                }
-        }
-        spin_unlock(&jffs2_compressor_list_lock);
-        printk(KERN_WARNING "JFFS2: compressor %s not found.\n",name);
-        return 1;
-reinsert:
-        /* list is sorted in the order of priority, so if
-           we change it we have to reinsert it into the
-           good place */
-        list_del(&comp->list);
-        list_for_each_entry(this, &jffs2_compressor_list, list) {
-                if (this->priority < comp->priority) {
-                        list_add(&comp->list, this->list.prev);
-                        spin_unlock(&jffs2_compressor_list_lock);
-                        return 0;
-                }
-        }
-        list_add_tail(&comp->list, &jffs2_compressor_list);
-        spin_unlock(&jffs2_compressor_list_lock);
-        return 0;
-}
-#endif
 void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
 {
        if (orig != comprbuf)
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 509b8b1c0811..68cc7010dbdf 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -1,13 +1,10 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
 *                    University of Szeged, Hungary
 *
- * For licensing information, see the file 'LICENCE' in the
+ * For licensing information, see the file 'LICENCE' in this directory.
- * jffs2 directory.
- *
- * $Id: compr.h,v 1.9 2005/11/07 11:14:38 gleixner Exp $
 *
 */
@@ -76,16 +73,6 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig);
-#ifdef CONFIG_JFFS2_PROC
-int jffs2_enable_compressor_name(const char *name);
-int jffs2_disable_compressor_name(const char *name);
-int jffs2_set_compression_mode_name(const char *mode_name);
-char *jffs2_get_compression_mode_name(void);
-int jffs2_set_compressor_priority(const char *mode_name, int priority);
-char *jffs2_list_compressors(void);
-char *jffs2_stats(void);
-#endif
 /* Compressor modules */
 /* These functions will be called by jffs2_compressors_init/exit */
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 2eb1b7428d16..0d0bfd2e4e0d 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -1,13 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr_rtime.c,v 1.14 2004/06/23 16:34:40 havasi Exp $
 *
 *
 * Very simple lz77-ish encoder.
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index e792e675d624..ea0431e047d5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -1,23 +1,94 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr_rubin.c,v 1.20 2004/06/23 16:34:40 havasi Exp $
- *
 */
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
-#include "compr_rubin.h"
+#include <linux/errno.h>
-#include "histo_mips.h"
 #include "compr.h"
+#define RUBIN_REG_SIZE   16
+#define UPPER_BIT_RUBIN    (((long) 1)<<(RUBIN_REG_SIZE-1))
+#define LOWER_BITS_RUBIN   ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
+#define BIT_DIVIDER_MIPS 1043
+static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
+#include <linux/errno.h>
+struct pushpull {
+        unsigned char *buf;
+        unsigned int buflen;
+        unsigned int ofs;
+        unsigned int reserve;
+};
+struct rubin_state {
+        unsigned long p;
+        unsigned long q;
+        unsigned long rec_q;
+        long bit_number;
+        struct pushpull pp;
+        int bit_divider;
+        int bits[8];
+};
+static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+{
+        pp->buf = buf;
+        pp->buflen = buflen;
+        pp->ofs = ofs;
+        pp->reserve = reserve;
+}
+static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
+{
+        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+                return -ENOSPC;
+        }
+        if (bit) {
+                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
+        }
+        else {
+                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
+        }
+        pp->ofs++;
+        return 0;
+}
+static inline int pushedbits(struct pushpull *pp)
+{
+        return pp->ofs;
+}
+static inline int pullbit(struct pushpull *pp)
+{
+        int bit;
+        bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
+        pp->ofs++;
+        return bit;
+}
+static inline int pulledbits(struct pushpull *pp)
+{
+        return pp->ofs;
+}
 static void init_rubin(struct rubin_state *rs, int div, int *bits)
 {
        int c;
diff --git a/fs/jffs2/compr_rubin.h b/fs/jffs2/compr_rubin.h
deleted file mode 100644
index bf1a93451621..000000000000
--- a/fs/jffs2/compr_rubin.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Rubin encoder/decoder header       */
-/* work started at   : aug   3, 1994  */
-/* last modification : aug  15, 1994  */
-/* $Id: compr_rubin.h,v 1.7 2005/11/07 11:14:38 gleixner Exp $ */
-#include "pushpull.h"
-#define RUBIN_REG_SIZE   16
-#define UPPER_BIT_RUBIN    (((long) 1)<<(RUBIN_REG_SIZE-1))
-#define LOWER_BITS_RUBIN   ((((long) 1)<<(RUBIN_REG_SIZE-1))-1)
-struct rubin_state {
-        unsigned long p;
-        unsigned long q;
-        unsigned long rec_q;
-        long bit_number;
-        struct pushpull pp;
-        int bit_divider;
-        int bits[8];
-};
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 0c1fc6e20b43..2b87fccc1557 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: compr_zlib.c,v 1.32 2005/11/07 11:14:38 gleixner Exp $
- *
 */
 #if !defined(__KERNEL__) && !defined(__ECOS)
diff --git a/fs/jffs2/comprtest.c b/fs/jffs2/comprtest.c
deleted file mode 100644
index f0fb8be7740c..000000000000
--- a/fs/jffs2/comprtest.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/* $Id: comprtest.c,v 1.6 2005/11/07 11:14:38 gleixner Exp $ */
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <asm/types.h>
-#if 0
-#define TESTDATA_LEN 512
-static unsigned char testdata[TESTDATA_LEN] = {
- 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x02, 0x00, 0x03, 0x00, 0x01, 0x00, 0x00, 0x00, 0x60, 0x83, 0x04, 0x08, 0x34, 0x00, 0x00, 0x00,
- 0xb0, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x34, 0x00, 0x20, 0x00, 0x06, 0x00, 0x28, 0x00,
- 0x1e, 0x00, 0x1b, 0x00, 0x06, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x34, 0x80, 0x04, 0x08,
- 0x34, 0x80, 0x04, 0x08, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, 0xf4, 0x80, 0x04, 0x08,
- 0xf4, 0x80, 0x04, 0x08, 0x13, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
- 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x04, 0x08,
- 0x00, 0x80, 0x04, 0x08, 0x0d, 0x05, 0x00, 0x00, 0x0d, 0x05, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x05, 0x00, 0x00, 0x10, 0x95, 0x04, 0x08,
- 0x10, 0x95, 0x04, 0x08, 0xe8, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
- 0x00, 0x10, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x58, 0x05, 0x00, 0x00, 0x58, 0x95, 0x04, 0x08,
- 0x58, 0x95, 0x04, 0x08, 0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x08, 0x81, 0x04, 0x08,
- 0x08, 0x81, 0x04, 0x08, 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x2f, 0x6c, 0x69, 0x62, 0x2f, 0x6c, 0x64, 0x2d, 0x6c, 0x69, 0x6e, 0x75,
- 0x78, 0x2e, 0x73, 0x6f, 0x2e, 0x32, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
- 0x01, 0x00, 0x00, 0x00, 0x47, 0x4e, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
- 0x07, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
- 0x0c, 0x83, 0x04, 0x08, 0x81, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
- 0x1c, 0x83, 0x04, 0x08, 0xac, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
- 0x2c, 0x83, 0x04, 0x08, 0xdd, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
- 0x3c, 0x83, 0x04, 0x08, 0x2e, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
- 0x4c, 0x83, 0x04, 0x08, 0x7d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
- 0x00, 0x85, 0x04, 0x08, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x0e, 0x00, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x5f, 0x5f, 0x67,
- 0x6d, 0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x5f, 0x5f, 0x00, 0x6c, 0x69, 0x62, 0x63,
- 0x2e, 0x73, 0x6f, 0x2e, 0x36, 0x00, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x00, 0x5f, 0x5f, 0x63};
-#else
-#define TESTDATA_LEN 3481
-static unsigned char testdata[TESTDATA_LEN] = {
- 0x23, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x20, 0x22, 0x64, 0x62, 0x65, 0x6e, 0x63, 0x68,
- 0x2e, 0x68, 0x22, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x41, 0x58,
- 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x20, 0x31, 0x30, 0x30, 0x30, 0x0a, 0x0a, 0x73, 0x74, 0x61,
- 0x74, 0x69, 0x63, 0x20, 0x63, 0x68, 0x61, 0x72, 0x20, 0x62, 0x75, 0x66, 0x5b, 0x37, 0x30, 0x30,
- 0x30, 0x30, 0x5d, 0x3b, 0x0a, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61,
- 0x74, 0x69, 0x63, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x09, 0x69, 0x6e,
- 0x74, 0x20, 0x66, 0x64, 0x3b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c,
- 0x65, 0x3b, 0x0a, 0x7d, 0x20, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x4d, 0x41, 0x58, 0x5f,
- 0x46, 0x49, 0x4c, 0x45, 0x53, 0x5d, 0x3b, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f,
- 0x5f, 0x75, 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e,
- 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72,
- 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x75,
- 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x21, 0x3d, 0x20,
- 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28,
- 0x25, 0x64, 0x29, 0x20, 0x75, 0x6e, 0x6c, 0x69, 0x6e, 0x6b, 0x20, 0x25, 0x73, 0x20, 0x66, 0x61,
- 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09,
- 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75,
- 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72,
- 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a,
- 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x5f, 0x66,
- 0x69, 0x6c, 0x65, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x64, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x3b, 0x0a,
- 0x09, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a,
- 0x09, 0x09, 0x73, 0x20, 0x3d, 0x20, 0x4d, 0x49, 0x4e, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x6f, 0x66,
- 0x28, 0x62, 0x75, 0x66, 0x29, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x09,
- 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73,
- 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x2d, 0x3d, 0x20, 0x73, 0x3b, 0x0a,
- 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x6f, 0x70,
- 0x65, 0x6e, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
- 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x64, 0x2c,
- 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x3d,
- 0x20, 0x4f, 0x5f, 0x52, 0x44, 0x57, 0x52, 0x7c, 0x4f, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x3b,
- 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x73, 0x74, 0x61, 0x74, 0x20, 0x73, 0x74,
- 0x3b, 0x0a, 0x09, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f,
- 0x75, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28,
- 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69,
- 0x7a, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x66, 0x6c, 0x61, 0x67, 0x73, 0x20, 0x7c,
- 0x3d, 0x20, 0x4f, 0x5f, 0x54, 0x52, 0x55, 0x4e, 0x43, 0x3b, 0x0a, 0x0a, 0x09, 0x66, 0x64, 0x20,
- 0x3d, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x66, 0x6c,
- 0x61, 0x67, 0x73, 0x2c, 0x20, 0x30, 0x36, 0x30, 0x30, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20,
- 0x28, 0x66, 0x64, 0x20, 0x3d, 0x3d, 0x20, 0x2d, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70,
- 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x6f, 0x70, 0x65, 0x6e,
- 0x20, 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x68,
- 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22,
- 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65,
- 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x68,
- 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28,
- 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72,
- 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x66, 0x73, 0x74, 0x61, 0x74, 0x28, 0x66, 0x64, 0x2c,
- 0x20, 0x26, 0x73, 0x74, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65,
- 0x20, 0x3e, 0x20, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b,
- 0x0a, 0x23, 0x69, 0x66, 0x20, 0x44, 0x45, 0x42, 0x55, 0x47, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
- 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64,
- 0x69, 0x6e, 0x67, 0x20, 0x25, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x25, 0x64, 0x20, 0x66, 0x72, 0x6f,
- 0x6d, 0x20, 0x25, 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66,
- 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e, 0x74,
- 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x23, 0x65,
- 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x09, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x5f, 0x66, 0x69,
- 0x6c, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x2d, 0x20, 0x73, 0x74,
- 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x20, 0x65, 0x6c,
- 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x3c, 0x20, 0x73, 0x74,
- 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72,
- 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x69, 0x6e, 0x67,
- 0x20, 0x25, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x25, 0x64, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x25,
- 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e,
- 0x74, 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09,
- 0x09, 0x66, 0x74, 0x72, 0x75, 0x6e, 0x63, 0x61, 0x74, 0x65, 0x28, 0x66, 0x64, 0x2c, 0x20, 0x73,
- 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69,
- 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b, 0x69,
- 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61, 0x62,
- 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d, 0x20,
- 0x30, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66,
- 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53,
- 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x66, 0x69,
- 0x6c, 0x65, 0x20, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x66, 0x75, 0x6c, 0x6c, 0x20, 0x66, 0x6f,
- 0x72, 0x20, 0x25, 0x73, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b,
- 0x0a, 0x09, 0x09, 0x65, 0x78, 0x69, 0x74, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09,
- 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65,
- 0x20, 0x3d, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x3b, 0x0a, 0x09, 0x66, 0x74, 0x61, 0x62,
- 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x20, 0x3d, 0x20, 0x66, 0x64, 0x3b, 0x0a, 0x09,
- 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2b, 0x2b, 0x20, 0x25, 0x20, 0x31, 0x30,
- 0x30, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e,
- 0x74, 0x66, 0x28, 0x22, 0x2e, 0x22, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76,
- 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x69, 0x6e, 0x74,
- 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x69, 0x7a,
- 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x0a, 0x7b,
- 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x62,
- 0x75, 0x66, 0x5b, 0x30, 0x5d, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x6d, 0x65, 0x6d, 0x73,
- 0x65, 0x74, 0x28, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x31, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x6f,
- 0x66, 0x28, 0x62, 0x75, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28,
- 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b,
- 0x69, 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61,
- 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d,
- 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a,
- 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58,
- 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x31, 0x0a,
- 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64,
- 0x6f, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x3a, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20,
- 0x25, 0x64, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20,
- 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25, 0x64, 0x20, 0x6f, 0x66, 0x73, 0x3d, 0x25, 0x64, 0x5c, 0x6e,
- 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e,
- 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c,
- 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x3b, 0x0a,
- 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b,
- 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x6c, 0x73, 0x65, 0x65, 0x6b, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c,
- 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x2c,
- 0x20, 0x53, 0x45, 0x45, 0x4b, 0x5f, 0x53, 0x45, 0x54, 0x29, 0x3b, 0x0a, 0x09, 0x69, 0x66, 0x20,
- 0x28, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d,
- 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20,
- 0x21, 0x3d, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
- 0x6e, 0x74, 0x66, 0x28, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65,
- 0x64, 0x20, 0x6f, 0x6e, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x5c, 0x6e,
- 0x22, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d,
- 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x65, 0x61, 0x64, 0x28, 0x69,
- 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73,
- 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29,
- 0x0a, 0x7b, 0x0a, 0x09, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20,
- 0x28, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53,
- 0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74,
- 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d,
- 0x3d, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
- 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41,
- 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69,
- 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x65, 0x61,
- 0x64, 0x3a, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x77, 0x61, 0x73,
- 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25,
- 0x64, 0x20, 0x6f, 0x66, 0x73, 0x3d, 0x25, 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e,
- 0x74, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c,
- 0x20, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75,
- 0x72, 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x6c, 0x73, 0x65, 0x65, 0x6b, 0x28, 0x66, 0x74,
- 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x2c, 0x20, 0x6f, 0x66, 0x66, 0x73,
- 0x65, 0x74, 0x2c, 0x20, 0x53, 0x45, 0x45, 0x4b, 0x5f, 0x53, 0x45, 0x54, 0x29, 0x3b, 0x0a, 0x09,
- 0x72, 0x65, 0x61, 0x64, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66,
- 0x64, 0x2c, 0x20, 0x62, 0x75, 0x66, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x7d,
- 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28,
- 0x69, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x69,
- 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x0a, 0x09, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x3d, 0x30, 0x3b,
- 0x69, 0x3c, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c, 0x45, 0x53, 0x3b, 0x69, 0x2b, 0x2b, 0x29,
- 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x69, 0x66, 0x20, 0x28, 0x66, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5b,
- 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x68, 0x61, 0x6e,
- 0x64, 0x6c, 0x65, 0x29, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09,
- 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x20, 0x4d, 0x41, 0x58, 0x5f, 0x46, 0x49, 0x4c,
- 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22,
- 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x3a, 0x20, 0x68,
- 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x25, 0x64, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74,
- 0x20, 0x6f, 0x70, 0x65, 0x6e, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20,
- 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74, 0x75, 0x72,
- 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28, 0x66, 0x74, 0x61,
- 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x66, 0x64, 0x29, 0x3b, 0x0a, 0x09, 0x66, 0x74, 0x61,
- 0x62, 0x6c, 0x65, 0x5b, 0x69, 0x5d, 0x2e, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x20, 0x3d, 0x20,
- 0x30, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x6d, 0x6b,
- 0x64, 0x69, 0x72, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29,
- 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61,
- 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x6b, 0x64, 0x69, 0x72,
- 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x30, 0x37, 0x30, 0x30, 0x29, 0x20, 0x21, 0x3d,
- 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x44, 0x45, 0x42, 0x55, 0x47, 0x0a,
- 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x6d, 0x6b, 0x64, 0x69, 0x72, 0x20,
- 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e,
- 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6e, 0x61,
- 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72,
- 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x09, 0x7d, 0x0a,
- 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x72, 0x6d, 0x64, 0x69, 0x72,
- 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x7b, 0x0a,
- 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29,
- 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x72, 0x6d, 0x64, 0x69, 0x72, 0x28, 0x66, 0x6e,
- 0x61, 0x6d, 0x65, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70,
- 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x72, 0x6d, 0x64, 0x69, 0x72, 0x20, 0x25, 0x73, 0x20,
- 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73, 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20,
- 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c,
- 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29,
- 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f,
- 0x5f, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x6f, 0x6c,
- 0x64, 0x2c, 0x20, 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x6e, 0x65, 0x77, 0x29, 0x0a, 0x7b, 0x0a,
- 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a,
- 0x09, 0x73, 0x74, 0x72, 0x75, 0x70, 0x70, 0x65, 0x72, 0x28, 0x6e, 0x65, 0x77, 0x29, 0x3b, 0x0a,
- 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x6f, 0x6c, 0x64,
- 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09,
- 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x72, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x20,
- 0x25, 0x73, 0x20, 0x25, 0x73, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x28, 0x25, 0x73,
- 0x29, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72,
- 0x6f, 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d,
- 0x0a, 0x0a, 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x28,
- 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74,
- 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74,
- 0x20, 0x73, 0x74, 0x61, 0x74, 0x20, 0x73, 0x74, 0x3b, 0x0a, 0x0a, 0x09, 0x73, 0x74, 0x72, 0x75,
- 0x70, 0x70, 0x65, 0x72, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x09, 0x69,
- 0x66, 0x20, 0x28, 0x73, 0x74, 0x61, 0x74, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x26,
- 0x73, 0x74, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72,
- 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22, 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74,
- 0x61, 0x74, 0x3a, 0x20, 0x25, 0x73, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x3d, 0x25, 0x64, 0x20, 0x25,
- 0x73, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d,
- 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x72, 0x65, 0x72, 0x72, 0x6f,
- 0x72, 0x28, 0x65, 0x72, 0x72, 0x6e, 0x6f, 0x29, 0x29, 0x3b, 0x0a, 0x09, 0x09, 0x72, 0x65, 0x74,
- 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28, 0x53, 0x5f, 0x49,
- 0x53, 0x44, 0x49, 0x52, 0x28, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x29,
- 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x0a, 0x09, 0x69, 0x66, 0x20, 0x28,
- 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x21, 0x3d, 0x20, 0x73, 0x69,
- 0x7a, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x09, 0x09, 0x70, 0x72, 0x69, 0x6e, 0x74, 0x66, 0x28, 0x22,
- 0x28, 0x25, 0x64, 0x29, 0x20, 0x64, 0x6f, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, 0x25, 0x73,
- 0x20, 0x77, 0x72, 0x6f, 0x6e, 0x67, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x25, 0x64, 0x20, 0x25,
- 0x64, 0x5c, 0x6e, 0x22, 0x2c, 0x20, 0x0a, 0x09, 0x09, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x20, 0x66, 0x6e, 0x61, 0x6d,
- 0x65, 0x2c, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x73, 0x74, 0x2e, 0x73, 0x74, 0x5f, 0x73, 0x69,
- 0x7a, 0x65, 0x2c, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x7d, 0x0a, 0x7d, 0x0a,
- 0x0a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x64, 0x6f, 0x5f, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x28,
- 0x63, 0x68, 0x61, 0x72, 0x20, 0x2a, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x69, 0x6e, 0x74,
- 0x20, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x0a, 0x7b, 0x0a, 0x09, 0x64, 0x6f, 0x5f, 0x6f, 0x70, 0x65,
- 0x6e, 0x28, 0x66, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x35, 0x30, 0x30, 0x30, 0x2c, 0x20, 0x73,
- 0x69, 0x7a, 0x65, 0x29, 0x3b, 0x0a, 0x09, 0x64, 0x6f, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x28,
- 0x35, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a
-};
-#endif
-static unsigned char comprbuf[TESTDATA_LEN];
-static unsigned char decomprbuf[TESTDATA_LEN];
-int jffs2_decompress(unsigned char comprtype, unsigned char *cdata_in,
-                     unsigned char *data_out, uint32_t cdatalen, uint32_t datalen);
-unsigned char jffs2_compress(unsigned char *data_in, unsigned char *cpage_out,
-                             uint32_t *datalen, uint32_t *cdatalen);
-int init_module(void ) {
-        unsigned char comprtype;
-        uint32_t c, d;
-        int ret;
-        printk("Original data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
-               testdata[0],testdata[1],testdata[2],testdata[3],
-               testdata[4],testdata[5],testdata[6],testdata[7],
-               testdata[8],testdata[9],testdata[10],testdata[11],
-               testdata[12],testdata[13],testdata[14],testdata[15]);
-        d = TESTDATA_LEN;
-        c = TESTDATA_LEN;
-        comprtype = jffs2_compress(testdata, comprbuf, &d, &c);
-        printk("jffs2_compress used compression type %d. Compressed size %d, uncompressed size %d\n",
-               comprtype, c, d);
-        printk("Compressed data: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
-               comprbuf[0],comprbuf[1],comprbuf[2],comprbuf[3],
-               comprbuf[4],comprbuf[5],comprbuf[6],comprbuf[7],
-               comprbuf[8],comprbuf[9],comprbuf[10],comprbuf[11],
-               comprbuf[12],comprbuf[13],comprbuf[14],comprbuf[15]);
-        ret = jffs2_decompress(comprtype, comprbuf, decomprbuf, c, d);
-        printk("jffs2_decompress returned %d\n", ret);
-        printk("Decompressed data:  %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
-               decomprbuf[0],decomprbuf[1],decomprbuf[2],decomprbuf[3],
-               decomprbuf[4],decomprbuf[5],decomprbuf[6],decomprbuf[7],
-               decomprbuf[8],decomprbuf[9],decomprbuf[10],decomprbuf[11],
-               decomprbuf[12],decomprbuf[13],decomprbuf[14],decomprbuf[15]);
-        if (memcmp(decomprbuf, testdata, d))
-                printk("Compression and decompression corrupted data\n");
-        else
-                printk("Compression good for %d bytes\n", d);
-        return 1;
-}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 4189e4a36050..3a32c64ed497 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -1,15 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: debug.c,v 1.12 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/pagemap.h>
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index f89c85d5a3f8..2a49f2c51a9f 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -1,15 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: debug.h,v 1.21 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #ifndef _JFFS2_DEBUG_H_
 #define _JFFS2_DEBUG_H_
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9fa2e27f0641..c1dfca310dd6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: dir.c,v 1.90 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ad0121088dde..66e7c2f1e644 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: erase.c,v 1.85 2005/09/20 14:53:15 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
@@ -333,7 +331,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
                *bad_offset = ofs;
-                ret = jffs2_flash_read(c, ofs, readlen, &retlen, ebuf);
+                ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
                if (ret) {
                        printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
                        goto fail;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e82eeaf7590d..99871279a1ed 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: file.c,v 1.104 2005/10/18 23:29:35 tpoynor Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index abb90c0c09cc..1d3b7a9fc828 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: fs.c,v 1.66 2005/09/27 13:17:29 dedekind Exp $
- *
 */
 #include <linux/capability.h>
@@ -672,6 +670,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
                        return ret;
        }
+        /* and an UBI volume */
+        if (jffs2_ubivol(c)) {
+                ret = jffs2_ubivol_setup(c);
+                if (ret)
+                        return ret;
+        }
        return ret;
 }
@@ -690,4 +695,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
        if (jffs2_nor_wbuf_flash(c)) {
                jffs2_nor_wbuf_flash_cleanup(c);
        }
+        /* and an UBI volume */
+        if (jffs2_ubivol(c)) {
+                jffs2_ubivol_cleanup(c);
+        }
 }
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3a3cf225981f..2d99e06ab223 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: gc.c,v 1.155 2005/11/07 11:14:39 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -144,7 +142,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                               c->unchecked_size);
                        jffs2_dbg_dump_block_lists_nolock(c);
                        spin_unlock(&c->erase_completion_lock);
-                        BUG();
+                        up(&c->alloc_sem);
+                        return -ENOSPC;
                }
                spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 69099835de1c..f4d525b0ea53 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: ioctl.c,v 1.10 2005/11/07 11:14:40 gleixner Exp $
- *
 */
 #include <linux/fs.h>
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 3a566077ac95..0b78fdc9773b 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -1,4 +1,13 @@
-/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #ifndef _JFFS2_FS_I
 #define _JFFS2_FS_I
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index b98594992eed..b13298a824ed 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -1,4 +1,13 @@
-/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
+ *
+ * Copyright © 2001-2007 Red Hat, Inc.
+ *
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #ifndef _JFFS2_FS_SB
 #define _JFFS2_FS_SB
@@ -98,20 +107,14 @@ struct jffs2_sb_info {
        uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-        /* Write-behind buffer for NAND flash */
+        unsigned char *wbuf; /* Write-behind buffer for NAND flash */
-        unsigned char *wbuf;
-        unsigned char *oobbuf;
        uint32_t wbuf_ofs;
        uint32_t wbuf_len;
        struct jffs2_inodirty *wbuf_inodes;
        struct rw_semaphore wbuf_sem;   /* Protects the write buffer */
-        /* Information about out-of-band area usage... */
+        unsigned char *oobbuf;
-        struct nand_ecclayout *ecclayout;
+        int oobavail; /* How many bytes are available for JFFS2 in OOB */
-        uint32_t badblock_pos;
-        uint32_t fsdata_pos;
-        uint32_t fsdata_len;
 #endif
        struct jffs2_summary *summary;          /* Summary information */
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 83f9881ec4cc..35c1a5e30ba1 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: malloc.c,v 1.31 2005/11/07 11:14:40 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 5a6b4d64206c..4bf86088b3ae 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: nodelist.c,v 1.115 2005/11/07 11:14:40 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -54,7 +52,7 @@ void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new
        *prev = new;
 }
-void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
+uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint32_t size)
 {
        struct jffs2_node_frag *frag = jffs2_lookup_node_frag(list, size);
@@ -76,18 +74,24 @@ void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint
        }
        if (size == 0)
-                return;
+                return 0;
-        /*
-         * If the last fragment starts at the RAM page boundary, it is
-         * REF_PRISTINE irrespective of its size.
-         */
        frag = frag_last(list);
+        /* Sanity check for truncation to longer than we started with... */
+        if (!frag)
+                return 0;
+        if (frag->ofs + frag->size < size)
+                return frag->ofs + frag->size;
+        /* If the last fragment starts at the RAM page boundary, it is
+         * REF_PRISTINE irrespective of its size. */
        if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
                dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
                        frag->ofs, frag->ofs + frag->size);
                frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
        }
+        return size;
 }
 static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
@@ -397,466 +401,6 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
        return 0;
 }
-/*
- * Check the data CRC of the node.
- *
- * Returns: 0 if the data CRC is correct;
- *          1 - if incorrect;
- *          error code if an error occured.
- */
-static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
-{
-        struct jffs2_raw_node_ref *ref = tn->fn->raw;
-        int err = 0, pointed = 0;
-        struct jffs2_eraseblock *jeb;
-        unsigned char *buffer;
-        uint32_t crc, ofs, len;
-        size_t retlen;
-        BUG_ON(tn->csize == 0);
-        if (!jffs2_is_writebuffered(c))
-                goto adj_acc;
-        /* Calculate how many bytes were already checked */
-        ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
-        len = ofs % c->wbuf_pagesize;
-        if (likely(len))
-                len = c->wbuf_pagesize - len;
-        if (len >= tn->csize) {
-                dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
-                        ref_offset(ref), tn->csize, ofs);
-                goto adj_acc;
-        }
-        ofs += len;
-        len = tn->csize - len;
-        dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
-                ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
-#ifndef __ECOS
-        /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
-         * adding and jffs2_flash_read_end() interface. */
-        if (c->mtd->point) {
-                err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
-                if (!err && retlen < tn->csize) {
-                        JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
-                        c->mtd->unpoint(c->mtd, buffer, ofs, len);
-                } else if (err)
-                        JFFS2_WARNING("MTD point failed: error code %d.\n", err);
-                else
-                        pointed = 1; /* succefully pointed to device */
-        }
-#endif
-        if (!pointed) {
-                buffer = kmalloc(len, GFP_KERNEL);
-                if (unlikely(!buffer))
-                        return -ENOMEM;
-                /* TODO: this is very frequent pattern, make it a separate
-                 * routine */
-                err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
-                if (err) {
-                        JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
-                        goto free_out;
-                }
-                if (retlen != len) {
-                        JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
-                        err = -EIO;
-                        goto free_out;
-                }
-        }
-        /* Continue calculating CRC */
-        crc = crc32(tn->partial_crc, buffer, len);
-        if(!pointed)
-                kfree(buffer);
-#ifndef __ECOS
-        else
-                c->mtd->unpoint(c->mtd, buffer, ofs, len);
-#endif
-        if (crc != tn->data_crc) {
-                JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
-                        ofs, tn->data_crc, crc);
-                return 1;
-        }
-adj_acc:
-        jeb = &c->blocks[ref->flash_offset / c->sector_size];
-        len = ref_totlen(c, jeb, ref);
-        /*
-         * Mark the node as having been checked and fix the
-         * accounting accordingly.
-         */
-        spin_lock(&c->erase_completion_lock);
-        jeb->used_size += len;
-        jeb->unchecked_size -= len;
-        c->used_size += len;
-        c->unchecked_size -= len;
-        spin_unlock(&c->erase_completion_lock);
-        return 0;
-free_out:
-        if(!pointed)
-                kfree(buffer);
-#ifndef __ECOS
-        else
-                c->mtd->unpoint(c->mtd, buffer, ofs, len);
-#endif
-        return err;
-}
-/*
- * Helper function for jffs2_add_older_frag_to_fragtree().
- *
- * Checks the node if we are in the checking stage.
- */
-static int check_node(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn)
-{
-        int ret;
-        BUG_ON(ref_obsolete(tn->fn->raw));
-        /* We only check the data CRC of unchecked nodes */
-        if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
-                return 0;
-        dbg_fragtree2("check node %#04x-%#04x, phys offs %#08x.\n",
-                tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
-        ret = check_node_data(c, tn);
-        if (unlikely(ret < 0)) {
-                JFFS2_ERROR("check_node_data() returned error: %d.\n",
-                        ret);
-        } else if (unlikely(ret > 0)) {
-                dbg_fragtree2("CRC error, mark it obsolete.\n");
-                jffs2_mark_node_obsolete(c, tn->fn->raw);
-        }
-        return ret;
-}
-/*
- * Helper function for jffs2_add_older_frag_to_fragtree().
- *
- * Called when the new fragment that is being inserted
- * splits a hole fragment.
- */
-static int split_hole(struct jffs2_sb_info *c, struct rb_root *root,
-                      struct jffs2_node_frag *newfrag, struct jffs2_node_frag *hole)
-{
-        dbg_fragtree2("fragment %#04x-%#04x splits the hole %#04x-%#04x\n",
-                newfrag->ofs, newfrag->ofs + newfrag->size, hole->ofs, hole->ofs + hole->size);
-        if (hole->ofs == newfrag->ofs) {
-                /*
-                 * Well, the new fragment actually starts at the same offset as
-                 * the hole.
-                 */
-                if (hole->ofs + hole->size > newfrag->ofs + newfrag->size) {
-                        /*
-                         * We replace the overlapped left part of the hole by
-                         * the new node.
-                         */
-                        dbg_fragtree2("insert fragment %#04x-%#04x and cut the left part of the hole\n",
-                                newfrag->ofs, newfrag->ofs + newfrag->size);
-                        rb_replace_node(&hole->rb, &newfrag->rb, root);
-                        hole->ofs += newfrag->size;
-                        hole->size -= newfrag->size;
-                        /*
-                         * We know that 'hole' should be the right hand
-                         * fragment.
-                         */
-                        jffs2_fragtree_insert(hole, newfrag);
-                        rb_insert_color(&hole->rb, root);
-                } else {
-                        /*
-                         * Ah, the new fragment is of the same size as the hole.
-                         * Relace the hole by it.
-                         */
-                        dbg_fragtree2("insert fragment %#04x-%#04x and overwrite hole\n",
-                                newfrag->ofs, newfrag->ofs + newfrag->size);
-                        rb_replace_node(&hole->rb, &newfrag->rb, root);
-                        jffs2_free_node_frag(hole);
-                }
-        } else {
-                /* The new fragment lefts some hole space at the left */
-                struct jffs2_node_frag * newfrag2 = NULL;
-                if (hole->ofs + hole->size > newfrag->ofs + newfrag->size) {
-                        /* The new frag also lefts some space at the right */
-                        newfrag2 = new_fragment(NULL, newfrag->ofs +
-                                newfrag->size, hole->ofs + hole->size
-                                - newfrag->ofs - newfrag->size);
-                        if (unlikely(!newfrag2)) {
-                                jffs2_free_node_frag(newfrag);
-                                return -ENOMEM;
-                        }
-                }
-                hole->size = newfrag->ofs - hole->ofs;
-                dbg_fragtree2("left the hole %#04x-%#04x at the left and inserd fragment %#04x-%#04x\n",
-                        hole->ofs, hole->ofs + hole->size, newfrag->ofs, newfrag->ofs + newfrag->size);
-                jffs2_fragtree_insert(newfrag, hole);
-                rb_insert_color(&newfrag->rb, root);
-                if (newfrag2) {
-                        dbg_fragtree2("left the hole %#04x-%#04x at the right\n",
-                                newfrag2->ofs, newfrag2->ofs + newfrag2->size);
-                        jffs2_fragtree_insert(newfrag2, newfrag);
-                        rb_insert_color(&newfrag2->rb, root);
-                }
-        }
-        return 0;
-}
-/*
- * This function is used when we build inode. It expects the nodes are passed
- * in the decreasing version order. The whole point of this is to improve the
- * inodes checking on NAND: we check the nodes' data CRC only when they are not
- * obsoleted. Previously, add_frag_to_fragtree() function was used and
- * nodes were passed to it in the increasing version ordes and CRCs of all
- * nodes were checked.
- *
- * Note: tn->fn->size shouldn't be zero.
- *
- * Returns 0 if the node was inserted
- *         1 if it wasn't inserted (since it is obsolete)
- *         < 0 an if error occured
- */
-int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
-                                     struct jffs2_tmp_dnode_info *tn)
-{
-        struct jffs2_node_frag *this, *newfrag;
-        uint32_t lastend;
-        struct jffs2_full_dnode *fn = tn->fn;
-        struct rb_root *root = &f->fragtree;
-        uint32_t fn_size = fn->size, fn_ofs = fn->ofs;
-        int err, checked = 0;
-        int ref_flag;
-        dbg_fragtree("insert fragment %#04x-%#04x, ver %u\n", fn_ofs, fn_ofs + fn_size, tn->version);
-        /* Skip all the nodes which are completed before this one starts */
-        this = jffs2_lookup_node_frag(root, fn_ofs);
-        if (this)
-                dbg_fragtree2("'this' found %#04x-%#04x (%s)\n", this->ofs, this->ofs + this->size, this->node ? "data" : "hole");
-        if (this)
-                lastend = this->ofs + this->size;
-        else
-                lastend = 0;
-        /* Detect the preliminary type of node */
-        if (fn->size >= PAGE_CACHE_SIZE)
-                ref_flag = REF_PRISTINE;
-        else
-                ref_flag = REF_NORMAL;
-        /* See if we ran off the end of the root */
-        if (lastend <= fn_ofs) {
-                /* We did */
-                /*
-                 * We are going to insert the new node into the
-                 * fragment tree, so check it.
-                 */
-                err = check_node(c, f, tn);
-                if (err != 0)
-                        return err;
-                fn->frags = 1;
-                newfrag = new_fragment(fn, fn_ofs, fn_size);
-                if (unlikely(!newfrag))
-                        return -ENOMEM;
-                err = no_overlapping_node(c, root, newfrag, this, lastend);
-                if (unlikely(err != 0)) {
-                        jffs2_free_node_frag(newfrag);
-                        return err;
-                }
-                goto out_ok;
-        }
-        fn->frags = 0;
-        while (1) {
-                /*
-                 * Here we have:
-                 * fn_ofs < this->ofs + this->size && fn_ofs >= this->ofs.
-                 *
-                 * Remember, 'this' has higher version, any non-hole node
-                 * which is already in the fragtree is newer then the newly
-                 * inserted.
-                 */
-                if (!this->node) {
-                        /*
-                         * 'this' is the hole fragment, so at least the
-                         * beginning of the new fragment is valid.
-                         */
-                        /*
-                         * We are going to insert the new node into the
-                         * fragment tree, so check it.
-                         */
-                        if (!checked) {
-                                err = check_node(c, f, tn);
-                                if (unlikely(err != 0))
-                                        return err;
-                                checked = 1;
-                        }
-                        if (this->ofs + this->size >= fn_ofs + fn_size) {
-                                /* We split the hole on two parts */
-                                fn->frags += 1;
-                                newfrag = new_fragment(fn, fn_ofs, fn_size);
-                                if (unlikely(!newfrag))
-                                        return -ENOMEM;
-                                err = split_hole(c, root, newfrag, this);
-                                if (unlikely(err))
-                                        return err;
-                                goto out_ok;
-                        }
-                        /*
-                         * The beginning of the new fragment is valid since it
-                         * overlaps the hole node.
-                         */
-                        ref_flag = REF_NORMAL;
-                        fn->frags += 1;
-                        newfrag = new_fragment(fn, fn_ofs,
-                                        this->ofs + this->size - fn_ofs);
-                        if (unlikely(!newfrag))
-                                return -ENOMEM;
-                        if (fn_ofs == this->ofs) {
-                                /*
-                                 * The new node starts at the same offset as
-                                 * the hole and supersieds the hole.
-                                 */
-                                dbg_fragtree2("add the new fragment instead of hole %#04x-%#04x, refcnt %d\n",
-                                        fn_ofs, fn_ofs + this->ofs + this->size - fn_ofs, fn->frags);
-                                rb_replace_node(&this->rb, &newfrag->rb, root);
-                                jffs2_free_node_frag(this);
-                        } else {
-                                /*
-                                 * The hole becomes shorter as its right part
-                                 * is supersieded by the new fragment.
-                                 */
-                                dbg_fragtree2("reduce size of hole %#04x-%#04x to %#04x-%#04x\n",
-                                        this->ofs, this->ofs + this->size, this->ofs, this->ofs + this->size - newfrag->size);
-                                dbg_fragtree2("add new fragment %#04x-%#04x, refcnt %d\n", fn_ofs,
-                                        fn_ofs + this->ofs + this->size - fn_ofs, fn->frags);
-                                this->size -= newfrag->size;
-                                jffs2_fragtree_insert(newfrag, this);
-                                rb_insert_color(&newfrag->rb, root);
-                        }
-                        fn_ofs += newfrag->size;
-                        fn_size -= newfrag->size;
-                        this = rb_entry(rb_next(&newfrag->rb),
-                                        struct jffs2_node_frag, rb);
-                        dbg_fragtree2("switch to the next 'this' fragment: %#04x-%#04x %s\n",
-                                this->ofs, this->ofs + this->size, this->node ? "(data)" : "(hole)");
-                }
-                /*
-                 * 'This' node is not the hole so it obsoletes the new fragment
-                 * either fully or partially.
-                 */
-                if (this->ofs + this->size >= fn_ofs + fn_size) {
-                        /* The new node is obsolete, drop it */
-                        if (fn->frags == 0) {
-                                dbg_fragtree2("%#04x-%#04x is obsolete, mark it obsolete\n", fn_ofs, fn_ofs + fn_size);
-                                ref_flag = REF_OBSOLETE;
-                        }
-                        goto out_ok;
-                } else {
-                        struct jffs2_node_frag *new_this;
-                        /* 'This' node obsoletes the beginning of the new node */
-                        dbg_fragtree2("the beginning %#04x-%#04x is obsolete\n", fn_ofs, this->ofs + this->size);
-                        ref_flag = REF_NORMAL;
-                        fn_size -= this->ofs + this->size - fn_ofs;
-                        fn_ofs = this->ofs + this->size;
-                        dbg_fragtree2("now considering %#04x-%#04x\n", fn_ofs, fn_ofs + fn_size);
-                        new_this = rb_entry(rb_next(&this->rb), struct jffs2_node_frag, rb);
-                        if (!new_this) {
-                                /*
-                                 * There is no next fragment. Add the rest of
-                                 * the new node as the right-hand child.
-                                 */
-                                if (!checked) {
-                                        err = check_node(c, f, tn);
-                                        if (unlikely(err != 0))
-                                                return err;
-                                        checked = 1;
-                                }
-                                fn->frags += 1;
-                                newfrag = new_fragment(fn, fn_ofs, fn_size);
-                                if (unlikely(!newfrag))
-                                        return -ENOMEM;
-                                dbg_fragtree2("there are no more fragments, insert %#04x-%#04x\n",
-                                        newfrag->ofs, newfrag->ofs + newfrag->size);
-                                rb_link_node(&newfrag->rb, &this->rb, &this->rb.rb_right);
-                                rb_insert_color(&newfrag->rb, root);
-                                goto out_ok;
-                        } else {
-                                this = new_this;
-                                dbg_fragtree2("switch to the next 'this' fragment: %#04x-%#04x %s\n",
-                                        this->ofs, this->ofs + this->size, this->node ? "(data)" : "(hole)");
-                        }
-                }
-        }
-out_ok:
-        BUG_ON(fn->size < PAGE_CACHE_SIZE && ref_flag == REF_PRISTINE);
-        if (ref_flag == REF_OBSOLETE) {
-                dbg_fragtree2("the node is obsolete now\n");
-                /* jffs2_mark_node_obsolete() will adjust space accounting */
-                jffs2_mark_node_obsolete(c, fn->raw);
-                return 1;
-        }
-        dbg_fragtree2("the node is \"%s\" now\n", ref_flag == REF_NORMAL ? "REF_NORMAL" : "REF_PRISTINE");
-        /* Space accounting was adjusted at check_node_data() */
-        spin_lock(&c->erase_completion_lock);
-        fn->raw->flash_offset = ref_offset(fn->raw) | ref_flag;
-        spin_unlock(&c->erase_completion_lock);
-        return 0;
-}
 void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state)
 {
        spin_lock(&c->inocache_lock);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 4178b4b55948..25126a062cae 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: nodelist.h,v 1.140 2005/09/07 08:34:54 havasi Exp $
- *
 */
 #ifndef __JFFS2_NODELIST_H__
@@ -40,6 +38,9 @@
 #define cpu_to_je32(x) ((jint32_t){x})
 #define cpu_to_jemode(x) ((jmode_t){os_to_jffs2_mode(x)})
+#define constant_cpu_to_je16(x) ((jint16_t){x})
+#define constant_cpu_to_je32(x) ((jint32_t){x})
 #define je16_to_cpu(x) ((x).v16)
 #define je32_to_cpu(x) ((x).v32)
 #define jemode_to_cpu(x) (jffs2_to_os_mode((x).m))
@@ -48,6 +49,9 @@
 #define cpu_to_je32(x) ((jint32_t){cpu_to_be32(x)})
 #define cpu_to_jemode(x) ((jmode_t){cpu_to_be32(os_to_jffs2_mode(x))})
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_be16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_be32(x)})
 #define je16_to_cpu(x) (be16_to_cpu(x.v16))
 #define je32_to_cpu(x) (be32_to_cpu(x.v32))
 #define jemode_to_cpu(x) (be32_to_cpu(jffs2_to_os_mode((x).m)))
@@ -56,6 +60,9 @@
 #define cpu_to_je32(x) ((jint32_t){cpu_to_le32(x)})
 #define cpu_to_jemode(x) ((jmode_t){cpu_to_le32(os_to_jffs2_mode(x))})
+#define constant_cpu_to_je16(x) ((jint16_t){__constant_cpu_to_le16(x)})
+#define constant_cpu_to_je32(x) ((jint32_t){__constant_cpu_to_le32(x)})
 #define je16_to_cpu(x) (le16_to_cpu(x.v16))
 #define je32_to_cpu(x) (le32_to_cpu(x.v32))
 #define jemode_to_cpu(x) (le32_to_cpu(jffs2_to_os_mode((x).m)))
@@ -216,7 +223,20 @@ struct jffs2_tmp_dnode_info
        uint32_t version;
        uint32_t data_crc;
        uint32_t partial_crc;
-        uint32_t csize;
+        uint16_t csize;
+        uint16_t overlapped;
+};
+/* Temporary data structure used during readinode. */
+struct jffs2_readinode_info
+{
+        struct rb_root tn_root;
+        struct jffs2_tmp_dnode_info *mdata_tn;
+        uint32_t highest_version;
+        uint32_t latest_mctime;
+        uint32_t mctime_ver;
+        struct jffs2_full_dirent *fds;
+        struct jffs2_raw_node_ref *latest_ref;
 };
 struct jffs2_full_dirent
@@ -319,6 +339,15 @@ static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
 #define frag_right(frag) rb_entry((frag)->rb.rb_right, struct jffs2_node_frag, rb)
 #define frag_erase(frag, list) rb_erase(&frag->rb, list);
+#define tn_next(tn) rb_entry(rb_next(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_prev(tn) rb_entry(rb_prev(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_parent(tn) rb_entry(rb_parent(&(tn)->rb), struct jffs2_tmp_dnode_info, rb)
+#define tn_left(tn) rb_entry((tn)->rb.rb_left, struct jffs2_tmp_dnode_info, rb)
+#define tn_right(tn) rb_entry((tn)->rb.rb_right, struct jffs2_tmp_dnode_info, rb)
+#define tn_erase(tn, list) rb_erase(&tn->rb, list);
+#define tn_last(list) rb_entry(rb_last(list), struct jffs2_tmp_dnode_info, rb)
+#define tn_first(list) rb_entry(rb_first(list), struct jffs2_tmp_dnode_info, rb)
 /* nodelist.c */
 void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list);
 void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state);
@@ -333,8 +362,7 @@ struct rb_node *rb_next(struct rb_node *);
 struct rb_node *rb_prev(struct rb_node *);
 void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
-void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
+uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
-int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
 struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
                                               struct jffs2_eraseblock *jeb,
                                               uint32_t ofs, uint32_t len,
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index d88376992ed9..dbc908ad622b 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: nodemgmt.c,v 1.127 2005/09/20 15:49:12 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
@@ -172,6 +170,11 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
+        if (c->nextblock == NULL) {
+                D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n",
+                  jeb->offset));
+                return;
+        }
        /* Check, if we have a dirty block now, or if it was dirty already */
        if (ISDIRTY (jeb->wasted_size + jeb->dirty_size)) {
                c->dirty_size += jeb->wasted_size;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index e07a0edcdb4f..80daea96bbc2 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2002-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: os-linux.h,v 1.64 2005/09/30 13:59:13 dedekind Exp $
- *
 */
 #ifndef __JFFS2_OS_LINUX_H__
@@ -98,6 +96,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_nor_wbuf_flash(c) (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
 #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
+#define jffs2_ubivol(c) (0)
+#define jffs2_ubivol_setup(c) (0)
+#define jffs2_ubivol_cleanup(c) do {} while (0)
 #else /* NAND and/or ECC'd NOR support present */
@@ -133,6 +134,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME)
+int jffs2_ubivol_setup(struct jffs2_sb_info *c);
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
 #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/pushpull.h b/fs/jffs2/pushpull.h
deleted file mode 100644
index c0c2a9158dff..000000000000
--- a/fs/jffs2/pushpull.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * JFFS2 -- Journalling Flash File System, Version 2.
- *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
- *
- * Created by David Woodhouse <dwmw2@infradead.org>
- *
- * For licensing information, see the file 'LICENCE' in this directory.
- *
- * $Id: pushpull.h,v 1.10 2004/11/16 20:36:11 dwmw2 Exp $
- *
- */
-#ifndef __PUSHPULL_H__
-#define __PUSHPULL_H__
-#include <linux/errno.h>
-struct pushpull {
-        unsigned char *buf;
-        unsigned int buflen;
-        unsigned int ofs;
-        unsigned int reserve;
-};
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
-{
-        pp->buf = buf;
-        pp->buflen = buflen;
-        pp->ofs = ofs;
-        pp->reserve = reserve;
-}
-static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
-{
-        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
-                return -ENOSPC;
-        }
-        if (bit) {
-                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
-        }
-        else {
-                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
-        }
-        pp->ofs++;
-        return 0;
-}
-static inline int pushedbits(struct pushpull *pp)
-{
-        return pp->ofs;
-}
-static inline int pullbit(struct pushpull *pp)
-{
-        int bit;
-        bit = (pp->buf[pp->ofs >> 3] >> (7-(pp->ofs & 7))) & 1;
-        pp->ofs++;
-        return bit;
-}
-static inline int pulledbits(struct pushpull *pp)
-{
-        return pp->ofs;
-}
-#endif /* __PUSHPULL_H__ */
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index f3b86da833ba..cfe05c1966a5 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: read.c,v 1.42 2005/11/07 11:14:41 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 58a0b912e9d0..6aff38930b50 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: readinode.c,v 1.143 2005/11/07 11:14:41 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -22,30 +20,510 @@
 #include "nodelist.h"
 /*
- * Put a new tmp_dnode_info into the temporaty RB-tree, keeping the list in
+ * Check the data CRC of the node.
- * order of increasing version.
+ *
+ * Returns: 0 if the data CRC is correct;
+ *          1 - if incorrect;
+ *          error code if an error occured.
 */
-static void jffs2_add_tn_to_tree(struct jffs2_tmp_dnode_info *tn, struct rb_root *list)
+static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
 {
-        struct rb_node **p = &list->rb_node;
+        struct jffs2_raw_node_ref *ref = tn->fn->raw;
-        struct rb_node * parent = NULL;
+        int err = 0, pointed = 0;
-        struct jffs2_tmp_dnode_info *this;
+        struct jffs2_eraseblock *jeb;
+        unsigned char *buffer;
-        while (*p) {
+        uint32_t crc, ofs, len;
-                parent = *p;
+        size_t retlen;
-                this = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+        BUG_ON(tn->csize == 0);
-                /* There may actually be a collision here, but it doesn't
-                   actually matter. As long as the two nodes with the same
+        if (!jffs2_is_writebuffered(c))
-                   version are together, it's all fine. */
+                goto adj_acc;
-                if (tn->version > this->version)
-                        p = &(*p)->rb_left;
+        /* Calculate how many bytes were already checked */
+        ofs = ref_offset(ref) + sizeof(struct jffs2_raw_inode);
+        len = ofs % c->wbuf_pagesize;
+        if (likely(len))
+                len = c->wbuf_pagesize - len;
+        if (len >= tn->csize) {
+                dbg_readinode("no need to check node at %#08x, data length %u, data starts at %#08x - it has already been checked.\n",
+                        ref_offset(ref), tn->csize, ofs);
+                goto adj_acc;
+        }
+        ofs += len;
+        len = tn->csize - len;
+        dbg_readinode("check node at %#08x, data length %u, partial CRC %#08x, correct CRC %#08x, data starts at %#08x, start checking from %#08x - %u bytes.\n",
+                ref_offset(ref), tn->csize, tn->partial_crc, tn->data_crc, ofs - len, ofs, len);
+#ifndef __ECOS
+        /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
+         * adding and jffs2_flash_read_end() interface. */
+        if (c->mtd->point) {
+                err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
+                if (!err && retlen < tn->csize) {
+                        JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+                        c->mtd->unpoint(c->mtd, buffer, ofs, len);
+                } else if (err)
+                        JFFS2_WARNING("MTD point failed: error code %d.\n", err);
                else
-                        p = &(*p)->rb_right;
+                        pointed = 1; /* succefully pointed to device */
+        }
+#endif
+        if (!pointed) {
+                buffer = kmalloc(len, GFP_KERNEL);
+                if (unlikely(!buffer))
+                        return -ENOMEM;
+                /* TODO: this is very frequent pattern, make it a separate
+                 * routine */
+                err = jffs2_flash_read(c, ofs, len, &retlen, buffer);
+                if (err) {
+                        JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ofs, err);
+                        goto free_out;
+                }
+                if (retlen != len) {
+                        JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
+                        err = -EIO;
+                        goto free_out;
+                }
+        }
+        /* Continue calculating CRC */
+        crc = crc32(tn->partial_crc, buffer, len);
+        if(!pointed)
+                kfree(buffer);
+#ifndef __ECOS
+        else
+                c->mtd->unpoint(c->mtd, buffer, ofs, len);
+#endif
+        if (crc != tn->data_crc) {
+                JFFS2_NOTICE("wrong data CRC in data node at 0x%08x: read %#08x, calculated %#08x.\n",
+                        ofs, tn->data_crc, crc);
+                return 1;
+        }
+adj_acc:
+        jeb = &c->blocks[ref->flash_offset / c->sector_size];
+        len = ref_totlen(c, jeb, ref);
+        /* If it should be REF_NORMAL, it'll get marked as such when
+           we build the fragtree, shortly. No need to worry about GC
+           moving it while it's marked REF_PRISTINE -- GC won't happen
+           till we've finished checking every inode anyway. */
+        ref->flash_offset |= REF_PRISTINE;
+        /*
+         * Mark the node as having been checked and fix the
+         * accounting accordingly.
+         */
+        spin_lock(&c->erase_completion_lock);
+        jeb->used_size += len;
+        jeb->unchecked_size -= len;
+        c->used_size += len;
+        c->unchecked_size -= len;
+        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+        spin_unlock(&c->erase_completion_lock);
+        return 0;
+free_out:
+        if(!pointed)
+                kfree(buffer);
+#ifndef __ECOS
+        else
+                c->mtd->unpoint(c->mtd, buffer, ofs, len);
+#endif
+        return err;
+}
+/*
+ * Helper function for jffs2_add_older_frag_to_fragtree().
+ *
+ * Checks the node if we are in the checking stage.
+ */
+static int check_tn_node(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+        int ret;
+        BUG_ON(ref_obsolete(tn->fn->raw));
+        /* We only check the data CRC of unchecked nodes */
+        if (ref_flags(tn->fn->raw) != REF_UNCHECKED)
+                return 0;
+        dbg_readinode("check node %#04x-%#04x, phys offs %#08x\n",
+                      tn->fn->ofs, tn->fn->ofs + tn->fn->size, ref_offset(tn->fn->raw));
+        ret = check_node_data(c, tn);
+        if (unlikely(ret < 0)) {
+                JFFS2_ERROR("check_node_data() returned error: %d.\n",
+                        ret);
+        } else if (unlikely(ret > 0)) {
+                dbg_readinode("CRC error, mark it obsolete.\n");
+                jffs2_mark_node_obsolete(c, tn->fn->raw);
        }
-        rb_link_node(&tn->rb, parent, p);
+        return ret;
-        rb_insert_color(&tn->rb, list);
+}
+static struct jffs2_tmp_dnode_info *jffs2_lookup_tn(struct rb_root *tn_root, uint32_t offset)
+{
+        struct rb_node *next;
+        struct jffs2_tmp_dnode_info *tn = NULL;
+        dbg_readinode("root %p, offset %d\n", tn_root, offset);
+        next = tn_root->rb_node;
+        while (next) {
+                tn = rb_entry(next, struct jffs2_tmp_dnode_info, rb);
+                if (tn->fn->ofs < offset)
+                        next = tn->rb.rb_right;
+                else if (tn->fn->ofs >= offset)
+                        next = tn->rb.rb_left;
+                else
+                        break;
+        }
+        return tn;
+}
+static void jffs2_kill_tn(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
+{
+        jffs2_mark_node_obsolete(c, tn->fn->raw);
+        jffs2_free_full_dnode(tn->fn);
+        jffs2_free_tmp_dnode_info(tn);
+}
+/*
+ * This function is used when we read an inode. Data nodes arrive in
+ * arbitrary order -- they may be older or newer than the nodes which
+ * are already in the tree. Where overlaps occur, the older node can
+ * be discarded as long as the newer passes the CRC check. We don't
+ * bother to keep track of holes in this rbtree, and neither do we deal
+ * with frags -- we can have multiple entries starting at the same
+ * offset, and the one with the smallest length will come first in the
+ * ordering.
+ *
+ * Returns 0 if the node was inserted
+ *         1 if the node is obsolete (because we can't mark it so yet)
+ *         < 0 an if error occurred
+ */
+static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
+                                struct jffs2_readinode_info *rii,
+                                struct jffs2_tmp_dnode_info *tn)
+{
+        uint32_t fn_end = tn->fn->ofs + tn->fn->size;
+        struct jffs2_tmp_dnode_info *insert_point = NULL, *this;
+        dbg_readinode("insert fragment %#04x-%#04x, ver %u\n", tn->fn->ofs, fn_end, tn->version);
+        /* If a node has zero dsize, we only have to keep if it if it might be the
+           node with highest version -- i.e. the one which will end up as f->metadata.
+           Note that such nodes won't be REF_UNCHECKED since there are no data to
+           check anyway. */
+        if (!tn->fn->size) {
+                if (rii->mdata_tn) {
+                        /* We had a candidate mdata node already */
+                        dbg_readinode("kill old mdata with ver %d\n", rii->mdata_tn->version);
+                        jffs2_kill_tn(c, rii->mdata_tn);
+                }
+                rii->mdata_tn = tn;
+                dbg_readinode("keep new mdata with ver %d\n", tn->version);
+                return 0;
+        }
+        /* Find the earliest node which _may_ be relevant to this one */
+        this = jffs2_lookup_tn(&rii->tn_root, tn->fn->ofs);
+        if (!this) {
+                /* First addition to empty tree. $DEITY how I love the easy cases */
+                rb_link_node(&tn->rb, NULL, &rii->tn_root.rb_node);
+                rb_insert_color(&tn->rb, &rii->tn_root);
+                dbg_readinode("keep new frag\n");
+                return 0;
+        }
+        /* If we add a new node it'll be somewhere under here. */
+        insert_point = this;
+        /* If the node is coincident with another at a lower address,
+           back up until the other node is found. It may be relevant */
+        while (tn->overlapped)
+                tn = tn_prev(tn);
+        dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
+        while (this) {
+                if (this->fn->ofs > fn_end)
+                        break;
+                dbg_readinode("Ponder this ver %d, 0x%x-0x%x\n",
+                              this->version, this->fn->ofs, this->fn->size);
+                if (this->version == tn->version) {
+                        /* Version number collision means REF_PRISTINE GC. Accept either of them
+                           as long as the CRC is correct. Check the one we have already...  */
+                        if (!check_tn_node(c, this)) {
+                                /* The one we already had was OK. Keep it and throw away the new one */
+                                dbg_readinode("Like old node. Throw away new\n");
+                                jffs2_kill_tn(c, tn);
+                                return 0;
+                        } else {
+                                /* Who cares if the new one is good; keep it for now anyway. */
+                                rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+                                /* Same overlapping from in front and behind */
+                                tn->overlapped = this->overlapped;
+                                jffs2_kill_tn(c, this);
+                                dbg_readinode("Like new node. Throw away old\n");
+                                return 0;
+                        }
+                }
+                if (this->version < tn->version &&
+                    this->fn->ofs >= tn->fn->ofs &&
+                    this->fn->ofs + this->fn->size <= fn_end) {
+                        /* New node entirely overlaps 'this' */
+                        if (check_tn_node(c, tn)) {
+                                dbg_readinode("new node bad CRC\n");
+                                jffs2_kill_tn(c, tn);
+                                return 0;
+                        }
+                        /* ... and is good. Kill 'this'... */
+                        rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+                        tn->overlapped = this->overlapped;
+                        jffs2_kill_tn(c, this);
+                        /* ... and any subsequent nodes which are also overlapped */
+                        this = tn_next(tn);
+                        while (this && this->fn->ofs + this->fn->size < fn_end) {
+                                struct jffs2_tmp_dnode_info *next = tn_next(this);
+                                if (this->version < tn->version) {
+                                        tn_erase(this, &rii->tn_root);
+                                        dbg_readinode("Kill overlapped ver %d, 0x%x-0x%x\n",
+                                                      this->version, this->fn->ofs,
+                                                      this->fn->ofs+this->fn->size);
+                                        jffs2_kill_tn(c, this);
+                                }
+                                this = next;
+                        }
+                        dbg_readinode("Done inserting new\n");
+                        return 0;
+                }
+                if (this->version > tn->version &&
+                    this->fn->ofs <= tn->fn->ofs &&
+                    this->fn->ofs+this->fn->size >= fn_end) {
+                        /* New node entirely overlapped by 'this' */
+                        if (!check_tn_node(c, this)) {
+                                dbg_readinode("Good CRC on old node. Kill new\n");
+                                jffs2_kill_tn(c, tn);
+                                return 0;
+                        }
+                        /* ... but 'this' was bad. Replace it... */
+                        rb_replace_node(&this->rb, &tn->rb, &rii->tn_root);
+                        dbg_readinode("Bad CRC on old overlapping node. Kill it\n");
+                        jffs2_kill_tn(c, this);
+                        return 0;
+                }
+                /* We want to be inserted under the last node which is
+                   either at a lower offset _or_ has a smaller range */
+                if (this->fn->ofs < tn->fn->ofs ||
+                    (this->fn->ofs == tn->fn->ofs &&
+                     this->fn->size <= tn->fn->size))
+                        insert_point = this;
+                this = tn_next(this);
+        }
+        dbg_readinode("insert_point %p, ver %d, 0x%x-0x%x, ov %d\n",
+                      insert_point, insert_point->version, insert_point->fn->ofs,
+                      insert_point->fn->ofs+insert_point->fn->size,
+                      insert_point->overlapped);
+        /* We neither completely obsoleted nor were completely
+           obsoleted by an earlier node. Insert under insert_point */
+        {
+                struct rb_node *parent = &insert_point->rb;
+                struct rb_node **link = &parent;
+                while (*link) {
+                        parent = *link;
+                        insert_point = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+                        if (tn->fn->ofs > insert_point->fn->ofs)
+                                link = &insert_point->rb.rb_right;
+                        else if (tn->fn->ofs < insert_point->fn->ofs ||
+                                 tn->fn->size < insert_point->fn->size)
+                                link = &insert_point->rb.rb_left;
+                        else
+                                link = &insert_point->rb.rb_right;
+                }
+                rb_link_node(&tn->rb, &insert_point->rb, link);
+                rb_insert_color(&tn->rb, &rii->tn_root);
+        }
+        /* If there's anything behind that overlaps us, note it */
+        this = tn_prev(tn);
+        if (this) {
+                while (1) {
+                        if (this->fn->ofs + this->fn->size > tn->fn->ofs) {
+                                dbg_readinode("Node is overlapped by %p (v %d, 0x%x-0x%x)\n",
+                                              this, this->version, this->fn->ofs,
+                                              this->fn->ofs+this->fn->size);
+                                tn->overlapped = 1;
+                                break;
+                        }
+                        if (!this->overlapped)
+                                break;
+                        this = tn_prev(this);
+                }
+        }
+        /* If the new node overlaps anything ahead, note it */
+        this = tn_next(tn);
+        while (this && this->fn->ofs < fn_end) {
+                this->overlapped = 1;
+                dbg_readinode("Node ver %d, 0x%x-0x%x is overlapped\n",
+                              this->version, this->fn->ofs,
+                              this->fn->ofs+this->fn->size);
+                this = tn_next(this);
+        }
+        return 0;
+}
+/* Trivial function to remove the last node in the tree. Which by definition
+   has no right-hand -- so can be removed just by making its only child (if
+   any) take its place under its parent. */
+static void eat_last(struct rb_root *root, struct rb_node *node)
+{
+        struct rb_node *parent = rb_parent(node);
+        struct rb_node **link;
+        /* LAST! */
+        BUG_ON(node->rb_right);
+        if (!parent)
+                link = &root->rb_node;
+        else if (node == parent->rb_left)
+                link = &parent->rb_left;
+        else
+                link = &parent->rb_right;
+        *link = node->rb_left;
+        /* Colour doesn't matter now. Only the parent pointer. */
+        if (node->rb_left)
+                node->rb_left->rb_parent_color = node->rb_parent_color;
+}
+/* We put this in reverse order, so we can just use eat_last */
+static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
+{
+        struct rb_node **link = &ver_root->rb_node;
+        struct rb_node *parent = NULL;
+        struct jffs2_tmp_dnode_info *this_tn;
+        while (*link) {
+                parent = *link;
+                this_tn = rb_entry(parent, struct jffs2_tmp_dnode_info, rb);
+                if (tn->version > this_tn->version)
+                        link = &parent->rb_left;
+                else
+                        link = &parent->rb_right;
+        }
+        dbg_readinode("Link new node at %p (root is %p)\n", link, ver_root);
+        rb_link_node(&tn->rb, parent, link);
+        rb_insert_color(&tn->rb, ver_root);
+}
+/* Build final, normal fragtree from tn tree. It doesn't matter which order
+   we add nodes to the real fragtree, as long as they don't overlap. And
+   having thrown away the majority of overlapped nodes as we went, there
+   really shouldn't be many sets of nodes which do overlap. If we start at
+   the end, we can use the overlap markers -- we can just eat nodes which
+   aren't overlapped, and when we encounter nodes which _do_ overlap we
+   sort them all into a temporary tree in version order before replaying them. */
+static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
+                                      struct jffs2_inode_info *f,
+                                      struct jffs2_readinode_info *rii)
+{
+        struct jffs2_tmp_dnode_info *pen, *last, *this;
+        struct rb_root ver_root = RB_ROOT;
+        uint32_t high_ver = 0;
+        if (rii->mdata_tn) {
+                dbg_readinode("potential mdata is ver %d at %p\n", rii->mdata_tn->version, rii->mdata_tn);
+                high_ver = rii->mdata_tn->version;
+                rii->latest_ref = rii->mdata_tn->fn->raw;
+        }
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+        this = tn_last(&rii->tn_root);
+        while (this) {
+                dbg_readinode("tn %p ver %d range 0x%x-0x%x ov %d\n", this, this->version, this->fn->ofs,
+                             this->fn->ofs+this->fn->size, this->overlapped);
+                this = tn_prev(this);
+        }
+#endif
+        pen = tn_last(&rii->tn_root);
+        while ((last = pen)) {
+                pen = tn_prev(last);
+                eat_last(&rii->tn_root, &last->rb);
+                ver_insert(&ver_root, last);
+                if (unlikely(last->overlapped))
+                        continue;
+                /* Now we have a bunch of nodes in reverse version
+                   order, in the tree at ver_root. Most of the time,
+                   there'll actually be only one node in the 'tree',
+                   in fact. */
+                this = tn_last(&ver_root);
+                while (this) {
+                        struct jffs2_tmp_dnode_info *vers_next;
+                        int ret;
+                        vers_next = tn_prev(this);
+                        eat_last(&ver_root, &this->rb);
+                        if (check_tn_node(c, this)) {
+                                dbg_readinode("node ver %x, 0x%x-0x%x failed CRC\n",
+                                             this->version, this->fn->ofs,
+                                             this->fn->ofs+this->fn->size);
+                                jffs2_kill_tn(c, this);
+                        } else {
+                                if (this->version > high_ver) {
+                                        /* Note that this is different from the other
+                                           highest_version, because this one is only
+                                           counting _valid_ nodes which could give the
+                                           latest inode metadata */
+                                        high_ver = this->version;
+                                        rii->latest_ref = this->fn->raw;
+                                }
+                                dbg_readinode("Add %p (v %x, 0x%x-0x%x, ov %d) to fragtree\n",
+                                             this, this->version, this->fn->ofs,
+                                             this->fn->ofs+this->fn->size, this->overlapped);
+                                ret = jffs2_add_full_dnode_to_inode(c, f, this->fn);
+                                if (ret) {
+                                        /* Free the nodes in vers_root; let the caller
+                                           deal with the rest */
+                                        JFFS2_ERROR("Add node to tree failed %d\n", ret);
+                                        while (1) {
+                                                vers_next = tn_prev(this);
+                                                if (check_tn_node(c, this))
+                                                        jffs2_mark_node_obsolete(c, this->fn->raw);
+                                                jffs2_free_full_dnode(this->fn);
+                                                jffs2_free_tmp_dnode_info(this);
+                                                this = vers_next;
+                                                if (!this)
+                                                        break;
+                                                eat_last(&ver_root, &vers_next->rb);
+                                        }
+                                        return ret;
+                                }
+                                jffs2_free_tmp_dnode_info(this);
+                        }
+                        this = vers_next;
+                }
+        }
+        return 0;
 }
 static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
@@ -112,8 +590,8 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
 *          negative error code on failure.
 */
 static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-                                struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp,
+                                struct jffs2_raw_dirent *rd, size_t read,
-                                uint32_t *latest_mctime, uint32_t *mctime_ver)
+                                struct jffs2_readinode_info *rii)
 {
        struct jffs2_full_dirent *fd;
        uint32_t crc;
@@ -125,7 +603,8 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
        if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
                JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
                             ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        /* If we've never checked the CRCs on this node, check them now */
@@ -137,7 +616,8 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
                if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
                        JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
                                    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
-                        return 1;
+                        jffs2_mark_node_obsolete(c, ref);
+                        return 0;
                }
                jeb = &c->blocks[ref->flash_offset / c->sector_size];
@@ -161,10 +641,13 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
        fd->ino = je32_to_cpu(rd->ino);
        fd->type = rd->type;
+        if (fd->version > rii->highest_version)
+                rii->highest_version = fd->version;
        /* Pick out the mctime of the latest dirent */
-        if(fd->version > *mctime_ver && je32_to_cpu(rd->mctime)) {
+        if(fd->version > rii->mctime_ver && je32_to_cpu(rd->mctime)) {
-                *mctime_ver = fd->version;
+                rii->mctime_ver = fd->version;
-                *latest_mctime = je32_to_cpu(rd->mctime);
+                rii->latest_mctime = je32_to_cpu(rd->mctime);
        }
        /*
@@ -201,7 +684,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
         * Wheee. We now have a complete jffs2_full_dirent structure, with
         * the name in it and everything. Link it into the list
         */
-        jffs2_add_fd_to_list(c, fd, fdp);
+        jffs2_add_fd_to_list(c, fd, &rii->fds);
        return 0;
 }
@@ -210,13 +693,13 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 * Helper function for jffs2_get_inode_nodes().
 * It is called every time an inode node is found.
 *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
 *          1 if the node should be marked obsolete;
 *          negative error code on failure.
 */
 static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-                             struct jffs2_raw_inode *rd, struct rb_root *tnp, int rdlen,
+                             struct jffs2_raw_inode *rd, int rdlen,
-                             uint32_t *latest_mctime, uint32_t *mctime_ver)
+                             struct jffs2_readinode_info *rii)
 {
        struct jffs2_tmp_dnode_info *tn;
        uint32_t len, csize;
@@ -230,7 +713,8 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
                JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
                             ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        tn = jffs2_alloc_tmp_dnode_info();
@@ -342,6 +826,10 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        tn->data_crc = je32_to_cpu(rd->data_crc);
        tn->csize = csize;
        tn->fn->raw = ref;
+        tn->overlapped = 0;
+        if (tn->version > rii->highest_version)
+                rii->highest_version = tn->version;
        /* There was a bug where we wrote hole nodes out with
           csize/dsize swapped. Deal with it */
@@ -353,13 +841,25 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
        dbg_readinode("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n",
                  ref_offset(ref), je32_to_cpu(rd->version), je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize);
-        jffs2_add_tn_to_tree(tn, tnp);
+        ret = jffs2_add_tn_to_tree(c, rii, tn);
+        if (ret) {
+                jffs2_free_full_dnode(tn->fn);
+        free_out:
+                jffs2_free_tmp_dnode_info(tn);
+                return ret;
+        }
+#ifdef JFFS2_DBG_READINODE_MESSAGES
+        dbg_readinode("After adding ver %d:\n", tn->version);
+        tn = tn_first(&rii->tn_root);
+        while (tn) {
+                dbg_readinode("%p: v %d r 0x%x-0x%x ov %d\n",
+                             tn, tn->version, tn->fn->ofs,
+                             tn->fn->ofs+tn->fn->size, tn->overlapped);
+                tn = tn_next(tn);
+        }
+#endif
        return 0;
-free_out:
-        jffs2_free_tmp_dnode_info(tn);
-        return ret;
 }
 /*
@@ -373,7 +873,15 @@ free_out:
 static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, struct jffs2_unknown_node *un)
 {
        /* We don't mark unknown nodes as REF_UNCHECKED */
-        BUG_ON(ref_flags(ref) == REF_UNCHECKED);
+        if (ref_flags(ref) == REF_UNCHECKED) {
+                JFFS2_ERROR("REF_UNCHECKED but unknown node at %#08x\n",
+                            ref_offset(ref));
+                JFFS2_ERROR("Node is {%04x,%04x,%08x,%08x}. Please report this error.\n",
+                            je16_to_cpu(un->magic), je16_to_cpu(un->nodetype),
+                            je32_to_cpu(un->totlen), je32_to_cpu(un->hdr_crc));
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
+        }
        un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
@@ -400,7 +908,8 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
        case JFFS2_FEATURE_RWCOMPAT_DELETE:
                JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
                             je16_to_cpu(un->nodetype), ref_offset(ref));
-                return 1;
+                jffs2_mark_node_obsolete(c, ref);
+                return 0;
        }
        return 0;
@@ -414,92 +923,62 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 *          negative error code on failure.
 */
 static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-                     int right_size, int *rdlen, unsigned char *buf, unsigned char *bufstart)
+                     int needed_len, int *rdlen, unsigned char *buf)
 {
-        int right_len, err, len;
+        int err, to_read = needed_len - *rdlen;
        size_t retlen;
        uint32_t offs;
        if (jffs2_is_writebuffered(c)) {
-                right_len = c->wbuf_pagesize - (bufstart - buf);
+                int rem = to_read % c->wbuf_pagesize;
-                if (right_size + (int)(bufstart - buf) > c->wbuf_pagesize)
-                        right_len += c->wbuf_pagesize;
-        } else
-                right_len = right_size;
-        if (*rdlen == right_len)
+                if (rem)
-                return 0;
+                        to_read += c->wbuf_pagesize - rem;
+        }
        /* We need to read more data */
        offs = ref_offset(ref) + *rdlen;
-        if (jffs2_is_writebuffered(c)) {
-                bufstart = buf + c->wbuf_pagesize;
-                len = c->wbuf_pagesize;
-        } else {
-                bufstart = buf + *rdlen;
-                len = right_size - *rdlen;
-        }
-        dbg_readinode("read more %d bytes\n", len);
+        dbg_readinode("read more %d bytes\n", to_read);
-        err = jffs2_flash_read(c, offs, len, &retlen, bufstart);
+        err = jffs2_flash_read(c, offs, to_read, &retlen, buf + *rdlen);
        if (err) {
                JFFS2_ERROR("can not read %d bytes from 0x%08x, "
-                        "error code: %d.\n", len, offs, err);
+                        "error code: %d.\n", to_read, offs, err);
                return err;
        }
-        if (retlen < len) {
+        if (retlen < to_read) {
                JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
-                                offs, retlen, len);
+                                offs, retlen, to_read);
                return -EIO;
        }
-        *rdlen = right_len;
+        *rdlen += to_read;
        return 0;
 }
 /* Get tmp_dnode_info and full_dirent for all non-obsolete nodes associated
-   with this ino, returning the former in order of version */
+   with this ino. Perform a preliminary ordering on data nodes, throwing away
+   those which are completely obsoleted by newer ones. The naïve approach we
+   use to take of just returning them _all_ in version order will cause us to
+   run out of memory in certain degenerate cases. */
 static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
-                                 struct rb_root *tnp, struct jffs2_full_dirent **fdp,
+                                 struct jffs2_readinode_info *rii)
-                                 uint32_t *highest_version, uint32_t *latest_mctime,
-                                 uint32_t *mctime_ver)
 {
        struct jffs2_raw_node_ref *ref, *valid_ref;
-        struct rb_root ret_tn = RB_ROOT;
-        struct jffs2_full_dirent *ret_fd = NULL;
        unsigned char *buf = NULL;
        union jffs2_node_union *node;
        size_t retlen;
        int len, err;
-        *mctime_ver = 0;
+        rii->mctime_ver = 0;
        dbg_readinode("ino #%u\n", f->inocache->ino);
-        if (jffs2_is_writebuffered(c)) {
-                /*
-                 * If we have the write buffer, we assume the minimal I/O unit
-                 * is c->wbuf_pagesize. We implement some optimizations which in
-                 * this case and we need a temporary buffer of size =
-                 * 2*c->wbuf_pagesize bytes (see comments in read_dnode()).
-                 * Basically, we want to read not only the node header, but the
-                 * whole wbuf (NAND page in case of NAND) or 2, if the node
-                 * header overlaps the border between the 2 wbufs.
-                 */
-                len = 2*c->wbuf_pagesize;
-        } else {
-                /*
-                 * When there is no write buffer, the size of the temporary
-                 * buffer is the size of the larges node header.
-                 */
-                len = sizeof(union jffs2_node_union);
-        }
        /* FIXME: in case of NOR and available ->point() this
         * needs to be fixed. */
+        len = sizeof(union jffs2_node_union) + c->wbuf_pagesize;
        buf = kmalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
@@ -509,8 +988,6 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
        if (!valid_ref && f->inocache->ino != 1)
                JFFS2_WARNING("Eep. No valid nodes for ino #%u.\n", f->inocache->ino);
        while (valid_ref) {
-                unsigned char *bufstart;
                /* We can hold a pointer to a non-obsolete node without the spinlock,
                   but _obsolete_ nodes may disappear at any time, if the block
                   they're in gets erased. So if we mark 'ref' obsolete while we're
@@ -526,32 +1003,31 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                /*
                 * At this point we don't know the type of the node we're going
                 * to read, so we do not know the size of its header. In order
-                 * to minimize the amount of flash IO we assume the node has
+                 * to minimize the amount of flash IO we assume the header is
-                 * size = JFFS2_MIN_NODE_HEADER.
+                 * of size = JFFS2_MIN_NODE_HEADER.
                 */
+                len = JFFS2_MIN_NODE_HEADER;
                if (jffs2_is_writebuffered(c)) {
+                        int end, rem;
                        /*
-                         * We treat 'buf' as 2 adjacent wbufs. We want to
+                         * We are about to read JFFS2_MIN_NODE_HEADER bytes,
-                         * adjust bufstart such as it points to the
+                         * but this flash has some minimal I/O unit. It is
-                         * beginning of the node within this wbuf.
+                         * possible that we'll need to read more soon, so read
+                         * up to the next min. I/O unit, in order not to
+                         * re-read the same min. I/O unit twice.
                         */
-                        bufstart = buf + (ref_offset(ref) % c->wbuf_pagesize);
+                        end = ref_offset(ref) + len;
-                        /* We will read either one wbuf or 2 wbufs. */
+                        rem = end % c->wbuf_pagesize;
-                        len = c->wbuf_pagesize - (bufstart - buf);
+                        if (rem)
-                        if (JFFS2_MIN_NODE_HEADER + (int)(bufstart - buf) > c->wbuf_pagesize) {
+                                end += c->wbuf_pagesize - rem;
-                                /* The header spans the border of the first wbuf */
+                        len = end - ref_offset(ref);
-                                len += c->wbuf_pagesize;
-                        }
-                } else {
-                        bufstart = buf;
-                        len = JFFS2_MIN_NODE_HEADER;
                }
                dbg_readinode("read %d bytes at %#08x(%d).\n", len, ref_offset(ref), ref_flags(ref));
                /* FIXME: point() */
-                err = jffs2_flash_read(c, ref_offset(ref), len,
+                err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf);
-                                       &retlen, bufstart);
                if (err) {
                        JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err);
                        goto free_out;
@@ -563,7 +1039,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                        goto free_out;
                }
-                node = (union jffs2_node_union *)bufstart;
+                node = (union jffs2_node_union *)buf;
                /* No need to mask in the valid bit; it shouldn't be invalid */
                if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
@@ -576,52 +1052,47 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
                        jffs2_mark_node_obsolete(c, ref);
                        goto cont;
                }
+                if (je16_to_cpu(node->u.magic) != JFFS2_MAGIC_BITMASK) {
+                        /* Not a JFFS2 node, whinge and move on */
+                        JFFS2_NOTICE("Wrong magic bitmask 0x%04x in node header at %#08x.\n",
+                                     je16_to_cpu(node->u.magic), ref_offset(ref));
+                        jffs2_mark_node_obsolete(c, ref);
+                        goto cont;
+                }
                switch (je16_to_cpu(node->u.nodetype)) {
                case JFFS2_NODETYPE_DIRENT:
                        if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_dirent)) {
-                                err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf, bufstart);
+                                err = read_more(c, ref, sizeof(struct jffs2_raw_dirent), &len, buf);
                                if (unlikely(err))
                                        goto free_out;
                        }
-                        err = read_direntry(c, ref, &node->d, retlen, &ret_fd, latest_mctime, mctime_ver);
+                        err = read_direntry(c, ref, &node->d, retlen, rii);
-                        if (err == 1) {
+                        if (unlikely(err))
-                                jffs2_mark_node_obsolete(c, ref);
-                                break;
-                        } else if (unlikely(err))
                                goto free_out;
-                        if (je32_to_cpu(node->d.version) > *highest_version)
-                                *highest_version = je32_to_cpu(node->d.version);
                        break;
                case JFFS2_NODETYPE_INODE:
                        if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_raw_inode)) {
-                                err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf, bufstart);
+                                err = read_more(c, ref, sizeof(struct jffs2_raw_inode), &len, buf);
                                if (unlikely(err))
                                        goto free_out;
                        }
-                        err = read_dnode(c, ref, &node->i, &ret_tn, len, latest_mctime, mctime_ver);
+                        err = read_dnode(c, ref, &node->i, len, rii);
-                        if (err == 1) {
+                        if (unlikely(err))
-                                jffs2_mark_node_obsolete(c, ref);
-                                break;
-                        } else if (unlikely(err))
                                goto free_out;
-                        if (je32_to_cpu(node->i.version) > *highest_version)
-                                *highest_version = je32_to_cpu(node->i.version);
                        break;
                default:
                        if (JFFS2_MIN_NODE_HEADER < sizeof(struct jffs2_unknown_node)) {
-                                err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf, bufstart);
+                                err = read_more(c, ref, sizeof(struct jffs2_unknown_node), &len, buf);
                                if (unlikely(err))
                                        goto free_out;
                        }
@@ -639,17 +1110,19 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
        }
        spin_unlock(&c->erase_completion_lock);
-        *tnp = ret_tn;
-        *fdp = ret_fd;
        kfree(buf);
+        f->highest_version = rii->highest_version;
        dbg_readinode("nodes of inode #%u were read, the highest version is %u, latest_mctime %u, mctime_ver %u.\n",
-                        f->inocache->ino, *highest_version, *latest_mctime, *mctime_ver);
+                      f->inocache->ino, rii->highest_version, rii->latest_mctime,
+                      rii->mctime_ver);
        return 0;
 free_out:
-        jffs2_free_tmp_dnode_info_list(&ret_tn);
+        jffs2_free_tmp_dnode_info_list(&rii->tn_root);
-        jffs2_free_full_dirent_list(ret_fd);
+        jffs2_free_full_dirent_list(rii->fds);
+        rii->fds = NULL;
        kfree(buf);
        return err;
 }
@@ -658,20 +1131,17 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                                        struct jffs2_inode_info *f,
                                        struct jffs2_raw_inode *latest_node)
 {
-        struct jffs2_tmp_dnode_info *tn;
+        struct jffs2_readinode_info rii;
-        struct rb_root tn_list;
+        uint32_t crc, new_size;
-        struct rb_node *rb, *repl_rb;
-        struct jffs2_full_dirent *fd_list;
-        struct jffs2_full_dnode *fn, *first_fn = NULL;
-        uint32_t crc;
-        uint32_t latest_mctime, mctime_ver;
        size_t retlen;
        int ret;
        dbg_readinode("ino #%u nlink is %d\n", f->inocache->ino, f->inocache->nlink);
+        memset(&rii, 0, sizeof(rii));
        /* Grab all nodes relevant to this ino */
-        ret = jffs2_get_inode_nodes(c, f, &tn_list, &fd_list, &f->highest_version, &latest_mctime, &mctime_ver);
+        ret = jffs2_get_inode_nodes(c, f, &rii);
        if (ret) {
                JFFS2_ERROR("cannot read nodes for ino %u, returned error is %d\n", f->inocache->ino, ret);
@@ -679,74 +1149,42 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                        jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
                return ret;
        }
-        f->dents = fd_list;
-        rb = rb_first(&tn_list);
-        while (rb) {
+        ret = jffs2_build_inode_fragtree(c, f, &rii);
-                cond_resched();
+        if (ret) {
-                tn = rb_entry(rb, struct jffs2_tmp_dnode_info, rb);
+                JFFS2_ERROR("Failed to build final fragtree for inode #%u: error %d\n",
-                fn = tn->fn;
+                            f->inocache->ino, ret);
-                ret = 1;
+                if (f->inocache->state == INO_STATE_READING)
-                dbg_readinode("consider node ver %u, phys offset "
+                        jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
-                        "%#08x(%d), range %u-%u.\n", tn->version,
+                jffs2_free_tmp_dnode_info_list(&rii.tn_root);
-                        ref_offset(fn->raw), ref_flags(fn->raw),
+                /* FIXME: We could at least crc-check them all */
-                        fn->ofs, fn->ofs + fn->size);
+                if (rii.mdata_tn) {
+                        jffs2_free_full_dnode(rii.mdata_tn->fn);
-                if (fn->size) {
+                        jffs2_free_tmp_dnode_info(rii.mdata_tn);
-                        ret = jffs2_add_older_frag_to_fragtree(c, f, tn);
+                        rii.mdata_tn = NULL;
-                        /* TODO: the error code isn't checked, check it */
+                }
-                        jffs2_dbg_fragtree_paranoia_check_nolock(f);
+                return ret;
-                        BUG_ON(ret < 0);
+        }
-                        if (!first_fn && ret == 0)
-                                first_fn = fn;
-                } else if (!first_fn) {
-                        first_fn = fn;
-                        f->metadata = fn;
-                        ret = 0; /* Prevent freeing the metadata update node */
-                } else
-                        jffs2_mark_node_obsolete(c, fn->raw);
-                BUG_ON(rb->rb_left);
-                if (rb_parent(rb) && rb_parent(rb)->rb_left == rb) {
-                        /* We were then left-hand child of our parent. We need
-                         * to move our own right-hand child into our place. */
-                        repl_rb = rb->rb_right;
-                        if (repl_rb)
-                                rb_set_parent(repl_rb, rb_parent(rb));
-                } else
-                        repl_rb = NULL;
-                rb = rb_next(rb);
-                /* Remove the spent tn from the tree; don't bother rebalancing
-                 * but put our right-hand child in our own place. */
-                if (rb_parent(&tn->rb)) {
-                        if (rb_parent(&tn->rb)->rb_left == &tn->rb)
-                                rb_parent(&tn->rb)->rb_left = repl_rb;
-                        else if (rb_parent(&tn->rb)->rb_right == &tn->rb)
-                                rb_parent(&tn->rb)->rb_right = repl_rb;
-                        else BUG();
-                } else if (tn->rb.rb_right)
-                        rb_set_parent(tn->rb.rb_right, NULL);
-                jffs2_free_tmp_dnode_info(tn);
+        if (rii.mdata_tn) {
-                if (ret) {
+                if (rii.mdata_tn->fn->raw == rii.latest_ref) {
-                        dbg_readinode("delete dnode %u-%u.\n",
+                        f->metadata = rii.mdata_tn->fn;
-                                fn->ofs, fn->ofs + fn->size);
+                        jffs2_free_tmp_dnode_info(rii.mdata_tn);
-                        jffs2_free_full_dnode(fn);
+                } else {
+                        jffs2_kill_tn(c, rii.mdata_tn);
                }
+                rii.mdata_tn = NULL;
        }
-        jffs2_dbg_fragtree_paranoia_check_nolock(f);
-        BUG_ON(first_fn && ref_obsolete(first_fn->raw));
+        f->dents = rii.fds;
-        fn = first_fn;
+        jffs2_dbg_fragtree_paranoia_check_nolock(f);
-        if (unlikely(!first_fn)) {
+        if (unlikely(!rii.latest_ref)) {
                /* No data nodes for this inode. */
                if (f->inocache->ino != 1) {
                        JFFS2_WARNING("no data nodes found for ino #%u\n", f->inocache->ino);
-                        if (!fd_list) {
+                        if (!rii.fds) {
                                if (f->inocache->state == INO_STATE_READING)
                                        jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT);
                                return -EIO;
@@ -764,7 +1202,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                return 0;
        }
-        ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(*latest_node), &retlen, (void *)latest_node);
+        ret = jffs2_flash_read(c, ref_offset(rii.latest_ref), sizeof(*latest_node), &retlen, (void *)latest_node);
        if (ret || retlen != sizeof(*latest_node)) {
                JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
                        ret, retlen, sizeof(*latest_node));
@@ -777,7 +1215,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
        crc = crc32(0, latest_node, sizeof(*latest_node)-8);
        if (crc != je32_to_cpu(latest_node->node_crc)) {
                JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
-                        f->inocache->ino, ref_offset(fn->raw));
+                        f->inocache->ino, ref_offset(rii.latest_ref));
                up(&f->sem);
                jffs2_do_clear_inode(c, f);
                return -EIO;
@@ -785,17 +1223,22 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
        switch(jemode_to_cpu(latest_node->mode) & S_IFMT) {
        case S_IFDIR:
-                if (mctime_ver > je32_to_cpu(latest_node->version)) {
+                if (rii.mctime_ver > je32_to_cpu(latest_node->version)) {
                        /* The times in the latest_node are actually older than
                           mctime in the latest dirent. Cheat. */
-                        latest_node->ctime = latest_node->mtime = cpu_to_je32(latest_mctime);
+                        latest_node->ctime = latest_node->mtime = cpu_to_je32(rii.latest_mctime);
                }
                break;
        case S_IFREG:
                /* If it was a regular file, truncate it to the latest node's isize */
-                jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
+                new_size = jffs2_truncate_fragtree(c, &f->fragtree, je32_to_cpu(latest_node->isize));
+                if (new_size != je32_to_cpu(latest_node->isize)) {
+                        JFFS2_WARNING("Truncating ino #%u to %d bytes failed because it only had %d bytes to start with!\n",
+                                      f->inocache->ino, je32_to_cpu(latest_node->isize), new_size);
+                        latest_node->isize = cpu_to_je32(new_size);
+                }
                break;
        case S_IFLNK:
@@ -818,7 +1261,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                                return -ENOMEM;
                        }
-                        ret = jffs2_flash_read(c, ref_offset(fn->raw) + sizeof(*latest_node),
+                        ret = jffs2_flash_read(c, ref_offset(rii.latest_ref) + sizeof(*latest_node),
                                                je32_to_cpu(latest_node->csize), &retlen, (char *)f->target);
                        if (ret  || retlen != je32_to_cpu(latest_node->csize)) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3af746eaff0e..2a1c976c7924 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1,15 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: scan.c,v 1.125 2005/09/30 13:59:13 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -450,16 +449,20 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
        if (jffs2_cleanmarker_oob(c)) {
-                int ret = jffs2_check_nand_cleanmarker(c, jeb);
+                int ret;
+                if (c->mtd->block_isbad(c->mtd, jeb->offset))
+                        return BLK_STATE_BADBLOCK;
+                ret = jffs2_check_nand_cleanmarker(c, jeb);
                D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n",ret));
                /* Even if it's not found, we still scan to see
                   if the block is empty. We use this information
                   to decide whether to erase it or not. */
                switch (ret) {
                case 0:         cleanmarkerfound = 1; break;
                case 1:         break;
-                case 2:         return BLK_STATE_BADBLOCK;
-                case 3:         return BLK_STATE_ALLDIRTY; /* Block has failed to erase min. once */
                default:        return ret;
                }
        }
@@ -632,16 +635,17 @@ scan_more:
                if (*(uint32_t *)(&buf[ofs-buf_ofs]) == 0xffffffff) {
                        uint32_t inbuf_ofs;
-                        uint32_t empty_start;
+                        uint32_t empty_start, scan_end;
                        empty_start = ofs;
                        ofs += 4;
+                        scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len);
                        D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs));
                more_empty:
                        inbuf_ofs = ofs - buf_ofs;
-                        while (inbuf_ofs < buf_len) {
+                        while (inbuf_ofs < scan_end) {
-                                if (*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff) {
+                                if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) {
                                        printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
                                               empty_start, ofs);
                                        if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
@@ -662,7 +666,11 @@ scan_more:
                                D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
                                return BLK_STATE_CLEANMARKER;
                        }
+                        if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */
+                                scan_end = buf_len;
+                                goto more_empty;
+                        }
+                        
                        /* See how much more there is to read in this eraseblock... */
                        buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
                        if (!buf_len) {
@@ -672,6 +680,8 @@ scan_more:
                                          empty_start));
                                break;
                        }
+                        /* point never reaches here */
+                        scan_end = buf_len;
                        D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs));
                        err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
                        if (err)
@@ -731,8 +741,7 @@ scan_more:
                        continue;
                }
-                if (ofs + je32_to_cpu(node->totlen) >
+                if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) {
-                    jeb->offset + c->sector_size) {
                        /* Eep. Node goes over the end of the erase block. */
                        printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
                               ofs, je32_to_cpu(node->totlen));
@@ -939,8 +948,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
                                 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
 {
        struct jffs2_inode_cache *ic;
-        uint32_t ino = je32_to_cpu(ri->ino);
+        uint32_t crc, ino = je32_to_cpu(ri->ino);
-        int err;
        D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
@@ -953,21 +961,22 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
           Which means that the _full_ amount of time to get to proper write mode with GC
           operational may actually be _longer_ than before. Sucks to be me. */
+        /* Check the node CRC in any case. */
+        crc = crc32(0, ri, sizeof(*ri)-8);
+        if (crc != je32_to_cpu(ri->node_crc)) {
+                printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on "
+                       "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
+                       ofs, je32_to_cpu(ri->node_crc), crc);
+                /*
+                 * We believe totlen because the CRC on the node
+                 * _header_ was OK, just the node itself failed.
+                 */
+                return jffs2_scan_dirty_space(c, jeb,
+                                              PAD(je32_to_cpu(ri->totlen)));
+        }
        ic = jffs2_get_ino_cache(c, ino);
        if (!ic) {
-                /* Inocache get failed. Either we read a bogus ino# or it's just genuinely the
-                   first node we found for this inode. Do a CRC check to protect against the former
-                   case */
-                uint32_t crc = crc32(0, ri, sizeof(*ri)-8);
-                if (crc != je32_to_cpu(ri->node_crc)) {
-                        printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
-                               ofs, je32_to_cpu(ri->node_crc), crc);
-                        /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-                        if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
-                                return err;
-                        return 0;
-                }
                ic = jffs2_scan_make_ino_cache(c, ino);
                if (!ic)
                        return -ENOMEM;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 52a9894a6364..bc9f6ba10823 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 30f888414ce7..d828b296392a 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -1,16 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
- *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *                   Zoltan Sogor <weth@inf.u-szeged.hu>,
- *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *                   Patrik Kluba <pajko@halom.u-szeged.hu>,
- *                     University of Szeged, Hungary
+ *                   University of Szeged, Hungary
- *               2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *             2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: summary.c,v 1.4 2005/09/26 11:37:21 havasi Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 6bf1f6aa4552..0c6669e21390 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -1,15 +1,13 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ * Copyright © 2004  Ferenc Havasi <havasi@inf.u-szeged.hu>,
- *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *                   Zoltan Sogor <weth@inf.u-szeged.hu>,
- *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
+ *                   Patrik Kluba <pajko@halom.u-szeged.hu>,
- *                     University of Szeged, Hungary
+ *                   University of Szeged, Hungary
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: summary.h,v 1.2 2005/09/26 11:37:21 havasi Exp $
- *
 */
 #ifndef JFFS2_SUMMARY_H
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index cc7e8e71ad46..e51164a8a8d4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: super.c,v 1.110 2005/11/07 11:14:42 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -347,7 +345,7 @@ static int __init init_jffs2_fs(void)
 #ifdef CONFIG_JFFS2_SUMMARY
               " (SUMMARY) "
 #endif
-               " (C) 2001-2006 Red Hat, Inc.\n");
+               " © 2001-2006 Red Hat, Inc.\n");
        jffs2_inode_cachep = kmem_cache_create("jffs2_i",
                                             sizeof(struct jffs2_inode_info),
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 7e4882c8a7ed..b7339c3b6ad9 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -1,17 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: symlink.c,v 1.19 2005/11/07 11:14:42 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 9c99859f5edd..c556e85a565c 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1,16 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
- * Copyright (C) 2004 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright © 2004 Thomas Gleixner <tglx@linutronix.de>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 * Modified debugged and enhanced by Thomas Gleixner <tglx@linutronix.de>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: wbuf.c,v 1.100 2005/09/30 13:59:13 dedekind Exp $
- *
 */
 #include <linux/kernel.h>
@@ -238,7 +236,10 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
        jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
        spin_lock(&c->erase_completion_lock);
-        jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+        if (c->wbuf_ofs % c->mtd->erasesize)
+                jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+        else
+                jffs2_block_refile(c, jeb, REFILE_ANYWAY);
        spin_unlock(&c->erase_completion_lock);
        BUG_ON(!ref_obsolete(jeb->last_node));
@@ -342,6 +343,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                return;
        }
+        /* The summary is not recovered, so it must be disabled for this erase block */
+        jffs2_sum_disable_collecting(c->summary);
        ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
        if (ret) {
                printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
@@ -957,43 +961,48 @@ exit:
        return ret;
 }
-#define NR_OOB_SCAN_PAGES       4
+#define NR_OOB_SCAN_PAGES 4
+/* For historical reasons we use only 12 bytes for OOB clean marker */
+#define OOB_CM_SIZE 12
+static const struct jffs2_unknown_node oob_cleanmarker =
+{
+        .magic = constant_cpu_to_je16(JFFS2_MAGIC_BITMASK),
+        .nodetype = constant_cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER),
+        .totlen = constant_cpu_to_je32(8)
+};
 /*
- * Check, if the out of band area is empty
+ * Check, if the out of band area is empty. This function knows about the clean
+ * marker and if it is present in OOB, treats the OOB as empty anyway.
 */
 int jffs2_check_oob_empty(struct jffs2_sb_info *c,
                          struct jffs2_eraseblock *jeb, int mode)
 {
-        int i, page, ret;
+        int i, ret;
-        int oobsize = c->mtd->oobsize;
+        int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
        struct mtd_oob_ops ops;
-        ops.ooblen = NR_OOB_SCAN_PAGES * oobsize;
+        ops.mode = MTD_OOB_AUTO;
+        ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
        ops.oobbuf = c->oobbuf;
-        ops.ooboffs = 0;
+        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
        ops.datbuf = NULL;
-        ops.mode = MTD_OOB_PLACE;
        ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
-        if (ret) {
+        if (ret || ops.oobretlen != ops.ooblen) {
-                D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+                printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
-                          "failed %d for block at %08x\n", ret, jeb->offset));
+                                " bytes, read %zd bytes, error %d\n",
+                                jeb->offset, ops.ooblen, ops.oobretlen, ret);
+                if (!ret)
+                        ret = -EIO;
                return ret;
        }
-        if (ops.oobretlen < ops.ooblen) {
+        for(i = 0; i < ops.ooblen; i++) {
-                D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+                if (mode && i < cmlen)
-                          "returned short read (%zd bytes not %d) for block "
+                        /* Yeah, we know about the cleanmarker */
-                          "at %08x\n", ops.oobretlen, ops.ooblen, jeb->offset));
-                return -EIO;
-        }
-        /* Special check for first page */
-        for(i = 0; i < oobsize ; i++) {
-                /* Yeah, we know about the cleanmarker. */
-                if (mode && i >= c->fsdata_pos &&
-                    i < c->fsdata_pos + c->fsdata_len)
                        continue;
                if (ops.oobbuf[i] != 0xFF) {
@@ -1003,111 +1012,63 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
                }
        }
-        /* we know, we are aligned :) */
-        for (page = oobsize; page < ops.ooblen; page += sizeof(long)) {
-                long dat = *(long *)(&ops.oobbuf[page]);
-                if(dat != -1)
-                        return 1;
-        }
        return 0;
 }
 /*
- * Scan for a valid cleanmarker and for bad blocks
+ * Check for a valid cleanmarker.
+ * Returns: 0 if a valid cleanmarker was found
+ *          1 if no cleanmarker was found
+ *          negative error code if an error occurred
 */
-int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c,
+int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
-                                  struct jffs2_eraseblock *jeb)
+                                 struct jffs2_eraseblock *jeb)
 {
-        struct jffs2_unknown_node n;
        struct mtd_oob_ops ops;
-        int oobsize = c->mtd->oobsize;
+        int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
-        unsigned char *p,*b;
-        int i, ret;
-        size_t offset = jeb->offset;
-        /* Check first if the block is bad. */
-        if (c->mtd->block_isbad(c->mtd, offset)) {
-                D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker()"
-                           ": Bad block at %08x\n", jeb->offset));
-                return 2;
-        }
-        ops.ooblen = oobsize;
+        ops.mode = MTD_OOB_AUTO;
+        ops.ooblen = cmlen;
        ops.oobbuf = c->oobbuf;
-        ops.ooboffs = 0;
+        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
        ops.datbuf = NULL;
-        ops.mode = MTD_OOB_PLACE;
-        ret = c->mtd->read_oob(c->mtd, offset, &ops);
+        ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
-        if (ret) {
+        if (ret || ops.oobretlen != ops.ooblen) {
-                D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+                printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
-                           "Read OOB failed %d for block at %08x\n",
+                                " bytes, read %zd bytes, error %d\n",
-                           ret, jeb->offset));
+                                jeb->offset, ops.ooblen, ops.oobretlen, ret);
+                if (!ret)
+                        ret = -EIO;
                return ret;
        }
-        if (ops.oobretlen < ops.ooblen) {
+        return !!memcmp(&oob_cleanmarker, c->oobbuf, cmlen);
-                D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): "
-                            "Read OOB return short read (%zd bytes not %d) "
-                            "for block at %08x\n", ops.oobretlen, ops.ooblen,
-                            jeb->offset));
-                return -EIO;
-        }
-        n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
-        n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
-        n.totlen = cpu_to_je32 (8);
-        p = (unsigned char *) &n;
-        b = c->oobbuf + c->fsdata_pos;
-        for (i = c->fsdata_len; i; i--) {
-                if (*b++ != *p++)
-                        ret = 1;
-        }
-        D1(if (ret == 1) {
-                printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
-                       "Cleanmarker node not detected in block at %08x\n",
-                       offset);
-                printk(KERN_WARNING "OOB at %08zx was ", offset);
-                for (i=0; i < oobsize; i++)
-                        printk("%02x ", c->oobbuf[i]);
-                printk("\n");
-        });
-        return ret;
 }
 int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
                                 struct jffs2_eraseblock *jeb)
 {
-        struct jffs2_unknown_node n;
+        int ret;
-        int     ret;
        struct mtd_oob_ops ops;
+        int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
-        n.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+        ops.mode = MTD_OOB_AUTO;
-        n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER);
+        ops.ooblen = cmlen;
-        n.totlen = cpu_to_je32(8);
+        ops.oobbuf = (uint8_t *)&oob_cleanmarker;
+        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
-        ops.ooblen = c->fsdata_len;
-        ops.oobbuf = (uint8_t *)&n;
-        ops.ooboffs = c->fsdata_pos;
        ops.datbuf = NULL;
-        ops.mode = MTD_OOB_PLACE;
        ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
+        if (ret || ops.oobretlen != ops.ooblen) {
-        if (ret) {
+                printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
-                D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+                                " bytes, read %zd bytes, error %d\n",
-                          "Write failed for block at %08x: error %d\n",
+                                jeb->offset, ops.ooblen, ops.oobretlen, ret);
-                          jeb->offset, ret));
+                if (!ret)
+                        ret = -EIO;
                return ret;
        }
-        if (ops.oobretlen != ops.ooblen) {
-                D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
-                          "Short write for block at %08x: %zd not %d\n",
-                          jeb->offset, ops.oobretlen, ops.ooblen));
-                return -EIO;
-        }
        return 0;
 }
@@ -1130,7 +1091,7 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
        if (!c->mtd->block_markbad)
                return 1; // What else can we do?
-        D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Marking bad block at %08x\n", bad_offset));
+        printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
        ret = c->mtd->block_markbad(c->mtd, bad_offset);
        if (ret) {
@@ -1140,41 +1101,24 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
        return 1;
 }
-static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
+int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 {
        struct nand_ecclayout *oinfo = c->mtd->ecclayout;
-        /* Do this only, if we have an oob buffer */
        if (!c->mtd->oobsize)
                return 0;
        /* Cleanmarker is out-of-band, so inline size zero */
        c->cleanmarker_size = 0;
-        /* Should we use autoplacement ? */
+        if (!oinfo || oinfo->oobavail == 0) {
-        if (!oinfo) {
+                printk(KERN_ERR "inconsistent device description\n");
-                D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
                return -EINVAL;
        }
-        D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
+        D1(printk(KERN_DEBUG "JFFS2 using OOB on NAND\n"));
-        /* Get the position of the free bytes */
-        if (!oinfo->oobfree[0].length) {
-                printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep."
-                        " Autoplacement selected and no empty space in oob\n");
-                return -ENOSPC;
-        }
-        c->fsdata_pos = oinfo->oobfree[0].offset;
-        c->fsdata_len = oinfo->oobfree[0].length;
-        if (c->fsdata_len > 8)
-                c->fsdata_len = 8;
-        return 0;
+        c->oobavail = oinfo->oobavail;
-}
-int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
-{
-        int res;
        /* Initialise write buffer */
        init_rwsem(&c->wbuf_sem);
@@ -1185,22 +1129,13 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
        if (!c->wbuf)
                return -ENOMEM;
-        c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->mtd->oobsize, GFP_KERNEL);
+        c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL);
-        if (!c->oobbuf)
+        if (!c->oobbuf) {
-                return -ENOMEM;
-        res = jffs2_nand_set_oobinfo(c);
-#ifdef BREAKME
-        if (!brokenbuf)
-                brokenbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
-        if (!brokenbuf) {
                kfree(c->wbuf);
                return -ENOMEM;
        }
-        memset(brokenbuf, 0xdb, c->wbuf_pagesize);
-#endif
+        return 0;
-        return res;
 }
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
@@ -1274,3 +1209,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
        kfree(c->wbuf);
 }
+int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
+        c->cleanmarker_size = 0;
+        if (c->mtd->writesize == 1)
+                /* We do not need write-buffer */
+                return 0;
+        init_rwsem(&c->wbuf_sem);
+        c->wbuf_pagesize =  c->mtd->writesize;
+        c->wbuf_ofs = 0xFFFFFFFF;
+        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+        if (!c->wbuf)
+                return -ENOMEM;
+        printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size);
+        return 0;
+}
+void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) {
+        kfree(c->wbuf);
+}
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 67176792e138..c9fe0ab3a329 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: write.c,v 1.97 2005/11/07 11:14:42 gleixner Exp $
- *
 */
 #include <linux/kernel.h>
@@ -507,8 +505,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
        uint32_t alloclen;
        int ret;
-        if (1 /* alternative branch needs testing */ ||
+        if (!jffs2_can_mark_obsolete(c)) {
-            !jffs2_can_mark_obsolete(c)) {
                /* We can't mark stuff obsolete on the medium. We need to write a deletion dirent */
                rd = jffs2_alloc_raw_dirent();
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index c638ae1008de..b9276b11bac6 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -1,14 +1,12 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2001, 2002 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
- * $Id: writev.c,v 1.8 2005/09/09 15:11:58 havasi Exp $
- *
 */
 #include <linux/kernel.h>
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4bb3f1897330..78fc08893a6c 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 06a5c69dcf8b..3b0ff2925937 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #ifndef _JFFS2_FS_XATTR_H_
 #define _JFFS2_FS_XATTR_H_
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ed046e19dbfa..8ec5765ef348 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 2f8e9aa01ea0..40942bc516bb 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -1,13 +1,14 @@
 /*
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
- * Copyright (C) 2006  NEC Corporation
+ * Copyright © 2006  NEC Corporation
 *
 * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 6988a1082f58..03893acbfda4 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1919,7 +1919,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
         * header ?
         */
        if (tlck->type & tlckTRUNCATE) {
-                pxd_t pxd;      /* truncated extent of xad */
+                /* This odd declaration suppresses a bogus gcc warning */
+                pxd_t pxd = pxd;        /* truncated extent of xad */
                int twm;
                /*
diff --git a/fs/libfs.c b/fs/libfs.c
index 7d487047dbb8..d93842d3c0a0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -190,6 +190,10 @@ const struct inode_operations simple_dir_inode_operations = {
        .lookup         = simple_lookup,
 };
+static const struct super_operations simple_super_operations = {
+        .statfs         = simple_statfs,
+};
 /*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
@@ -199,7 +203,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        struct vfsmount *mnt)
 {
        struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
-        static const struct super_operations default_ops = {.statfs = simple_statfs};
        struct dentry *dentry;
        struct inode *root;
        struct qstr d_name = {.name = name, .len = strlen(name)};
@@ -212,7 +215,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
        s->s_magic = magic;
-        s->s_op = ops ? ops : &default_ops;
+        s->s_op = ops ? ops : &simple_super_operations;
        s->s_time_gran = 1;
        root = new_inode(s);
        if (!root)
@@ -335,17 +338,18 @@ int simple_prepare_write(struct file *file, struct page *page,
                        flush_dcache_page(page);
                        kunmap_atomic(kaddr, KM_USER0);
                }
-                SetPageUptodate(page);
        }
        return 0;
 }
 int simple_commit_write(struct file *file, struct page *page,
-                        unsigned offset, unsigned to)
+                        unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_mutex.
@@ -358,7 +362,6 @@ int simple_commit_write(struct file *file, struct page *page,
 int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
 {
-        static struct super_operations s_ops = {.statfs = simple_statfs};
        struct inode *inode;
        struct dentry *root;
        struct dentry *dentry;
@@ -367,7 +370,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
        s->s_blocksize = PAGE_CACHE_SIZE;
        s->s_blocksize_bits = PAGE_CACHE_SHIFT;
        s->s_magic = magic;
-        s->s_op = &s_ops;
+        s->s_op = &simple_super_operations;
        s->s_time_gran = 1;
        inode = new_inode(s);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 50cb8daba4e5..126b1bf02c0e 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -440,7 +440,7 @@ static ctl_table nlm_sysctl_root[] = {
 };
 /*
- * Module (and driverfs) parameters.
+ * Module (and sysfs) parameters.
 */
 #define param_set_min_max(name, type, which_strtol, min, max)           \
diff --git a/fs/namei.c b/fs/namei.c
index ee60cc4d3453..880052cadbcd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1243,22 +1243,13 @@ int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
        return err;
 }
-/*
+static inline struct dentry *__lookup_hash_kern(struct qstr *name, struct dentry *base, struct nameidata *nd)
- * Restricted form of lookup. Doesn't follow links, single-component only,
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
-static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
 {
-        struct dentry * dentry;
+        struct dentry *dentry;
        struct inode *inode;
        int err;
        inode = base->d_inode;
-        err = permission(inode, MAY_EXEC, nd);
-        dentry = ERR_PTR(err);
-        if (err)
-                goto out;
        /*
         * See if the low-level filesystem might want
@@ -1287,35 +1278,76 @@ out:
        return dentry;
 }
+/*
+ * Restricted form of lookup. Doesn't follow links, single-component only,
+ * needs parent already locked. Doesn't follow mounts.
+ * SMP-safe.
+ */
+static inline struct dentry * __lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd)
+{
+        struct dentry *dentry;
+        struct inode *inode;
+        int err;
+        inode = base->d_inode;
+        err = permission(inode, MAY_EXEC, nd);
+        dentry = ERR_PTR(err);
+        if (err)
+                goto out;
+        dentry = __lookup_hash_kern(name, base, nd);
+out:
+        return dentry;
+}
 static struct dentry *lookup_hash(struct nameidata *nd)
 {
        return __lookup_hash(&nd->last, nd->dentry, nd);
 }
 /* SMP-safe */
-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
+static inline int __lookup_one_len(const char *name, struct qstr *this, struct dentry *base, int len)
 {
        unsigned long hash;
-        struct qstr this;
        unsigned int c;
-        this.name = name;
+        this->name = name;
-        this.len = len;
+        this->len = len;
        if (!len)
-                goto access;
+                return -EACCES;
        hash = init_name_hash();
        while (len--) {
                c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
-                        goto access;
+                        return -EACCES;
                hash = partial_name_hash(c, hash);
        }
-        this.hash = end_name_hash(hash);
+        this->hash = end_name_hash(hash);
+        return 0;
+}
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+        int err;
+        struct qstr this;
+        err = __lookup_one_len(name, &this, base, len);
+        if (err)
+                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
-access:
+}
-        return ERR_PTR(-EACCES);
+struct dentry *lookup_one_len_kern(const char *name, struct dentry *base, int len)
+{
+        int err;
+        struct qstr this;
+        err = __lookup_one_len(name, &this, base, len);
+        if (err)
+                return ERR_PTR(err);
+        return __lookup_hash_kern(&this, base, NULL);
 }
 /*
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 14939ddf74f1..7285c94956c4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -576,6 +576,12 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        server->packet = vmalloc(NCP_PACKET_SIZE);
        if (server->packet == NULL)
                goto out_nls;
+        server->txbuf = vmalloc(NCP_PACKET_SIZE);
+        if (server->txbuf == NULL)
+                goto out_packet;
+        server->rxbuf = vmalloc(NCP_PACKET_SIZE);
+        if (server->rxbuf == NULL)
+                goto out_txbuf;
        sock->sk->sk_data_ready   = ncp_tcp_data_ready;
        sock->sk->sk_error_report = ncp_tcp_error_report;
@@ -597,7 +603,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        error = ncp_connect(server);
        ncp_unlock_server(server);
        if (error < 0)
-                goto out_packet;
+                goto out_rxbuf;
        DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb));
        error = -EMSGSIZE;      /* -EREMOTESIDEINCOMPATIBLE */
@@ -666,8 +672,12 @@ out_disconnect:
        ncp_lock_server(server);
        ncp_disconnect(server);
        ncp_unlock_server(server);
-out_packet:
+out_rxbuf:
        ncp_stop_tasks(server);
+        vfree(server->rxbuf);
+out_txbuf:
+        vfree(server->txbuf);
+out_packet:
        vfree(server->packet);
 out_nls:
 #ifdef CONFIG_NCPFS_NLS
@@ -723,6 +733,8 @@ static void ncp_put_super(struct super_block *sb)
        kfree(server->priv.data);
        kfree(server->auth.object_name);
+        vfree(server->rxbuf);
+        vfree(server->txbuf);
        vfree(server->packet);
        sb->s_fs_info = NULL;
        kfree(server);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e496d8b65e92..e37df8d5fe70 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -14,6 +14,7 @@
 #include <linux/socket.h>
 #include <linux/fcntl.h>
 #include <linux/stat.h>
+#include <linux/string.h>
 #include <asm/uaccess.h>
 #include <linux/in.h>
 #include <linux/net.h>
@@ -55,10 +56,11 @@ static int _send(struct socket *sock, const void *buff, int len)
 struct ncp_request_reply {
        struct list_head req;
        wait_queue_head_t wq;
-        struct ncp_reply_header* reply_buf;
+        atomic_t refs;
+        unsigned char* reply_buf;
        size_t datalen;
        int result;
-        enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE } status;
+        enum { RQ_DONE, RQ_INPROGRESS, RQ_QUEUED, RQ_IDLE, RQ_ABANDONED } status;
        struct kvec* tx_ciov;
        size_t tx_totallen;
        size_t tx_iovlen;
@@ -67,6 +69,32 @@ struct ncp_request_reply {
        u_int32_t sign[6];
 };
+static inline struct ncp_request_reply* ncp_alloc_req(void)
+{
+        struct ncp_request_reply *req;
+        req = kmalloc(sizeof(struct ncp_request_reply), GFP_KERNEL);
+        if (!req)
+                return NULL;
+        init_waitqueue_head(&req->wq);
+        atomic_set(&req->refs, (1));
+        req->status = RQ_IDLE;
+        return req;
+}
+static void ncp_req_get(struct ncp_request_reply *req)
+{
+        atomic_inc(&req->refs);
+}
+static void ncp_req_put(struct ncp_request_reply *req)
+{
+        if (atomic_dec_and_test(&req->refs))
+                kfree(req);
+}
 void ncp_tcp_data_ready(struct sock *sk, int len)
 {
        struct ncp_server *server = sk->sk_user_data;
@@ -101,14 +129,17 @@ void ncpdgram_timeout_call(unsigned long v)
        schedule_work(&server->timeout_tq);
 }
-static inline void ncp_finish_request(struct ncp_request_reply *req, int result)
+static inline void ncp_finish_request(struct ncp_server *server, struct ncp_request_reply *req, int result)
 {
        req->result = result;
+        if (req->status != RQ_ABANDONED)
+                memcpy(req->reply_buf, server->rxbuf, req->datalen);
        req->status = RQ_DONE;
        wake_up_all(&req->wq);
+        ncp_req_put(req);
 }
-static void __abort_ncp_connection(struct ncp_server *server, struct ncp_request_reply *aborted, int err)
+static void __abort_ncp_connection(struct ncp_server *server)
 {
        struct ncp_request_reply *req;
@@ -118,31 +149,19 @@ static void __abort_ncp_connection(struct ncp_server *server, struct ncp_request
                req = list_entry(server->tx.requests.next, struct ncp_request_reply, req);
                
                list_del_init(&req->req);
-                if (req == aborted) {
+                ncp_finish_request(server, req, -EIO);
-                        ncp_finish_request(req, err);
-                } else {
-                        ncp_finish_request(req, -EIO);
-                }
        }
        req = server->rcv.creq;
        if (req) {
                server->rcv.creq = NULL;
-                if (req == aborted) {
+                ncp_finish_request(server, req, -EIO);
-                        ncp_finish_request(req, err);
-                } else {
-                        ncp_finish_request(req, -EIO);
-                }
                server->rcv.ptr = NULL;
                server->rcv.state = 0;
        }
        req = server->tx.creq;
        if (req) {
                server->tx.creq = NULL;
-                if (req == aborted) {
+                ncp_finish_request(server, req, -EIO);
-                        ncp_finish_request(req, err);
-                } else {
-                        ncp_finish_request(req, -EIO);
-                }
        }
 }
@@ -160,10 +179,12 @@ static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_req
                        break;
                case RQ_QUEUED:
                        list_del_init(&req->req);
-                        ncp_finish_request(req, err);
+                        ncp_finish_request(server, req, err);
                        break;
                case RQ_INPROGRESS:
-                        __abort_ncp_connection(server, req, err);
+                        req->status = RQ_ABANDONED;
+                        break;
+                case RQ_ABANDONED:
                        break;
        }
 }
@@ -177,7 +198,7 @@ static inline void ncp_abort_request(struct ncp_server *server, struct ncp_reque
 static inline void __ncptcp_abort(struct ncp_server *server)
 {
-        __abort_ncp_connection(server, NULL, 0);
+        __abort_ncp_connection(server);
 }
 static int ncpdgram_send(struct socket *sock, struct ncp_request_reply *req)
@@ -294,6 +315,11 @@ static void ncptcp_start_request(struct ncp_server *server, struct ncp_request_r
 static inline void __ncp_start_request(struct ncp_server *server, struct ncp_request_reply *req)
 {
+        /* we copy the data so that we do not depend on the caller
+           staying alive */
+        memcpy(server->txbuf, req->tx_iov[1].iov_base, req->tx_iov[1].iov_len);
+        req->tx_iov[1].iov_base = server->txbuf;
        if (server->ncp_sock->type == SOCK_STREAM)
                ncptcp_start_request(server, req);
        else
@@ -308,6 +334,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *
                printk(KERN_ERR "ncpfs: tcp: Server died\n");
                return -EIO;
        }
+        ncp_req_get(req);
        if (server->tx.creq || server->rcv.creq) {
                req->status = RQ_QUEUED;
                list_add_tail(&req->req, &server->tx.requests);
@@ -409,7 +436,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
                                        server->timeout_last = NCP_MAX_RPC_TIMEOUT;
                                        mod_timer(&server->timeout_tm, jiffies + NCP_MAX_RPC_TIMEOUT);
                                } else if (reply.type == NCP_REPLY) {
-                                        result = _recv(sock, (void*)req->reply_buf, req->datalen, MSG_DONTWAIT);
+                                        result = _recv(sock, server->rxbuf, req->datalen, MSG_DONTWAIT);
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
                                        if (result >= 0 && server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
                                                if (result < 8 + 8) {
@@ -419,7 +446,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
                                                        
                                                        result -= 8;
                                                        hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
-                                                        if (sign_verify_reply(server, ((char*)req->reply_buf) + hdrl, result - hdrl, cpu_to_le32(result), ((char*)req->reply_buf) + result)) {
+                                                        if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
                                                                printk(KERN_INFO "ncpfs: Signature violation\n");
                                                                result = -EIO;
                                                        }
@@ -428,7 +455,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
 #endif
                                        del_timer(&server->timeout_tm);
                                        server->rcv.creq = NULL;
-                                        ncp_finish_request(req, result);
+                                        ncp_finish_request(server, req, result);
                                        __ncp_next_request(server);
                                        mutex_unlock(&server->rcv.creq_mutex);
                                        continue;
@@ -478,12 +505,6 @@ void ncpdgram_timeout_proc(struct work_struct *work)
        mutex_unlock(&server->rcv.creq_mutex);
 }
-static inline void ncp_init_req(struct ncp_request_reply* req)
-{
-        init_waitqueue_head(&req->wq);
-        req->status = RQ_IDLE;
-}
 static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
 {
        int result;
@@ -601,8 +622,8 @@ skipdata:;
                                        goto skipdata;
                                }
                                req->datalen = datalen - 8;
-                                req->reply_buf->type = NCP_REPLY;
+                                ((struct ncp_reply_header*)server->rxbuf)->type = NCP_REPLY;
-                                server->rcv.ptr = (unsigned char*)(req->reply_buf) + 2;
+                                server->rcv.ptr = server->rxbuf + 2;
                                server->rcv.len = datalen - 10;
                                server->rcv.state = 1;
                                break;
@@ -615,12 +636,12 @@ skipdata:;
                        case 1:
                                req = server->rcv.creq;
                                if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
-                                        if (req->reply_buf->sequence != server->sequence) {
+                                        if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
                                                printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n");
                                                __ncp_abort_request(server, req, -EIO);
                                                return -EIO;
                                        }
-                                        if ((req->reply_buf->conn_low | (req->reply_buf->conn_high << 8)) != server->connection) {
+                                        if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
                                                printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n");
                                                __ncp_abort_request(server, req, -EIO);
                                                return -EIO;
@@ -628,14 +649,14 @@ skipdata:;
                                }
 #ifdef CONFIG_NCPFS_PACKET_SIGNING                              
                                if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
-                                        if (sign_verify_reply(server, (unsigned char*)(req->reply_buf) + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
+                                        if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
                                                printk(KERN_ERR "ncpfs: tcp: Signature violation\n");
                                                __ncp_abort_request(server, req, -EIO);
                                                return -EIO;
                                        }
                                }
 #endif                          
-                                ncp_finish_request(req, req->datalen);
+                                ncp_finish_request(server, req, req->datalen);
                        nextreq:;
                                __ncp_next_request(server);
                        case 2:
@@ -645,7 +666,7 @@ skipdata:;
                                server->rcv.state = 0;
                                break;
                        case 3:
-                                ncp_finish_request(server->rcv.creq, -EIO);
+                                ncp_finish_request(server, server->rcv.creq, -EIO);
                                goto nextreq;
                        case 5:
                                info_server(server, 0, server->unexpected_packet.data, server->unexpected_packet.len);
@@ -675,28 +696,39 @@ void ncp_tcp_tx_proc(struct work_struct *work)
 }
 static int do_ncp_rpc_call(struct ncp_server *server, int size,
-                struct ncp_reply_header* reply_buf, int max_reply_size)
+                unsigned char* reply_buf, int max_reply_size)
 {
        int result;
-        struct ncp_request_reply req;
+        struct ncp_request_reply *req;
-        ncp_init_req(&req);
+        req = ncp_alloc_req();
-        req.reply_buf = reply_buf;
+        if (!req)
-        req.datalen = max_reply_size;
+                return -ENOMEM;
-        req.tx_iov[1].iov_base = server->packet;
-        req.tx_iov[1].iov_len = size;
+        req->reply_buf = reply_buf;
-        req.tx_iovlen = 1;
+        req->datalen = max_reply_size;
-        req.tx_totallen = size;
+        req->tx_iov[1].iov_base = server->packet;
-        req.tx_type = *(u_int16_t*)server->packet;
+        req->tx_iov[1].iov_len = size;
+        req->tx_iovlen = 1;
-        result = ncp_add_request(server, &req);
+        req->tx_totallen = size;
-        if (result < 0) {
+        req->tx_type = *(u_int16_t*)server->packet;
-                return result;
-        }
+        result = ncp_add_request(server, req);
-        if (wait_event_interruptible(req.wq, req.status == RQ_DONE)) {
+        if (result < 0)
-                ncp_abort_request(server, &req, -EIO);
+                goto out;
+        if (wait_event_interruptible(req->wq, req->status == RQ_DONE)) {
+                ncp_abort_request(server, req, -EINTR);
+                result = -EINTR;
+                goto out;
        }
-        return req.result;
+        result = req->result;
+out:
+        ncp_req_put(req);
+        return result;
 }
 /*
@@ -751,11 +783,6 @@ static int ncp_do_request(struct ncp_server *server, int size,
        DDPRINTK("do_ncp_rpc_call returned %d\n", result);
-        if (result < 0) {
-                /* There was a problem with I/O, so the connections is
-                 * no longer usable. */
-                ncp_invalidate_conn(server);
-        }
        return result;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 92d8ec859e22..cd3469720cbf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1684,7 +1684,8 @@ go_ahead:
         * ... prune child dentries and writebacks if needed.
         */
        if (atomic_read(&old_dentry->d_count) > 1) {
-                nfs_wb_all(old_inode);
+                if (S_ISREG(old_inode->i_mode))
+                        nfs_wb_all(old_inode);
                shrink_dcache_parent(old_dentry);
        }
        nfs_inode_return_delegation(old_inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b1c98ea39b72..2877744cb606 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -432,10 +432,10 @@ static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
        if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
                return;
        if (unlikely(task->tk_status < 0)) {
-                dreq->error = task->tk_status;
+                dprintk("NFS: %5u commit failed with error %d.\n",
+                                task->tk_pid, task->tk_status);
                dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-        }
+        } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
-        if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
                dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
                dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
        }
@@ -531,9 +531,12 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
        spin_lock(&dreq->lock);
+        if (unlikely(dreq->error != 0))
+                goto out_unlock;
        if (unlikely(status < 0)) {
+                /* An error has occured, so we should not commit */
+                dreq->flags = 0;
                dreq->error = status;
-                goto out_unlock;
        }
        dreq->count += data->res.count;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index af53c02f473b..44aa9b726573 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -341,8 +341,10 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
        lock_kernel();
        nfs_begin_data_update(inode);
        /* Write all dirty data */
-        filemap_write_and_wait(inode->i_mapping);
+        if (S_ISREG(inode->i_mode)) {
-        nfs_wb_all(inode);
+                filemap_write_and_wait(inode->i_mapping);
+                nfs_wb_all(inode);
+        }
        /*
         * Return any delegations if we're going to change ACLs
         */
@@ -429,7 +431,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        int err;
        /* Flush out writes to the server in order to update c/mtime */
-        nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT);
+        if (S_ISREG(inode->i_mode))
+                nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT);
        /*
         * We may force a getattr if the user cares about atime.
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb516a2cfbaf..f1eae44b9a1a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -151,10 +151,10 @@ int __init register_nfs_fs(void)
        if (ret < 0)
                goto error_0;
-#ifdef CONFIG_NFS_V4
        ret = nfs_register_sysctl();
        if (ret < 0)
                goto error_1;
+#ifdef CONFIG_NFS_V4
        ret = register_filesystem(&nfs4_fs_type);
        if (ret < 0)
                goto error_2;
@@ -165,9 +165,9 @@ int __init register_nfs_fs(void)
 #ifdef CONFIG_NFS_V4
 error_2:
        nfs_unregister_sysctl();
+#endif
 error_1:
        unregister_filesystem(&nfs_fs_type);
-#endif
 error_0:
        return ret;
 }
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index fcdcafbb3293..b62481dabae9 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -50,6 +50,14 @@ static ctl_table nfs_cb_sysctls[] = {
                .proc_handler   = &proc_dointvec_jiffies,
                .strategy       = &sysctl_jiffies,
        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "nfs_congestion_kb",
+                .data           = &nfs_congestion_kb,
+                .maxlen         = sizeof(nfs_congestion_kb),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
        { .ctl_name = 0 }
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index febdade91670..797558941745 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
+#include <linux/swap.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
@@ -37,8 +38,6 @@
 static struct nfs_page * nfs_update_request(struct nfs_open_context*,
                                            struct page *,
                                            unsigned int, unsigned int);
-static void nfs_mark_request_dirty(struct nfs_page *req);
-static int nfs_wait_on_write_congestion(struct address_space *, int);
 static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
@@ -48,8 +47,6 @@ static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
-static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
 struct nfs_write_data *nfs_commit_alloc(void)
 {
        struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -211,6 +208,43 @@ static int wb_priority(struct writeback_control *wbc)
 }
 /*
+ * NFS congestion control
+ */
+int nfs_congestion_kb;
+#define NFS_CONGESTION_ON_THRESH        (nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH       \
+        (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+static int nfs_set_page_writeback(struct page *page)
+{
+        int ret = test_set_page_writeback(page);
+        if (!ret) {
+                struct inode *inode = page->mapping->host;
+                struct nfs_server *nfss = NFS_SERVER(inode);
+                if (atomic_inc_return(&nfss->writeback) >
+                                NFS_CONGESTION_ON_THRESH)
+                        set_bdi_congested(&nfss->backing_dev_info, WRITE);
+        }
+        return ret;
+}
+static void nfs_end_page_writeback(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        end_page_writeback(page);
+        if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+                clear_bdi_congested(&nfss->backing_dev_info, WRITE);
+                congestion_end(WRITE);
+        }
+}
+/*
 * Find an associated nfs write request, and prepare to flush it out
 * Returns 1 if there was no write request, or if the request was
 * already tagged by nfs_set_page_dirty.Returns 0 if the request
@@ -220,7 +254,8 @@ static int wb_priority(struct writeback_control *wbc)
 static int nfs_page_mark_flush(struct page *page)
 {
        struct nfs_page *req;
-        spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+        struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+        spinlock_t *req_lock = &nfsi->req_lock;
        int ret;
        spin_lock(req_lock);
@@ -244,11 +279,23 @@ static int nfs_page_mark_flush(struct page *page)
                        return ret;
                spin_lock(req_lock);
        }
-        spin_unlock(req_lock);
+        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-        if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
+                /* This request is marked for commit */
-                nfs_mark_request_dirty(req);
+                spin_unlock(req_lock);
-                set_page_writeback(page);
+                nfs_unlock_request(req);
+                return 1;
        }
+        if (nfs_set_page_writeback(page) == 0) {
+                nfs_list_remove_request(req);
+                /* add the request to the inode's dirty list. */
+                radix_tree_tag_set(&nfsi->nfs_page_tree,
+                                req->wb_index, NFS_PAGE_TAG_DIRTY);
+                nfs_list_add_request(req, &nfsi->dirty);
+                nfsi->ndirty++;
+                spin_unlock(req_lock);
+                __mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES);
+        } else
+                spin_unlock(req_lock);
        ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
        nfs_unlock_request(req);
        return ret;
@@ -302,13 +349,8 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
        return err; 
 }
-/*
- * Note: causes nfs_update_request() to block on the assumption
- *       that the writeback is generated due to memory pressure.
- */
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        struct inode *inode = mapping->host;
        int err;
@@ -317,20 +359,12 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
        err = generic_writepages(mapping, wbc);
        if (err)
                return err;
-        while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
-                if (wbc->nonblocking)
-                        return 0;
-                nfs_wait_on_write_congestion(mapping, 0);
-        }
        err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
        if (err < 0)
                goto out;
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
        err = 0;
 out:
-        clear_bit(BDI_write_congested, &bdi->state);
-        wake_up_all(&nfs_write_congestion);
-        congestion_end(WRITE);
        return err;
 }
@@ -354,13 +388,15 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        }
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
+        if (PageDirty(req->wb_page))
+                set_bit(PG_NEED_FLUSH, &req->wb_flags);
        nfsi->npages++;
        atomic_inc(&req->wb_count);
        return 0;
 }
 /*
- * Insert a write request into an inode
+ * Remove a write request from an inode
 */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
@@ -373,6 +409,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
+        if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags))
+                __set_page_dirty_nobuffers(req->wb_page);
        nfsi->npages--;
        if (!nfsi->npages) {
                spin_unlock(&nfsi->req_lock);
@@ -384,28 +422,9 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        nfs_release_request(req);
 }
-/*
- * Add a request to the inode's dirty list.
- */
-static void
-nfs_mark_request_dirty(struct nfs_page *req)
-{
-        struct inode *inode = req->wb_context->dentry->d_inode;
-        struct nfs_inode *nfsi = NFS_I(inode);
-        spin_lock(&nfsi->req_lock);
-        radix_tree_tag_set(&nfsi->nfs_page_tree,
-                        req->wb_index, NFS_PAGE_TAG_DIRTY);
-        nfs_list_add_request(req, &nfsi->dirty);
-        nfsi->ndirty++;
-        spin_unlock(&nfsi->req_lock);
-        __mark_inode_dirty(inode, I_DIRTY_PAGES);
-}
 static void
 nfs_redirty_request(struct nfs_page *req)
 {
-        clear_bit(PG_FLUSHING, &req->wb_flags);
        __set_page_dirty_nobuffers(req->wb_page);
 }
@@ -415,7 +434,11 @@ nfs_redirty_request(struct nfs_page *req)
 static inline int
 nfs_dirty_request(struct nfs_page *req)
 {
-        return test_bit(PG_FLUSHING, &req->wb_flags) == 0;
+        struct page *page = req->wb_page;
+        if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
+                return 0;
+        return !PageWriteback(req->wb_page);
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -431,10 +454,48 @@ nfs_mark_request_commit(struct nfs_page *req)
        spin_lock(&nfsi->req_lock);
        nfs_list_add_request(req, &nfsi->commit);
        nfsi->ncommit++;
+        set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
        spin_unlock(&nfsi->req_lock);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
+static inline
+int nfs_write_need_commit(struct nfs_write_data *data)
+{
+        return data->verf.committed != NFS_FILE_SYNC;
+}
+static inline
+int nfs_reschedule_unstable_write(struct nfs_page *req)
+{
+        if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+                nfs_mark_request_commit(req);
+                return 1;
+        }
+        if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
+                nfs_redirty_request(req);
+                return 1;
+        }
+        return 0;
+}
+#else
+static inline void
+nfs_mark_request_commit(struct nfs_page *req)
+{
+}
+static inline
+int nfs_write_need_commit(struct nfs_write_data *data)
+{
+        return 0;
+}
+static inline
+int nfs_reschedule_unstable_write(struct nfs_page *req)
+{
+        return 0;
+}
 #endif
 /*
@@ -481,6 +542,7 @@ static void nfs_cancel_dirty_list(struct list_head *head)
        while(!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
+                nfs_end_page_writeback(req->wb_page);
                nfs_inode_remove_request(req);
                nfs_clear_page_writeback(req);
        }
@@ -494,6 +556,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
                req = nfs_list_entry(head->next);
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                nfs_list_remove_request(req);
+                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
                nfs_inode_remove_request(req);
                nfs_unlock_request(req);
        }
@@ -531,10 +594,10 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, un
 }
 #endif
-static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
+static int nfs_wait_on_write_congestion(struct address_space *mapping)
 {
+        struct inode *inode = mapping->host;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
-        DEFINE_WAIT(wait);
        int ret = 0;
        might_sleep();
@@ -542,31 +605,23 @@ static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
        if (!bdi_write_congested(bdi))
                return 0;
-        nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT);
+        nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
-        if (intr) {
+        do {
-                struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
+                struct rpc_clnt *clnt = NFS_CLIENT(inode);
                sigset_t oldset;
                rpc_clnt_sigmask(clnt, &oldset);
-                prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
+                ret = congestion_wait_interruptible(WRITE, HZ/10);
-                if (bdi_write_congested(bdi)) {
-                        if (signalled())
-                                ret = -ERESTARTSYS;
-                        else
-                                schedule();
-                }
                rpc_clnt_sigunmask(clnt, &oldset);
-        } else {
+                if (ret == -ERESTARTSYS)
-                prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
+                        break;
-                if (bdi_write_congested(bdi))
+                ret = 0;
-                        schedule();
+        } while (bdi_write_congested(bdi));
-        }
-        finish_wait(&nfs_write_congestion, &wait);
        return ret;
 }
 /*
 * Try to update any existing write request, or create one if there is none.
 * In order to match, the request's credentials must match those of
@@ -577,14 +632,15 @@ static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
 static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
                struct page *page, unsigned int offset, unsigned int bytes)
 {
-        struct inode *inode = page->mapping->host;
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_page         *req, *new = NULL;
        unsigned long           rqend, end;
        end = offset + bytes;
-        if (nfs_wait_on_write_congestion(page->mapping, NFS_SERVER(inode)->flags & NFS_MOUNT_INTR))
+        if (nfs_wait_on_write_congestion(mapping))
                return ERR_PTR(-ERESTARTSYS);
        for (;;) {
                /* Loop over all inode entries and see if we find
@@ -727,26 +783,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
-        end_page_writeback(req->wb_page);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
-        if (!PageError(req->wb_page)) {
+                nfs_end_page_writeback(req->wb_page);
-                if (NFS_NEED_RESCHED(req)) {
+                nfs_inode_remove_request(req);
-                        nfs_redirty_request(req);
+        } else
-                        goto out;
+                nfs_end_page_writeback(req->wb_page);
-                } else if (NFS_NEED_COMMIT(req)) {
-                        nfs_mark_request_commit(req);
-                        goto out;
-                }
-        }
-        nfs_inode_remove_request(req);
-out:
-        nfs_clear_commit(req);
-        nfs_clear_reschedule(req);
-#else
-        nfs_inode_remove_request(req);
-#endif
        nfs_clear_page_writeback(req);
 }
@@ -879,6 +921,7 @@ out_bad:
                nfs_writedata_release(data);
        }
        nfs_redirty_request(req);
+        nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_writeback(req);
        return -ENOMEM;
 }
@@ -924,6 +967,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
                struct nfs_page *req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_redirty_request(req);
+                nfs_end_page_writeback(req->wb_page);
                nfs_clear_page_writeback(req);
        }
        return -ENOMEM;
@@ -959,6 +1003,7 @@ out_err:
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_redirty_request(req);
+                nfs_end_page_writeback(req->wb_page);
                nfs_clear_page_writeback(req);
        }
        return error;
@@ -986,22 +1031,28 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
                nfs_set_pageerror(page);
                req->wb_context->error = task->tk_status;
                dprintk(", error = %d\n", task->tk_status);
-        } else {
+                goto out;
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-                if (data->verf.committed < NFS_FILE_SYNC) {
-                        if (!NFS_NEED_COMMIT(req)) {
-                                nfs_defer_commit(req);
-                                memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-                                dprintk(" defer commit\n");
-                        } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
-                                nfs_defer_reschedule(req);
-                                dprintk(" server reboot detected\n");
-                        }
-                } else
-#endif
-                        dprintk(" OK\n");
        }
+        if (nfs_write_need_commit(data)) {
+                spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+                spin_lock(req_lock);
+                if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
+                        /* Do nothing we need to resend the writes */
+                } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
+                        dprintk(" defer commit\n");
+                } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
+                        set_bit(PG_NEED_RESCHED, &req->wb_flags);
+                        clear_bit(PG_NEED_COMMIT, &req->wb_flags);
+                        dprintk(" server reboot detected\n");
+                }
+                spin_unlock(req_lock);
+        } else
+                dprintk(" OK\n");
+out:
        if (atomic_dec_and_test(&req->wb_complete))
                nfs_writepage_release(req);
 }
@@ -1042,25 +1093,21 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
                if (task->tk_status < 0) {
                        nfs_set_pageerror(page);
                        req->wb_context->error = task->tk_status;
-                        end_page_writeback(page);
-                        nfs_inode_remove_request(req);
                        dprintk(", error = %d\n", task->tk_status);
-                        goto next;
+                        goto remove_request;
                }
-                end_page_writeback(page);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+                if (nfs_write_need_commit(data)) {
-                if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
+                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-                        nfs_inode_remove_request(req);
+                        nfs_mark_request_commit(req);
-                        dprintk(" OK\n");
+                        nfs_end_page_writeback(page);
+                        dprintk(" marked for commit\n");
                        goto next;
                }
-                memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
+                dprintk(" OK\n");
-                nfs_mark_request_commit(req);
+remove_request:
-                dprintk(" marked for commit\n");
+                nfs_end_page_writeback(page);
-#else
                nfs_inode_remove_request(req);
-#endif
        next:
                nfs_clear_page_writeback(req);
        }
@@ -1248,6 +1295,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
+                clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
                dprintk("NFS: commit (%s/%Ld %d@%Ld)",
@@ -1483,15 +1531,22 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 int nfs_set_page_dirty(struct page *page)
 {
+        spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
        struct nfs_page *req;
+        int ret;
-        req = nfs_page_find_request(page);
+        spin_lock(req_lock);
+        req = nfs_page_find_request_locked(page);
        if (req != NULL) {
                /* Mark any existing write requests for flushing */
-                set_bit(PG_NEED_FLUSH, &req->wb_flags);
+                ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
+                spin_unlock(req_lock);
                nfs_release_request(req);
+                return ret;
        }
-        return __set_page_dirty_nobuffers(page);
+        ret = __set_page_dirty_nobuffers(page);
+        spin_unlock(req_lock);
+        return ret;
 }
@@ -1514,6 +1569,26 @@ int __init nfs_init_writepagecache(void)
        if (nfs_commit_mempool == NULL)
                return -ENOMEM;
+        /*
+         * NFS congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (nfs_congestion_kb > 256*1024)
+                nfs_congestion_kb = 256*1024;
        return 0;
 }
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index edde5dc5f796..b61742885011 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -287,13 +287,20 @@ static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
        return 1;
 }
-static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
+static int nfsaclsvc_release_attrstat(struct svc_rqst *rqstp, __be32 *p,
-                struct nfsd_fhandle *resp)
+                struct nfsd_attrstat *resp)
 {
        fh_put(&resp->fh);
        return 1;
 }
+static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
+               struct nfsd3_accessres *resp)
+{
+       fh_put(&resp->fh);
+       return 1;
+}
 #define nfsaclsvc_decode_voidargs       NULL
 #define nfsaclsvc_encode_voidres        NULL
 #define nfsaclsvc_release_void          NULL
@@ -322,9 +329,9 @@ struct nfsd3_voidargs { int dummy; };
 static struct svc_procedure             nfsd_acl_procedures2[] = {
  PROC(null,    void,           void,           void,     RC_NOCACHE, ST),
  PROC(getacl,  getacl,         getacl,         getacl,   RC_NOCACHE, ST+1+2*(1+ACL)),
-  PROC(setacl,  setacl,         attrstat,       fhandle,  RC_NOCACHE, ST+AT),
+  PROC(setacl,  setacl,         attrstat,       attrstat, RC_NOCACHE, ST+AT),
-  PROC(getattr, fhandle,        attrstat,       fhandle,  RC_NOCACHE, ST+AT),
+  PROC(getattr, fhandle,        attrstat,       attrstat, RC_NOCACHE, ST+AT),
-  PROC(access,  access,         access,         fhandle,  RC_NOCACHE, ST+AT+1),
+  PROC(access,  access,         access,         access,   RC_NOCACHE, ST+AT+1),
 };
 struct svc_version      nfsd_acl_version2 = {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 6f677988c71d..7e4bb0af24d7 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -859,8 +859,8 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 #define NFS3_ENTRY_BAGGAGE      (2 + 1 + 2 + 1)
 #define NFS3_ENTRYPLUS_BAGGAGE  (1 + 21 + 1 + (NFS3_FHSIZE >> 2))
 static int
-encode_entry(struct readdir_cd *ccd, const char *name,
+encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
-             int namlen, off_t offset, ino_t ino, unsigned int d_type, int plus)
+             loff_t offset, ino_t ino, unsigned int d_type, int plus)
 {
        struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
                                                        common);
@@ -880,7 +880,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
                        *cd->offset1 = htonl(offset64 & 0xffffffff);
                        cd->offset1 = NULL;
                } else {
-                        xdr_encode_hyper(cd->offset, (u64) offset);
+                        xdr_encode_hyper(cd->offset, offset64);
                }
        }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 832673b14587..673a53c014a3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -228,7 +228,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
        struct posix_acl_summary pas;
        unsigned short deny;
        int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
-                                        NFS4_INHERITANCE_FLAGS : 0);
+                NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0);
        BUG_ON(pacl->a_count < 3);
        summarize_posix_acl(pacl, &pas);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9e4067999209..af360705e551 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -750,9 +750,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfserr_clid_inuse;
                if (!cmp_creds(&conf->cl_cred, &rqstp->rq_cred)
                                || conf->cl_addr != sin->sin_addr.s_addr) {
-                        printk("NFSD: setclientid: string in use by client"
+                        dprintk("NFSD: setclientid: string in use by client"
-                        "(clientid %08x/%08x)\n",
+                                "at %u.%u.%u.%u\n", NIPQUAD(conf->cl_addr));
-                        conf->cl_clientid.cl_boot, conf->cl_clientid.cl_id);
                        goto out;
                }
        }
@@ -3261,7 +3260,6 @@ __nfs4_state_shutdown(void)
                unhash_delegation(dp);
        }
-        cancel_delayed_work(&laundromat_work);
        nfsd4_shutdown_recdir();
        nfs4_init = 0;
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c2660cbfcd96..8d995bcef806 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -17,7 +17,6 @@
 #include <linux/stat.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
-#include <asm/pgtable.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..a0c8667caa72 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -47,63 +49,243 @@
 #include "buffer_head_io.h"
-static int ocfs2_extent_contig(struct inode *inode,
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-                               struct ocfs2_extent_rec *ext,
-                               u64 blkno);
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+/*
-                                     handle_t *handle,
+ * Structures which describe a path through a btree, and functions to
-                                     struct inode *inode,
+ * manipulate them.
-                                     int wanted,
+ *
-                                     struct ocfs2_alloc_context *meta_ac,
+ * The idea here is to be as generic as possible with the tree
-                                     struct buffer_head *bhs[]);
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+        struct buffer_head              *bh;
+        struct ocfs2_extent_list        *el;
+};
-static int ocfs2_add_branch(struct ocfs2_super *osb,
+#define OCFS2_MAX_PATH_DEPTH    5
-                            handle_t *handle,
-                            struct inode *inode,
-                            struct buffer_head *fe_bh,
-                            struct buffer_head *eb_bh,
-                            struct buffer_head *last_eb_bh,
-                            struct ocfs2_alloc_context *meta_ac);
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+struct ocfs2_path {
-                                  handle_t *handle,
+        int                     p_tree_depth;
-                                  struct inode *inode,
+        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
-                                  struct buffer_head *fe_bh,
+};
-                                  struct ocfs2_alloc_context *meta_ac,
-                                  struct buffer_head **ret_new_eb_bh);
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
-                                  handle_t *handle,
+#define path_root_el(_path) ((_path)->p_node[0].el)
-                                  struct inode *inode,
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-                                  struct buffer_head *fe_bh,
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-                                  u64 blkno,
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-                                  u32 new_clusters);
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+/*
-                                    struct inode *inode,
+ * Reset the actual path elements so that we can re-use the structure
-                                    struct buffer_head *fe_bh,
+ * to build another path. Generally, this involves freeing the buffer
-                                    struct buffer_head **target_bh);
+ * heads.
+ */
+static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+        int i, start = 0, depth = 0;
+        struct ocfs2_path_item *node;
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+        if (keep_root)
-                                       struct inode *inode,
+                start = 1;
-                                       struct ocfs2_dinode *fe,
-                                       unsigned int new_i_clusters,
+        for(i = start; i < path_num_items(path); i++) {
-                                       struct buffer_head *old_last_eb,
+                node = &path->p_node[i];
-                                       struct buffer_head **new_last_eb);
+                brelse(node->bh);
+                node->bh = NULL;
+                node->el = NULL;
+        }
+        /*
+         * Tree depth may change during truncate, or insert. If we're
+         * keeping the root extent list, then make sure that our path
+         * structure reflects the proper depth.
+         */
+        if (keep_root)
+                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        path->p_tree_depth = depth;
+}
+static void ocfs2_free_path(struct ocfs2_path *path)
+{
+        if (path) {
+                ocfs2_reinit_path(path, 0);
+                kfree(path);
+        }
+}
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+        int i;
+        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+                brelse(dest->p_node[i].bh);
+                dest->p_node[i].bh = src->p_node[i].bh;
+                dest->p_node[i].el = src->p_node[i].el;
+                src->p_node[i].bh = NULL;
+                src->p_node[i].el = NULL;
+        }
+}
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+                                        struct buffer_head *eb_bh)
+{
+        struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+        /*
+         * Right now, no root bh is an extent block, so this helps
+         * catch code errors with dinode trees. The assertion can be
+         * safely removed if we ever need to insert extent block
+         * structures at the root.
+         */
+        BUG_ON(index == 0);
+        path->p_node[index].bh = eb_bh;
+        path->p_node[index].el = &eb->h_list;
+}
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+                                         struct ocfs2_extent_list *root_el)
+{
+        struct ocfs2_path *path;
+        BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+        path = kzalloc(sizeof(*path), GFP_NOFS);
+        if (path) {
+                path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+                get_bh(root_bh);
+                path_root_bh(path) = root_bh;
+                path_root_el(path) = root_el;
+        }
+        return path;
+}
+/*
+ * Allocate and initialize a new path based on a disk inode tree.
+ */
+static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_list *el = &di->id2.i_list;
+        return ocfs2_new_path(di_bh, el);
+}
+/*
+ * Convenience function to journal all components in a path.
+ */
+static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
+                                     struct ocfs2_path *path)
+{
+        int i, ret = 0;
+        if (!path)
+                goto out;
+        for(i = 0; i < path_num_items(path); i++) {
+                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        return ret;
+}
+enum ocfs2_contig_type {
+        CONTIG_NONE = 0,
+        CONTIG_LEFT,
+        CONTIG_RIGHT
+};
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
-static int ocfs2_extent_contig(struct inode *inode,
+/*
-                               struct ocfs2_extent_rec *ext,
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
-                               u64 blkno)
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+                                     struct ocfs2_extent_rec *ext,
+                                     u64 blkno)
+{
+        u64 blk_end = le64_to_cpu(ext->e_blkno);
+        blk_end += ocfs2_clusters_to_blocks(sb,
+                                    le16_to_cpu(ext->e_leaf_clusters));
+        return blkno == blk_end;
+}
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+                                  struct ocfs2_extent_rec *right)
+{
+        u32 left_range;
+        left_range = le32_to_cpu(left->e_cpos) +
+                le16_to_cpu(left->e_leaf_clusters);
+        return (left_range == le32_to_cpu(right->e_cpos));
+}
+static enum ocfs2_contig_type
+        ocfs2_extent_contig(struct inode *inode,
+                            struct ocfs2_extent_rec *ext,
+                            struct ocfs2_extent_rec *insert_rec)
 {
-        return blkno == (le64_to_cpu(ext->e_blkno) +
+        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
-                         ocfs2_clusters_to_blocks(inode->i_sb,
-                                                  le32_to_cpu(ext->e_clusters)));
+        if (ocfs2_extents_adjacent(ext, insert_rec) &&
+            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+                        return CONTIG_RIGHT;
+        blkno = le64_to_cpu(ext->e_blkno);
+        if (ocfs2_extents_adjacent(insert_rec, ext) &&
+            ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+                return CONTIG_LEFT;
+        return CONTIG_NONE;
 }
 /*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+        APPEND_NONE = 0,
+        APPEND_TAIL,
+};
+struct ocfs2_insert_type {
+        enum ocfs2_append_type  ins_appending;
+        enum ocfs2_contig_type  ins_contig;
+        int                     ins_contig_index;
+        int                     ins_free_records;
+        int                     ins_tree_depth;
+};
+/*
 * How many free extents have we got before we need more meta data?
 */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
 }
 /*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
+{
+        int i;
+        i = le16_to_cpu(el->l_next_free_rec) - 1;
+        return le32_to_cpu(el->l_recs[i].e_cpos) +
+                ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+/*
 * Add an entire tree branch to our inode. eb_bh is the extent block
 * to start at, if we don't want to start the branch at the dinode
 * structure.
@@ -250,7 +454,7 @@ bail:
 * for the new last extent block.
 *
 * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
 */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *eb_el;
        struct ocfs2_extent_list  *el;
+        u32 new_cpos;
        mlog_entry_void();
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
+        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
         * linked with the rest of the tree.
         * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                eb->h_next_leaf_blk = 0;
                eb_el->l_tree_depth = cpu_to_le16(i);
                eb_el->l_next_free_rec = cpu_to_le16(1);
-                eb_el->l_recs[0].e_cpos = fe->i_clusters;
+                /*
+                 * This actually counts as an empty extent as
+                 * c_clusters == 0
+                 */
+                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-                eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+                /*
+                 * eb_el isn't always an interior node, but even leaf
+                 * nodes want a zero'd flags and reserved field so
+                 * this gets the whole 32 bits regardless of use.
+                 */
+                eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * either be on the fe, or the extent block passed in. */
        i = le16_to_cpu(el->l_next_free_rec);
        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
-        el->l_recs[i].e_cpos = fe->i_clusters;
+        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
-        el->l_recs[i].e_clusters = 0;
+        el->l_recs[i].e_int_clusters = 0;
        le16_add_cpu(&el->l_next_free_rec, 1);
        /* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                                  struct buffer_head **ret_new_eb_bh)
 {
        int status, i;
+        u32 new_clusters;
        struct buffer_head *new_eb_bh = NULL;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        /* copy the fe data into the new extent block */
        eb_el->l_tree_depth = fe_el->l_tree_depth;
        eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-                eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+                eb_el->l_recs[i] = fe_el->l_recs[i];
-                eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-                eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-        }
        status = ocfs2_journal_dirty(handle, new_eb_bh);
        if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
+        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
        /* update fe now */
        le16_add_cpu(&fe_el->l_tree_depth, 1);
        fe_el->l_recs[0].e_cpos = 0;
        fe_el->l_recs[0].e_blkno = eb->h_blkno;
-        fe_el->l_recs[0].e_clusters = fe->i_clusters;
+        fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-                fe_el->l_recs[i].e_cpos = 0;
+                memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-                fe_el->l_recs[i].e_clusters = 0;
-                fe_el->l_recs[i].e_blkno = 0;
-        }
        fe_el->l_next_free_rec = cpu_to_le16(1);
        /* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
 }
 /*
- * Expects the tree to already have room in the rightmost leaf for the
- * extent.  Updates all the extent blocks (and the dinode) on the way
- * down.
- */
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
-                                  handle_t *handle,
-                                  struct inode *inode,
-                                  struct buffer_head *fe_bh,
-                                  u64 start_blk,
-                                  u32 new_clusters)
-{
-        int status, i, num_bhs = 0;
-        u64 next_blkno;
-        u16 next_free;
-        struct buffer_head **eb_bhs = NULL;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *el;
-        mlog_entry_void();
-        status = ocfs2_journal_access(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        el = &fe->id2.i_list;
-        if (el->l_tree_depth) {
-                /* This is another operation where we want to be
-                 * careful about our tree updates. An error here means
-                 * none of the previous changes we made should roll
-                 * forward. As a result, we have to record the buffers
-                 * for this part of the tree in an array and reserve a
-                 * journal write to them before making any changes. */
-                num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
-                eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
-                                 GFP_KERNEL);
-                if (!eb_bhs) {
-                        status = -ENOMEM;
-                        mlog_errno(status);
-                        goto bail;
-                }
-                i = 0;
-                while(el->l_tree_depth) {
-                        next_free = le16_to_cpu(el->l_next_free_rec);
-                        if (next_free == 0) {
-                                ocfs2_error(inode->i_sb,
-                                            "Dinode %llu has a bad extent list",
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                                status = -EIO;
-                                goto bail;
-                        }
-                        next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
-                        BUG_ON(i >= num_bhs);
-                        status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
-                                                  OCFS2_BH_CACHED, inode);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-                        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-                                                                 eb);
-                                status = -EIO;
-                                goto bail;
-                        }
-                        status = ocfs2_journal_access(handle, inode, eb_bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        el = &eb->h_list;
-                        i++;
-                        /* When we leave this loop, eb_bhs[num_bhs - 1] will
-                         * hold the bottom-most leaf extent block. */
-                }
-                BUG_ON(el->l_tree_depth);
-                el = &fe->id2.i_list;
-                /* If we have tree depth, then the fe update is
-                 * trivial, and we want to switch el out for the
-                 * bottom-most leaf in order to update it with the
-                 * actual extent data below. */
-                next_free = le16_to_cpu(el->l_next_free_rec);
-                if (next_free == 0) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu has a bad extent list",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        status = -EIO;
-                        goto bail;
-                }
-                le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                             new_clusters);
-                /* (num_bhs - 1) to avoid the leaf */
-                for(i = 0; i < (num_bhs - 1); i++) {
-                        eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-                        el = &eb->h_list;
-                        /* finally, make our actual change to the
-                         * intermediate extent blocks. */
-                        next_free = le16_to_cpu(el->l_next_free_rec);
-                        le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                                     new_clusters);
-                        status = ocfs2_journal_dirty(handle, eb_bhs[i]);
-                        if (status < 0)
-                                mlog_errno(status);
-                }
-                BUG_ON(i != (num_bhs - 1));
-                /* note that the leaf block wasn't touched in
-                 * the loop above */
-                eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
-                el = &eb->h_list;
-                BUG_ON(el->l_tree_depth);
-        }
-        /* yay, we can finally add the actual extent now! */
-        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le16_to_cpu(el->l_next_free_rec) &&
-            ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
-                le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
-        } else if (le16_to_cpu(el->l_next_free_rec) &&
-                   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
-                /* having an empty extent at eof is legal. */
-                if (el->l_recs[i].e_cpos != fe->i_clusters) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu trailing extent is bad: "
-                                    "cpos (%u) != number of clusters (%u)",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                    le32_to_cpu(el->l_recs[i].e_cpos),
-                                    le32_to_cpu(fe->i_clusters));
-                        status = -EIO;
-                        goto bail;
-                }
-                el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-                el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-        } else {
-                /* No contiguous record, or no empty record at eof, so
-                 * we add a new one. */
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
-                       le16_to_cpu(el->l_count));
-                i = le16_to_cpu(el->l_next_free_rec);
-                el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-                el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-                el->l_recs[i].e_cpos = fe->i_clusters;
-                le16_add_cpu(&el->l_next_free_rec, 1);
-        }
-        /*
-         * extent_map errors are not fatal, so they are ignored outside
-         * of flushing the thing.
-         */
-        status = ocfs2_extent_map_append(inode, &el->l_recs[i],
-                                         new_clusters);
-        if (status) {
-                mlog_errno(status);
-                ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
-        }
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
-                if (status < 0)
-                        mlog_errno(status);
-        }
-        status = 0;
-bail:
-        if (eb_bhs) {
-                for (i = 0; i < num_bhs; i++)
-                        if (eb_bhs[i])
-                                brelse(eb_bhs[i]);
-                kfree(eb_bhs);
-        }
-        mlog_exit(status);
-        return status;
-}
-/*
 * Should only be called when there is no space left in any of the
 * leaf nodes. What we want to do is find the lowest tree depth
 * non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
        return status;
 }
-/* the caller needs to update fe->i_clusters */
+/*
-int ocfs2_insert_extent(struct ocfs2_super *osb,
+ * This is only valid for leaf nodes, which are the only ones that can
-                        handle_t *handle,
+ * have empty extents anyway.
-                        struct inode *inode,
+ */
-                        struct buffer_head *fe_bh,
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-                        u64 start_blk,
-                        u32 new_clusters,
-                        struct ocfs2_alloc_context *meta_ac)
 {
-        int status, i, shift;
+        return !rec->e_leaf_clusters;
-        struct buffer_head *last_eb_bh = NULL;
+}
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        int count = le16_to_cpu(el->l_count);
+        unsigned int num_bytes;
+        BUG_ON(!next_free);
+        /* This will cause us to go off the end of our extent list. */
+        BUG_ON(next_free >= count);
+        num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+        memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+                              struct ocfs2_extent_rec *insert_rec)
+{
+        int i, insert_index, next_free, has_empty, num_bytes;
+        u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+        struct ocfs2_extent_rec *rec;
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+        BUG_ON(!next_free);
+        /* The tree code before us didn't allow enough room in the leaf. */
+        if (el->l_next_free_rec == el->l_count && !has_empty)
+                BUG();
+        /*
+         * The easiest way to approach this is to just remove the
+         * empty extent and temporarily decrement next_free.
+         */
+        if (has_empty) {
+                /*
+                 * If next_free was 1 (only an empty extent), this
+                 * loop won't execute, which is fine. We still want
+                 * the decrement above to happen.
+                 */
+                for(i = 0; i < (next_free - 1); i++)
+                        el->l_recs[i] = el->l_recs[i+1];
+                next_free--;
+        }
+        /*
+         * Figure out what the new record index should be.
+         */
+        for(i = 0; i < next_free; i++) {
+                rec = &el->l_recs[i];
+                if (insert_cpos < le32_to_cpu(rec->e_cpos))
+                        break;
+        }
+        insert_index = i;
+        mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
+             insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+        BUG_ON(insert_index < 0);
+        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+        BUG_ON(insert_index > next_free);
+        /*
+         * No need to memmove if we're just adding to the tail.
+         */
+        if (insert_index != next_free) {
+                BUG_ON(next_free >= le16_to_cpu(el->l_count));
+                num_bytes = next_free - insert_index;
+                num_bytes *= sizeof(struct ocfs2_extent_rec);
+                memmove(&el->l_recs[insert_index + 1],
+                        &el->l_recs[insert_index],
+                        num_bytes);
+        }
+        /*
+         * Either we had an empty extent, and need to re-increment or
+         * there was no empty extent on a non full rightmost leaf node,
+         * in which case we still need to increment.
+         */
+        next_free++;
+        el->l_next_free_rec = cpu_to_le16(next_free);
+        /*
+         * Make sure none of the math above just messed up our tree.
+         */
+        BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+        el->l_recs[insert_index] = *insert_rec;
+}
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (next_free == 0)
+                goto set_and_inc;
+        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                return;
+        mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+                        "Asked to create an empty extent in a full list:\n"
+                        "count = %u, tree depth = %u",
+                        le16_to_cpu(el->l_count),
+                        le16_to_cpu(el->l_tree_depth));
+        ocfs2_shift_records_right(el);
+set_and_inc:
+        le16_add_cpu(&el->l_next_free_rec, 1);
+        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+static int ocfs2_find_subtree_root(struct inode *inode,
+                                   struct ocfs2_path *left,
+                                   struct ocfs2_path *right)
+{
+        int i = 0;
+        /*
+         * Check that the caller passed in two paths from the same tree.
+         */
+        BUG_ON(path_root_bh(left) != path_root_bh(right));
+        do {
+                i++;
+                /*
+                 * The caller didn't pass two adjacent paths.
+                 */
+                mlog_bug_on_msg(i > left->p_tree_depth,
+                                "Inode %lu, left depth %u, right depth %u\n"
+                                "left leaf blk %llu, right leaf blk %llu\n",
+                                inode->i_ino, left->p_tree_depth,
+                                right->p_tree_depth,
+                                (unsigned long long)path_leaf_bh(left)->b_blocknr,
+                                (unsigned long long)path_leaf_bh(right)->b_blocknr);
+        } while (left->p_node[i].bh->b_blocknr ==
+                 right->p_node[i].bh->b_blocknr);
+        return i - 1;
+}
+typedef void (path_insert_t)(void *, struct buffer_head *);
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct inode *inode,
+                             struct ocfs2_extent_list *root_el, u32 cpos,
+                             path_insert_t *func, void *data)
+{
+        int i, ret = 0;
+        u32 range;
+        u64 blkno;
        struct buffer_head *bh = NULL;
-        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *el;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog_entry_void();
+        el = root_el;
+        while (el->l_tree_depth) {
+                if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has empty extent list at "
+                                    "depth %u\n",
+                                    (unsigned long long)oi->ip_blkno,
+                                    le16_to_cpu(el->l_tree_depth));
+                        ret = -EROFS;
+                        goto out;
-        mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
+                }
-             new_clusters, (unsigned long long)start_blk,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+                for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
-        el = &fe->id2.i_list;
+                        rec = &el->l_recs[i];
+                        /*
+                         * In the case that cpos is off the allocation
+                         * tree, this should just wind up returning the
+                         * rightmost record.
+                         */
+                        range = le32_to_cpu(rec->e_cpos) +
+                                ocfs2_rec_clusters(el, rec);
+                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+                            break;
+                }
-        if (el->l_tree_depth) {
+                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
-                /* jump to end of tree */
+                if (blkno == 0) {
-                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                        ocfs2_error(inode->i_sb,
-                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
+                                    "Inode %llu has bad blkno in extent list "
-                if (status < 0) {
+                                    "at depth %u (index %d)\n",
-                        mlog_exit(status);
+                                    (unsigned long long)oi->ip_blkno,
-                        goto bail;
+                                    le16_to_cpu(el->l_tree_depth), i);
+                        ret = -EROFS;
+                        goto out;
                }
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                brelse(bh);
+                bh = NULL;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+                                       &bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                        ret = -EIO;
+                        goto out;
+                }
+                if (le16_to_cpu(el->l_next_free_rec) >
+                    le16_to_cpu(el->l_count)) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has bad count in extent list "
+                                    "at block %llu (next free=%u, count=%u)\n",
+                                    (unsigned long long)oi->ip_blkno,
+                                    (unsigned long long)bh->b_blocknr,
+                                    le16_to_cpu(el->l_next_free_rec),
+                                    le16_to_cpu(el->l_count));
+                        ret = -EROFS;
+                        goto out;
+                }
+                if (func)
+                        func(data, bh);
+        }
+out:
+        /*
+         * Catch any trailing bh that the loop didn't handle.
+         */
+        brelse(bh);
+        return ret;
+}
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+        int index;
+        struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+        struct find_path_data *fp = data;
+        get_bh(bh);
+        ocfs2_path_insert_eb(fp->path, fp->index, bh);
+        fp->index++;
+}
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+                           u32 cpos)
+{
+        struct find_path_data data;
+        data.index = 1;
+        data.path = path;
+        return __ocfs2_find_path(inode, path_root_el(path), cpos,
+                                 find_path_ins, &data);
+}
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+        struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+        struct ocfs2_extent_list *el = &eb->h_list;
+        struct buffer_head **ret = data;
+        /* We want to retain only the leaf block. */
+        if (le16_to_cpu(el->l_tree_depth) == 0) {
+                get_bh(bh);
+                *ret = bh;
+        }
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+                    u32 cpos, struct buffer_head **leaf_bh)
+{
+        int ret;
+        struct buffer_head *bh = NULL;
+        ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *leaf_bh = bh;
+out:
+        return ret;
+}
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+                                  struct ocfs2_extent_list *left_child_el,
+                                  struct ocfs2_extent_rec *right_rec,
+                                  struct ocfs2_extent_list *right_child_el)
+{
+        u32 left_clusters, right_end;
+        /*
+         * Interior nodes never have holes. Their cpos is the cpos of
+         * the leftmost record in their child list. Their cluster
+         * count covers the full theoretical range of their child list
+         * - the range between their cpos and the cpos of the record
+         * immediately to their right.
+         */
+        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+        left_clusters -= le32_to_cpu(left_rec->e_cpos);
+        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+        /*
+         * Calculate the rightmost cluster count boundary before
+         * moving cpos - we will need to adjust clusters after
+         * updating e_cpos to keep the same highest cluster count.
+         */
+        right_end = le32_to_cpu(right_rec->e_cpos);
+        right_end += le32_to_cpu(right_rec->e_int_clusters);
+        right_rec->e_cpos = left_rec->e_cpos;
+        le32_add_cpu(&right_rec->e_cpos, left_clusters);
+        right_end -= le32_to_cpu(right_rec->e_cpos);
+        right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+                                      struct ocfs2_extent_list *left_el,
+                                      struct ocfs2_extent_list *right_el,
+                                      u64 left_el_blkno)
+{
+        int i;
+        BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+               le16_to_cpu(left_el->l_tree_depth));
+        for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+                if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+                        break;
+        }
+        /*
+         * The path walking code should have never returned a root and
+         * two paths which are not adjacent.
+         */
+        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+        ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+                                      &root_el->l_recs[i + 1], right_el);
+}
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ */
+static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+                                       struct ocfs2_path *left_path,
+                                       struct ocfs2_path *right_path,
+                                       int subtree_index)
+{
+        int ret, i, idx;
+        struct ocfs2_extent_list *el, *left_el, *right_el;
+        struct ocfs2_extent_rec *left_rec, *right_rec;
+        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+        /*
+         * Update the counts and position values within all the
+         * interior nodes to reflect the leaf rotation we just did.
+         *
+         * The root node is handled below the loop.
+         *
+         * We begin the loop with right_el and left_el pointing to the
+         * leaf lists and work our way up.
+         *
+         * NOTE: within this loop, left_el and right_el always refer
+         * to the *child* lists.
+         */
+        left_el = path_leaf_el(left_path);
+        right_el = path_leaf_el(right_path);
+        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+                mlog(0, "Adjust records at index %u\n", i);
+                /*
+                 * One nice property of knowing that all of these
+                 * nodes are below the root is that we only deal with
+                 * the leftmost right node record and the rightmost
+                 * left node record.
+                 */
+                el = left_path->p_node[i].el;
+                idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+                left_rec = &el->l_recs[idx];
+                el = right_path->p_node[i].el;
+                right_rec = &el->l_recs[0];
+                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+                                              right_el);
+                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                if (ret)
+                        mlog_errno(ret);
+                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * Setup our list pointers now so that the current
+                 * parents become children in the next iteration.
+                 */
+                left_el = left_path->p_node[i].el;
+                right_el = right_path->p_node[i].el;
+        }
+        /*
+         * At the root node, adjust the two adjacent records which
+         * begin our path to the leaves.
+         */
+        el = left_path->p_node[subtree_index].el;
+        left_el = left_path->p_node[subtree_index + 1].el;
+        right_el = right_path->p_node[subtree_index + 1].el;
+        ocfs2_adjust_root_records(el, left_el, right_el,
+                                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
+        root_bh = left_path->p_node[subtree_index].bh;
+        ret = ocfs2_journal_dirty(handle, root_bh);
+        if (ret)
+                mlog_errno(ret);
+}
+static int ocfs2_rotate_subtree_right(struct inode *inode,
+                                      handle_t *handle,
+                                      struct ocfs2_path *left_path,
+                                      struct ocfs2_path *right_path,
+                                      int subtree_index)
+{
+        int ret, i;
+        struct buffer_head *right_leaf_bh;
+        struct buffer_head *left_leaf_bh = NULL;
+        struct buffer_head *root_bh;
+        struct ocfs2_extent_list *right_el, *left_el;
+        struct ocfs2_extent_rec move_rec;
+        left_leaf_bh = path_leaf_bh(left_path);
+        left_el = path_leaf_el(left_path);
+        if (left_el->l_next_free_rec != left_el->l_count) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has non-full interior leaf node %llu"
+                            "(next free = %u)",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)left_leaf_bh->b_blocknr,
+                            le16_to_cpu(left_el->l_next_free_rec));
+                return -EROFS;
+        }
+        /*
+         * This extent block may already have an empty record, so we
+         * return early if so.
+         */
+        if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+                return 0;
+        root_bh = left_path->p_node[subtree_index].bh;
+        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+                ret = ocfs2_journal_access(handle, inode,
+                                           right_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access(handle, inode,
+                                           left_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        right_leaf_bh = path_leaf_bh(right_path);
+        right_el = path_leaf_el(right_path);
+        /* This is a code error, not a disk corruption. */
+        mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+                        "because rightmost leaf block %llu is empty\n",
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (unsigned long long)right_leaf_bh->b_blocknr);
+        ocfs2_create_empty_extent(right_el);
+        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* Do the copy now. */
+        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+        move_rec = left_el->l_recs[i];
+        right_el->l_recs[0] = move_rec;
+        /*
+         * Clear out the record we just copied and shift everything
+         * over, leaving an empty extent in the left leaf.
+         *
+         * We temporarily subtract from next_free_rec so that the
+         * shift will lose the tail record (which is now defunct).
+         */
+        le16_add_cpu(&left_el->l_next_free_rec, -1);
+        ocfs2_shift_records_right(left_el);
+        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+        le16_add_cpu(&left_el->l_next_free_rec, 1);
+        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+                                subtree_index);
+out:
+        return ret;
+}
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                         struct ocfs2_path *path, u32 *cpos)
+{
+        int i, j, ret = 0;
+        u64 blkno;
+        struct ocfs2_extent_list *el;
+        BUG_ON(path->p_tree_depth == 0);
+        *cpos = 0;
+        blkno = path_leaf_bh(path)->b_blocknr;
+        /* Start at the tree node just above the leaf and work our way up. */
+        i = path->p_tree_depth - 1;
+        while (i >= 0) {
+                el = path->p_node[i].el;
+                /*
+                 * Find the extent record just before the one in our
+                 * path.
+                 */
+                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+                                if (j == 0) {
+                                        if (i == 0) {
+                                                /*
+                                                 * We've determined that the
+                                                 * path specified is already
+                                                 * the leftmost one - return a
+                                                 * cpos of zero.
+                                                 */
+                                                goto out;
+                                        }
+                                        /*
+                                         * The leftmost record points to our
+                                         * leaf - we need to travel up the
+                                         * tree one level.
+                                         */
+                                        goto next_node;
+                                }
+                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+                                *cpos = *cpos + ocfs2_rec_clusters(el,
+                                                           &el->l_recs[j - 1]);
+                                *cpos = *cpos - 1;
+                                goto out;
+                        }
+                }
+                /*
+                 * If we got here, we never found a valid node where
+                 * the tree indicated one should be.
+                 */
+                ocfs2_error(sb,
+                            "Invalid extent tree at extent block %llu\n",
+                            (unsigned long long)blkno);
+                ret = -EROFS;
+                goto out;
+next_node:
+                blkno = path->p_node[i].bh->b_blocknr;
+                i--;
+        }
+out:
+        return ret;
+}
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+                                           struct ocfs2_path *path)
+{
+        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+        if (handle->h_buffer_credits < credits)
+                return ocfs2_extend_trans(handle, credits);
+        return 0;
+}
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+                                                 u32 insert_cpos)
+{
+        struct ocfs2_extent_list *left_el;
+        struct ocfs2_extent_rec *rec;
+        int next_free;
+        left_el = path_leaf_el(left_path);
+        next_free = le16_to_cpu(left_el->l_next_free_rec);
+        rec = &left_el->l_recs[next_free - 1];
+        if (insert_cpos > le32_to_cpu(rec->e_cpos))
+                return 1;
+        return 0;
+}
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon succesful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(struct inode *inode,
+                                   handle_t *handle,
+                                   u32 insert_cpos,
+                                   struct ocfs2_path *right_path,
+                                   struct ocfs2_path **ret_left_path)
+{
+        int ret, start;
+        u32 cpos;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                   path_root_el(right_path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+        /*
+         * What we want to do here is:
+         *
+         * 1) Start with the rightmost path.
+         *
+         * 2) Determine a path to the leaf block directly to the left
+         *    of that leaf.
+         *
+         * 3) Determine the 'subtree root' - the lowest level tree node
+         *    which contains a path to both leaves.
+         *
+         * 4) Rotate the subtree.
+         *
+         * 5) Find the next subtree by considering the left path to be
+         *    the new right path.
+         *
+         * The check at the top of this while loop also accepts
+         * insert_cpos == cpos because cpos is only a _theoretical_
+         * value to get us the left path - insert_cpos might very well
+         * be filling that hole.
+         *
+         * Stop at a cpos of '0' because we either started at the
+         * leftmost branch (i.e., a tree with one branch and a
+         * rotation inside of it), or we've gone as far as we can in
+         * rotating subtrees.
+         */
+        while (cpos && insert_cpos <= cpos) {
+                mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
+                     insert_cpos, cpos);
+                ret = ocfs2_find_path(inode, left_path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                mlog_bug_on_msg(path_leaf_bh(left_path) ==
+                                path_leaf_bh(right_path),
+                                "Inode %lu: error during insert of %u "
+                                "(left path cpos %u) results in two identical "
+                                "paths ending at %llu\n",
+                                inode->i_ino, insert_cpos, cpos,
+                                (unsigned long long)
+                                path_leaf_bh(left_path)->b_blocknr);
+                if (ocfs2_rotate_requires_path_adjustment(left_path,
+                                                          insert_cpos)) {
+                        mlog(0, "Path adjustment required\n");
+                        /*
+                         * We've rotated the tree as much as we
+                         * should. The rest is up to
+                         * ocfs2_insert_path() to complete, after the
+                         * record insertion. We indicate this
+                         * situation by returning the left path.
+                         *
+                         * The reason we don't adjust the records here
+                         * before the record insert is that an error
+                         * later might break the rule where a parent
+                         * record e_cpos will reflect the actual
+                         * e_cpos of the 1st nonempty record of the
+                         * child list.
+                         */
+                        *ret_left_path = left_path;
+                        goto out_ret_path;
+                }
+                start = ocfs2_find_subtree_root(inode, left_path, right_path);
+                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                     start,
+                     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
+                     right_path->p_tree_depth);
+                ret = ocfs2_extend_rotate_transaction(handle, start,
+                                                      right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+                                                 right_path, start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * There is no need to re-read the next right path
+                 * as we know that it'll be our current left
+                 * path. Optimize by copying values instead.
+                 */
+                ocfs2_mv_path(right_path, left_path);
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                    &cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(left_path);
+out_ret_path:
+        return ret;
+}
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_insert_type *insert,
+                                 struct inode *inode)
+{
+        int i = insert->ins_contig_index;
+        unsigned int range;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        /*
+         * Contiguous insert - either left or right.
+         */
+        if (insert->ins_contig != CONTIG_NONE) {
+                rec = &el->l_recs[i];
+                if (insert->ins_contig == CONTIG_LEFT) {
+                        rec->e_blkno = insert_rec->e_blkno;
+                        rec->e_cpos = insert_rec->e_cpos;
+                }
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                return;
+        }
+        /*
+         * Handle insert into an empty leaf.
+         */
+        if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+            ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+             ocfs2_is_empty_extent(&el->l_recs[0]))) {
+                el->l_recs[0] = *insert_rec;
+                el->l_next_free_rec = cpu_to_le16(1);
+                return;
+        }
+        /*
+         * Appending insert.
+         */
+        if (insert->ins_appending == APPEND_TAIL) {
+                i = le16_to_cpu(el->l_next_free_rec) - 1;
+                rec = &el->l_recs[i];
+                range = le32_to_cpu(rec->e_cpos)
+                        + le16_to_cpu(rec->e_leaf_clusters);
+                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+                                le16_to_cpu(el->l_count),
+                                "inode %lu, depth %u, count %u, next free %u, "
+                                "rec.cpos %u, rec.clusters %u, "
+                                "insert.cpos %u, insert.clusters %u\n",
+                                inode->i_ino,
+                                le16_to_cpu(el->l_tree_depth),
+                                le16_to_cpu(el->l_count),
+                                le16_to_cpu(el->l_next_free_rec),
+                                le32_to_cpu(el->l_recs[i].e_cpos),
+                                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+                                le32_to_cpu(insert_rec->e_cpos),
+                                le16_to_cpu(insert_rec->e_leaf_clusters));
+                i++;
+                el->l_recs[i] = *insert_rec;
+                le16_add_cpu(&el->l_next_free_rec, 1);
+                return;
+        }
+        /*
+         * Ok, we have to rotate.
+         *
+         * At this point, it is safe to assume that inserting into an
+         * empty leaf and appending to a leaf have both been handled
+         * above.
+         *
+         * This leaf needs to have space, either by the empty 1st
+         * extent record, or by virtue of an l_next_rec < l_count.
+         */
+        ocfs2_rotate_leaf(el, insert_rec);
+}
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+                                                struct ocfs2_dinode *di,
+                                                u32 clusters)
+{
+        le32_add_cpu(&di->i_clusters, clusters);
+        spin_lock(&OCFS2_I(inode)->ip_lock);
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+        spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+                                    struct ocfs2_extent_rec *insert_rec,
+                                    struct ocfs2_path *right_path,
+                                    struct ocfs2_path **ret_left_path)
+{
+        int ret, i, next_free;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        /*
+         * This shouldn't happen for non-trees. The extent rec cluster
+         * count manipulation below only works for interior nodes.
+         */
+        BUG_ON(right_path->p_tree_depth == 0);
+        /*
+         * If our appending insert is at the leftmost edge of a leaf,
+         * then we might need to update the rightmost records of the
+         * neighboring path.
+         */
+        el = path_leaf_el(right_path);
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        if (next_free == 0 ||
+            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+                u32 left_cpos;
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+                                                    &left_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                mlog(0, "Append may need a left path update. cpos: %u, "
+                     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+                     left_cpos);
+                /*
+                 * No need to worry if the append is already in the
+                 * leftmost leaf.
+                 */
+                if (left_cpos) {
+                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                                   path_root_el(right_path));
+                        if (!left_path) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /*
+                         * ocfs2_insert_path() will pass the left_path to the
+                         * journal for us.
+                         */
+                }
+        }
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_root_el(right_path);
+        bh = path_root_bh(right_path);
+        i = 0;
+        while (1) {
+                struct ocfs2_extent_rec *rec;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (next_free == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Dinode %llu has a bad extent list",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        ret = -EIO;
+                        goto out;
+                }
+                rec = &el->l_recs[next_free - 1];
+                rec->e_int_clusters = insert_rec->e_cpos;
+                le32_add_cpu(&rec->e_int_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                le32_add_cpu(&rec->e_int_clusters,
+                             -le32_to_cpu(rec->e_cpos));
+                ret = ocfs2_journal_dirty(handle, bh);
+                if (ret)
+                        mlog_errno(ret);
+                /* Don't touch the leaf node */
+                if (++i >= right_path->p_tree_depth)
+                        break;
+                bh = right_path->p_node[i].bh;
+                el = right_path->p_node[i].el;
+        }
+        *ret_left_path = left_path;
+        ret = 0;
+out:
+        if (ret != 0)
+                ocfs2_free_path(left_path);
+        return ret;
+}
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct inode *inode,
+                             handle_t *handle,
+                             struct ocfs2_path *left_path,
+                             struct ocfs2_path *right_path,
+                             struct ocfs2_extent_rec *insert_rec,
+                             struct ocfs2_insert_type *insert)
+{
+        int ret, subtree_index;
+        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+        struct ocfs2_extent_list *el;
+        /*
+         * Pass both paths to the journal. The majority of inserts
+         * will be touching all components anyway.
+         */
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (left_path) {
+                int credits = handle->h_buffer_credits;
+                /*
+                 * There's a chance that left_path got passed back to
+                 * us without being accounted for in the
+                 * journal. Extend our transaction here to be sure we
+                 * can change those blocks.
+                 */
+                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, credits);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access_path(inode, handle, left_path);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        el = path_leaf_el(right_path);
+        ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
+        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        if (ret)
+                mlog_errno(ret);
+        if (left_path) {
+                /*
+                 * The rotate code has indicated that we need to fix
+                 * up portions of the tree after the insert.
+                 *
+                 * XXX: Should we extend the transaction here?
+                 */
+                subtree_index = ocfs2_find_subtree_root(inode, left_path,
+                                                        right_path);
+                ocfs2_complete_edge_insert(inode, handle, left_path,
+                                           right_path, subtree_index);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_do_insert_extent(struct inode *inode,
+                                  handle_t *handle,
+                                  struct buffer_head *di_bh,
+                                  struct ocfs2_extent_rec *insert_rec,
+                                  struct ocfs2_insert_type *type)
+{
+        int ret, rotate = 0;
+        u32 cpos;
+        struct ocfs2_path *right_path = NULL;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_extent_list *el;
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        el = &di->id2.i_list;
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le16_to_cpu(el->l_tree_depth) == 0) {
+                ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+                goto out_update_clusters;
+        }
+        right_path = ocfs2_new_inode_path(di_bh);
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Determine the path to start with. Rotations need the
+         * rightmost path, everything else can go directly to the
+         * target leaf.
+         */
+        cpos = le32_to_cpu(insert_rec->e_cpos);
+        if (type->ins_appending == APPEND_NONE &&
+            type->ins_contig == CONTIG_NONE) {
+                rotate = 1;
+                cpos = UINT_MAX;
+        }
+        ret = ocfs2_find_path(inode, right_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Rotations and appends need special treatment - they modify
+         * parts of the tree's above them.
+         *
+         * Both might pass back a path immediate to the left of the
+         * one being inserted to. This will be cause
+         * ocfs2_insert_path() to modify the rightmost records of
+         * left_path to account for an edge insert.
+         *
+         * XXX: When modifying this code, keep in mind that an insert
+         * can wind up skipping both of these two special cases...
+         */
+        if (rotate) {
+                ret = ocfs2_rotate_tree_right(inode, handle,
+                                              le32_to_cpu(insert_rec->e_cpos),
+                                              right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else if (type->ins_appending == APPEND_TAIL
+                   && type->ins_contig != CONTIG_LEFT) {
+                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+                                               right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+                                insert_rec, type);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+out_update_clusters:
+        ocfs2_update_dinode_clusters(inode, di,
+                                     le16_to_cpu(insert_rec->e_leaf_clusters));
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        ocfs2_free_path(left_path);
+        ocfs2_free_path(right_path);
+        return ret;
+}
+static void ocfs2_figure_contig_type(struct inode *inode,
+                                     struct ocfs2_insert_type *insert,
+                                     struct ocfs2_extent_list *el,
+                                     struct ocfs2_extent_rec *insert_rec)
+{
+        int i;
+        enum ocfs2_contig_type contig_type = CONTIG_NONE;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
+                                                  insert_rec);
+                if (contig_type != CONTIG_NONE) {
+                        insert->ins_contig_index = i;
+                        break;
+                }
+        }
+        insert->ins_contig = contig_type;
+}
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+                                        struct ocfs2_extent_list *el,
+                                        struct ocfs2_extent_rec *insert_rec)
+{
+        int i;
+        u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+        struct ocfs2_extent_rec *rec;
+        insert->ins_appending = APPEND_NONE;
+        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (!el->l_next_free_rec)
+                goto set_tail_append;
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                /* Were all records empty? */
+                if (le16_to_cpu(el->l_next_free_rec) == 1)
+                        goto set_tail_append;
        }
-        /* Can we allocate without adding/shifting tree bits? */
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le16_to_cpu(el->l_next_free_rec) == 0
+        rec = &el->l_recs[i];
-            || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
-            || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+        if (cpos >=
-            || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+            (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
-                goto out_add;
+                goto set_tail_append;
+        return;
+set_tail_append:
+        insert->ins_appending = APPEND_TAIL;
+}
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct inode *inode,
+                                    struct buffer_head *di_bh,
+                                    struct buffer_head **last_eb_bh,
+                                    struct ocfs2_extent_rec *insert_rec,
+                                    struct ocfs2_insert_type *insert)
+{
+        int ret;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *path = NULL;
+        struct buffer_head *bh = NULL;
+        el = &di->id2.i_list;
+        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+        if (el->l_tree_depth) {
+                /*
+                 * If we have tree depth, we read in the
+                 * rightmost extent block ahead of time as
+                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+                 * may want it later.
+                 */
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk), &bh,
+                                       OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_exit(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) bh->b_data;
+                el = &eb->h_list;
+        }
+        /*
+         * Unless we have a contiguous insert, we'll need to know if
+         * there is room left in our allocation tree for another
+         * extent record.
+         *
+         * XXX: This test is simplistic, we can search for empty
+         * extent records too.
+         */
+        insert->ins_free_records = le16_to_cpu(el->l_count) -
+                le16_to_cpu(el->l_next_free_rec);
+        if (!insert->ins_tree_depth) {
+                ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+                ocfs2_figure_appending_type(insert, el, insert_rec);
+                return 0;
+        }
+        path = ocfs2_new_inode_path(di_bh);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * In the case that we're inserting past what the tree
+         * currently accounts for, ocfs2_find_path() will return for
+         * us the rightmost tree path. This is accounted for below in
+         * the appending code.
+         */
+        ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        /*
+         * Now that we have the path, there's two things we want to determine:
+         * 1) Contiguousness (also set contig_index if this is so)
+         *
+         * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+         */
+        ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+        /*
+         * The insert code isn't quite ready to deal with all cases of
+         * left contiguousness. Specifically, if it's an insert into
+         * the 1st record in a leaf, it will require the adjustment of
+         * cluster count on the last record of the path directly to it's
+         * left. For now, just catch that case and fool the layers
+         * above us. This works just fine for tree_depth == 0, which
+         * is why we allow that above.
+         */
+        if (insert->ins_contig == CONTIG_LEFT &&
+            insert->ins_contig_index == 0)
+                insert->ins_contig = CONTIG_NONE;
+        /*
+         * Ok, so we can simply compare against last_eb to figure out
+         * whether the path doesn't exist. This will only happen in
+         * the case that we're doing a tail append, so maybe we can
+         * take advantage of that information somehow.
+         */
+        if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+                /*
+                 * Ok, ocfs2_find_path() returned us the rightmost
+                 * tree path. This might be an appending insert. There are
+                 * two cases:
+                 *    1) We're doing a true append at the tail:
+                 *      -This might even be off the end of the leaf
+                 *    2) We're "appending" by rotating in the tail
+                 */
+                ocfs2_figure_appending_type(insert, el, insert_rec);
+        }
+out:
+        ocfs2_free_path(path);
+        if (ret == 0)
+                *last_eb_bh = bh;
+        else
+                brelse(bh);
+        return ret;
+}
+/*
+ * Insert an extent into an inode btree.
+ *
+ * The caller needs to update fe->i_clusters
+ */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+                        handle_t *handle,
+                        struct inode *inode,
+                        struct buffer_head *fe_bh,
+                        u32 cpos,
+                        u64 start_blk,
+                        u32 new_clusters,
+                        struct ocfs2_alloc_context *meta_ac)
+{
+        int status, shift;
+        struct buffer_head *last_eb_bh = NULL;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_insert_type insert = {0, };
+        struct ocfs2_extent_rec rec;
+        mlog(0, "add %u clusters at position %u to inode %llu\n",
+             new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+                        (OCFS2_I(inode)->ip_clusters != cpos),
+                        "Device %s, asking for sparse allocation: inode %llu, "
+                        "cpos %u, clusters %u\n",
+                        osb->dev_str,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
+                        OCFS2_I(inode)->ip_clusters);
+        memset(&rec, 0, sizeof(rec));
+        rec.e_cpos = cpu_to_le32(cpos);
+        rec.e_blkno = cpu_to_le64(start_blk);
+        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+                                          &insert);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
-        mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+        mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
-             "tree now.\n");
+             "Insert.contig_index: %d, Insert.free_records: %d, "
+             "Insert.tree_depth: %d\n",
+             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+             insert.ins_free_records, insert.ins_tree_depth);
+        /*
+         * Avoid growing the tree unless we're out of records and the
+         * insert type requres one.
+         */
+        if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+                goto out_add;
        shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
        if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
         * and didn't find room for any more extents - we need to add
         * another tree level */
        if (shift) {
-                /* if we hit a leaf, we'd better be empty :) */
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
-                       le16_to_cpu(el->l_count));
                BUG_ON(bh);
-                mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+                mlog(0, "need to shift tree depth "
-                     "(current = %u)\n",
+                     "(current = %d)\n", insert.ins_tree_depth);
-                     le16_to_cpu(fe->id2.i_list.l_tree_depth));
                /* ocfs2_shift_tree_depth will return us a buffer with
                 * the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        mlog_errno(status);
                        goto bail;
                }
+                insert.ins_tree_depth++;
                /* Special case: we have room now if we shifted from
                 * tree_depth 0 */
-                if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+                if (insert.ins_tree_depth == 1)
                        goto out_add;
        }
        /* call ocfs2_add_branch to add the final part of the tree with
         * the new data. */
-        mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+        mlog(0, "add branch. bh = %p\n", bh);
        status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
                                  meta_ac);
        if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        }
 out_add:
-        /* Finally, we can add clusters. */
+        /* Finally, we can add clusters. This might rotate the tree for us. */
-        status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
-                                        start_blk, new_clusters);
        if (status < 0)
                mlog_errno(status);
+        else
+                ocfs2_extent_map_insert_rec(inode, &rec);
 bail:
        if (bh)
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 * block will be deleted, and if it will, what the new last extent
 * block will be so we can update his h_next_leaf_blk field, as well
 * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       struct inode *inode,
+                                       unsigned int clusters_to_del,
-                                       struct ocfs2_dinode *fe,
+                                       struct ocfs2_path *path,
-                                       u32 new_i_clusters,
-                                       struct buffer_head *old_last_eb,
                                       struct buffer_head **new_last_eb)
 {
-        int i, status = 0;
+        int next_free, ret = 0;
-        u64 block = 0;
+        u32 cpos;
+        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        struct buffer_head *bh = NULL;
        *new_last_eb = NULL;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        /* we have no tree, so of course, no last_eb. */
-        if (!fe->id2.i_list.l_tree_depth)
+        if (!path->p_tree_depth)
-                goto bail;
+                goto out;
        /* trunc to zero special case - this makes tree_depth = 0
         * regardless of what it is.  */
-        if (!new_i_clusters)
+        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto bail;
+                goto out;
-        eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+        el = path_leaf_el(path);
-        el = &(eb->h_list);
        BUG_ON(!el->l_next_free_rec);
-        /* Make sure that this guy will actually be empty after we
+        /*
-         * clear away the data. */
+         * Make sure that this extent list will actually be empty
-        if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+         * after we clear away the data. We can shortcut out if
-                goto bail;
+         * there's more than one non-empty extent in the
+         * list. Otherwise, a check of the remaining extent is
+         * necessary.
+         */
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        rec = NULL;
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                if (next_free > 2)
+                        goto out;
-        /* Ok, at this point, we know that last_eb will definitely
+                /* We may have a valid extent in index 1, check it. */
-         * change, so lets traverse the tree and find the second to
+                if (next_free == 2)
-         * last extent block. */
+                        rec = &el->l_recs[1];
-        el = &(fe->id2.i_list);
-        /* go down the tree, */
+                /*
-        do {
+                 * Fall through - no more nonempty extents, so we want
-                for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+                 * to delete this leaf.
-                        if (le32_to_cpu(el->l_recs[i].e_cpos) <
+                 */
-                            new_i_clusters) {
+        } else {
-                                block = le64_to_cpu(el->l_recs[i].e_blkno);
+                if (next_free > 1)
-                                break;
+                        goto out;
-                        }
+                rec = &el->l_recs[0];
+        }
+        if (rec) {
+                /*
+                 * Check it we'll only be trimming off the end of this
+                 * cluster.
+                 */
+                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+                        goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        eb = (struct ocfs2_extent_block *) bh->b_data;
+        el = &eb->h_list;
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                ret = -EROFS;
+                goto out;
+        }
+        *new_last_eb = bh;
+        get_bh(*new_last_eb);
+        mlog(0, "returning block %llu, (cpos: %u)\n",
+             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
+out:
+        brelse(bh);
+        return ret;
+}
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ *   - start journaling of each path component.
+ *   - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+                           handle_t *handle, struct ocfs2_truncate_context *tc,
+                           u32 clusters_to_del, u64 *delete_start)
+{
+        int ret, i, index = path->p_tree_depth;
+        u32 new_edge = 0;
+        u64 deleted_eb = 0;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        *delete_start = 0;
+        while (index >= 0) {
+                bh = path->p_node[index].bh;
+                el = path->p_node[index].el;
+                mlog(0, "traveling tree (index = %d, block = %llu)\n",
+                     index,  (unsigned long long)bh->b_blocknr);
+                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+                if (index !=
+                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has invalid ext. block %llu",
+                                    inode->i_ino,
+                                    (unsigned long long)bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
                }
-                BUG_ON(i < 0);
-                if (bh) {
+find_tail_record:
-                        brelse(bh);
+                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                        bh = NULL;
+                rec = &el->l_recs[i];
+                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+                     ocfs2_rec_clusters(el, rec),
+                     (unsigned long long)le64_to_cpu(rec->e_blkno),
+                     le16_to_cpu(el->l_next_free_rec));
+                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+                if (le16_to_cpu(el->l_tree_depth) == 0) {
+                        /*
+                         * If the leaf block contains a single empty
+                         * extent and no records, we can just remove
+                         * the block.
+                         */
+                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                el->l_next_free_rec = cpu_to_le16(0);
+                                goto delete;
+                        }
+                        /*
+                         * Remove any empty extents by shifting things
+                         * left. That should make life much easier on
+                         * the code below. This condition is rare
+                         * enough that we shouldn't see a performance
+                         * hit.
+                         */
+                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                                for(i = 0;
+                                    i < le16_to_cpu(el->l_next_free_rec); i++)
+                                        el->l_recs[i] = el->l_recs[i + 1];
+                                memset(&el->l_recs[i], 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                /*
+                                 * We've modified our extent list. The
+                                 * simplest way to handle this change
+                                 * is to being the search from the
+                                 * start again.
+                                 */
+                                goto find_tail_record;
+                        }
+                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+                        /*
+                         * We'll use "new_edge" on our way back up the
+                         * tree to know what our rightmost cpos is.
+                         */
+                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
+                        new_edge += le32_to_cpu(rec->e_cpos);
+                        /*
+                         * The caller will use this to delete data blocks.
+                         */
+                        *delete_start = le64_to_cpu(rec->e_blkno)
+                                + ocfs2_clusters_to_blocks(inode->i_sb,
+                                        le16_to_cpu(rec->e_leaf_clusters));
+                        /*
+                         * If it's now empty, remove this record.
+                         */
+                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                        }
+                } else {
+                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+                                memset(rec, 0,
+                                       sizeof(struct ocfs2_extent_rec));
+                                le16_add_cpu(&el->l_next_free_rec, -1);
+                                goto delete;
+                        }
+                        /* Can this actually happen? */
+                        if (le16_to_cpu(el->l_next_free_rec) == 0)
+                                goto delete;
+                        /*
+                         * We never actually deleted any clusters
+                         * because our leaf was empty. There's no
+                         * reason to adjust the rightmost edge then.
+                         */
+                        if (new_edge == 0)
+                                goto delete;
+                        rec->e_int_clusters = cpu_to_le32(new_edge);
+                        le32_add_cpu(&rec->e_int_clusters,
+                                     -le32_to_cpu(rec->e_cpos));
+                         /*
+                          * A deleted child record should have been
+                          * caught above.
+                          */
+                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
                }
-                status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+delete:
-                                         inode);
+                ret = ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
+                if (ret) {
-                        mlog_errno(status);
+                        mlog_errno(ret);
-                        goto bail;
+                        goto out;
                }
-                eb = (struct ocfs2_extent_block *) bh->b_data;
-                el = &eb->h_list;
+                mlog(0, "extent list container %llu, after: record %d: "
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                     "(%u, %u, %llu), next = %u.\n",
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                     (unsigned long long)bh->b_blocknr, i,
-                        status = -EIO;
+                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                        goto bail;
+                     (unsigned long long)le64_to_cpu(rec->e_blkno),
+                     le16_to_cpu(el->l_next_free_rec));
+                /*
+                 * We must be careful to only attempt delete of an
+                 * extent block (and not the root inode block).
+                 */
+                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+                        struct ocfs2_extent_block *eb =
+                                (struct ocfs2_extent_block *)bh->b_data;
+                        /*
+                         * Save this for use when processing the
+                         * parent block.
+                         */
+                        deleted_eb = le64_to_cpu(eb->h_blkno);
+                        mlog(0, "deleting this extent block.\n");
+                        ocfs2_remove_from_cache(inode, bh);
+                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+                        if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+                                /*
+                                 * This code only understands how to
+                                 * lock the suballocator in slot 0,
+                                 * which is fine because allocation is
+                                 * only ever done out of that
+                                 * suballocator too. A future version
+                                 * might change that however, so avoid
+                                 * a free if we don't know how to
+                                 * handle it. This way an fs incompat
+                                 * bit will not be necessary.
+                                 */
+                                ret = ocfs2_free_extent_block(handle,
+                                                              tc->tc_ext_alloc_inode,
+                                                              tc->tc_ext_alloc_bh,
+                                                              eb);
+                                /* An error here is not fatal. */
+                                if (ret < 0)
+                                        mlog_errno(ret);
+                        }
+                } else {
+                        deleted_eb = 0;
                }
-        } while (el->l_tree_depth);
-        *new_last_eb = bh;
+                index--;
-        get_bh(*new_last_eb);
+        }
-        mlog(0, "returning block %llu\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno));
-bail:
-        if (bh)
-                brelse(bh);
-        return status;
+        ret = 0;
+out:
+        return ret;
 }
 static int ocfs2_do_truncate(struct ocfs2_super *osb,
                             unsigned int clusters_to_del,
                             struct inode *inode,
                             struct buffer_head *fe_bh,
-                             struct buffer_head *old_last_eb_bh,
                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc)
+                             struct ocfs2_truncate_context *tc,
+                             struct ocfs2_path *path)
 {
-        int status, i, depth;
+        int status;
        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_block *last_eb = NULL;
        struct ocfs2_extent_list *el;
-        struct buffer_head *eb_bh = NULL;
        struct buffer_head *last_eb_bh = NULL;
-        u64 next_eb = 0;
        u64 delete_blk = 0;
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(osb,
+        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             inode,
+                                             path, &last_eb_bh);
-                                             fe,
-                                             le32_to_cpu(fe->i_clusters) -
-                                                        clusters_to_del,
-                                             old_last_eb_bh,
-                                             &last_eb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        if (last_eb_bh)
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        /*
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+         * Each component will be touched, so we might as well journal
+         * here to avoid having to handle errors later.
+         */
+        status = ocfs2_journal_access_path(inode, handle, path);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
+        if (last_eb_bh) {
+                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        }
        el = &(fe->id2.i_list);
+        /*
+         * Lower levels depend on this never happening, but it's best
+         * to check it up here before changing the tree.
+         */
+        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %lu has an empty extent record, depth %u\n",
+                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
+                status = -EROFS;
+                goto bail;
+        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-        fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
-        le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-        /* tree depth zero, we can just delete the clusters, otherwise
-         * we need to record the offset of the next level extent block
-         * as we may overwrite it. */
-        if (!el->l_tree_depth)
-                delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                        + ocfs2_clusters_to_blocks(osb->sb,
-                                        le32_to_cpu(el->l_recs[i].e_clusters));
-        else
-                next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
-        if (!el->l_recs[i].e_clusters) {
+        status = ocfs2_trim_tree(inode, path, handle, tc,
-                /* if we deleted the whole extent record, then clear
+                                 clusters_to_del, &delete_blk);
-                 * out the other fields and update the extent
+        if (status) {
-                 * list. For depth > 0 trees, we've already recorded
+                mlog_errno(status);
-                 * the extent block in 'next_eb' */
+                goto bail;
-                el->l_recs[i].e_cpos = 0;
-                el->l_recs[i].e_blkno = 0;
-                BUG_ON(!el->l_next_free_rec);
-                le16_add_cpu(&el->l_next_free_rec, -1);
        }
-        depth = le16_to_cpu(el->l_tree_depth);
+        if (le32_to_cpu(fe->i_clusters) == 0) {
-        if (!fe->i_clusters) {
                /* trunc to zero is a special case. */
                el->l_tree_depth = 0;
                fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                /* If there will be a new last extent block, then by
                 * definition, there cannot be any leaves to the right of
                 * him. */
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                last_eb->h_next_leaf_blk = 0;
                status = ocfs2_journal_dirty(handle, last_eb_bh);
                if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                }
        }
-        /* if our tree depth > 0, update all the tree blocks below us. */
+        if (delete_blk) {
-        while (depth) {
+                status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
+                                                   clusters_to_del);
-                     depth,  (unsigned long long)next_eb);
-                status = ocfs2_read_block(osb, next_eb, &eb_bh,
-                                          OCFS2_BH_CACHED, inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
-                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+        }
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        status = 0;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+bail:
-                        status = -EIO;
-                        goto bail;
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        return 0;
+}
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+        set_buffer_uptodate(bh);
+        mark_buffer_dirty(bh);
+        return ocfs2_journal_dirty_data(handle, bh);
+}
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+                                     struct page **pages, int numpages,
+                                     u64 phys, handle_t *handle)
+{
+        int i, ret, partial = 0;
+        void *kaddr;
+        struct page *page;
+        unsigned int from, to = PAGE_CACHE_SIZE;
+        struct super_block *sb = inode->i_sb;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        if (numpages == 0)
+                goto out;
+        from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+        if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+                /*
+                 * Since 'from' has been capped to a value below page
+                 * size, this calculation won't be able to overflow
+                 * 'to'
+                 */
+                to = ocfs2_align_bytes_to_clusters(sb, from);
+                /*
+                 * The truncate tail in this case should never contain
+                 * more than one page at maximum. The loop below also
+                 * assumes this.
+                 */
+                BUG_ON(numpages != 1);
+        }
+        for(i = 0; i < numpages; i++) {
+                page = pages[i];
+                BUG_ON(from > PAGE_CACHE_SIZE);
+                BUG_ON(to > PAGE_CACHE_SIZE);
+                ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+                if (ret)
+                        mlog_errno(ret);
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + from, 0, to - from);
+                kunmap_atomic(kaddr, KM_USER0);
+                /*
+                 * Need to set the buffers we zero'd into uptodate
+                 * here if they aren't - ocfs2_map_page_blocks()
+                 * might've skipped some
+                 */
+                if (ocfs2_should_order_data(inode)) {
+                        ret = walk_page_buffers(handle,
+                                                page_buffers(page),
+                                                from, to, &partial,
+                                                ocfs2_ordered_zero_func);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                } else {
+                        ret = walk_page_buffers(handle, page_buffers(page),
+                                                from, to, &partial,
+                                                ocfs2_writeback_zero_func);
+                        if (ret < 0)
+                                mlog_errno(ret);
                }
-                el = &(eb->h_list);
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                if (!partial)
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                        SetPageUptodate(page);
-                if (status < 0) {
-                        mlog_errno(status);
+                flush_dcache_page(page);
-                        goto bail;
+                /*
+                 * Every page after the 1st one should be completely zero'd.
+                 */
+                from = 0;
+        }
+out:
+        if (pages) {
+                for (i = 0; i < numpages; i++) {
+                        page = pages[i];
+                        unlock_page(page);
+                        mark_page_accessed(page);
+                        page_cache_release(page);
                }
+        }
+}
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
-                BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+                                int *num, u64 *phys)
+{
+        int i, numpages = 0, ret = 0;
+        unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+        unsigned int ext_flags;
+        struct super_block *sb = inode->i_sb;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long index;
+        u64 next_cluster_bytes;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        /* Cluster boundary, so we don't need to grab any pages. */
+        if ((isize & (csize - 1)) == 0)
+                goto out;
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
+        ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+                                          phys, NULL, &ext_flags);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
-                mlog(0, "extent block %llu, before: record %d: "
+        /* Tail is a hole. */
-                     "(%u, %u, %llu), next = %u\n",
+        if (*phys == 0)
-                     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
+                goto out;
-                     le32_to_cpu(el->l_recs[i].e_cpos),
-                     le32_to_cpu(el->l_recs[i].e_clusters),
-                     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+        /* Tail is marked as unwritten, we can count on write to zero
-                le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+         * in that case. */
+        if (ext_flags & OCFS2_EXT_UNWRITTEN)
-                next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+                goto out;
-                /* bottom-most block requires us to delete data.*/
-                if (!el->l_tree_depth)
-                        delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                                + ocfs2_clusters_to_blocks(osb->sb,
-                                        le32_to_cpu(el->l_recs[i].e_clusters));
-                if (!el->l_recs[i].e_clusters) {
-                        el->l_recs[i].e_cpos = 0;
-                        el->l_recs[i].e_blkno = 0;
-                        BUG_ON(!el->l_next_free_rec);
-                        le16_add_cpu(&el->l_next_free_rec, -1);
-                }
-                mlog(0, "extent block %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u\n",
-                     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-                     le32_to_cpu(el->l_recs[i].e_cpos),
-                     le32_to_cpu(el->l_recs[i].e_clusters),
-                     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                status = ocfs2_journal_dirty(handle, eb_bh);
+        next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
-                if (status < 0) {
+        index = isize >> PAGE_CACHE_SHIFT;
-                        mlog_errno(status);
+        do {
-                        goto bail;
+                pages[numpages] = grab_cache_page(mapping, index);
+                if (!pages[numpages]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
                }
-                if (!el->l_next_free_rec) {
+                numpages++;
-                        mlog(0, "deleting this extent block.\n");
+                index++;
+        } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
-                        ocfs2_remove_from_cache(inode, eb_bh);
-                        BUG_ON(el->l_recs[0].e_clusters);
+out:
-                        BUG_ON(el->l_recs[0].e_cpos);
+        if (ret != 0) {
-                        BUG_ON(el->l_recs[0].e_blkno);
+                if (pages) {
-                        if (eb->h_suballoc_slot == 0) {
+                        for (i = 0; i < numpages; i++) {
-                                /*
+                                if (pages[i]) {
-                                 * This code only understands how to
+                                        unlock_page(pages[i]);
-                                 * lock the suballocator in slot 0,
+                                        page_cache_release(pages[i]);
-                                 * which is fine because allocation is
-                                 * only ever done out of that
-                                 * suballocator too. A future version
-                                 * might change that however, so avoid
-                                 * a free if we don't know how to
-                                 * handle it. This way an fs incompat
-                                 * bit will not be necessary.
-                                 */
-                                status = ocfs2_free_extent_block(handle,
-                                                                 tc->tc_ext_alloc_inode,
-                                                                 tc->tc_ext_alloc_bh,
-                                                                 eb);
-                                if (status < 0) {
-                                        mlog_errno(status);
-                                        goto bail;
                                }
                        }
                }
-                brelse(eb_bh);
+                numpages = 0;
-                eb_bh = NULL;
-                depth--;
        }
-        BUG_ON(!delete_blk);
+        *num = numpages;
-        status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                                           clusters_to_del);
+        return ret;
-        if (status < 0) {
+}
-                mlog_errno(status);
-                goto bail;
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 new_i_size)
+{
+        int ret, numpages;
+        loff_t endbyte;
+        struct page **pages = NULL;
+        u64 phys;
+        /*
+         * File systems which don't support sparse files zero on every
+         * extend.
+         */
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                return 0;
+        pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+                        sizeof(struct page *), GFP_NOFS);
+        if (pages == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
        }
-        status = 0;
-bail:
+        ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
-        if (!status)
+        if (ret) {
-                ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+                mlog_errno(ret);
-        else
+                goto out;
-                ocfs2_extent_map_drop(inode, 0);
+        }
-        mlog_exit(status);
-        return status;
+        if (numpages == 0)
+                goto out;
+        ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+                                 handle);
+        /*
+         * Initiate writeout of the pages we zero'd here. We don't
+         * wait on them - the truncate_inode_pages() call later will
+         * do that for us.
+         */
+        endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+                                    endbyte - 1, SYNC_FILE_RANGE_WRITE);
+        if (ret)
+                mlog_errno(ret);
+out:
+        if (pages)
+                kfree(pages);
+        return ret;
 }
 /*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct ocfs2_truncate_context *tc)
 {
        int status, i, credits, tl_sem = 0;
-        u32 clusters_to_del, target_i_clusters;
+        u32 clusters_to_del, new_highest_cpos, range;
-        u64 last_eb = 0;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh;
        handle_t *handle = NULL;
        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_path *path = NULL;
        mlog_entry_void();
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        last_eb_bh = tc->tc_last_eb_bh;
+        path = ocfs2_new_inode_path(fe_bh);
-        tc->tc_last_eb_bh = NULL;
+        if (!path) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        ocfs2_extent_map_trunc(inode, new_highest_cpos);
-        if (fe->id2.i_list.l_tree_depth) {
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                el = &eb->h_list;
-        } else
-                el = &fe->id2.i_list;
-        last_eb = le64_to_cpu(fe->i_last_eb_blk);
 start:
-        mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+        /*
-             "last_eb = %llu, fe->i_last_eb_blk = %llu, "
+         * Check that we still have allocation to delete.
-             "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+         */
-             le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
+        if (OCFS2_I(inode)->ip_clusters == 0) {
-             (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
+                status = 0;
-             le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+                goto bail;
+        }
-        if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
-                mlog(0, "last_eb changed!\n");
-                BUG_ON(!fe->id2.i_list.l_tree_depth);
-                last_eb = le64_to_cpu(fe->i_last_eb_blk);
-                /* i_last_eb_blk may have changed, read it if
-                 * necessary. We don't have to worry about the
-                 * truncate to zero case here (where there becomes no
-                 * last_eb) because we never loop back after our work
-                 * is done. */
-                if (last_eb_bh) {
-                        brelse(last_eb_bh);
-                        last_eb_bh = NULL;
-                }
-                status = ocfs2_read_block(osb, last_eb,
+        /*
-                                          &last_eb_bh, OCFS2_BH_CACHED,
+         * Truncate always works against the rightmost tree branch.
-                                          inode);
+         */
-                if (status < 0) {
+        status = ocfs2_find_path(inode, path, UINT_MAX);
-                        mlog_errno(status);
+        if (status) {
-                        goto bail;
+                mlog_errno(status);
-                }
+                goto bail;
-                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        }
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
-                        status = -EIO;
+             OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
-                        goto bail;
-                }
+        /*
-                el = &(eb->h_list);
+         * By now, el will point to the extent list on the bottom most
+         * portion of this tree. Only the tail record is considered in
+         * each pass.
+         *
+         * We handle the following cases, in order:
+         * - empty extent: delete the remaining branch
+         * - remove the entire record
+         * - remove a partial record
+         * - no record needs to be removed (truncate has completed)
+         */
+        el = path_leaf_el(path);
+        if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has empty extent block at %llu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)path_leaf_bh(path)->b_blocknr);
+                status = -EROFS;
+                goto bail;
        }
-        /* by now, el will point to the extent list on the bottom most
-         * portion of this tree. */
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+        range = le32_to_cpu(el->l_recs[i].e_cpos) +
-                clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+                ocfs2_rec_clusters(el, &el->l_recs[i]);
-        else
+        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
-                clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+                clusters_to_del = 0;
+        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+        } else if (range > new_highest_cpos) {
+                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
-                                  target_i_clusters;
+                                  new_highest_cpos;
+        } else {
+                status = 0;
+                goto bail;
+        }
-        mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
+        BUG_ON(clusters_to_del == 0);
        mutex_lock(&tl_inode->i_mutex);
        tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
        }
        credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
-                                                fe, el);
+                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                                el);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
                goto bail;
        }
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+                                   tc, path);
-        if (status < 0)
-                mlog_errno(status);
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
-                                   last_eb_bh, handle, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1888,9 +3745,14 @@ start:
        ocfs2_commit_trans(osb, handle);
        handle = NULL;
-        BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+        ocfs2_reinit_path(path, 1);
-        if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
-                goto start;
+        /*
+         * The check above will catch the case where we've truncated
+         * away all allocation.
+         */
+        goto start;
 bail:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -1902,8 +3764,7 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        if (last_eb_bh)
+        ocfs2_free_path(path);
-                brelse(last_eb_bh);
        /* This will drop the ext_alloc cluster lock for us */
        ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
        return status;
 }
 /*
 * Expects the inode to already be locked. This will figure out which
 * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct buffer_head *fe_bh,
                           struct ocfs2_truncate_context **tc)
 {
-        int status, metadata_delete;
+        int status, metadata_delete, i;
        unsigned int new_i_clusters;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
             "%llu\n", fe->i_clusters, new_i_clusters,
             (unsigned long long)fe->i_size);
-        if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
-                ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
-                            "%u and size %llu whereas struct inode has "
-                            "cluster count %u and size %llu which caused an "
-                            "invalid truncate to %u clusters.",
-                            (unsigned long long)le64_to_cpu(fe->i_blkno),
-                            le32_to_cpu(fe->i_clusters),
-                            (unsigned long long)le64_to_cpu(fe->i_size),
-                            OCFS2_I(inode)->ip_clusters, i_size_read(inode),
-                            new_i_clusters);
-                mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
-                status = -EIO;
-                goto bail;
-        }
        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
        if (!(*tc)) {
                status = -ENOMEM;
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                        goto bail;
                }
                el = &(eb->h_list);
-                if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+                i = 0;
+                if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                        i = 1;
+                /*
+                 * XXX: Should we check that next_free_rec contains
+                 * the extent?
+                 */
+                if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
                        metadata_delete = 1;
        }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        handle_t *handle,
                        struct inode *inode,
                        struct buffer_head *fe_bh,
-                        u64 blkno,
+                        u32 cpos,
+                        u64 start_blk,
                        u32 new_clusters,
                        struct ocfs2_alloc_context *meta_ac);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
        struct buffer_head *tc_last_eb_bh;
 };
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct buffer_head *fe_bh,
                          struct ocfs2_truncate_context *tc);
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+                    u32 cpos, struct buffer_head **leaf_bh);
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+                                              struct ocfs2_extent_rec *rec)
+{
+        /*
+         * Cluster count in extent records is slightly different
+         * between interior nodes and leaf nodes. This is to support
+         * unwritten extents which need a flags field in leaf node
+         * records, thus shrinking the available space for a clusters
+         * field.
+         */
+        if (el->l_tree_depth)
+                return le32_to_cpu(rec->e_int_clusters);
+        else
+                return le16_to_cpu(rec->e_leaf_clusters);
+}
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 93628b02ef5d..56963e6c46c0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -37,6 +39,7 @@
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh_result, int create)
 {
        int err = 0;
+        unsigned int ext_flags;
        u64 p_blkno, past_eof;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
                   (unsigned long long)iblock, bh_result, create);
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        /* this can happen if another node truncs after our extend! */
+        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+                                          &ext_flags);
-        if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
-                                               OCFS2_I(inode)->ip_clusters))
-                err = -EIO;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        if (err)
-                goto bail;
-        err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
-                                          NULL);
        if (err) {
                mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
                     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        map_bh(bh_result, inode->i_sb, p_blkno);
+        /*
+         * ocfs2 never allocates in this function - the only time we
-        if (bh_result->b_blocknr == 0) {
+         * need to use BH_New is when we're extending i_size on a file
-                err = -EIO;
+         * system which doesn't support holes, in which case BH_New
-                mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+         * allows block_prepare_write() to zero.
-                     (unsigned long long)iblock,
+         */
-                     (unsigned long long)p_blkno,
+        mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        "ino %lu, iblock %llu\n", inode->i_ino,
-        }
+                        (unsigned long long)iblock);
+        /* Treat the unwritten extent as a hole for zeroing purposes. */
+        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                map_bh(bh_result, inode->i_sb, p_blkno);
+        if (!ocfs2_sparse_alloc(osb)) {
+                if (p_blkno == 0) {
+                        err = -EIO;
+                        mlog(ML_ERROR,
+                             "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+                             (unsigned long long)iblock,
+                             (unsigned long long)p_blkno,
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+                        dump_stack();
+                }
-        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+                mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-             (unsigned long long)past_eof);
+                     (unsigned long long)past_eof);
-        if (create && (iblock >= past_eof))
+                if (create && (iblock >= past_eof))
-                set_buffer_new(bh_result);
+                        set_buffer_new(bh_result);
+        }
 bail:
        if (err < 0)
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
-/* This can also be called from ocfs2_write_zero_page() which has done
+/*
- * it's own cluster locking. */
+ * This is called from ocfs2_write_zero_page() which has handled it's
+ * own cluster locking and has ensured allocation exists for those
+ * blocks to be written.
+ */
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
                               unsigned from, unsigned to)
 {
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
        return ret;
 }
-/*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback.  It must be able to perform its own locking around
- * ocfs2_get_block().
- */
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-                               unsigned from, unsigned to)
-{
-        struct inode *inode = page->mapping->host;
-        int ret;
-        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-        ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_prepare_write_nolock(inode, page, from, to);
-        ocfs2_meta_unlock(inode, 0);
-out:
-        mlog_exit(ret);
-        return ret;
-}
 /* Taken from ext3. We don't necessarily need the full blown
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
 * their fixes when they happen) --Mark */
-static int walk_page_buffers(   handle_t *handle,
+int walk_page_buffers(  handle_t *handle,
-                                struct buffer_head *head,
+                        struct buffer_head *head,
-                                unsigned from,
+                        unsigned from,
-                                unsigned to,
+                        unsigned to,
-                                int *partial,
+                        int *partial,
-                                int (*fn)(      handle_t *handle,
+                        int (*fn)(      handle_t *handle,
-                                                struct buffer_head *bh))
+                                        struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -388,95 +377,6 @@ out:
        return handle;
 }
-static int ocfs2_commit_write(struct file *file, struct page *page,
-                              unsigned from, unsigned to)
-{
-        int ret;
-        struct buffer_head *di_bh = NULL;
-        struct inode *inode = page->mapping->host;
-        handle_t *handle = NULL;
-        struct ocfs2_dinode *di;
-        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-        /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-         * us to continue here without rechecking the I/O against
-         * changed inode values.
-         *
-         * 1) We're currently holding the inode alloc lock, so no
-         *    nodes can change it underneath us.
-         *
-         * 2) We've had to take the metadata lock at least once
-         *    already to check for extending writes, suid removal, etc.
-         *    The meta data update code then ensures that we don't get a
-         *    stale inode allocation image (i_size, i_clusters, etc).
-         */
-        ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_data_lock_with_page(inode, 1, page);
-        if (ret != 0) {
-                mlog_errno(ret);
-                goto out_unlock_meta;
-        }
-        handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out_unlock_data;
-        }
-        /* Mark our buffer early. We'd rather catch this error up here
-         * as opposed to after a successful commit_write which would
-         * require us to set back inode->i_size. */
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        /* might update i_size */
-        ret = generic_commit_write(file, page, from, to);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        di = (struct ocfs2_dinode *)di_bh->b_data;
-        /* ocfs2_mark_inode_dirty() is too heavy to use here. */
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
-        di->i_size = cpu_to_le64((u64)i_size_read(inode));
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-out_unlock_data:
-        ocfs2_data_unlock(inode, 1);
-out_unlock_meta:
-        ocfs2_meta_unlock(inode, 1);
-out:
-        if (di_bh)
-                brelse(di_bh);
-        mlog_exit(ret);
-        return ret;
-}
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t status;
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
                down_read(&OCFS2_I(inode)->ip_alloc_sem);
        }
-        err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+        err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
-                                          NULL);
        if (!INODE_JOURNAL(inode)) {
                up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
-        u64 p_blkno, inode_blocks;
+        u64 p_blkno, inode_blocks, contig_blocks;
-        int contig_blocks;
+        unsigned int ext_flags;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
         * nicely aligned and of the right size, so there's no need
         * for us to check any of that. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
-                                                OCFS2_I(inode)->ip_clusters);
-        /*
-         * For a read which begins past the end of file, we return a hole.
-         */
-        if (!create && (iblock >= inode_blocks)) {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = 0;
-                goto bail;
-        }
        /*
         * Any write past EOF is not allowed because we'd be extending.
         */
        if (create && (iblock + max_blocks) > inode_blocks) {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
                ret = -EIO;
                goto bail;
        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
-        ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                          &contig_blocks);
+                                          &contig_blocks, &ext_flags);
        if (ret) {
                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
                     (unsigned long long)iblock);
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        map_bh(bh_result, inode->i_sb, p_blkno);
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has a hole at block %llu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)iblock);
+                ret = -EROFS;
+                goto bail;
+        }
+        /*
+         * get_more_blocks() expects us to describe a hole by clearing
+         * the mapped bit on bh_result().
+         *
+         * Consider an unwritten extent as a hole.
+         */
+        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                map_bh(bh_result, inode->i_sb, p_blkno);
+        else {
+                /*
+                 * ocfs2_prepare_inode_for_write() should have caught
+                 * the case where we'd be filling a hole and triggered
+                 * a buffered write instead.
+                 */
+                if (create) {
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                clear_buffer_mapped(bh_result);
+        }
        /* make sure we don't map more than max_blocks blocks here as
           that's all the kernel will handle at this point. */
@@ -606,12 +522,38 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                             void *private)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int level;
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
        ocfs2_iocb_clear_rw_locked(iocb);
-        up_read(&inode->i_alloc_sem);
-        ocfs2_rw_unlock(inode, 0);
+        level = ocfs2_iocb_rw_locked_level(iocb);
+        if (!level)
+                up_read(&inode->i_alloc_sem);
+        ocfs2_rw_unlock(inode, level);
+}
+/*
+ * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
+ * from ext3.  PageChecked() bits have been removed as OCFS2 does not
+ * do journalled data.
+ */
+static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+        journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
+        journal_invalidatepage(journal, page, offset);
+}
+static int ocfs2_releasepage(struct page *page, gfp_t wait)
+{
+        journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
+        if (!page_has_buffers(page))
+                return 0;
+        return journal_try_to_free_buffers(journal, page, wait);
 }
 static ssize_t ocfs2_direct_IO(int rw,
@@ -626,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw,
        mlog_entry_void();
-        /*
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-         * We get PR data locks even for O_DIRECT.  This allows
+                /*
-         * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+                 * We get PR data locks even for O_DIRECT.  This
-         * extending and buffered zeroing writes race.  If they did
+                 * allows concurrent O_DIRECT I/O but doesn't let
-         * race then the buffered zeroing could be written back after
+                 * O_DIRECT with extending and buffered zeroing writes
-         * the O_DIRECT I/O.  It's one thing to tell people not to mix
+                 * race.  If they did race then the buffered zeroing
-         * buffered and O_DIRECT writes, but expecting them to
+                 * could be written back after the O_DIRECT I/O.  It's
-         * understand that file extension is also an implicit buffered
+                 * one thing to tell people not to mix buffered and
-         * write is too much.  By getting the PR we force writeback of
+                 * O_DIRECT writes, but expecting them to understand
-         * the buffered zeroing before proceeding.
+                 * that file extension is also an implicit buffered
-         */
+                 * write is too much.  By getting the PR we force
-        ret = ocfs2_data_lock(inode, 0);
+                 * writeback of the buffered zeroing before
-        if (ret < 0) {
+                 * proceeding.
-                mlog_errno(ret);
+                 */
-                goto out;
+                ret = ocfs2_data_lock(inode, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_data_unlock(inode, 0);
        }
-        ocfs2_data_unlock(inode, 0);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
@@ -654,12 +600,719 @@ out:
        return ret;
 }
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+                                            u32 cpos,
+                                            unsigned int *start,
+                                            unsigned int *end)
+{
+        unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+                unsigned int cpp;
+                cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+                cluster_start = cpos % cpp;
+                cluster_start = cluster_start << osb->s_clustersize_bits;
+                cluster_end = cluster_start + osb->s_clustersize;
+        }
+        BUG_ON(cluster_start > PAGE_SIZE);
+        BUG_ON(cluster_end > PAGE_SIZE);
+        if (start)
+                *start = cluster_start;
+        if (end)
+                *end = cluster_end;
+}
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+                                     struct ocfs2_super *osb, u32 cpos,
+                                     unsigned from, unsigned to)
+{
+        void *kaddr;
+        unsigned int cluster_start, cluster_end;
+        ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (from || to) {
+                if (from > cluster_start)
+                        memset(kaddr + cluster_start, 0, from - cluster_start);
+                if (to < cluster_end)
+                        memset(kaddr + to, 0, cluster_end - to);
+        } else {
+                memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+}
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                          struct inode *inode, unsigned int from,
+                          unsigned int to, int new)
+{
+        int ret = 0;
+        struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+        unsigned int block_end, block_start;
+        unsigned int bsize = 1 << inode->i_blkbits;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, bsize, 0);
+        head = page_buffers(page);
+        for (bh = head, block_start = 0; bh != head || !block_start;
+             bh = bh->b_this_page, block_start += bsize) {
+                block_end = block_start + bsize;
+                /*
+                 * Ignore blocks outside of our i/o range -
+                 * they may belong to unallocated clusters.
+                 */
+                if (block_start >= to || block_end <= from) {
+                        if (PageUptodate(page))
+                                set_buffer_uptodate(bh);
+                        continue;
+                }
+                /*
+                 * For an allocating write with cluster size >= page
+                 * size, we always write the entire page.
+                 */
+                if (buffer_new(bh))
+                        clear_buffer_new(bh);
+                if (!buffer_mapped(bh)) {
+                        map_bh(bh, inode->i_sb, *p_blkno);
+                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+                }
+                if (PageUptodate(page)) {
+                        if (!buffer_uptodate(bh))
+                                set_buffer_uptodate(bh);
+                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+                     (block_start < from || block_end > to)) {
+                        ll_rw_block(READ, 1, &bh);
+                        *wait_bh++=bh;
+                }
+                *p_blkno = *p_blkno + 1;
+        }
+        /*
+         * If we issued read requests - let them complete.
+         */
+        while(wait_bh > wait) {
+                wait_on_buffer(*--wait_bh);
+                if (!buffer_uptodate(*wait_bh))
+                        ret = -EIO;
+        }
+        if (ret == 0 || !new)
+                return ret;
+        /*
+         * If we get -EIO above, zero out any newly allocated blocks
+         * to avoid exposing stale data.
+         */
+        bh = head;
+        block_start = 0;
+        do {
+                void *kaddr;
+                block_end = block_start + bsize;
+                if (block_end <= from)
+                        goto next_bh;
+                if (block_start >= to)
+                        break;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr+block_start, 0, bh->b_size);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_buffer_uptodate(bh);
+                mark_buffer_dirty(bh);
+next_bh:
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return ret;
+}
+/*
+ * This will copy user data from the buffer page in the splice
+ * context.
+ *
+ * For now, we ignore SPLICE_F_MOVE as that would require some extra
+ * communication out all the way to ocfs2_write().
+ */
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        char *src, *dst;
+        struct ocfs2_splice_write_priv *sp = wc->w_private;
+        struct pipe_buffer *buf = sp->s_buf;
+        unsigned long bytes, src_from;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        from = sp->s_offset;
+        src_from = sp->s_buf_offset;
+        bytes = wc->w_count;
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        src = buf->ops->map(sp->s_pipe, buf, 1);
+        dst = kmap_atomic(wc->w_this_page, KM_USER1);
+        memcpy(dst + from, src + src_from, bytes);
+        kunmap_atomic(wc->w_this_page, KM_USER1);
+        buf->ops->unmap(sp->s_pipe, buf, src);
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+                                  unsigned int *ret_from, unsigned int *ret_to)
+{
+        int ret;
+        unsigned int to, from, cluster_start, cluster_end;
+        unsigned long bytes, src_from;
+        char *dst;
+        struct ocfs2_buffered_write_priv *bp = wc->w_private;
+        const struct iovec *cur_iov = bp->b_cur_iov;
+        char __user *buf;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+                                        &cluster_end);
+        buf = cur_iov->iov_base + bp->b_cur_off;
+        src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+        from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+        /*
+         * This is a lot of comparisons, but it reads quite
+         * easily, which is important here.
+         */
+        /* Stay within the src page */
+        bytes = PAGE_SIZE - src_from;
+        /* Stay within the vector */
+        bytes = min(bytes,
+                    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+        /* Stay within count */
+        bytes = min(bytes, (unsigned long)wc->w_count);
+        /*
+         * For clustersize > page size, just stay within
+         * target page, otherwise we have to calculate pos
+         * within the cluster and obey the rightmost
+         * boundary.
+         */
+        if (wc->w_large_pages) {
+                /*
+                 * For cluster size < page size, we have to
+                 * calculate pos within the cluster and obey
+                 * the rightmost boundary.
+                 */
+                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+                                   - (wc->w_pos & (osb->s_clustersize - 1))));
+        } else {
+                /*
+                 * cluster size > page size is the most common
+                 * case - we just stay within the target page
+                 * boundary.
+                 */
+                bytes = min(bytes, PAGE_CACHE_SIZE - from);
+        }
+        to = from + bytes;
+        if (wc->w_this_page_new)
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+        else
+                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                                            from, to, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > osb->s_clustersize);
+        BUG_ON(to > osb->s_clustersize);
+        dst = kmap(wc->w_this_page);
+        memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+        kunmap(wc->w_this_page);
+        /*
+         * XXX: This is slow, but simple. The caller of
+         * ocfs2_buffered_write_cluster() is responsible for
+         * passing through the iovecs, so it's difficult to
+         * predict what our next step is in here after our
+         * initial write. A future version should be pushing
+         * that iovec manipulation further down.
+         *
+         * By setting this, we indicate that a copy from user
+         * data was done, and subsequent calls for this
+         * cluster will skip copying more data.
+         */
+        wc->w_finished_copy = 1;
+        *ret_from = from;
+        *ret_to = to;
+out:
+        return bytes ? (unsigned int)bytes : ret;
+}
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback.  Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+                          u64 *p_blkno, struct page *page,
+                          struct ocfs2_write_ctxt *wc, int new)
+{
+        int ret, copied = 0;
+        unsigned int from = 0, to = 0;
+        unsigned int cluster_start, cluster_end;
+        unsigned int zero_from = 0, zero_to = 0;
+        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+                                        &cluster_start, &cluster_end);
+        if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+            && !wc->w_finished_copy) {
+                wc->w_this_page = page;
+                wc->w_this_page_new = new;
+                ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied = ret;
+                zero_from = from;
+                zero_to = to;
+                if (new) {
+                        from = cluster_start;
+                        to = cluster_end;
+                }
+        } else {
+                /*
+                 * If we haven't allocated the new page yet, we
+                 * shouldn't be writing it out without copying user
+                 * data. This is likely a math error from the caller.
+                 */
+                BUG_ON(!new);
+                from = cluster_start;
+                to = cluster_end;
+                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                            cluster_start, cluster_end, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Parts of newly allocated pages need to be zero'd.
+         *
+         * Above, we have also rewritten 'to' and 'from' - as far as
+         * the rest of the function is concerned, the entire cluster
+         * range inside of a page needs to be written.
+         *
+         * We can skip this if the page is up to date - it's already
+         * been zero'd from being read in as a hole.
+         */
+        if (new && !PageUptodate(page))
+                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+                                         wc->w_cpos, zero_from, zero_to);
+        flush_dcache_page(page);
+        if (ocfs2_should_order_data(inode)) {
+                ret = walk_page_buffers(handle,
+                                        page_buffers(page),
+                                        from, to, NULL,
+                                        ocfs2_journal_dirty_data);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        /*
+         * We don't use generic_commit_write() because we need to
+         * handle our own i_size update.
+         */
+        ret = block_commit_write(page, from, to);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return copied ? copied : ret;
+}
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+                           struct buffer_head *di_bh,
+                           struct ocfs2_alloc_context *data_ac,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_write_ctxt *wc)
+{
+        int ret, i, numpages = 1, new;
+        unsigned int copied = 0;
+        u32 tmp_pos;
+        u64 v_blkno, p_blkno;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned long index, start;
+        struct page **cpages;
+        new = phys == 0 ? 1 : 0;
+        /*
+         * Figure out how many pages we'll be manipulating here. For
+         * non allocating write, we just change the one
+         * page. Otherwise, we'll need a whole clusters worth.
+         */
+        if (new)
+                numpages = ocfs2_pages_per_cluster(inode->i_sb);
+        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+        if (!cpages) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * Fill our page array first. That way we've grabbed enough so
+         * that we can zero and flush if we error after adding the
+         * extent.
+         */
+        if (new) {
+                start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+                                                           wc->w_cpos);
+                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+        } else {
+                start = wc->w_pos >> PAGE_CACHE_SHIFT;
+                v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+        }
+        for(i = 0; i < numpages; i++) {
+                index = start + i;
+                cpages[i] = grab_cache_page(mapping, index);
+                if (!cpages[i]) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (new) {
+                /*
+                 * This is safe to call with the page locks - it won't take
+                 * any additional semaphores or cluster locks.
+                 */
+                tmp_pos = wc->w_cpos;
+                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+                                                 &tmp_pos, 1, di_bh, handle,
+                                                 data_ac, meta_ac, NULL);
+                /*
+                 * This shouldn't happen because we must have already
+                 * calculated the correct meta data allocation required. The
+                 * internal tree allocation code should know how to increase
+                 * transaction credits itself.
+                 *
+                 * If need be, we could handle -EAGAIN for a
+                 * RESTART_TRANS here.
+                 */
+                mlog_bug_on_msg(ret == -EAGAIN,
+                                "Inode %llu: EAGAIN return during allocation.\n",
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+                                          NULL);
+        if (ret < 0) {
+                /*
+                 * XXX: Should we go readonly here?
+                 */
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(p_blkno == 0);
+        for(i = 0; i < numpages; i++) {
+                ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+                                            wc, new);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                copied += ret;
+        }
+out:
+        for(i = 0; i < numpages; i++) {
+                unlock_page(cpages[i]);
+                mark_page_accessed(cpages[i]);
+                page_cache_release(cpages[i]);
+        }
+        kfree(cpages);
+        return copied ? copied : ret;
+}
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+                                  struct ocfs2_super *osb, loff_t pos,
+                                  size_t count, ocfs2_page_writer *cb,
+                                  void *cb_priv)
+{
+        wc->w_count = count;
+        wc->w_pos = pos;
+        wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+        wc->w_finished_copy = 0;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+                wc->w_large_pages = 1;
+        else
+                wc->w_large_pages = 0;
+        wc->w_write_data_page = cb;
+        wc->w_private = cb_priv;
+}
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv)
+{
+        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        ssize_t written = 0;
+        u32 phys;
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle;
+        struct ocfs2_write_ctxt wc;
+        ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        /*
+         * Take alloc sem here to prevent concurrent lookups. That way
+         * the mapping, zeroing and tree manipulation within
+         * ocfs2_write() will be safe against ->readpage(). This
+         * should also serve to lock out allocation from a shared
+         * writeable region.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        /* phys == 0 means that allocation is required. */
+        if (phys == 0) {
+                ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_meta;
+                }
+                credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+        }
+        ret = ocfs2_data_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta;
+        }
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_data;
+        }
+        written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+                              meta_ac, &wc);
+        if (written < 0) {
+                ret = written;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        pos += written;
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
+        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_data:
+        ocfs2_data_unlock(inode, 1);
+out_meta:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_meta_unlock(inode, 1);
+out:
+        brelse(di_bh);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return written ? written : ret;
+}
 const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
        .writepage      = ocfs2_writepage,
-        .prepare_write  = ocfs2_prepare_write,
-        .commit_write   = ocfs2_commit_write,
        .bmap           = ocfs2_bmap,
        .sync_page      = block_sync_page,
-        .direct_IO      = ocfs2_direct_IO
+        .direct_IO      = ocfs2_direct_IO,
+        .invalidatepage = ocfs2_invalidatepage,
+        .releasepage    = ocfs2_releasepage,
+        .migratepage    = buffer_migrate_page,
 };
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned from,
                                                         unsigned to);
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+                          struct inode *inode, unsigned int from,
+                          unsigned int to, int new);
+int walk_page_buffers(  handle_t *handle,
+                        struct buffer_head *head,
+                        unsigned from,
+                        unsigned to,
+                        int *partial,
+                        int (*fn)(      handle_t *handle,
+                                        struct buffer_head *bh));
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+                                u64 *, unsigned int *, unsigned int *);
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+                                     size_t count, ocfs2_page_writer *actor,
+                                     void *priv);
+struct ocfs2_write_ctxt {
+        size_t                          w_count;
+        loff_t                          w_pos;
+        u32                             w_cpos;
+        unsigned int                    w_finished_copy;
+        /* This is true if page_size > cluster_size */
+        unsigned int                    w_large_pages;
+        /* Filler callback and private data */
+        ocfs2_page_writer               *w_write_data_page;
+        void                            *w_private;
+        /* Only valid for the filler callback */
+        struct page                     *w_this_page;
+        unsigned int                    w_this_page_new;
+};
+struct ocfs2_buffered_write_priv {
+        char                            *b_src_buf;
+        const struct iovec              *b_cur_iov; /* Current iovec */
+        size_t                          b_cur_off; /* Offset in the
+                                                    * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc,
+                                  u64 *p_blkno,
+                                  unsigned int *ret_from,
+                                  unsigned int *ret_to);
+struct ocfs2_splice_write_priv {
+        struct splice_desc              *s_sd;
+        struct pipe_buffer              *s_buf;
+        struct pipe_inode_info          *s_pipe;
+        /* Neither offset value is ever larger than one page */
+        unsigned int                    s_offset;
+        unsigned int                    s_buf_offset;
+};
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+                                    struct ocfs2_write_ctxt *wc,
+                                    u64 *p_blkno,
+                                    unsigned int *ret_from,
+                                    unsigned int *ret_to);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
        test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
-        set_bit(0, (unsigned long *)&iocb->private)
+{
+        set_bit(0, (unsigned long *)&iocb->private);
+        if (level)
+                set_bit(1, (unsigned long *)&iocb->private);
+        else
+                clear_bit(1, (unsigned long *)&iocb->private);
+}
 #define ocfs2_iocb_clear_rw_locked(iocb) \
        clear_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_rw_locked_level(iocb) \
+        test_bit(1, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5a9779bb9236..eba282da500e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1234,6 +1234,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                                     const char *page,
                                     size_t count)
 {
+        struct task_struct *hb_task;
        long fd;
        int sectsize;
        char *p = (char *)page;
@@ -1319,20 +1320,28 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
         */
        atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
-        reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+        hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
-                                   reg->hr_item.ci_name);
+                              reg->hr_item.ci_name);
-        if (IS_ERR(reg->hr_task)) {
+        if (IS_ERR(hb_task)) {
-                ret = PTR_ERR(reg->hr_task);
+                ret = PTR_ERR(hb_task);
                mlog_errno(ret);
-                reg->hr_task = NULL;
                goto out;
        }
+        spin_lock(&o2hb_live_lock);
+        reg->hr_task = hb_task;
+        spin_unlock(&o2hb_live_lock);
        ret = wait_event_interruptible(o2hb_steady_queue,
                                atomic_read(&reg->hr_steady_iterations) == 0);
        if (ret) {
-                kthread_stop(reg->hr_task);
+                spin_lock(&o2hb_live_lock);
+                hb_task = reg->hr_task;
                reg->hr_task = NULL;
+                spin_unlock(&o2hb_live_lock);
+                if (hb_task)
+                        kthread_stop(hb_task);
                goto out;
        }
@@ -1354,10 +1363,17 @@ out:
 static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
                                      char *page)
 {
-        if (!reg->hr_task)
+        pid_t pid = 0;
+        spin_lock(&o2hb_live_lock);
+        if (reg->hr_task)
+                pid = reg->hr_task->pid;
+        spin_unlock(&o2hb_live_lock);
+        if (!pid)
                return 0;
-        return sprintf(page, "%u\n", reg->hr_task->pid);
+        return sprintf(page, "%u\n", pid);
 }
 struct o2hb_region_attribute {
@@ -1495,13 +1511,17 @@ out:
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
                                           struct config_item *item)
 {
+        struct task_struct *hb_task;
        struct o2hb_region *reg = to_o2hb_region(item);
        /* stop the thread when the user removes the region dir */
-        if (reg->hr_task) {
+        spin_lock(&o2hb_live_lock);
-                kthread_stop(reg->hr_task);
+        hb_task = reg->hr_task;
-                reg->hr_task = NULL;
+        reg->hr_task = NULL;
-        }
+        spin_unlock(&o2hb_live_lock);
+        if (hb_task)
+                kthread_stop(hb_task);
        config_item_put(item);
 }
@@ -1682,7 +1702,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(o2hb_register_callback);
-int o2hb_unregister_callback(struct o2hb_callback_func *hc)
+void o2hb_unregister_callback(struct o2hb_callback_func *hc)
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
@@ -1690,15 +1710,13 @@ int o2hb_unregister_callback(struct o2hb_callback_func *hc)
             __builtin_return_address(0), hc);
        if (list_empty(&hc->hc_item))
-                return 0;
+                return;
        down_write(&o2hb_callback_sem);
        list_del_init(&hc->hc_item);
        up_write(&o2hb_callback_sem);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cac6223206a9..cc6d40b39771 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -70,7 +70,7 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
                         void *data,
                         int priority);
 int o2hb_register_callback(struct o2hb_callback_func *hc);
-int o2hb_unregister_callback(struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
 void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/reboot.h>
 #include "heartbeat.h"
 #include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
        /* panic spins with interrupts enabled.  with preempt
         * threads can still schedule, etc, etc */
        o2hb_stop_all_regions();
-        panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+        printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+        emergency_restart();
 }
 /* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1718215fc018..69caf3e12fea 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1638,17 +1638,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 void o2net_unregister_hb_callbacks(void)
 {
-        int ret;
+        o2hb_unregister_callback(&o2net_hb_up);
+        o2hb_unregister_callback(&o2net_hb_down);
-        ret = o2hb_unregister_callback(&o2net_hb_up);
-        if (ret < 0)
-                mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
-                     "callback!\n", ret);
-        ret = o2hb_unregister_callback(&o2net_hb_down);
-        if (ret < 0)
-                mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
-                     "callback!\n", ret);
 }
 int o2net_register_hb_callbacks(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 8:
+ *      - Replace delete inode votes with a cluster lock
+ *
 * New in version 7:
 *      - DLM join domain includes the live nodemap
 *
@@ -57,7 +60,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..67e6866a2a4f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 {
        int status;
        int extend;
-        u64 p_blkno;
+        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
        extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
        spin_unlock(&OCFS2_I(dir)->ip_lock);
        if (extend) {
-                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+                u32 offset = OCFS2_I(dir)->ip_clusters;
-                                                    parent_fe_bh, handle,
+                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+                                                    1, parent_fe_bh, handle,
                                                    data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
                }
        }
-        status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+        v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
-                                                   (sb->s_blocksize_bits - 9)),
+        status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
-                                             1, &p_blkno, NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
-        dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+        dir->i_blocks = ocfs2_inode_sector_count(dir);
        status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
        if (status < 0) {
                mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6087c4749fee..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -138,8 +138,10 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
 void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
 {
-        hlist_del_init(&lockres->hash_node);
+        if (!hlist_unhashed(&lockres->hash_node)) {
-        dlm_lockres_put(lockres);
+                hlist_del_init(&lockres->hash_node);
+                dlm_lockres_put(lockres);
+        }
 }
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
@@ -428,11 +430,10 @@ redo_bucket:
                        dlm_lockres_put(res);
-                        cond_resched_lock(&dlm->spinlock);
                        if (dropped)
                                goto redo_bucket;
                }
+                cond_resched_lock(&dlm->spinlock);
                num += n;
                mlog(0, "%s: touched %d lockreses in bucket %d "
                     "(tot=%d)\n", dlm->name, n, i, num);
@@ -655,6 +656,8 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                dlm_kick_thread(dlm, NULL);
                while (dlm_migrate_all_locks(dlm)) {
+                        /* Give dlm_thread time to purge the lockres' */
+                        msleep(500);
                        mlog(0, "%s: more migration to do\n", dlm->name);
                }
                dlm_mark_domain_leaving(dlm);
@@ -1031,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 {
        int status = 0, tmpstat, node;
        struct domain_join_ctxt *ctxt;
-        enum dlm_query_join_response response;
+        enum dlm_query_join_response response = JOIN_DISALLOW;
        mlog_entry("%p", dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 77e4e6169a0d..6edffca99d98 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2424,6 +2424,57 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_put(res);
 }
+/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
+ * if not. If 0, numlocks is set to the number of locks in the lockres.
+ */
+static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+                                      struct dlm_lock_resource *res,
+                                      int *numlocks)
+{
+        int ret;
+        int i;
+        int count = 0;
+        struct list_head *queue, *iter;
+        struct dlm_lock *lock;
+        assert_spin_locked(&res->spinlock);
+        ret = -EINVAL;
+        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+                mlog(0, "cannot migrate lockres with unknown owner!\n");
+                goto leave;
+        }
+        if (res->owner != dlm->node_num) {
+                mlog(0, "cannot migrate lockres this node doesn't own!\n");
+                goto leave;
+        }
+        ret = 0;
+        queue = &res->granted;
+        for (i = 0; i < 3; i++) {
+                list_for_each(iter, queue) {
+                        lock = list_entry(iter, struct dlm_lock, list);
+                        ++count;
+                        if (lock->ml.node == dlm->node_num) {
+                                mlog(0, "found a lock owned by this node still "
+                                     "on the %s queue!  will not migrate this "
+                                     "lockres\n", (i == 0 ? "granted" :
+                                                   (i == 1 ? "converting" :
+                                                    "blocked")));
+                                ret = -ENOTEMPTY;
+                                goto leave;
+                        }
+                }
+                queue++;
+        }
+        *numlocks = count;
+        mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+leave:
+        return ret;
+}
 /*
 * DLM_MIGRATE_LOCKRES
@@ -2437,14 +2488,12 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        struct dlm_master_list_entry *mle = NULL;
        struct dlm_master_list_entry *oldmle = NULL;
        struct dlm_migratable_lockres *mres = NULL;
-        int ret = -EINVAL;
+        int ret = 0;
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        struct list_head *queue, *iter;
+        int numlocks;
-        int i;
+        int wake = 0;
-        struct dlm_lock *lock;
-        int empty = 1, wake = 0;
        if (!dlm_grab(dlm))
                return -EINVAL;
@@ -2458,42 +2507,16 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
         * ensure this lockres is a proper candidate for migration
         */
        spin_lock(&res->spinlock);
-        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
-                mlog(0, "cannot migrate lockres with unknown owner!\n");
+        if (ret < 0) {
-                spin_unlock(&res->spinlock);
-                goto leave;
-        }
-        if (res->owner != dlm->node_num) {
-                mlog(0, "cannot migrate lockres this node doesn't own!\n");
                spin_unlock(&res->spinlock);
                goto leave;
        }
-        mlog(0, "checking queues...\n");
-        queue = &res->granted;
-        for (i=0; i<3; i++) {
-                list_for_each(iter, queue) {
-                        lock = list_entry (iter, struct dlm_lock, list);
-                        empty = 0;
-                        if (lock->ml.node == dlm->node_num) {
-                                mlog(0, "found a lock owned by this node "
-                                     "still on the %s queue!  will not "
-                                     "migrate this lockres\n",
-                                     i==0 ? "granted" :
-                                     (i==1 ? "converting" : "blocked"));
-                                spin_unlock(&res->spinlock);
-                                ret = -ENOTEMPTY;
-                                goto leave;
-                        }
-                }
-                queue++;
-        }
-        mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
        spin_unlock(&res->spinlock);
        /* no work to do */
-        if (empty) {
+        if (numlocks == 0) {
                mlog(0, "no locks were found on this lockres! done!\n");
-                ret = 0;
                goto leave;
        }
@@ -2729,15 +2752,26 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
+        int numlocks;
+        spin_lock(&res->spinlock);
        if (res->owner != dlm->node_num) {
                if (!__dlm_lockres_unused(res)) {
                        mlog(ML_ERROR, "%s:%.*s: this node is not master, "
                             "trying to free this but locks remain\n",
                             dlm->name, res->lockname.len, res->lockname.name);
                }
+                spin_unlock(&res->spinlock);
+                goto leave;
+        }
+        /* No need to migrate a lockres having no locks */
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        if (ret >= 0 && numlocks == 0) {
+                spin_unlock(&res->spinlock);
                goto leave;
        }
+        spin_unlock(&res->spinlock);
        /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
        spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..c1807a42c49f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        }
                } while (status != 0);
+                spin_lock(&dlm_reco_state_lock);
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
                        case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                     ndata->node_num, dead_node);
                                break;
                }
+                spin_unlock(&dlm_reco_state_lock);
        }
        mlog(0, "done requesting all lock info\n");
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 8ffa0916eb86..2b264c6ba039 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -256,18 +256,14 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                        break;
                }
-                mlog(0, "removing lockres %.*s:%p from purgelist\n",
+                dlm_lockres_get(lockres);
-                     lockres->lockname.len, lockres->lockname.name, lockres);
-                list_del_init(&lockres->purge);
-                dlm_lockres_put(lockres);
-                dlm->purge_count--;
                /* This may drop and reacquire the dlm spinlock if it
                 * has to do migration. */
-                mlog(0, "calling dlm_purge_lockres!\n");
                if (dlm_purge_lockres(dlm, lockres))
                        BUG();
-                mlog(0, "DONE calling dlm_purge_lockres!\n");
+                dlm_lockres_put(lockres);
                /* Avoid adding any scheduling latencies */
                cond_resched_lock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..27e43b0c0eae 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+        .get_osb        = ocfs2_get_inode_osb,
+        .flags          = 0,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-                lockres->l_type == OCFS2_LOCK_TYPE_RW;
+                lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+                lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                case OCFS2_LOCK_TYPE_DATA:
                        ops = &ocfs2_inode_data_lops;
                        break;
+                case OCFS2_LOCK_TYPE_OPEN:
+                        ops = &ocfs2_inode_open_lops;
+                        break;
                default:
                        mlog_bug_on_msg(1, "type: %d\n", type);
                        ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
                goto bail;
        }
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto bail;
+        }
 bail:
        mlog_exit(ret);
        return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
        mlog_exit_void();
 }
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+        int status = 0;
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(!inode);
+        mlog_entry_void();
+        mlog(0, "inode %llu take PRMODE open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (ocfs2_mount_local(osb))
+                goto out;
+        lockres = &OCFS2_I(inode)->ip_open_lockres;
+        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_PRMODE, 0, 0);
+        if (status < 0)
+                mlog_errno(status);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+        int status = 0, level;
+        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(!inode);
+        mlog_entry_void();
+        mlog(0, "inode %llu try to take %s open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             write ? "EXMODE" : "PRMODE");
+        if (ocfs2_mount_local(osb))
+                goto out;
+        lockres = &OCFS2_I(inode)->ip_open_lockres;
+        level = write ? LKM_EXMODE : LKM_PRMODE;
+        /*
+         * The file system may already holding a PRMODE/EXMODE open lock.
+         * Since we pass LKM_NOQUEUE, the request won't block waiting on
+         * other nodes and the -EAGAIN will indicate to the caller that
+         * this inode is still in use.
+         */
+        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                    level, LKM_NOQUEUE, 0);
+out:
+        mlog_exit(status);
+        return status;
+}
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        mlog_entry_void();
+        mlog(0, "inode %llu drop open lock\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (ocfs2_mount_local(osb))
+                goto out;
+        if(lockres->l_ro_holders)
+                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                     LKM_PRMODE);
+        if(lockres->l_ex_holders)
+                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                     LKM_EXMODE);
+out:
+        mlog_exit_void();
+}
 int ocfs2_data_lock_full(struct inode *inode,
                         int write,
                         int arg_flags)
@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
                inode->i_blocks = 0;
        else
-                inode->i_blocks =
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                        ocfs2_align_bytes_to_sectors(i_size_read(inode));
        inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
        inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 {
        int status = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = NULL;
+        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
        struct ocfs2_dinode *fe;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry_void();
+        if (ocfs2_mount_local(osb))
+                goto bail;
        spin_lock(&oi->ip_lock);
        if (oi->ip_flags & OCFS2_INODE_DELETED) {
                mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
        }
        spin_unlock(&oi->ip_lock);
-        if (!ocfs2_mount_local(osb)) {
+        if (!ocfs2_should_refresh_lock_res(lockres))
-                lockres = &oi->ip_meta_lockres;
+                goto bail;
-                if (!ocfs2_should_refresh_lock_res(lockres))
-                        goto bail;
-        }
        /* This will discard any caching information we might have had
         * for the inode metadata. */
        ocfs2_metadata_cache_purge(inode);
-        /* will do nothing for inode types that don't use the extent
-         * map (directories, bitmap files, etc) */
        ocfs2_extent_map_trunc(inode, 0);
-        if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+        if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
                mlog(0, "Trusting LVB on inode %llu\n",
                     (unsigned long long)oi->ip_blkno);
                ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
        status = 0;
 bail_refresh:
-        if (lockres)
+        ocfs2_complete_lock_res_refresh(lockres, status);
-                ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
        mlog_exit(status);
        return status;
@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
                wait_event(osb->recovery_event,
                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
-        acquired = 0;
        lockres = &OCFS2_I(inode)->ip_meta_lockres;
        level = ex ? LKM_EXMODE : LKM_PRMODE;
        dlm_flags = 0;
@@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
         * ocfs2_clear_inode has done it for us. */
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_data_lockres);
+                              &OCFS2_I(inode)->ip_open_lockres);
        if (err < 0)
                mlog_errno(err);
        status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+                              &OCFS2_I(inode)->ip_data_lockres);
+        if (err < 0)
+                mlog_errno(err);
+        if (err < 0 && !status)
+                status = err;
+        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
                mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..59cb566e7983 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
 *
 * extent_map.c
 *
- * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * Block/Cluster mapping functions
- * the library.
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 *
@@ -26,1016 +25,528 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 #include "ocfs2.h"
+#include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 #include "buffer_head_io.h"
 /*
- * SUCK SUCK SUCK
+ * The extent caching implementation is intentionally trivial.
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-struct ocfs2_extent_map_entry {
-        struct rb_node e_node;
-        int e_tree_depth;
-        struct ocfs2_extent_rec e_rec;
-};
-struct ocfs2_em_insert_context {
-        int need_left;
-        int need_right;
-        struct ocfs2_extent_map_entry *new_ent;
-        struct ocfs2_extent_map_entry *old_ent;
-        struct ocfs2_extent_map_entry *left_ent;
-        struct ocfs2_extent_map_entry *right_ent;
-};
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-                        u32 cpos, u32 clusters,
-                        struct rb_node ***ret_p,
-                        struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
-                                   struct ocfs2_extent_rec *rec,
-                                   int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-                                         struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-                                      u32 cpos, u32 clusters,
-                                      struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-                                        u32 cpos, u32 clusters,
-                                        struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-                                       struct ocfs2_extent_rec *rec,
-                                       int tree_depth,
-                                       struct ocfs2_em_insert_context *ctxt);
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
-                                              u32 cpos, u32 clusters)
-{
-        if (le32_to_cpu(rec->e_cpos) > cpos)
-                return 0;
-        if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
-                              le32_to_cpu(rec->e_clusters))
-                return 0;
-        return 1;
-}
-/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock.  This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
 *
- * The rb_node garbage lets insertion share the search.  Trivial
+ * We only cache a small number of extents stored directly on the
- * callers pass NULL.
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
 */
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+void ocfs2_extent_map_init(struct inode *inode)
-                        u32 cpos, u32 clusters,
-                        struct rb_node ***ret_p,
-                        struct rb_node **ret_parent)
 {
-        struct rb_node **p = &em->em_extents.rb_node;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct rb_node *parent = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        while (*p)
-        {
-                parent = *p;
-                ent = rb_entry(parent, struct ocfs2_extent_map_entry,
-                               e_node);
-                if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
-                        p = &(*p)->rb_left;
-                        ent = NULL;
-                } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
-                                    le32_to_cpu(ent->e_rec.e_clusters))) {
-                        p = &(*p)->rb_right;
-                        ent = NULL;
-                } else
-                        break;
-        }
-        if (ret_p != NULL)
+        oi->ip_extent_map.em_num_items = 0;
-                *ret_p = p;
+        INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
-        if (ret_parent != NULL)
-                *ret_parent = parent;
-        return ent;
 }
-/*
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- * Find the leaf containing the interval we want.  While we're on our
+                                      unsigned int cpos,
- * way down the tree, fill in every record we see at any depth, because
+                                      struct ocfs2_extent_map_item **ret_emi)
- * we might want it later.
- *
- * Note that this code is run without ip_lock.  That's because it
- * sleeps while reading.  If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-                                      u32 cpos, u32 clusters,
-                                      struct ocfs2_extent_list *el)
 {
-        int i, ret;
+        unsigned int range;
-        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_map_item *emi;
-        u64 blkno;
-        u32 rec_end;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_rec *rec;
-        /*
-         * The bh data containing the el cannot change here, because
-         * we hold alloc_sem.  So we can do this without other
-         * locks.
-         */
-        while (el->l_tree_depth)
-        {
-                blkno = 0;
-                for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                        rec = &el->l_recs[i];
-                        rec_end = (le32_to_cpu(rec->e_cpos) +
-                                   le32_to_cpu(rec->e_clusters));
-                        ret = -EBADR;
-                        if (rec_end > OCFS2_I(inode)->ip_clusters) {
-                                mlog_errno(ret);
-                                ocfs2_error(inode->i_sb,
-                                            "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-                                            i,
-                                            (unsigned long long)le64_to_cpu(rec->e_blkno),
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                            OCFS2_I(inode)->ip_clusters);
-                                goto out_free;
-                        }
-                        if (rec_end <= cpos) {
-                                ret = ocfs2_extent_map_insert(inode, rec,
-                                                le16_to_cpu(el->l_tree_depth));
-                                if (ret && (ret != -EEXIST)) {
-                                        mlog_errno(ret);
-                                        goto out_free;
-                                }
-                                continue;
-                        }
-                        if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
-                                ret = ocfs2_extent_map_insert(inode, rec,
-                                                le16_to_cpu(el->l_tree_depth));
-                                if (ret && (ret != -EEXIST)) {
-                                        mlog_errno(ret);
-                                        goto out_free;
-                                }
-                                continue;
-                        }
-                        /*
+        *ret_emi = NULL;
-                         * We've found a record that matches our
-                         * interval.  We don't insert it because we're
-                         * about to traverse it.
-                         */
-                        /* Check to see if we're stradling */
-                        ret = -ESRCH;
-                        if (!ocfs2_extent_rec_contains_clusters(rec,
-                                                                cpos,
-                                                                clusters)) {
-                                mlog_errno(ret);
-                                goto out_free;
-                        }
-                        /*
+        list_for_each_entry(emi, &em->em_list, ei_list) {
-                         * If we've already found a record, the el has
+                range = emi->ei_cpos + emi->ei_clusters;
-                         * two records covering the same interval.
-                         * EEEK!
-                         */
-                        ret = -EBADR;
-                        if (blkno) {
-                                mlog_errno(ret);
-                                ocfs2_error(inode->i_sb,
-                                            "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
-                                            cpos, clusters,
-                                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                            (unsigned long long)blkno, i,
-                                            (unsigned long long)le64_to_cpu(rec->e_blkno));
-                                goto out_free;
-                        }
-                        blkno = le64_to_cpu(rec->e_blkno);
+                if (cpos >= emi->ei_cpos && cpos < range) {
-                }
+                        list_move(&emi->ei_list, &em->em_list);
-                /*
+                        *ret_emi = emi;
-                 * We don't support holes, and we're still up
+                        break;
-                 * in the branches, so we'd better have found someone
-                 */
-                ret = -EBADR;
-                if (!blkno) {
-                        ocfs2_error(inode->i_sb,
-                                    "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
-                                    cpos, clusters,
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        mlog_errno(ret);
-                        goto out_free;
-                }
-                if (eb_bh) {
-                        brelse(eb_bh);
-                        eb_bh = NULL;
-                }
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-                                       blkno, &eb_bh, OCFS2_BH_CACHED,
-                                       inode);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_free;
-                }
-                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out_free;
                }
-                el = &eb->h_list;
        }
+}
-        BUG_ON(el->l_tree_depth);
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+                                   unsigned int *phys, unsigned int *len,
-        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                                   unsigned int *flags)
-                rec = &el->l_recs[i];
+{
+        unsigned int coff;
-                if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-                    OCFS2_I(inode)->ip_clusters) {
+        struct ocfs2_extent_map_item *emi;
-                        ret = -EBADR;
-                        mlog_errno(ret);
+        spin_lock(&oi->ip_lock);
-                        ocfs2_error(inode->i_sb,
-                                    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
+        __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
-                                    i,
+        if (emi) {
-                                    (unsigned long long)le64_to_cpu(rec->e_blkno),
+                coff = cpos - emi->ei_cpos;
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                *phys = emi->ei_phys + coff;
-                                    OCFS2_I(inode)->ip_clusters);
+                if (len)
-                        return ret;
+                        *len = emi->ei_clusters - coff;
-                }
+                if (flags)
+                        *flags = emi->ei_flags;
-                ret = ocfs2_extent_map_insert(inode, rec,
-                                              le16_to_cpu(el->l_tree_depth));
-                if (ret && (ret != -EEXIST)) {
-                        mlog_errno(ret);
-                        goto out_free;
-                }
        }
-        ret = 0;
+        spin_unlock(&oi->ip_lock);
-out_free:
+        if (emi == NULL)
-        if (eb_bh)
+                return -ENOENT;
-                brelse(eb_bh);
-        return ret;
+        return 0;
 }
 /*
- * This lookup actually will read from disk.  It has one invariant:
+ * Forget about all clusters equal to or greater than cpos.
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
 */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
-                                        u32 cpos,
-                                        u32 clusters,
-                                        struct ocfs2_extent_map_entry **ret_ent)
 {
-        int ret;
+        struct list_head *p, *n;
-        u64 blkno;
+        struct ocfs2_extent_map_item *emi;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_extent_map_entry *ent;
+        struct ocfs2_extent_map *em = &oi->ip_extent_map;
-        struct buffer_head *bh = NULL;
+        LIST_HEAD(tmp_list);
-        struct ocfs2_extent_block *eb;
+        unsigned int range;
-        struct ocfs2_dinode *di;
-        struct ocfs2_extent_list *el;
+        spin_lock(&oi->ip_lock);
+        list_for_each_safe(p, n, &em->em_list) {
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-        ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-        if (ent) {
+                if (emi->ei_cpos >= cpos) {
-                if (!ent->e_tree_depth) {
+                        /* Full truncate of this record. */
-                        spin_unlock(&OCFS2_I(inode)->ip_lock);
+                        list_move(&emi->ei_list, &tmp_list);
-                        *ret_ent = ent;
+                        BUG_ON(em->em_num_items == 0);
-                        return 0;
+                        em->em_num_items--;
-                }
+                        continue;
-                blkno = le64_to_cpu(ent->e_rec.e_blkno);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
-                                       OCFS2_BH_CACHED, inode);
-                if (ret) {
-                        mlog_errno(ret);
-                        if (bh)
-                                brelse(bh);
-                        return ret;
                }
-                eb = (struct ocfs2_extent_block *)bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(bh);
-                        return -EIO;
-                }
-                el = &eb->h_list;
-        } else {
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                range = emi->ei_cpos + emi->ei_clusters;
-                                       OCFS2_I(inode)->ip_blkno, &bh,
+                if (range > cpos) {
-                                       OCFS2_BH_CACHED, inode);
+                        /* Partial truncate */
-                if (ret) {
+                        emi->ei_clusters = cpos - emi->ei_cpos;
-                        mlog_errno(ret);
-                        if (bh)
-                                brelse(bh);
-                        return ret;
                }
-                di = (struct ocfs2_dinode *)bh->b_data;
-                if (!OCFS2_IS_VALID_DINODE(di)) {
-                        brelse(bh);
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
-                        return -EIO;
-                }
-                el = &di->id2.i_list;
-        }
-        ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
-        brelse(bh);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
        }
+        spin_unlock(&oi->ip_lock);
-        ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+        list_for_each_safe(p, n, &tmp_list) {
-        if (!ent) {
+                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-                ret = -ESRCH;
+                list_del(&emi->ei_list);
-                mlog_errno(ret);
+                kfree(emi);
-                return ret;
        }
-        /* FIXME: Make sure this isn't a corruption */
-        BUG_ON(ent->e_tree_depth);
-        *ret_ent = ent;
-        return 0;
 }
 /*
- * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * Is any part of emi2 contained within emi1
- * thus racing lookup if the lock weren't held.
 */
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
-                                         struct ocfs2_extent_map_entry *ent)
+                                 struct ocfs2_extent_map_item *emi2)
 {
-        struct rb_node **p, *parent;
+        unsigned int range1, range2;
-        struct ocfs2_extent_map_entry *old_ent;
-        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+        /*
-                                          le32_to_cpu(ent->e_rec.e_clusters),
+         * Check if logical start of emi2 is inside emi1
-                                          &p, &parent);
+         */
-        if (old_ent)
+        range1 = emi1->ei_cpos + emi1->ei_clusters;
-                return -EEXIST;
+        if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+                return 1;
-        rb_link_node(&ent->e_node, parent, p);
+        /*
-        rb_insert_color(&ent->e_node, &em->em_extents);
+         * Check if logical end of emi2 is inside emi1
+         */
+        range2 = emi2->ei_cpos + emi2->ei_clusters;
+        if (range2 > emi1->ei_cpos && range2 <= range1)
+                return 1;
        return 0;
 }
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+                                  struct ocfs2_extent_map_item *src)
+{
+        dest->ei_cpos = src->ei_cpos;
+        dest->ei_phys = src->ei_phys;
+        dest->ei_clusters = src->ei_clusters;
+        dest->ei_flags = src->ei_flags;
+}
 /*
- * Simple rule: on any return code other than -EAGAIN, anything left
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
- * in the insert_context will be freed.
+ * otherwise.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert.  It is not an actual error, but it
- * tells the caller we have no more work to do.
 */
-static int ocfs2_extent_map_try_insert(struct inode *inode,
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
-                                       struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_extent_map_item *ins)
-                                       int tree_depth,
-                                       struct ocfs2_em_insert_context *ctxt)
 {
-        int ret;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *old_ent;
-        ctxt->need_left = 0;
-        ctxt->need_right = 0;
-        ctxt->old_ent = NULL;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-        if (!ret) {
-                ctxt->new_ent = NULL;
-                goto out_unlock;
-        }
-        /* Since insert_entry failed, the map MUST have old_ent */
-        old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-                                          le32_to_cpu(rec->e_clusters),
-                                          NULL, NULL);
-        BUG_ON(!old_ent);
-        if (old_ent->e_tree_depth < tree_depth) {
-                /* Another thread beat us to the lower tree_depth */
-                ret = -EEXIST;
-                goto out_unlock;
-        }
-        if (old_ent->e_tree_depth == tree_depth) {
-                /*
-                 * Another thread beat us to this tree_depth.
-                 * Let's make sure we agree with that thread (the
-                 * extent_rec should be identical).
-                 */
-                if (!memcmp(rec, &old_ent->e_rec,
-                            sizeof(struct ocfs2_extent_rec)))
-                        ret = 0;
-                else
-                        /* FIXME: Should this be ESRCH/EBADR??? */
-                        ret = -EEXIST;
-                goto out_unlock;
-        }
        /*
-         * We do it in this order specifically so that no actual tree
+         * Handle contiguousness
-         * changes occur until we have all the pieces we need.  We
-         * don't want malloc failures to leave an inconsistent tree.
-         * Whenever we drop the lock, another process could be
-         * inserting.  Also note that, if another process just beat us
-         * to an insert, we might not need the same pieces we needed
-         * the first go round.  In the end, the pieces we need will
-         * be used, and the pieces we don't will be freed.
         */
-        ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+        if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
-                             le32_to_cpu(old_ent->e_rec.e_cpos));
+            ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
-        ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+            ins->ei_flags == emi->ei_flags) {
-                               le32_to_cpu(old_ent->e_rec.e_clusters)) >
+                emi->ei_clusters += ins->ei_clusters;
-                              (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+                return 1;
-        ret = -EAGAIN;
+        } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
-        if (ctxt->need_left) {
+                   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
-                if (!ctxt->left_ent)
+                   ins->ei_flags == emi->ei_flags) {
-                        goto out_unlock;
+                emi->ei_phys = ins->ei_phys;
-                *(ctxt->left_ent) = *old_ent;
+                emi->ei_cpos = ins->ei_cpos;
-                ctxt->left_ent->e_rec.e_clusters =
+                emi->ei_clusters += ins->ei_clusters;
-                        cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+                return 1;
-                                    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
-        }
-        if (ctxt->need_right) {
-                if (!ctxt->right_ent)
-                        goto out_unlock;
-                *(ctxt->right_ent) = *old_ent;
-                ctxt->right_ent->e_rec.e_cpos =
-                        cpu_to_le32(le32_to_cpu(rec->e_cpos) +
-                                    le32_to_cpu(rec->e_clusters));
-                ctxt->right_ent->e_rec.e_clusters =
-                        cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
-                                     le32_to_cpu(old_ent->e_rec.e_clusters)) -
-                                    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
-        }
-        rb_erase(&old_ent->e_node, &em->em_extents);
-        /* Now that he's erased, set him up for deletion */
-        ctxt->old_ent = old_ent;
-        if (ctxt->need_left) {
-                ret = ocfs2_extent_map_insert_entry(em,
-                                                    ctxt->left_ent);
-                if (ret)
-                        goto out_unlock;
-                ctxt->left_ent = NULL;
        }
-        if (ctxt->need_right) {
+        /*
-                ret = ocfs2_extent_map_insert_entry(em,
+         * Overlapping extents - this shouldn't happen unless we've
-                                                    ctxt->right_ent);
+         * split an extent to change it's flags. That is exceedingly
-                if (ret)
+         * rare, so there's no sense in trying to optimize it yet.
-                        goto out_unlock;
+         */
-                ctxt->right_ent = NULL;
+        if (ocfs2_ei_is_contained(emi, ins) ||
+            ocfs2_ei_is_contained(ins, emi)) {
+                ocfs2_copy_emi_fields(emi, ins);
+                return 1;
        }
-        ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+        /* No merge was possible. */
+        return 0;
-        if (!ret)
-                ctxt->new_ent = NULL;
-out_unlock:
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        return ret;
 }
+/*
-static int ocfs2_extent_map_insert(struct inode *inode,
+ * In order to reduce complexity on the caller, this insert function
-                                   struct ocfs2_extent_rec *rec,
+ * is intentionally liberal in what it will accept.
-                                   int tree_depth)
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+                                 struct ocfs2_extent_rec *rec)
 {
-        int ret;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_em_insert_context ctxt = {0, };
+        struct ocfs2_extent_map *em = &oi->ip_extent_map;
+        struct ocfs2_extent_map_item *emi, *new_emi = NULL;
-        if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+        struct ocfs2_extent_map_item ins;
-            OCFS2_I(inode)->ip_map.em_clusters) {
-                ret = -EBADR;
+        ins.ei_cpos = le32_to_cpu(rec->e_cpos);
-                mlog_errno(ret);
+        ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
-                return ret;
+                                               le64_to_cpu(rec->e_blkno));
+        ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+        ins.ei_flags = rec->e_flags;
+search:
+        spin_lock(&oi->ip_lock);
+        list_for_each_entry(emi, &em->em_list, ei_list) {
+                if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+                        list_move(&emi->ei_list, &em->em_list);
+                        spin_unlock(&oi->ip_lock);
+                        goto out;
+                }
        }
-        /* Zero e_clusters means a truncated tail record.  It better be EOF */
+        /*
-        if (!rec->e_clusters) {
+         * No item could be merged.
-                if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+         *
-                    OCFS2_I(inode)->ip_map.em_clusters) {
+         * Either allocate and add a new item, or overwrite the last recently
-                        ret = -EBADR;
+         * inserted.
-                        mlog_errno(ret);
+         */
-                        ocfs2_error(inode->i_sb,
-                                    "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
-                                    (unsigned long long)le64_to_cpu(rec->e_blkno),
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        return ret;
-                }
-                /* Ignore the truncated tail */
+        if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
-                return 0;
+                if (new_emi == NULL) {
-        }
+                        spin_unlock(&oi->ip_lock);
-        ret = -ENOMEM;
+                        new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
-        ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+                        if (new_emi == NULL)
-                                        GFP_NOFS);
+                                goto out;
-        if (!ctxt.new_ent) {
-                mlog_errno(ret);
-                return ret;
-        }
-        ctxt.new_ent->e_rec = *rec;
+                        goto search;
-        ctxt.new_ent->e_tree_depth = tree_depth;
-        do {
-                ret = -ENOMEM;
-                if (ctxt.need_left && !ctxt.left_ent) {
-                        ctxt.left_ent =
-                                kmem_cache_alloc(ocfs2_em_ent_cachep,
-                                                 GFP_NOFS);
-                        if (!ctxt.left_ent)
-                                break;
-                }
-                if (ctxt.need_right && !ctxt.right_ent) {
-                        ctxt.right_ent =
-                                kmem_cache_alloc(ocfs2_em_ent_cachep,
-                                                 GFP_NOFS);
-                        if (!ctxt.right_ent)
-                                break;
                }
-                ret = ocfs2_extent_map_try_insert(inode, rec,
+                ocfs2_copy_emi_fields(new_emi, &ins);
-                                                  tree_depth, &ctxt);
+                list_add(&new_emi->ei_list, &em->em_list);
-        } while (ret == -EAGAIN);
+                em->em_num_items++;
+                new_emi = NULL;
-        if ((ret < 0) && (ret != -EEXIST))
+        } else {
-                mlog_errno(ret);
+                BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+                emi = list_entry(em->em_list.prev,
+                                 struct ocfs2_extent_map_item, ei_list);
+                list_move(&emi->ei_list, &em->em_list);
+                ocfs2_copy_emi_fields(emi, &ins);
+        }
-        if (ctxt.left_ent)
+        spin_unlock(&oi->ip_lock);
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
-        if (ctxt.right_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
-        if (ctxt.old_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
-        if (ctxt.new_ent)
-                kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
-        return ret;
+out:
+        if (new_emi)
+                kfree(new_emi);
 }
 /*
- * Append this record to the tail of the extent map.  It must be
+ * Return the 1st index within el which contains an extent start
- * tree_depth 0.  The record might be an extension of an existing
+ * larger than v_cluster.
- * record, and as such that needs to be handled.  eg:
- *
- * Existing record in the extent map:
- *
- *      cpos = 10, len = 10
- *      |---------|
- *
- * New Record:
- *
- *      cpos = 10, len = 20
- *      |------------------|
- *
- * The passed record is the new on-disk record.  The new_clusters value
- * is how many clusters were added to the file.  If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters.  If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
 */
-int ocfs2_extent_map_append(struct inode *inode,
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
-                            struct ocfs2_extent_rec *rec,
+                                       u32 v_cluster)
-                            u32 new_clusters)
 {
-        int ret;
+        int i;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_map_entry *ent;
-        struct ocfs2_extent_rec *old;
-        BUG_ON(!new_clusters);
-        BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
-        if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                /*
+                rec = &el->l_recs[i];
-                 * Size changed underneath us on disk.  Drop any
-                 * straddling records and update our idea of
-                 * i_clusters
-                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
-        }
-        mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+                if (v_cluster < le32_to_cpu(rec->e_cpos))
-                         le32_to_cpu(rec->e_clusters)) !=
+                        break;
-                        (em->em_clusters + new_clusters),
-                        "Inode %llu:\n"
-                        "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
-                        "em->em_clusters = %u + new_clusters = %u = %u\n",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                        le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
-                        le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
-                        em->em_clusters, new_clusters,
-                        em->em_clusters + new_clusters);
-        em->em_clusters += new_clusters;
-        ret = -ENOENT;
-        if (le32_to_cpu(rec->e_clusters) > new_clusters) {
-                /* This is a contiguous append */
-                ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
-                                              NULL, NULL);
-                if (ent) {
-                        old = &ent->e_rec;
-                        BUG_ON((le32_to_cpu(rec->e_cpos) +
-                                le32_to_cpu(rec->e_clusters)) !=
-                                 (le32_to_cpu(old->e_cpos) +
-                                  le32_to_cpu(old->e_clusters) +
-                                  new_clusters));
-                        if (ent->e_tree_depth == 0) {
-                                BUG_ON(le32_to_cpu(old->e_cpos) !=
-                                       le32_to_cpu(rec->e_cpos));
-                                BUG_ON(le64_to_cpu(old->e_blkno) !=
-                                       le64_to_cpu(rec->e_blkno));
-                                ret = 0;
-                        }
-                        /*
-                         * Let non-leafs fall through as -ENOENT to
-                         * force insertion of the new leaf.
-                         */
-                        le32_add_cpu(&old->e_clusters, new_clusters);
-                }
        }
-        if (ret == -ENOENT)
+        return i;
-                ret = ocfs2_extent_map_insert(inode, rec, 0);
-        if (ret < 0)
-                mlog_errno(ret);
-        return ret;
 }
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
 /*
- * Look up the record containing this cluster offset.  This record is
+ * Figure out the size of a hole which starts at v_cluster within the given
- * part of the extent map.  Do not free it.  Any changes you make to
+ * extent list.
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
 *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * If there is no more allocation past v_cluster, we return the maximum
- * rec->e_clusters -= 5;
+ * cluster size minus v_cluster.
 *
- * The lookup does not read from disk.  If the map isn't filled in for
+ * If we have in-inode extents, then el points to the dinode list and
- * an entry, you won't find it.
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
- *
+ * containing el.
- * Also note that the returned record is valid until alloc_sem is
- * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
 */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+static int ocfs2_figure_hole_clusters(struct inode *inode,
-                             struct ocfs2_extent_rec **rec,
+                                      struct ocfs2_extent_list *el,
-                             int *tree_depth)
+                                      struct buffer_head *eb_bh,
+                                      u32 v_cluster,
+                                      u32 *num_clusters)
 {
-        int ret = -ENOENT;
+        int ret, i;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct buffer_head *next_eb_bh = NULL;
-        struct ocfs2_extent_map_entry *ent;
+        struct ocfs2_extent_block *eb, *next_eb;
-        *rec = NULL;
+        i = ocfs2_search_for_hole_index(el, v_cluster);
-        if (cpos >= OCFS2_I(inode)->ip_clusters)
+        if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
-                return -EINVAL;
+                eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-        if (cpos >= em->em_clusters) {
                /*
-                 * Size changed underneath us on disk.  Drop any
+                 * Check the next leaf for any extents.
-                 * straddling records and update our idea of
-                 * i_clusters
                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters ;
-        }
-        ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
-                                      NULL, NULL);
-        if (ent) {
+                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
-                *rec = &ent->e_rec;
+                        goto no_more_extents;
-                if (tree_depth)
-                        *tree_depth = ent->e_tree_depth;
-                ret = 0;
-        }
-        return ret;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-}
+                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                       &next_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-int ocfs2_extent_map_get_clusters(struct inode *inode,
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                                  u32 v_cpos, int count,
+                        ret = -EROFS;
-                                  u32 *p_cpos, int *ret_count)
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-{
+                        goto out;
-        int ret;
+                }
-        u32 coff, ccount;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        *p_cpos = ccount = 0;
+                el = &next_eb->h_list;
-        if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+                i = ocfs2_search_for_hole_index(el, v_cluster);
-                return -EINVAL;
+        }
-        if ((v_cpos + count) > em->em_clusters) {
+no_more_extents:
+        if (i == le16_to_cpu(el->l_next_free_rec)) {
                /*
-                 * Size changed underneath us on disk.  Drop any
+                 * We're at the end of our existing allocation. Just
-                 * straddling records and update our idea of
+                 * return the maximum number of clusters we could
-                 * i_clusters
+                 * possibly allocate.
                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+                *num_clusters = UINT_MAX - v_cluster;
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
+        } else {
+                *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
        }
+        ret = 0;
+out:
+        brelse(next_eb_bh);
+        return ret;
+}
-        ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+/*
-        if (ret)
+ * Return the index of the extent record which contains cluster #v_cluster.
-                return ret;
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+                                    u32 v_cluster)
+{
+        int ret = -1;
+        int i;
+        struct ocfs2_extent_rec *rec;
+        u32 rec_end, rec_start, clusters;
-        if (ent) {
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                /* We should never find ourselves straddling an interval */
+                rec = &el->l_recs[i];
-                if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
-                                                        v_cpos,
-                                                        count))
-                        return -ESRCH;
-                coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+                rec_start = le32_to_cpu(rec->e_cpos);
-                *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                clusters = ocfs2_rec_clusters(el, rec);
-                                le64_to_cpu(ent->e_rec.e_blkno)) +
-                          coff;
-                if (ret_count)
+                rec_end = rec_start + clusters;
-                        *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
-                return 0;
+                if (v_cluster >= rec_start && v_cluster < rec_end) {
+                        ret = i;
+                        break;
+                }
        }
+        return ret;
-        return -ENOENT;
 }
-#endif  /*  0  */
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+                       u32 *p_cluster, u32 *num_clusters,
-int ocfs2_extent_map_get_blocks(struct inode *inode,
+                       unsigned int *extent_flags)
-                                u64 v_blkno, int count,
-                                u64 *p_blkno, int *ret_count)
 {
-        int ret;
+        int ret, i;
-        u64 boff;
+        unsigned int flags = 0;
-        u32 cpos, clusters;
+        struct buffer_head *di_bh = NULL;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        struct buffer_head *eb_bh = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
+        struct ocfs2_dinode *di;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
        struct ocfs2_extent_rec *rec;
+        u32 coff;
-        *p_blkno = 0;
+        ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+                                      num_clusters, extent_flags);
-        cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+        if (ret == 0)
-        clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+                goto out;
-                                            (u64)count + bpc - 1);
-        if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
-                ret = -EINVAL;
-                mlog_errno(ret);
-                return ret;
-        }
-        if ((cpos + clusters) > em->em_clusters) {
-                /*
-                 * Size changed underneath us on disk.  Drop any
-                 * straddling records and update our idea of
-                 * i_clusters
-                 */
-                ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-                em->em_clusters = OCFS2_I(inode)->ip_clusters;
-        }
-        ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+                               &di_bh, OCFS2_BH_CACHED, inode);
        if (ret) {
                mlog_errno(ret);
-                return ret;
+                goto out;
        }
-        if (ent)
+        di = (struct ocfs2_dinode *) di_bh->b_data;
-        {
+        el = &di->id2.i_list;
-                rec = &ent->e_rec;
-                /* We should never find ourselves straddling an interval */
+        if (el->l_tree_depth) {
-                if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+                ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
-                        ret = -ESRCH;
+                if (ret) {
                        mlog_errno(ret);
-                        return ret;
+                        goto out;
                }
-                boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
-                                                le32_to_cpu(rec->e_cpos));
+                el = &eb->h_list;
-                boff += (v_blkno & (u64)(bpc - 1));
-                *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
-                if (ret_count) {
+                if (el->l_tree_depth) {
-                        *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+                        ocfs2_error(inode->i_sb,
-                                        le32_to_cpu(rec->e_clusters)) - boff;
+                                    "Inode %lu has non zero tree depth in "
+                                    "leaf block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
                }
-                return 0;
        }
-        return -ENOENT;
+        i = ocfs2_search_extent_list(el, v_cluster);
-}
+        if (i == -1) {
+                /*
-int ocfs2_extent_map_init(struct inode *inode)
+                 * A hole was found. Return some canned values that
-{
+                 * callers can key on. If asked for, num_clusters will
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+                 * be populated with the size of the hole.
+                 */
-        em->em_extents = RB_ROOT;
+                *p_cluster = 0;
-        em->em_clusters = 0;
+                if (num_clusters) {
+                        ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-        return 0;
+                                                         v_cluster,
-}
+                                                         num_clusters);
+                        if (ret) {
-/* Needs the lock */
+                                mlog_errno(ret);
-static void __ocfs2_extent_map_drop(struct inode *inode,
+                                goto out;
-                                    u32 new_clusters,
+                        }
-                                    struct rb_node **free_head,
+                }
-                                    struct ocfs2_extent_map_entry **tail_ent)
+        } else {
-{
+                rec = &el->l_recs[i];
-        struct rb_node *node, *next;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-        struct ocfs2_extent_map_entry *ent;
-        *free_head = NULL;
+                BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
-        ent = NULL;
+                if (!rec->e_blkno) {
-        node = rb_last(&em->em_extents);
+                        ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-        while (node)
+                                    "record (%u, %u, 0)", inode->i_ino,
-        {
+                                    le32_to_cpu(rec->e_cpos),
-                next = rb_prev(node);
+                                    ocfs2_rec_clusters(el, rec));
+                        ret = -EROFS;
+                        goto out;
+                }
-                ent = rb_entry(node, struct ocfs2_extent_map_entry,
+                coff = v_cluster - le32_to_cpu(rec->e_cpos);
-                               e_node);
-                if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
-                        break;
-                rb_erase(&ent->e_node, &em->em_extents);
+                *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                    le64_to_cpu(rec->e_blkno));
+                *p_cluster = *p_cluster + coff;
-                node->rb_right = *free_head;
+                if (num_clusters)
-                *free_head = node;
+                        *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
-                ent = NULL;
+                flags = rec->e_flags;
-                node = next;
-        }
-        /* Do we have an entry straddling new_clusters? */
+                ocfs2_extent_map_insert_rec(inode, rec);
-        if (tail_ent) {
-                if (ent &&
-                    ((le32_to_cpu(ent->e_rec.e_cpos) +
-                      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
-                        *tail_ent = ent;
-                else
-                        *tail_ent = NULL;
        }
-}
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-        struct rb_node *node;
-        struct ocfs2_extent_map_entry *ent;
-        while (free_head) {
+        if (extent_flags)
-                node = free_head;
+                *extent_flags = flags;
-                free_head = node->rb_right;
-                ent = rb_entry(node, struct ocfs2_extent_map_entry,
+out:
-                               e_node);
+        brelse(di_bh);
-                kmem_cache_free(ocfs2_em_ent_cachep, ent);
+        brelse(eb_bh);
-        }
+        return ret;
 }
 /*
- * Remove all entries past new_clusters, inclusive of an entry that
+ * This expects alloc_sem to be held. The allocation cannot change at
- * contains new_clusters.  This is effectively a cache forget.
+ * all while the map is in the process of being updated.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
 */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+                                u64 *ret_count, unsigned int *extent_flags)
 {
-        struct rb_node *free_head = NULL;
+        int ret;
-        struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-        struct ocfs2_extent_map_entry *ent;
+        u32 cpos, num_clusters, p_cluster;
+        u64 boff = 0;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+        cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
-        if (ent) {
+        ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
-                rb_erase(&ent->e_node, &em->em_extents);
+                                 extent_flags);
-                ent->e_node.rb_right = free_head;
+        if (ret) {
-                free_head = &ent->e_node;
+                mlog_errno(ret);
+                goto out;
        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /*
+         * p_cluster == 0 indicates a hole.
-        if (free_head)
+         */
-                __ocfs2_extent_map_drop_cleanup(free_head);
+        if (p_cluster) {
+                boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
-        return 0;
+                boff += (v_blkno & (u64)(bpc - 1));
-}
+        }
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.  This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
-        struct rb_node *free_head = NULL;
-        struct ocfs2_extent_map_entry *ent = NULL;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-        if (ent)
-                ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
-                                               le32_to_cpu(ent->e_rec.e_cpos));
-        OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        if (free_head)
-                __ocfs2_extent_map_drop_cleanup(free_head);
-        return 0;
-}
-int __init init_ocfs2_extent_maps(void)
+        *p_blkno = boff;
-{
-        ocfs2_em_ent_cachep =
-                kmem_cache_create("ocfs2_em_ent",
-                                  sizeof(struct ocfs2_extent_map_entry),
-                                  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-        if (!ocfs2_em_ent_cachep)
-                return -ENOMEM;
-        return 0;
+        if (ret_count) {
-}
+                *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+                *ret_count -= v_blkno & (u64)(bpc - 1);
+        }
-void exit_ocfs2_extent_maps(void)
+out:
-{
+        return ret;
-        kmem_cache_destroy(ocfs2_em_ent_cachep);
 }
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
-int init_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
-void exit_ocfs2_extent_maps(void);
+        unsigned int                    ei_cpos;
+        unsigned int                    ei_phys;
+        unsigned int                    ei_clusters;
+        unsigned int                    ei_flags;
-/*
+        struct list_head                ei_list;
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+};
- * to be held.  The allocation cannot change at all while the map is
- * in the process of being updated.
+#define OCFS2_MAX_EXTENT_MAP_ITEMS                      3
- */
+struct ocfs2_extent_map {
-int ocfs2_extent_map_init(struct inode *inode);
+        unsigned int                    em_num_items;
-int ocfs2_extent_map_append(struct inode *inode,
+        struct list_head                em_list;
-                            struct ocfs2_extent_rec *rec,
+};
-                            u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
+void ocfs2_extent_map_init(struct inode *inode);
-                                u64 v_blkno, int count,
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
-                                u64 *p_blkno, int *ret_count);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+                                 struct ocfs2_extent_rec *rec);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+                       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+                                u64 *ret_count, unsigned int *extent_flags);
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..520a2a6d7670 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
        mlog_entry_void();
        i_size_write(inode, new_i_size);
-        inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
        int status;
        handle_t *handle;
+        struct ocfs2_dinode *di;
        mlog_entry_void();
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+        status = ocfs2_journal_access(handle, inode, fe_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        /*
+         * Do this before setting i_size.
+         */
+        status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+        if (status) {
+                mlog_errno(status);
+                goto out_commit;
+        }
+        i_size_write(inode, new_i_size);
+        inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        di = (struct ocfs2_dinode *) fe_bh->b_data;
+        di->i_size = cpu_to_le64(new_i_size);
+        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0)
                mlog_errno(status);
+out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
        mlog_exit(status);
        return status;
 }
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                mlog_errno(status);
                goto bail;
        }
-        ocfs2_data_unlock(inode, 1);
-        if (le32_to_cpu(fe->i_clusters) ==
-            ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
-                mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
-                     fe->i_clusters);
-                /* No allocation change is required, so lets fast path
-                 * this truncate. */
-                status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-                if (status < 0)
-                        mlog_errno(status);
-                goto bail;
-        }
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail;
+                goto bail_unlock_data;
        }
        /* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+        ocfs2_data_unlock(inode, 1);
 bail:
        mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
 */
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
+                               u32 *logical_offset,
                               u32 clusters_to_add,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
        mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+        status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-                                     num_bits, meta_ac);
+                                     *logical_offset, block, num_bits,
+                                     meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        le32_add_cpu(&fe->i_clusters, num_bits);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        }
        clusters_to_add -= num_bits;
+        *logical_offset += num_bits;
        if (clusters_to_add) {
                mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
        return status;
 }
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Called from ocfs2_extend_allocation() for file systems which don't
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+                          u32 clusters_to_add,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac)
+{
+        int ret, num_free_extents;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *meta_ac = NULL;
+        *data_ac = NULL;
+        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+             "clusters_to_add = %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+             le32_to_cpu(di->i_clusters), clusters_to_add);
+        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Sparse allocation file systems need to be more conservative
+         * with reserving room for expansion - the actual allocation
+         * happens while we've got a journal handle open so re-taking
+         * a cluster lock (because we ran out of room for another
+         * extent) will violate ordering rules.
+         *
+         * Most of the time we'll only be seeing this 1 cluster at a time
+         * anyway.
+         */
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+                ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null *data_ac.
+                 */
+        }
+        return ret;
+}
 static int ocfs2_extend_allocation(struct inode *inode,
                                   u32 clusters_to_add)
 {
        int status = 0;
        int restart_func = 0;
        int drop_alloc_sem = 0;
-        int credits, num_free_extents;
+        int credits;
-        u32 prev_clusters;
+        u32 prev_clusters, logical_start;
        struct buffer_head *bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+        /*
+         * This function only exists for file systems which don't
+         * support holes.
+         */
+        BUG_ON(ocfs2_sparse_alloc(osb));
        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
                                  OCFS2_BH_CACHED, inode);
        if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
                goto leave;
        }
+        logical_start = OCFS2_I(inode)->ip_clusters;
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
-             "clusters_to_add = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-             fe->i_clusters, clusters_to_add);
-        num_free_extents = ocfs2_num_free_extents(osb,
-                                                  inode,
-                                                  fe);
-        if (num_free_extents < 0) {
-                status = num_free_extents;
-                mlog_errno(status);
-                goto leave;
-        }
-        if (!num_free_extents) {
-                status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
-                if (status < 0) {
-                        if (status != -ENOSPC)
-                                mlog_errno(status);
-                        goto leave;
-                }
-        }
-        status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
-        if (status < 0) {
-                if (status != -ENOSPC)
-                        mlog_errno(status);
-                goto leave;
-        }
        /* blocks peope in read/write from reading our allocation
         * until we're done changing it. We depend on i_mutex to block
         * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        drop_alloc_sem = 1;
+        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+                                       &meta_ac);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
        credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
        status = ocfs2_do_extend_allocation(osb,
                                            inode,
+                                            &logical_start,
                                            clusters_to_add,
                                            bh,
                                            handle,
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
                             size_t tail_to_skip)
 {
        int ret = 0;
-        u32 clusters_to_add;
+        u32 clusters_to_add = 0;
        BUG_ON(!tail_to_skip && !di_bh);
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                BUG_ON(tail_to_skip != 0);
+                goto out_update_size;
+        }
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
                OCFS2_I(inode)->ip_clusters;
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out_unlock;
        }
+out_update_size:
        if (!tail_to_skip) {
                /* We're being called from ocfs2_setattr() which wants
                 * us to update i_size */
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
        }
 out_unlock:
-        ocfs2_data_unlock(inode, 1);
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                ocfs2_data_unlock(inode, 1);
 out:
        return ret;
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        ret = ocfs2_meta_lock(inode, NULL, 0);
        if (ret) {
-                mlog_errno(ret);
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
                goto out;
        }
@@ -1035,10 +1119,49 @@ out:
        return ret;
 }
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+                                       size_t count)
+{
+        int ret = 0;
+        unsigned int extent_flags;
+        u32 cpos, clusters, extent_len, phys_cpos;
+        struct super_block *sb = inode->i_sb;
+        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+        while (clusters) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+                                         &extent_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+                        ret = 1;
+                        break;
+                }
+                if (extent_len > clusters)
+                        extent_len = clusters;
+                clusters -= extent_len;
+                cpos += extent_len;
+        }
+out:
+        return ret;
+}
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
-                                         int appending)
+                                         int appending,
+                                         int *direct_io)
 {
        int ret = 0, meta_level = appending;
        struct inode *inode = dentry->d_inode;
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                } else {
                        saved_pos = *ppos;
                }
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+                        loff_t end = saved_pos + count;
+                        /*
+                         * Skip the O_DIRECT checks if we don't need
+                         * them.
+                         */
+                        if (!direct_io || !(*direct_io))
+                                break;
+                        /*
+                         * Allowing concurrent direct writes means
+                         * i_size changes wouldn't be synchronized, so
+                         * one node could wind up truncating another
+                         * nodes writes.
+                         */
+                        if (end > i_size_read(inode)) {
+                                *direct_io = 0;
+                                break;
+                        }
+                        /*
+                         * We don't fill holes during direct io, so
+                         * check for them here. If any are found, the
+                         * caller will have to retake some cluster
+                         * locks and initiate the io as buffered.
+                         */
+                        ret = ocfs2_check_range_for_holes(inode, saved_pos,
+                                                          count);
+                        if (ret == 1) {
+                                *direct_io = 0;
+                                ret = 0;
+                        } else if (ret < 0)
+                                mlog_errno(ret);
+                        break;
+                }
+                /*
+                 * The rest of this loop is concerned with legacy file
+                 * systems which don't support sparse files.
+                 */
                newsize = count + saved_pos;
                mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1307,264 @@ out:
        return ret;
 }
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+        const struct iovec *iov = *iovp;
+        size_t base = *basep;
+        do {
+                int copy = min(bytes, iov->iov_len - base);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        base = 0;
+                }
+        } while (bytes);
+        *iovp = iov;
+        *basep = base;
+}
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+                                            const struct iovec *cur_iov,
+                                            size_t iov_offset)
+{
+        int ret;
+        char *buf;
+        struct page *src_page = NULL;
+        buf = cur_iov->iov_base + iov_offset;
+        if (!segment_eq(get_fs(), KERNEL_DS)) {
+                /*
+                 * Pull in the user page. We want to do this outside
+                 * of the meta data locks in order to preserve locking
+                 * order in case of page fault.
+                 */
+                ret = get_user_pages(current, current->mm,
+                                     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+                                     0, 0, &src_page, NULL);
+                if (ret == 1)
+                        bp->b_src_buf = kmap(src_page);
+                else
+                        src_page = ERR_PTR(-EFAULT);
+        } else {
+                bp->b_src_buf = buf;
+        }
+        return src_page;
+}
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+                                   struct page *page)
+{
+        if (page) {
+                kunmap(page);
+                page_cache_release(page);
+        }
+}
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+                                         const struct iovec *iov,
+                                         unsigned long nr_segs,
+                                         size_t count,
+                                         ssize_t o_direct_written)
+{
+        int ret = 0;
+        ssize_t copied, total = 0;
+        size_t iov_offset = 0;
+        const struct iovec *cur_iov = iov;
+        struct ocfs2_buffered_write_priv bp;
+        struct page *page;
+        /*
+         * handle partial DIO write.  Adjust cur_iov if needed.
+         */
+        ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+        do {
+                bp.b_cur_off = iov_offset;
+                bp.b_cur_iov = cur_iov;
+                page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+                if (IS_ERR(page)) {
+                        ret = PTR_ERR(page);
+                        goto out;
+                }
+                copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+                                                      ocfs2_map_and_write_user_data,
+                                                      &bp);
+                ocfs2_put_write_source(&bp, page);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                total += copied;
+                *ppos = *ppos + copied;
+                count -= copied;
+                ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+        } while(count);
+out:
+        return total ? total : ret;
+}
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+                             unsigned long *nr_segs)
+{
+        size_t ocount;          /* original count */
+        unsigned long seg;
+        ocount = 0;
+        for (seg = 0; seg < *nr_segs; seg++) {
+                const struct iovec *iv = &iov[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                ocount += iv->iov_len;
+                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                *nr_segs = seg;
+                ocount -= iv->iov_len;  /* This segment is no good */
+                break;
+        }
+        *counted = ocount;
+        return 0;
+}
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs,
                                    loff_t pos)
 {
-        int ret, rw_level, have_alloc_sem = 0;
+        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-        struct file *filp = iocb->ki_filp;
+        int can_do_direct, sync = 0;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        ssize_t written = 0;
-        int appending = filp->f_flags & O_APPEND ? 1 : 0;
+        size_t ocount;          /* original count */
+        size_t count;           /* after file limit checks */
-        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+        loff_t *ppos = &iocb->ki_pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
-                   filp->f_path.dentry->d_name.len,
+                   file->f_path.dentry->d_name.len,
-                   filp->f_path.dentry->d_name.name);
+                   file->f_path.dentry->d_name.name);
-        /* happy write of zero bytes */
        if (iocb->ki_left == 0)
                return 0;
+        ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+        if (ret)
+                return ret;
+        count = ocount;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        appending = file->f_flags & O_APPEND ? 1 : 0;
+        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
        mutex_lock(&inode->i_mutex);
+relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-        if (filp->f_flags & O_DIRECT) {
+        if (direct_io) {
-                have_alloc_sem = 1;
                down_read(&inode->i_alloc_sem);
+                have_alloc_sem = 1;
        }
        /* concurrent O_DIRECT writes are allowed */
-        rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+        rw_level = !direct_io;
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
-                rw_level = -1;
                mlog_errno(ret);
-                goto out;
+                goto out_sems;
        }
-        ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
+        can_do_direct = direct_io;
-                                            iocb->ki_left, appending);
+        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+                                            iocb->ki_left, appending,
+                                            &can_do_direct);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        /* communicate with ocfs2_dio_end_io */
+        /*
-        ocfs2_iocb_set_rw_locked(iocb);
+         * We can't complete the direct I/O as requested, fall back to
+         * buffered I/O.
+         */
+        if (direct_io && !can_do_direct) {
+                ocfs2_rw_unlock(inode, rw_level);
+                up_read(&inode->i_alloc_sem);
+                have_alloc_sem = 0;
+                rw_level = -1;
-        ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+                direct_io = 0;
+                sync = 1;
+                goto relock;
+        }
+        if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+                sync = 1;
+        /*
+         * XXX: Is it ok to execute these checks a second time?
+         */
+        ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+        if (ret)
+                goto out;
+        /*
+         * Set pos so that sync_page_range_nolock() below understands
+         * where to start from. We might've moved it around via the
+         * calls above. The range we want to actually sync starts from
+         * *ppos here.
+         *
+         */
+        pos = *ppos;
+        /* communicate with ocfs2_dio_end_io */
+        ocfs2_iocb_set_rw_locked(iocb, rw_level);
+        if (direct_io) {
+                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+                                                    ppos, count, ocount);
+                if (written < 0) {
+                        ret = written;
+                        goto out_dio;
+                }
+        } else {
+                written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+                                                    count, written);
+                if (written < 0) {
+                        ret = written;
+                        if (ret != -EFAULT || ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
-        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        /* 
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        }
 out:
+        if (rw_level != -1)
+                ocfs2_rw_unlock(inode, rw_level);
+out_sems:
        if (have_alloc_sem)
                up_read(&inode->i_alloc_sem);
-        if (rw_level != -1) 
-                ocfs2_rw_unlock(inode, rw_level);
+        if (written > 0 && sync) {
+                ssize_t err;
+                err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+                if (err < 0)
+                        written = err;
+        }
        mutex_unlock(&inode->i_mutex);
        mlog_exit(ret);
+        return written ? written : ret;
+}
+static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
+                                    struct pipe_buffer *buf,
+                                    struct splice_desc *sd)
+{
+        int ret, count, total = 0;
+        ssize_t copied = 0;
+        struct ocfs2_splice_write_priv sp;
+        ret = buf->ops->pin(pipe, buf);
+        if (ret)
+                goto out;
+        sp.s_sd = sd;
+        sp.s_buf = buf;
+        sp.s_pipe = pipe;
+        sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
+        sp.s_buf_offset = buf->offset;
+        count = sd->len;
+        if (count + sp.s_offset > PAGE_CACHE_SIZE)
+                count = PAGE_CACHE_SIZE - sp.s_offset;
+        do {
+                /*
+                 * splice wants us to copy up to one page at a
+                 * time. For pagesize > cluster size, this means we
+                 * might enter ocfs2_buffered_write_cluster() more
+                 * than once, so keep track of our progress here.
+                 */
+                copied = ocfs2_buffered_write_cluster(sd->file,
+                                                      (loff_t)sd->pos + total,
+                                                      count,
+                                                      ocfs2_map_and_write_splice_data,
+                                                      &sp);
+                if (copied < 0) {
+                        mlog_errno(copied);
+                        ret = copied;
+                        goto out;
+                }
+                count -= copied;
+                sp.s_offset += copied;
+                sp.s_buf_offset += copied;
+                total += copied;
+        } while (count);
+        ret = 0;
+out:
+        return total ? total : ret;
+}
+static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+                                         struct file *out,
+                                         loff_t *ppos,
+                                         size_t len,
+                                         unsigned int flags)
+{
+        int ret, err;
+        struct address_space *mapping = out->f_mapping;
+        struct inode *inode = mapping->host;
+        ret = __splice_from_pipe(pipe, out, ppos, len, flags,
+                                 ocfs2_splice_write_actor);
+        if (ret > 0) {
+                *ppos += ret;
+                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        err = generic_osync_inode(inode, mapping,
+                                                  OSYNC_METADATA|OSYNC_DATA);
+                        if (err)
+                                ret = err;
+                }
+        }
        return ret;
 }
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                goto out;
        }
-        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+                                            NULL);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        /* ok, we're done with i_size and alloc work */
-        ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+        ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
 out_unlock:
        ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                }
                rw_level = 0;
                /* communicate with ocfs2_dio_end_io */
-                ocfs2_iocb_set_rw_locked(iocb);
+                ocfs2_iocb_set_rw_locked(iocb, rw_level);
        }
        /*
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..2c4460fced52 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
+                               u32 *cluster_start,
                               u32 clusters_to_add,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+                          u32 clusters_to_add,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 8fc52d6d0ce7..b25ef63781ba 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -164,8 +164,10 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
        }
        status = o2hb_register_callback(&osb->osb_hb_up);
-        if (status < 0)
+        if (status < 0) {
                mlog_errno(status);
+                o2hb_unregister_callback(&osb->osb_hb_down);
+        }
 bail:
        return status;
@@ -173,18 +175,11 @@ bail:
 void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
 {
-        int status;
        if (ocfs2_mount_local(osb))
                return;
-        status = o2hb_unregister_callback(&osb->osb_hb_down);
+        o2hb_unregister_callback(&osb->osb_hb_down);
-        if (status < 0)
+        o2hb_unregister_callback(&osb->osb_hb_up);
-                mlog_errno(status);
-        status = o2hb_unregister_callback(&osb->osb_hb_up);
-        if (status < 0)
-                mlog_errno(status);
 }
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..21a605079c62 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_DIRSYNC;
 }
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-                                     u64 blkno,
-                                     int delete_vote)
-{
-        struct ocfs2_find_inode_args args;
-        /* ocfs2_ilookup_for_vote should *only* be called from the
-         * vote thread */
-        BUG_ON(current != osb->vote_task);
-        args.fi_blkno = blkno;
-        args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
-        if (delete_vote)
-                args.fi_flags |= OCFS2_FI_FLAG_DELETE;
-        args.fi_ino = ino_from_blkno(osb->sb, blkno);
-        return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
-}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
        struct inode *inode = NULL;
@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
        if (oi->ip_blkno != args->fi_blkno)
                goto bail;
-        /* OCFS2_FI_FLAG_NOWAIT is *only* set from
-         * ocfs2_ilookup_for_vote which won't create an inode for one
-         * that isn't found. The vote thread which doesn't want to get
-         * an inode which is in the process of going away - otherwise
-         * the call to __wait_on_freeing_inode in find_inode_fast will
-         * cause it to deadlock on an inode which may be waiting on a
-         * vote (or lock release) in delete_inode */
-        if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
-            (inode->i_state & (I_FREEING|I_CLEAR))) {
-                /* As stated above, we're not going to return an
-                 * inode.  In the case of a delete vote, the voting
-                 * code is going to signal the other node to go
-                 * ahead. Mark that state here, so this freeing inode
-                 * has the state when it gets to delete_inode. */
-                if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
-                        spin_lock(&oi->ip_lock);
-                        ocfs2_mark_inode_remotely_deleted(inode);
-                        spin_unlock(&oi->ip_lock);
-                }
-                goto bail;
-        }
        ret = 1;
 bail:
        mlog_exit(ret);
@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                goto bail;
        }
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        inode->i_version = 1;
        inode->i_generation = le32_to_cpu(fe->i_generation);
        inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
                inode->i_blocks = 0;
        else
-                inode->i_blocks =
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-                        ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
        inode->i_mapping->a_ops = &ocfs2_aops;
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)fe->i_blkno);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-        OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
-        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
+                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                          OCFS2_LOCK_TYPE_OPEN, 0, inode);
        }
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * cluster lock before trusting anything anyway.
         */
        can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-                && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+                && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
                && !ocfs2_mount_local(osb);
        /*
@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
+        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                  OCFS2_LOCK_TYPE_OPEN,
+                                  0, inode);
        if (can_lock) {
+                status = ocfs2_open_lock(inode);
+                if (status) {
+                        make_bad_inode(inode);
+                        mlog_errno(status);
+                        return status;
+                }
                status = ocfs2_meta_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
+        if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+                status = ocfs2_try_open_lock(inode, 0);
+                if (status) {
+                        make_bad_inode(inode);  
+                        return status;
+                }
+        }
        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
                                  can_lock ? inode : NULL);
        if (status < 0) {
@@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        handle_t *handle = NULL;
        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
+        handle_t *handle = NULL;
        mlog_entry_void();
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        /* zero allocation, zero truncate :) */
+        if (fe->i_clusters) {
-        if (!fe->i_clusters)
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-                goto bail;
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out;
+                }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                status = ocfs2_journal_access(handle, inode, fe_bh,
-        if (IS_ERR(handle)) {
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                status = PTR_ERR(handle);
+                if (status < 0) {
-                handle = NULL;
+                        mlog_errno(status);
-                mlog_errno(status);
+                        goto out;
-                goto bail;
+                }
-        }
-        status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+                i_size_write(inode, 0);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_commit_trans(osb, handle);
+                status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
-        handle = NULL;
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out;
+                }
-        status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                ocfs2_commit_trans(osb, handle);
-        if (status < 0) {
+                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-        if (status < 0) {
+                if (status < 0) {
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto out;
+                }
+                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out;
+                }
        }
-bail:
+out:
        if (handle)
                ocfs2_commit_trans(osb, handle);
        mlog_exit(status);
        return status;
 }
@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_dinode *di;
-        /* We've already voted on this so it should be readonly - no
+        di = (struct ocfs2_dinode *) di_bh->b_data;
-         * spinlock needed. */
+        orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
-        orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
        if (status)
@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
-        status = ocfs2_request_delete_vote(inode);
+        /*
-        /* -EBUSY means that other nodes are still using the
+         * This is how ocfs2 determines whether an inode is still live
-         * inode. We're done here though, so avoid doing anything on
+         * within the cluster. Every node takes a shared read lock on
-         * disk and let them worry about deleting it. */
+         * the inode open lock in ocfs2_read_locked_inode(). When we
-        if (status == -EBUSY) {
+         * get to ->delete_inode(), each node tries to convert it's
+         * lock to an exclusive. Trylocks are serialized by the inode
+         * meta data lock. If the upconvert suceeds, we know the inode
+         * is no longer live and can be deleted.
+         *
+         * Though we call this with the meta data lock held, the
+         * trylock keeps us from ABBA deadlock.
+         */
+        status = ocfs2_try_open_lock(inode, 1);
+        if (status == -EAGAIN) {
                status = 0;
                mlog(0, "Skipping delete of %llu because it is in use on"
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
-        spin_lock(&oi->ip_lock);
+        *wipe = 1;
-        if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+        mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
-                /* Nobody knew which slot this inode was orphaned
+             (unsigned long long)oi->ip_blkno,
-                 * into. This may happen during node death and
+             le16_to_cpu(di->i_orphaned_slot));
-                 * recovery knows how to clean it up so we can safely
-                 * ignore this inode for now on. */
-                mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-                     (unsigned long long)oi->ip_blkno);
-        } else {
-                *wipe = 1;
-                mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-                     (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-        }
-        spin_unlock(&oi->ip_lock);
 bail:
        return status;
@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
+        /* For remove delete_inode vote, we hold open lock before,
+         * now it is time to unlock PR and EX open locks. */
+        ocfs2_open_unlock(inode);
        /* Do these before all the other work so that we don't bounce
         * the vote thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
-        ocfs2_extent_map_drop(inode, 0);
+        ocfs2_extent_map_trunc(inode, 0);
-        ocfs2_extent_map_init(inode);
        status = ocfs2_drop_inode_locks(inode);
        if (status < 0)
@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
        ocfs2_lock_res_free(&oi->ip_meta_lockres);
        ocfs2_lock_res_free(&oi->ip_data_lockres);
+        ocfs2_lock_res_free(&oi->ip_open_lockres);
        ocfs2_metadata_cache_purge(inode);
@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
        mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
-        /* Testing ip_orphaned_slot here wouldn't work because we may
-         * not have gotten a delete_inode vote from any other nodes
-         * yet. */
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
                generic_delete_inode(inode);
        else
@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
                return NULL;
        }
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-                                             &p_blkno, NULL);
+                                             NULL);
        if (tmperr < 0) {
                mlog_errno(tmperr);
                goto fail;
@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
                inode->i_blocks = 0;
        else
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
        inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..03ae075869ee 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
+#include "extent_map.h"
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
        struct ocfs2_lock_res           ip_rw_lockres;
        struct ocfs2_lock_res           ip_meta_lockres;
        struct ocfs2_lock_res           ip_data_lockres;
+        struct ocfs2_lock_res           ip_open_lockres;
        /* protects allocation changes on this inode. */
        struct rw_semaphore             ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
        u32                             ip_clusters;
-        struct ocfs2_extent_map         ip_map;
        struct list_head                ip_io_markers;
-        int                             ip_orphaned_slot;
        struct mutex                    ip_io_mutex;
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
        struct ocfs2_caching_info       ip_metadata_cache;
+        struct ocfs2_extent_map         ip_extent_map;
        struct inode                    vfs_inode;
 };
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT    0x1
+#define OCFS2_FI_FLAG_SYSFILE           0x4
-#define OCFS2_FI_FLAG_DELETE    0x2
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x8
-#define OCFS2_FI_FLAG_SYSFILE   0x4
-#define OCFS2_FI_FLAG_NOLOCK    0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-                                     u64 blkno,
-                                     int delete_vote);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 void ocfs2_set_inode_flags(struct inode *inode);
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+        int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..5a8a90d1c787 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -649,29 +649,20 @@ bail:
 static int ocfs2_force_read_journal(struct inode *inode)
 {
        int status = 0;
-        int i, p_blocks;
+        int i;
-        u64 v_blkno, p_blkno;
+        u64 v_blkno, p_blkno, p_blocks, num_blocks;
-#define CONCURRENT_JOURNAL_FILL 32
+#define CONCURRENT_JOURNAL_FILL 32ULL
        struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
        mlog_entry_void();
-        BUG_ON(inode->i_blocks !=
-                     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
        memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
-        mlog(0, "Force reading %llu blocks\n",
+        num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
-                (unsigned long long)(inode->i_blocks >>
-                        (inode->i_sb->s_blocksize_bits - 9)));
        v_blkno = 0;
-        while (v_blkno <
+        while (v_blkno < num_blocks) {
-               (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
                status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-                                                     1, &p_blkno,
+                                                     &p_blkno, &p_blocks, NULL);
-                                                     &p_blocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                                continue;
                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-                                          OCFS2_FI_FLAG_NOLOCK);
+                                          OCFS2_FI_FLAG_ORPHAN_RECOVERY);
                        if (IS_ERR(iter))
                                continue;
@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                /* Set the proper information to get us going into
                 * ocfs2_delete_inode. */
                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-                oi->ip_orphaned_slot = slot;
                spin_unlock(&oi->ip_lock);
                iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* We may be deleting metadata blocks, so metadata alloc dinode +
           one desc. block for each possible delete. */
        if (tree_depth && next_free == 1 &&
-            le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+            ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
                credits += 1 + tree_depth;
        /* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
        int ret = 0, lock_level = 0;
        struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
-        /* We don't want to support shared writable mappings yet. */
+        /*
-        if (!ocfs2_mount_local(osb) &&
+         * Only support shared writeable mmap for local mounts which
+         * don't know about holes.
+         */
+        if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
            ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
            ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
                mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f7fa52bb3f6b..2bcf353fd7c5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
        if (IS_ERR(inode)) {
-                mlog(ML_ERROR, "Unable to create inode %llu\n",
-                     (unsigned long long)blkno);
                ret = ERR_PTR(-EACCES);
                goto bail_unlock;
        }
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
         * unlink. */
        spin_lock(&oi->ip_lock);
        oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-        oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        spin_unlock(&oi->ip_lock);
 bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
-        inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+        inode->i_blocks = ocfs2_inode_sector_count(inode);
        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -1098,7 +1095,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        BUG();
        }
-        /* Assume a directory heirarchy thusly:
+        /* Assume a directory hierarchy thusly:
         * a/b/c
         * a/d
         * a,b,c, and d are all directories.
@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
        struct buffer_head **bhs = NULL;
        const char *c;
        struct super_block *sb = osb->sb;
-        u64 p_blkno;
+        u64 p_blkno, p_blocks;
-        int p_blocks;
        int virtual, blocks, status, i, bytes_left;
        bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+        status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
-                                             &p_blocks);
+                                             NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
        inode->i_rdev = 0;
        newsize = l - 1;
        if (l > ocfs2_fast_symlink_chars(sb)) {
+                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+                                                    new_fe_bh,
                                                    handle, data_ac, NULL,
                                                    NULL);
                if (status < 0) {
@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
                        goto bail;
                }
                i_size_write(inode, newsize);
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
        } else {
                inode->i_op = &ocfs2_fast_symlink_inode_operations;
                memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
+        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
-        OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..82cc92dcf8a6 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
 #include "endian.h"
 #include "ocfs2_lockid.h"
-struct ocfs2_extent_map {
-        u32             em_clusters;
-        struct rb_root  em_extents;
-};
 /* Most user visible OCFS2 inodes will have very few pieces of
 * metadata, but larger files (including bitmaps, etc) must be taken
 * into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
        return 1;
 }
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
        return (unsigned long)((bytes + 511) >> 9);
 }
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+                                                        unsigned long pg_index)
+{
+        u32 clusters = pg_index;
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        if (unlikely(PAGE_CACHE_SHIFT > cbits))
+                clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+        else if (PAGE_CACHE_SHIFT < cbits)
+                clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+        return clusters;
+}
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+                                                        u32 clusters)
+{
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned long index = clusters;
+        if (PAGE_CACHE_SHIFT > cbits) {
+                index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+        } else if (PAGE_CACHE_SHIFT < cbits) {
+                index = clusters << (cbits - PAGE_CACHE_SHIFT);
+        }
+        return index;
+}
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+        unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned int pages_per_cluster = 1;
+        if (PAGE_CACHE_SHIFT < cbits)
+                pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+        return pages_per_cluster;
+}
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..71306479c68f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
        OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP     OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    0
 /*
@@ -155,6 +156,12 @@
 #define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
 /*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN     (0x01)  /* Extent is allocated but
+                                         * unwritten */
+/*
 * ioctl commands
 */
 #define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
 */
 struct ocfs2_extent_rec {
 /*00*/  __le32 e_cpos;          /* Offset into the file, in clusters */
-        __le32 e_clusters;      /* Clusters covered by this extent */
+        union {
+                __le32 e_int_clusters; /* Clusters covered by all children */
+                struct {
+                        __le16 e_leaf_clusters; /* Clusters covered by this
+                                                   extent */
+                        __u8 e_reserved1;
+                        __u8 e_flags; /* Extent flags */
+                };
+        };
        __le64 e_blkno;         /* Physical disk offset, in blocks */
 /*10*/
 };
@@ -311,7 +329,10 @@ struct ocfs2_extent_list {
 /*00*/  __le16 l_tree_depth;            /* Extent tree depth from this
                                           point.  0 means data extents
                                           hang directly off this
-                                           header (a leaf) */
+                                           header (a leaf)
+                                           NOTE: The high 8 bits cannot be
+                                           used - tree_depth is never that big.
+                                        */
        __le16 l_count;                 /* Number of extent records */
        __le16 l_next_free_rec;         /* Next unused extent slot */
        __le16 l_reserved1;
@@ -446,7 +467,9 @@ struct ocfs2_dinode {
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
        __le32 i_attr;
-        __le32 i_reserved1;
+        __le16 i_orphaned_slot;         /* Only valid when OCFS2_ORPHANED_FL
+                                           was set in i_flags */
+        __le16 i_reserved1;
 /*70*/  __le64 i_reserved2[8];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_RENAME,
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
+        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_DENTRY:
                        c = 'N';
                        break;
+                case OCFS2_LOCK_TYPE_OPEN:
+                        c = 'O';
+                        break;
                default:
                        c = '\0';
        }
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
         * important job it does, anyway. */
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+        [OCFS2_LOCK_TYPE_OPEN] = "Open",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+        status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..0da655ae5d6f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                             le32_to_cpu(fe->i_clusters)));
        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
-        alloc_inode->i_blocks =
+        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
-                ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
        status = 0;
 bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..5c9e8243691f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
        ocfs2_print_version();
-        if (init_ocfs2_extent_maps())
-                return -ENOMEM;
        status = init_ocfs2_uptodate_cache();
        if (status < 0) {
                mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
        if (status < 0) {
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
-                exit_ocfs2_extent_maps();
        }
        mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
        unregister_filesystem(&ocfs2_fs_type);
-        exit_ocfs2_extent_maps();
        exit_ocfs2_uptodate_cache();
        mlog_exit_void();
@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
                ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
                ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
                ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+                ocfs2_lock_res_init_once(&oi->ip_open_lockres);
                ocfs2_metadata_cache_init(&oi->vfs_inode);
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
        __be32 h_node_num;    /* node sending this particular message. */
 };
-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
 struct ocfs2_vote_msg
 {
        struct ocfs2_msg_hdr v_hdr;
-        union {
+        __be32 v_reserved1;
-                __be32 v_generic1;
-                __be32 v_orphaned_slot; /* Used during delete votes */
-                __be32 v_nlink;         /* Used during unlink votes */
-        } md1;                          /* Message type dependant 1 */
 };
 /* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
 {
        struct ocfs2_msg_hdr r_hdr;
        __be32 r_response;
-        __be32 r_orphaned_slot;
 };
 struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
 enum ocfs2_vote_request {
        OCFS2_VOTE_REQ_INVALID = 0,
-        OCFS2_VOTE_REQ_DELETE,
        OCFS2_VOTE_REQ_MOUNT,
        OCFS2_VOTE_REQ_UMOUNT,
        OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
        ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
 }
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        assert_spin_locked(&oi->ip_lock);
-        /* We set the SKIP_DELETE flag on the inode so we don't try to
-         * delete it in delete_inode ourselves, thus avoiding
-         * unecessary lock pinging. If the other node failed to wipe
-         * the inode as a result of a crash, then recovery will pick
-         * up the slack. */
-        oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-static int ocfs2_process_delete_request(struct inode *inode,
-                                        int *orphaned_slot)
-{
-        int response = OCFS2_RESPONSE_BUSY;
-        mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
-             inode->i_ino, inode->i_nlink, *orphaned_slot);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        /* Whatever our vote response is, we want to make sure that
-         * the orphaned slot is recorded properly on this node *and*
-         * on the requesting node. Technically, if the requesting node
-         * did not know which slot the inode is orphaned in but we
-         * respond with BUSY he doesn't actually need the orphaned
-         * slot, but it doesn't hurt to do it here anyway. */
-        if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
-                mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
-                                OCFS2_INVALID_SLOT &&
-                                OCFS2_I(inode)->ip_orphaned_slot !=
-                                (*orphaned_slot),
-                                "Inode %llu: This node thinks it's "
-                                "orphaned in slot %d, messaged it's in %d\n",
-                                (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                OCFS2_I(inode)->ip_orphaned_slot,
-                                *orphaned_slot);
-                mlog(0, "Setting orphaned slot for inode %llu to %d\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     *orphaned_slot);
-                OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
-        } else {
-                mlog(0, "Sending back orphaned slot %d for inode %llu\n",
-                     OCFS2_I(inode)->ip_orphaned_slot,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-        }
-        /* vote no if the file is still open. */
-        if (OCFS2_I(inode)->ip_open_count) {
-                mlog(0, "open count = %u\n",
-                     OCFS2_I(inode)->ip_open_count);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                goto done;
-        }
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* directories are a bit ugly... What if someone is sitting in
-         * it? We want to make sure the inode is removed completely as
-         * a result of the iput in process_vote. */
-        if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
-                mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
-                goto done;
-        }
-        if (filemap_fdatawrite(inode->i_mapping)) {
-                mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                goto done;
-        }
-        sync_mapping_buffers(inode->i_mapping);
-        truncate_inode_pages(inode->i_mapping, 0);
-        ocfs2_extent_map_trunc(inode, 0);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        /* double check open count - someone might have raced this
-         * thread into ocfs2_file_open while we were writing out
-         * data. If we're to allow a wipe of this inode now, we *must*
-         * hold the spinlock until we've marked it. */
-        if (OCFS2_I(inode)->ip_open_count) {
-                mlog(0, "Raced to wipe! open count = %u\n",
-                     OCFS2_I(inode)->ip_open_count);
-                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                goto done;
-        }
-        /* Mark the inode as being wiped from disk. */
-        ocfs2_mark_inode_remotely_deleted(inode);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* Not sure this is necessary anymore. */
-        d_prune_aliases(inode);
-        /* If we get here, then we're voting 'yes', so commit the
-         * delete on our side. */
-        response = OCFS2_RESPONSE_OK;
-done:
-        return response;
-}
 static void ocfs2_process_vote(struct ocfs2_super *osb,
                               struct ocfs2_vote_msg *msg)
 {
        int net_status, vote_response;
-        int orphaned_slot = 0;
+        unsigned int node_num;
-        unsigned int node_num, generation;
        u64 blkno;
        enum ocfs2_vote_request request;
-        struct inode *inode = NULL;
        struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
        struct ocfs2_response_msg response;
        /* decode the network mumbo jumbo into local variables. */
        request = be32_to_cpu(hdr->h_request);
        blkno = be64_to_cpu(hdr->h_blkno);
-        generation = be32_to_cpu(hdr->h_generation);
        node_num = be32_to_cpu(hdr->h_node_num);
-        if (request == OCFS2_VOTE_REQ_DELETE)
-                orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
-        mlog(0, "processing vote: request = %u, blkno = %llu, "
+        mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
-             "generation = %u, node_num = %u, priv1 = %u\n", request,
+             request, (unsigned long long)blkno, node_num);
-             (unsigned long long)blkno, generation, node_num,
-             be32_to_cpu(msg->md1.v_generic1));
        if (!ocfs2_is_valid_vote_request(request)) {
                mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
                break;
        }
-        /* We cannot process the remaining message types before we're
-         * fully mounted. It's perfectly safe however to send a 'yes'
-         * response as we can't possibly have any of the state they're
-         * asking us to modify yet. */
-        if (atomic_read(&osb->vol_state) == VOLUME_INIT)
-                goto respond;
-        /* If we get here, then the request is against an inode. */
-        inode = ocfs2_ilookup_for_vote(osb, blkno,
-                                       request == OCFS2_VOTE_REQ_DELETE);
-        /* Not finding the inode is perfectly valid - it means we're
-         * not interested in what the other node is about to do to it
-         * so in those cases we automatically respond with an
-         * affirmative. Cluster locking ensures that we won't race
-         * interest in the inode with this vote request. */
-        if (!inode)
-                goto respond;
-        /* Check generation values. It's possible for us to get a
-         * request against a stale inode. If so then we proceed as if
-         * we had not found an inode in the first place. */
-        if (inode->i_generation != generation) {
-                mlog(0, "generation passed %u != inode generation = %u, "
-                     "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
-                     "message type = %u\n", generation, inode->i_generation,
-                     OCFS2_I(inode)->ip_flags,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)blkno, atomic_read(&inode->i_count),
-                     request);
-                iput(inode);
-                inode = NULL;
-                goto respond;
-        }
-        switch (request) {
-        case OCFS2_VOTE_REQ_DELETE:
-                vote_response = ocfs2_process_delete_request(inode,
-                                                             &orphaned_slot);
-                break;
-        default:
-                mlog(ML_ERROR, "node %u, invalid request: %u\n",
-                     node_num, request);
-                vote_response = OCFS2_RESPONSE_BAD_MSG;
-        }
 respond:
        /* Response struture is small so we just put it on the stack
         * and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
        response.r_hdr.h_generation = hdr->h_generation;
        response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
        response.r_response = cpu_to_be32(vote_response);
-        response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
        net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
                                        osb->net_key,
@@ -373,9 +205,6 @@ respond:
            && net_status != -ENOTCONN)
                mlog(ML_ERROR, "message to node %u fails with error %d!\n",
                     node_num, net_status);
-        if (inode)
-                iput(inode);
 }
 static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
 static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
                                                      u64 blkno,
                                                      unsigned int generation,
-                                                      enum ocfs2_vote_request type,
+                                                      enum ocfs2_vote_request type)
-                                                      u32 priv)
 {
        struct ocfs2_vote_msg *request;
        struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
                hdr->h_request = cpu_to_be32(type);
                hdr->h_blkno = cpu_to_be64(blkno);
                hdr->h_generation = cpu_to_be32(generation);
-                request->md1.v_generic1 = cpu_to_be32(priv);
        }
        return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
                                 struct ocfs2_vote_msg *request,
                                 struct ocfs2_net_response_cb *callback)
 {
-        int status, response;
+        int status, response = -EBUSY;
        unsigned int response_id;
        struct ocfs2_msg_hdr *hdr;
@@ -686,109 +512,12 @@ bail:
        return status;
 }
-static int ocfs2_request_vote(struct inode *inode,
-                              struct ocfs2_vote_msg *request,
-                              struct ocfs2_net_response_cb *callback)
-{
-        int status;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        if (ocfs2_inode_is_new(inode))
-                return 0;
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-                    signal_pending(current))
-                        return -ERESTARTSYS;
-                status = ocfs2_super_lock(osb, 0);
-                if (status < 0) {
-                        mlog_errno(status);
-                        break;
-                }
-                status = 0;
-                if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num))
-                        status = ocfs2_do_request_vote(osb, request, callback);
-                ocfs2_super_unlock(osb, 0);
-        }
-        return status;
-}
-static void ocfs2_delete_response_cb(void *priv,
-                                     struct ocfs2_response_msg *resp)
-{
-        int orphaned_slot, node;
-        struct inode *inode = priv;
-        orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
-        node = be32_to_cpu(resp->r_hdr.h_node_num);
-        mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
-             node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             orphaned_slot);
-        /* The other node may not actually know which slot the inode
-         * is orphaned in. */
-        if (orphaned_slot == OCFS2_INVALID_SLOT)
-                return;
-        /* Ok, the responding node knows which slot this inode is
-         * orphaned in. We verify that the information is correct and
-         * then record this in the inode. ocfs2_delete_inode will use
-         * this information to determine which lock to take. */
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
-                        OCFS2_I(inode)->ip_orphaned_slot
-                        != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
-                        "orphaned in slot %d, we think it's in %d\n",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                        be32_to_cpu(resp->r_hdr.h_node_num),
-                        orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-        OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-int ocfs2_request_delete_vote(struct inode *inode)
-{
-        int orphaned_slot, status;
-        struct ocfs2_net_response_cb delete_cb;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_vote_msg *request;
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        delete_cb.rc_cb = ocfs2_delete_response_cb;
-        delete_cb.rc_priv = inode;
-        mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-        status = -ENOMEM;
-        request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-                                         inode->i_generation,
-                                         OCFS2_VOTE_REQ_DELETE, orphaned_slot);
-        if (request) {
-                status = ocfs2_request_vote(inode, request, &delete_cb);
-                kfree(request);
-        }
-        return status;
-}
 int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 {
        int status;
        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0,
+        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
-                                         OCFS2_VOTE_REQ_MOUNT, 0);
        if (!request) {
                status = -ENOMEM;
                goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
        int status;
        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0,
+        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
-                                         OCFS2_VOTE_REQ_UMOUNT, 0);
        if (!request) {
                status = -ENOMEM;
                goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
             be32_to_cpu(work->w_msg.v_hdr.h_generation));
        mlog(0, "h_node_num = %u\n",
             be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-        mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
        spin_lock(&osb->vote_task_lock);
        list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
        wake_up(&osb->vote_event);
 }
-int ocfs2_request_delete_vote(struct inode *inode);
 int ocfs2_request_mount_vote(struct ocfs2_super *osb);
 int ocfs2_request_umount_vote(struct ocfs2_super *osb);
 int ocfs2_register_net_handlers(struct ocfs2_super *osb);
 void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
                                        int node_num);
 #endif
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 74552c60b671..6e8bb66fe619 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -235,5 +235,4 @@ config EFI_PARTITION
        select CRC32
        help
          Say Y here if you would like to use hard disks under Linux which
-          were partitioned using EFI GPT.  Presently only useful on the
+          were partitioned using EFI GPT.
-          IA-64 platform.
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ac32a2e8540c..8a7d0035ad7a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -180,7 +180,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        }
        if (res > 0)
                return state;
-        if (!err)
+        if (err)
        /* The partition is unrecognized. So report I/O errors if there were any */
                res = err;
        if (!res)
@@ -358,8 +358,7 @@ void delete_partition(struct gendisk *disk, int part)
        p->ios[0] = p->ios[1] = 0;
        p->sectors[0] = p->sectors[1] = 0;
        sysfs_remove_link(&p->kobj, "subsystem");
-        if (p->holder_dir)
+        kobject_unregister(p->holder_dir);
-                kobject_unregister(p->holder_dir);
        kobject_uevent(&p->kobj, KOBJ_REMOVE);
        kobject_del(&p->kobj);
        kobject_put(&p->kobj);
@@ -542,7 +541,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
        if (IS_ERR(state))      /* I/O error reading the partition table */
-                return PTR_ERR(state);
+                return -EIO;
        for (p = 1; p < state->limit; p++) {
                sector_t size = state->parts[p].size;
                sector_t from = state->parts[p].from;
@@ -603,10 +602,8 @@ void del_gendisk(struct gendisk *disk)
        disk->stamp = 0;
        kobject_uevent(&disk->kobj, KOBJ_REMOVE);
-        if (disk->holder_dir)
+        kobject_unregister(disk->holder_dir);
-                kobject_unregister(disk->holder_dir);
+        kobject_unregister(disk->slave_dir);
-        if (disk->slave_dir)
-                kobject_unregister(disk->slave_dir);
        if (disk->driverfs_dev) {
                char *disk_name = make_block_name(disk);
                sysfs_remove_link(&disk->kobj, "device");
diff --git a/fs/pipe.c b/fs/pipe.c
index 68090e84f589..ebafde7d6aba 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -16,6 +16,7 @@
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -985,6 +986,10 @@ int do_pipe(int *fd)
                goto err_fdr;
        fdw = error;
+        error = audit_fd_pair(fdr, fdw);
+        if (error < 0)
+                goto err_fdw;
        fd_install(fdr, fr);
        fd_install(fdw, fw);
        fd[0] = fdr;
@@ -992,6 +997,8 @@ int do_pipe(int *fd)
        return 0;
+ err_fdw:
+        put_unused_fd(fdw);
 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index a6b3a8f878f0..bce38e3f06cb 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,8 +8,9 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)      := mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
-                proc_tty.o proc_misc.o proc_sysctl.o
+                proc_tty.o proc_misc.o
+proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
 proc-$(CONFIG_PROC_VMCORE)      += vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)  += proc_devtree.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4f5745af8c19..989af5e55d1b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -581,7 +581,7 @@ out_no_task:
 #ifndef mem_write
 /* This is a security hazard */
-static ssize_t mem_write(struct file * file, const char * buf,
+static ssize_t mem_write(struct file * file, const char __user *buf,
                         size_t count, loff_t *ppos)
 {
        int copied;
@@ -1558,29 +1558,20 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
 {
        struct inode * inode = file->f_path.dentry->d_inode;
-        unsigned long page;
+        char *p = NULL;
        ssize_t length;
        struct task_struct *task = get_proc_task(inode);
-        length = -ESRCH;
        if (!task)
-                goto out_no_task;
+                return -ESRCH;
-        if (count > PAGE_SIZE)
-                count = PAGE_SIZE;
-        length = -ENOMEM;
-        if (!(page = __get_free_page(GFP_KERNEL)))
-                goto out;
        length = security_getprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
-                                      (void*)page, count);
+                                      &p);
-        if (length >= 0)
-                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
-        free_page(page);
-out:
        put_task_struct(task);
-out_no_task:
+        if (length > 0)
+                length = simple_read_from_buffer(buf, count, ppos, p, length);
+        kfree(p);
        return length;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c932aa65e198..f771889183c3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -11,7 +11,11 @@
 #include <linux/proc_fs.h>
+#ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
+#else
+static inline void proc_sys_init(void) { }
+#endif
 struct vmalloc_info {
        unsigned long   used;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index abdf068bc27f..eca471bc8512 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -38,7 +38,7 @@ static int property_read_proc(char *page, char **start, off_t off,
                n = count;
        else
                *eof = 1;
-        memcpy(page, pp->value + off, n);
+        memcpy(page, (char *)pp->value + off, n);
        *start = page;
        return n;
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5834a744c2a9..41f17037f738 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -79,9 +79,7 @@ void __init proc_root_init(void)
        proc_device_tree_init();
 #endif
        proc_bus = proc_mkdir("bus", NULL);
-#ifdef CONFIG_SYSCTL
        proc_sys_init();
-#endif
 }
 static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index b9b423b22a8b..9475557ab499 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -23,7 +23,7 @@ static void sd_decrement_key(struct cpu_key *key)
 {
        key->on_disk_key.k_objectid--;
        set_cpu_key_k_type(key, TYPE_ANY);
-        set_cpu_key_k_offset(key, (loff_t) (-1));
+        set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
 }
 static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f01389fd162e..2cac56210e2b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -54,82 +54,48 @@
 static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
                                                                *prefix);
-static struct dentry *create_xa_root(struct super_block *sb)
+/* Returns the dentry referring to the root of the extended attribute
+ * directory tree. If it has already been retrieved, it is used. If it
+ * hasn't been created and the flags indicate creation is allowed, we
+ * attempt to create it. On error, we return a pointer-encoded error.
+ */
+static struct dentry *get_xa_root(struct super_block *sb, int flags)
 {
        struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root);
        struct dentry *xaroot;
        /* This needs to be created at mount-time */
        if (!privroot)
-                return ERR_PTR(-EOPNOTSUPP);
+                return ERR_PTR(-ENODATA);
-        xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
+        mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
-        if (IS_ERR(xaroot)) {
+        if (REISERFS_SB(sb)->xattr_root) {
+                xaroot = dget(REISERFS_SB(sb)->xattr_root);
                goto out;
-        } else if (!xaroot->d_inode) {
-                int err;
-                mutex_lock(&privroot->d_inode->i_mutex);
-                err =
-                    privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
-                                                   0700);
-                mutex_unlock(&privroot->d_inode->i_mutex);
-                if (err) {
-                        dput(xaroot);
-                        dput(privroot);
-                        return ERR_PTR(err);
-                }
-                REISERFS_SB(sb)->xattr_root = dget(xaroot);
        }
-      out:
-        dput(privroot);
-        return xaroot;
-}
-/* This will return a dentry, or error, refering to the xa root directory.
- * If the xa root doesn't exist yet, the dentry will be returned without
- * an associated inode. This dentry can be used with ->mkdir to create
- * the xa directory. */
-static struct dentry *__get_xa_root(struct super_block *s)
-{
-        struct dentry *privroot = dget(REISERFS_SB(s)->priv_root);
-        struct dentry *xaroot = NULL;
-        if (IS_ERR(privroot) || !privroot)
-                return privroot;
        xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
        if (IS_ERR(xaroot)) {
                goto out;
        } else if (!xaroot->d_inode) {
-                dput(xaroot);
+                int err = -ENODATA;
-                xaroot = NULL;
+                if (flags == 0 || flags & XATTR_CREATE)
-                goto out;
+                        err = privroot->d_inode->i_op->mkdir(privroot->d_inode,
+                                                             xaroot, 0700);
+                if (err) {
+                        dput(xaroot);
+                        xaroot = ERR_PTR(err);
+                        goto out;
+                }
        }
+        REISERFS_SB(sb)->xattr_root = dget(xaroot);
-        REISERFS_SB(s)->xattr_root = dget(xaroot);
      out:
+        mutex_unlock(&privroot->d_inode->i_mutex);
        dput(privroot);
        return xaroot;
 }
-/* Returns the dentry (or NULL) referring to the root of the extended
- * attribute directory tree. If it has already been retrieved, it is used.
- * Otherwise, we attempt to retrieve it from disk. It may also return
- * a pointer-encoded error.
- */
-static inline struct dentry *get_xa_root(struct super_block *s)
-{
-        struct dentry *dentry = dget(REISERFS_SB(s)->xattr_root);
-        if (!dentry)
-                dentry = __get_xa_root(s);
-        return dentry;
-}
 /* Opens the directory corresponding to the inode's extended attribute store.
 * If flags allow, the tree to the directory may be created. If creation is
 * prohibited, -ENODATA is returned. */
@@ -138,21 +104,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
        struct dentry *xaroot, *xadir;
        char namebuf[17];
-        xaroot = get_xa_root(inode->i_sb);
+        xaroot = get_xa_root(inode->i_sb, flags);
-        if (IS_ERR(xaroot)) {
+        if (IS_ERR(xaroot))
                return xaroot;
-        } else if (!xaroot) {
-                if (flags == 0 || flags & XATTR_CREATE) {
-                        xaroot = create_xa_root(inode->i_sb);
-                        if (IS_ERR(xaroot))
-                                return xaroot;
-                }
-                if (!xaroot)
-                        return ERR_PTR(-ENODATA);
-        }
        /* ok, we have xaroot open */
        snprintf(namebuf, sizeof(namebuf), "%X.%X",
                 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
                 inode->i_generation);
@@ -821,7 +777,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
        /* Leftovers besides . and .. -- that's not good. */
        if (dir->d_inode->i_nlink <= 2) {
-                root = get_xa_root(inode->i_sb);
+                root = get_xa_root(inode->i_sb, XATTR_REPLACE);
                reiserfs_write_lock_xattrs(inode->i_sb);
                err = vfs_rmdir(root->d_inode, dir);
                reiserfs_write_unlock_xattrs(inode->i_sb);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index 42261dbdf60f..723f7c667661 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -181,6 +181,7 @@ static int smb_setup_request(struct smb_request *req)
        req->rq_errno = 0;
        req->rq_fragment = 0;
        kfree(req->rq_trans2buffer);
+        req->rq_trans2buffer = NULL;
        return 0;
 }
diff --git a/fs/splice.c b/fs/splice.c
index 2fca6ebf4cc2..5428b0ff3b6f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -576,76 +576,21 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        if (this_len + offset > PAGE_CACHE_SIZE)
                this_len = PAGE_CACHE_SIZE - offset;
-        /*
-         * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
-         * page.
-         */
-        if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
-                /*
-                 * If steal succeeds, buf->page is now pruned from the
-                 * pagecache and we can reuse it. The page will also be
-                 * locked on successful return.
-                 */
-                if (buf->ops->steal(pipe, buf))
-                        goto find_page;
-                page = buf->page;
-                if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) {
-                        unlock_page(page);
-                        goto find_page;
-                }
-                page_cache_get(page);
-                if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-                        lru_cache_add(page);
-        } else {
 find_page:
-                page = find_lock_page(mapping, index);
+        page = find_lock_page(mapping, index);
-                if (!page) {
+        if (!page) {
-                        ret = -ENOMEM;
+                ret = -ENOMEM;
-                        page = page_cache_alloc_cold(mapping);
+                page = page_cache_alloc_cold(mapping);
-                        if (unlikely(!page))
+                if (unlikely(!page))
-                                goto out_ret;
+                        goto out_ret;
-                        /*
-                         * This will also lock the page
-                         */
-                        ret = add_to_page_cache_lru(page, mapping, index,
-                                                    GFP_KERNEL);
-                        if (unlikely(ret))
-                                goto out;
-                }
                /*
-                 * We get here with the page locked. If the page is also
+                 * This will also lock the page
-                 * uptodate, we don't need to do more. If it isn't, we
-                 * may need to bring it in if we are not going to overwrite
-                 * the full page.
                 */
-                if (!PageUptodate(page)) {
+                ret = add_to_page_cache_lru(page, mapping, index,
-                        if (this_len < PAGE_CACHE_SIZE) {
+                                            GFP_KERNEL);
-                                ret = mapping->a_ops->readpage(file, page);
+                if (unlikely(ret))
-                                if (unlikely(ret))
+                        goto out;
-                                        goto out;
-                                lock_page(page);
-                                if (!PageUptodate(page)) {
-                                        /*
-                                         * Page got invalidated, repeat.
-                                         */
-                                        if (!page->mapping) {
-                                                unlock_page(page);
-                                                page_cache_release(page);
-                                                goto find_page;
-                                        }
-                                        ret = -EIO;
-                                        goto out;
-                                }
-                        } else
-                                SetPageUptodate(page);
-                }
        }
        ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
@@ -682,18 +627,25 @@ find_page:
        }
        ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
-        if (!ret) {
+        if (ret) {
+                if (ret == AOP_TRUNCATED_PAGE) {
+                        page_cache_release(page);
+                        goto find_page;
+                }
+                if (ret < 0)
+                        goto out;
                /*
-                 * Return the number of bytes written and mark page as
+                 * Partial write has happened, so 'ret' already initialized by
-                 * accessed, we are now done!
+                 * number of bytes written, Where is nothing we have to do here.
                 */
+        } else
                ret = this_len;
-                mark_page_accessed(page);
+        /*
-                balance_dirty_pages_ratelimited(mapping);
+         * Return the number of bytes written and mark page as
-        } else if (ret == AOP_TRUNCATED_PAGE) {
+         * accessed, we are now done!
-                page_cache_release(page);
+         */
-                goto find_page;
+        mark_page_accessed(page);
-        }
+        balance_dirty_pages_ratelimited(mapping);
 out:
        page_cache_release(page);
        unlock_page(page);
@@ -706,9 +658,9 @@ out_ret:
 * key here is the 'actor' worker passed in that actually moves the data
 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
 */
-static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
-                                  struct file *out, loff_t *ppos, size_t len,
+                           struct file *out, loff_t *ppos, size_t len,
-                                  unsigned int flags, splice_actor *actor)
+                           unsigned int flags, splice_actor *actor)
 {
        int ret, do_wakeup, err;
        struct splice_desc sd;
@@ -802,6 +754,7 @@ static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
        return ret;
 }
+EXPORT_SYMBOL(__splice_from_pipe);
 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                         loff_t *ppos, size_t len, unsigned int flags,
diff --git a/fs/stack.c b/fs/stack.c
index 8ffb880d2f46..67716f6a1a4a 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -20,11 +20,6 @@ EXPORT_SYMBOL_GPL(fsstack_copy_inode_size);
 void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
                                int (*get_nlinks)(struct inode *))
 {
-        if (!get_nlinks)
-                dest->i_nlink = src->i_nlink;
-        else
-                dest->i_nlink = (*get_nlinks)(dest);
        dest->i_mode = src->i_mode;
        dest->i_uid = src->i_uid;
        dest->i_gid = src->i_gid;
@@ -34,5 +29,14 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
        dest->i_ctime = src->i_ctime;
        dest->i_blkbits = src->i_blkbits;
        dest->i_flags = src->i_flags;
+        /*
+         * Update the nlinks AFTER updating the above fields, because the
+         * get_links callback may depend on them.
+         */
+        if (!get_nlinks)
+                dest->i_nlink = src->i_nlink;
+        else
+                dest->i_nlink = (*get_nlinks)(dest);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/super.c b/fs/super.c
index 60b1e50cbf53..8341e4e1d738 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -725,16 +725,6 @@ static int test_bdev_super(struct super_block *s, void *data)
        return (void *)s->s_bdev == data;
 }
-static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
-{
-        if (bdev->bd_disk) {
-                if (bdev->bd_part)
-                        kobject_uevent(&bdev->bd_part->kobj, action);
-                else
-                        kobject_uevent(&bdev->bd_disk->kobj, action);
-        }
-}
 int get_sb_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int),
@@ -782,7 +772,6 @@ int get_sb_bdev(struct file_system_type *fs_type,
                }
                s->s_flags |= MS_ACTIVE;
-                bdev_uevent(bdev, KOBJ_MOUNT);
        }
        return simple_set_mnt(mnt, s);
@@ -801,7 +790,6 @@ void kill_block_super(struct super_block *sb)
 {
        struct block_device *bdev = sb->s_bdev;
-        bdev_uevent(bdev, KOBJ_UMOUNT);
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        close_bdev_excl(bdev);
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
 /*
 * `endbyte' is inclusive
 */
-int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
+int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
-                        unsigned int flags)
+                          loff_t endbyte, unsigned int flags)
 {
        int ret;
-        struct address_space *mapping;
-        mapping = file->f_mapping;
        if (!mapping) {
                ret = -EINVAL;
                goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
 out:
        return ret;
 }
-EXPORT_SYMBOL_GPL(do_sync_file_range);
+EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index d3b9f5f07db1..8ea2a51ce883 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -59,7 +59,7 @@ read(struct file * file, char __user * userbuf, size_t count, loff_t * off)
        if (copy_to_user(userbuf, buffer, count))
                return -EFAULT;
-        pr_debug("offs = %lld, *off = %lld, count = %zd\n", offs, *off, count);
+        pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
        *off = offs + count;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8813990304fe..85a668680f82 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -431,6 +431,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent)
        new_parent_dentry = new_parent ?
                new_parent->dentry : sysfs_mount->mnt_sb->s_root;
+        if (old_parent_dentry->d_inode == new_parent_dentry->d_inode)
+                return 0;       /* nothing to move */
 again:
        mutex_lock(&old_parent_dentry->d_inode->i_mutex);
        if (!mutex_trylock(&new_parent_dentry->d_inode->i_mutex)) {
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c0e117649a4d..db0413a411d6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -54,7 +54,7 @@ static struct sysfs_ops subsys_sysfs_ops = {
 /**
 *      add_to_collection - add buffer to a collection
 *      @buffer:        buffer to be added
- *      @node           inode of set to add to
+ *      @node:          inode of set to add to
 */
 static inline void
@@ -168,12 +168,12 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        ssize_t retval = 0;
        down(&buffer->sem);
-        if (buffer->orphaned) {
-                retval = -ENODEV;
-                goto out;
-        }
        if (buffer->needs_read_fill) {
-                if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
+                if (buffer->orphaned)
+                        retval = -ENODEV;
+                else
+                        retval = fill_read_buffer(file->f_path.dentry,buffer);
+                if (retval)
                        goto out;
        }
        pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
@@ -502,6 +502,30 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 /**
+ * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @group: group name.
+ */
+int sysfs_add_file_to_group(struct kobject *kobj,
+                const struct attribute *attr, const char *group)
+{
+        struct dentry *dir;
+        int error;
+        dir = lookup_one_len(group, kobj->dentry, strlen(group));
+        if (IS_ERR(dir))
+                error = PTR_ERR(dir);
+        else {
+                error = sysfs_add_file(dir, attr, SYSFS_KOBJ_ATTR);
+                dput(dir);
+        }
+        return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
+/**
 * sysfs_update_file - update the modified timestamp on an object attribute.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
@@ -586,6 +610,88 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 }
+/**
+ * sysfs_remove_file_from_group - remove an attribute file from a group.
+ * @kobj: object we're acting for.
+ * @attr: attribute descriptor.
+ * @group: group name.
+ */
+void sysfs_remove_file_from_group(struct kobject *kobj,
+                const struct attribute *attr, const char *group)
+{
+        struct dentry *dir;
+        dir = lookup_one_len(group, kobj->dentry, strlen(group));
+        if (!IS_ERR(dir)) {
+                sysfs_hash_and_remove(dir, attr->name);
+                dput(dir);
+        }
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
+struct sysfs_schedule_callback_struct {
+        struct kobject          *kobj;
+        void                    (*func)(void *);
+        void                    *data;
+        struct module           *owner;
+        struct work_struct      work;
+};
+static void sysfs_schedule_callback_work(struct work_struct *work)
+{
+        struct sysfs_schedule_callback_struct *ss = container_of(work,
+                        struct sysfs_schedule_callback_struct, work);
+        (ss->func)(ss->data);
+        kobject_put(ss->kobj);
+        module_put(ss->owner);
+        kfree(ss);
+}
+/**
+ * sysfs_schedule_callback - helper to schedule a callback for a kobject
+ * @kobj: object we're acting for.
+ * @func: callback function to invoke later.
+ * @data: argument to pass to @func.
+ * @owner: module owning the callback code
+ *
+ * sysfs attribute methods must not unregister themselves or their parent
+ * kobject (which would amount to the same thing).  Attempts to do so will
+ * deadlock, since unregistration is mutually exclusive with driver
+ * callbacks.
+ *
+ * Instead methods can call this routine, which will attempt to allocate
+ * and schedule a workqueue request to call back @func with @data as its
+ * argument in the workqueue's process context.  @kobj will be pinned
+ * until @func returns.
+ *
+ * Returns 0 if the request was submitted, -ENOMEM if storage could not
+ * be allocated, -ENODEV if a reference to @owner isn't available.
+ */
+int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
+                void *data, struct module *owner)
+{
+        struct sysfs_schedule_callback_struct *ss;
+        if (!try_module_get(owner))
+                return -ENODEV;
+        ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+        if (!ss) {
+                module_put(owner);
+                return -ENOMEM;
+        }
+        kobject_get(kobj);
+        ss->kobj = kobj;
+        ss->func = func;
+        ss->data = data;
+        ss->owner = owner;
+        INIT_WORK(&ss->work, sysfs_schedule_callback_work);
+        schedule_work(&ss->work);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
 EXPORT_SYMBOL_GPL(sysfs_update_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index b20951c93761..52eed2a7a5ef 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,9 +70,11 @@ void sysfs_remove_group(struct kobject * kobj,
 {
        struct dentry * dir;
-        if (grp->name)
+        if (grp->name) {
-                dir = lookup_one_len(grp->name, kobj->dentry,
+                dir = lookup_one_len_kern(grp->name, kobj->dentry,
                                strlen(grp->name));
+                BUG_ON(IS_ERR(dir));
+        }
        else
                dir = dget(kobj->dentry);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index dd1344b007f5..4de5c6b89918 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -222,11 +222,12 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
 static inline void orphan_all_buffers(struct inode *node)
 {
-        struct sysfs_buffer_collection *set = node->i_private;
+        struct sysfs_buffer_collection *set;
        struct sysfs_buffer *buf;
        mutex_lock_nested(&node->i_mutex, I_MUTEX_CHILD);
-        if (node->i_private) {
+        set = node->i_private;
+        if (set) {
                list_for_each_entry(buf, &set->associates, associates) {
                        down(&buf->sem);
                        buf->orphaned = 1;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d976b0005549..a77c57e5a6d5 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,3 +1,14 @@
+struct sysfs_dirent {
+        atomic_t                s_count;
+        struct list_head        s_sibling;
+        struct list_head        s_children;
+        void                    * s_element;
+        int                     s_type;
+        umode_t                 s_mode;
+        struct dentry           * s_dentry;
+        struct iattr            * s_iattr;
+        atomic_t                s_event;
+};
 extern struct vfsmount * sysfs_mount;
 extern struct kmem_cache *sysfs_dir_cachep;
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index bcc44084e004..841ac25fd950 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -244,62 +244,87 @@ failed:
 * We can come here from ufs_writepage or ufs_prepare_write,
 * locked_page is argument of these functions, so we already lock it.
 */
-static void ufs_change_blocknr(struct inode *inode, unsigned int beg,
+static void ufs_change_blocknr(struct inode *inode, sector_t beg,
-                               unsigned int count, unsigned int oldb,
+                               unsigned int count, sector_t oldb,
-                               unsigned int newb, struct page *locked_page)
+                               sector_t newb, struct page *locked_page)
 {
-        const unsigned mask = (1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1;
+        const unsigned blks_per_page =
+                1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        const unsigned mask = blks_per_page - 1;
        struct address_space * const mapping = inode->i_mapping;
-        pgoff_t index, cur_index;
+        pgoff_t index, cur_index, last_index;
-        unsigned end, pos, j;
+        unsigned pos, j, lblock;
+        sector_t end, i;
        struct page *page;
        struct buffer_head *head, *bh;
-        UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+        UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
-              inode->i_ino, count, oldb, newb);
+              inode->i_ino, count,
+             (unsigned long long)oldb, (unsigned long long)newb);
        BUG_ON(!locked_page);
        BUG_ON(!PageLocked(locked_page));
        cur_index = locked_page->index;
+        end = count + beg;
-        for (end = count + beg; beg < end; beg = (beg | mask) + 1) {
+        last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                index = beg >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        for (i = beg; i < end; i = (i | mask) + 1) {
+                index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
                if (likely(cur_index != index)) {
                        page = ufs_get_locked_page(mapping, index);
-                        if (!page || IS_ERR(page)) /* it was truncated or EIO */
+                        if (!page)/* it was truncated */
+                                continue;
+                        if (IS_ERR(page)) {/* or EIO */
+                                ufs_error(inode->i_sb, __FUNCTION__,
+                                          "read of page %llu failed\n",
+                                          (unsigned long long)index);
                                continue;
+                        }
                } else
                        page = locked_page;
                head = page_buffers(page);
                bh = head;
-                pos = beg & mask;
+                pos = i & mask;
                for (j = 0; j < pos; ++j)
                        bh = bh->b_this_page;
-                j = 0;
+                if (unlikely(index == last_index))
+                        lblock = end & mask;
+                else
+                        lblock = blks_per_page;
                do {
-                        if (buffer_mapped(bh)) {
+                        if (j >= lblock)
-                                pos = bh->b_blocknr - oldb;
+                                break;
-                                if (pos < count) {
+                        pos = (i - beg) + j;
-                                        UFSD(" change from %llu to %llu\n",
-                                             (unsigned long long)pos + oldb,
+                        if (!buffer_mapped(bh))
-                                             (unsigned long long)pos + newb);
+                                        map_bh(bh, inode->i_sb, oldb + pos);
-                                        bh->b_blocknr = newb + pos;
+                        if (!buffer_uptodate(bh)) {
-                                        unmap_underlying_metadata(bh->b_bdev,
+                                ll_rw_block(READ, 1, &bh);
-                                                                  bh->b_blocknr);
+                                wait_on_buffer(bh);
-                                        mark_buffer_dirty(bh);
+                                if (!buffer_uptodate(bh)) {
-                                        ++j;
+                                        ufs_error(inode->i_sb, __FUNCTION__,
+                                                  "read of block failed\n");
+                                        break;
                                }
                        }
+                        UFSD(" change from %llu to %llu, pos %u\n",
+                             (unsigned long long)pos + oldb,
+                             (unsigned long long)pos + newb, pos);
+                        bh->b_blocknr = newb + pos;
+                        unmap_underlying_metadata(bh->b_bdev,
+                                                  bh->b_blocknr);
+                        mark_buffer_dirty(bh);
+                        ++j;
                        bh = bh->b_this_page;
                } while (bh != head);
-                if (j)
-                        set_page_dirty(page);
                if (likely(cur_index != index))
                        ufs_put_locked_page(page);
        }
@@ -457,8 +482,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
        if (result) {
                ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
                                locked_page != NULL);
-                ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp,
+                ufs_change_blocknr(inode, fragment - oldcount, oldcount,
-                                   result, locked_page);
+                                   uspi->s_sbbase + tmp,
+                                   uspi->s_sbbase + result, locked_page);
                ufs_cpu_to_data_ptr(sb, p, result);
                *err = 0;
                UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index b868878009b6..c28a8b6f2feb 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -343,9 +343,8 @@ cg_found:
                lock_buffer(bh);
                ufs2_inode = (struct ufs2_inode *)bh->b_data;
                ufs2_inode += ufs_inotofsbo(inode->i_ino);
-                ufs2_inode->ui_birthtime.tv_sec =
+                ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec);
-                        cpu_to_fs32(sb, CURRENT_TIME_SEC.tv_sec);
+                ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec);
-                ufs2_inode->ui_birthtime.tv_usec = 0;
                mark_buffer_dirty(bh);
                unlock_buffer(bh);
                if (sb->s_flags & MS_SYNCHRONOUS)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index fb34ad03e224..f18b79122fa3 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -212,7 +212,7 @@ repeat:
                        brelse (result);
                        goto repeat;
                } else {
-                        *phys = tmp + blockoff;
+                        *phys = uspi->s_sbbase + tmp + blockoff;
                        return NULL;
                }
        }
@@ -282,9 +282,9 @@ repeat:
        }
        if (!phys) {
-                result = sb_getblk(sb, tmp + blockoff);
+                result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
        } else {
-                *phys = tmp + blockoff;
+                *phys = uspi->s_sbbase + tmp + blockoff;
                result = NULL;
                *err = 0;
                *new = 1;
@@ -368,7 +368,7 @@ repeat:
                        brelse (result);
                        goto repeat;
                } else {
-                        *phys = tmp + blockoff;
+                        *phys = uspi->s_sbbase + tmp + blockoff;
                        goto out;
                }
        }
@@ -389,9 +389,9 @@ repeat:
        if (!phys) {
-                result = sb_getblk(sb, tmp + blockoff);
+                result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
        } else {
-                *phys = tmp + blockoff;
+                *phys = uspi->s_sbbase + tmp + blockoff;
                *new = 1;
        }
@@ -601,7 +601,7 @@ static void ufs_set_inode_ops(struct inode *inode)
                                   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
 }
-static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
+static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -613,8 +613,10 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
         */
        inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode);
        inode->i_nlink = fs16_to_cpu(sb, ufs_inode->ui_nlink);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+                return -1;
+        }
        
        /*
         * Linux now has 32-bit uid and gid, so we can support EFT.
@@ -643,9 +645,10 @@ static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
        }
+        return 0;
 }
-static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -658,8 +661,10 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
         */
        inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode);
        inode->i_nlink = fs16_to_cpu(sb, ufs2_inode->ui_nlink);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
+                return -1;
+        }
        /*
         * Linux now has 32-bit uid and gid, so we can support EFT.
@@ -668,12 +673,12 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
        inode->i_gid = fs32_to_cpu(sb, ufs2_inode->ui_gid);
        inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
-        inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs2_inode->ui_atime.tv_sec);
+        inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
-        inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs2_inode->ui_ctime.tv_sec);
+        inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime);
-        inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs2_inode->ui_mtime.tv_sec);
+        inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
-        inode->i_mtime.tv_nsec = 0;
+        inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
-        inode->i_atime.tv_nsec = 0;
+        inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec);
-        inode->i_ctime.tv_nsec = 0;
+        inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
        inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
        inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
        ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
@@ -690,6 +695,7 @@ static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
        }
+        return 0;
 }
 void ufs_read_inode(struct inode * inode)
@@ -698,6 +704,7 @@ void ufs_read_inode(struct inode * inode)
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
        struct buffer_head * bh;
+        int err;
        UFSD("ENTER, ino %lu\n", inode->i_ino);
@@ -720,14 +727,17 @@ void ufs_read_inode(struct inode * inode)
        if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
                struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
-                ufs2_read_inode(inode,
+                err = ufs2_read_inode(inode,
-                                ufs2_inode + ufs_inotofsbo(inode->i_ino));
+                                      ufs2_inode + ufs_inotofsbo(inode->i_ino));
        } else {
                struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
-                ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+                err = ufs1_read_inode(inode,
+                                      ufs_inode + ufs_inotofsbo(inode->i_ino));
        }
+        if (err)
+                goto bad_inode;
        inode->i_version++;
        ufsi->i_lastfrag =
                (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -803,12 +813,12 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
        ufs_inode->ui_gid = cpu_to_fs32(sb, inode->i_gid);
        ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
-        ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+        ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
-        ufs_inode->ui_atime.tv_usec = 0;
+        ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
-        ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec);
+        ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec);
-        ufs_inode->ui_ctime.tv_usec = 0;
+        ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec);
-        ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+        ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
-        ufs_inode->ui_mtime.tv_usec = 0;
+        ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
        ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
        ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -888,6 +898,8 @@ void ufs_delete_inode (struct inode * inode)
        loff_t old_i_size;
        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode))
+                goto no_delete;
        /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
        lock_kernel();
        mark_inode_dirty(inode);
@@ -898,4 +910,7 @@ void ufs_delete_inode (struct inode * inode)
                ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n");
        ufs_free_inode (inode);
        unlock_kernel();
+        return;
+no_delete:
+        clear_inode(inode);     /* We must guarantee clearing of inode... */
 }
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 749581fa7729..79c54c85fb58 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -74,7 +74,7 @@ static int ufs_trunc_direct(struct inode *inode)
        unsigned i, tmp;
        int retry;
        
-        UFSD("ENTER\n");
+        UFSD("ENTER: ino %lu\n", inode->i_ino);
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -96,8 +96,8 @@ static int ufs_trunc_direct(struct inode *inode)
                block2 = ufs_fragstoblks (frag3);
        }
-        UFSD("frag1 %llu, frag2 %llu, block1 %llu, block2 %llu, frag3 %llu,"
+        UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
-             " frag4 %llu\n",
+             " frag3 %llu, frag4 %llu\n", inode->i_ino,
             (unsigned long long)frag1, (unsigned long long)frag2,
             (unsigned long long)block1, (unsigned long long)block2,
             (unsigned long long)frag3, (unsigned long long)frag4);
@@ -163,7 +163,7 @@ next1:
        mark_inode_dirty(inode);
 next3:
-        UFSD("EXIT\n");
+        UFSD("EXIT: ino %lu\n", inode->i_ino);
        return retry;
 }
@@ -248,7 +248,7 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
        }
        ubh_brelse (ind_ubh);
        
-        UFSD("EXIT\n");
+        UFSD("EXIT: ino %lu\n", inode->i_ino);
        
        return retry;
 }
@@ -262,7 +262,7 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
        void *dind;
        int retry = 0;
        
-        UFSD("ENTER\n");
+        UFSD("ENTER: ino %lu\n", inode->i_ino);
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
@@ -312,7 +312,7 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
        }
        ubh_brelse (dind_bh);
        
-        UFSD("EXIT\n");
+        UFSD("EXIT: ino %lu\n", inode->i_ino);
        
        return retry;
 }
@@ -327,7 +327,7 @@ static int ufs_trunc_tindirect(struct inode *inode)
        void *tind, *p;
        int retry;
        
-        UFSD("ENTER\n");
+        UFSD("ENTER: ino %lu\n", inode->i_ino);
        retry = 0;
        
@@ -348,7 +348,7 @@ static int ufs_trunc_tindirect(struct inode *inode)
        }
        for (i = tindirect_block ; i < uspi->s_apb ; i++) {
-                tind = ubh_get_addr32 (tind_bh, i);
+                tind = ubh_get_data_ptr(uspi, tind_bh, i);
                retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + 
                        uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind);
                ubh_mark_buffer_dirty(tind_bh);
@@ -372,19 +372,21 @@ static int ufs_trunc_tindirect(struct inode *inode)
        }
        ubh_brelse (tind_bh);
        
-        UFSD("EXIT\n");
+        UFSD("EXIT: ino %lu\n", inode->i_ino);
        return retry;
 }
 static int ufs_alloc_lastblock(struct inode *inode)
 {
        int err = 0;
+        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
-        struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
        unsigned i, end;
        sector_t lastfrag;
        struct page *lastpage;
        struct buffer_head *bh;
+        u64 phys64;
        lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -424,6 +426,20 @@ static int ufs_alloc_lastblock(struct inode *inode)
               set_page_dirty(lastpage);
       }
+       if (lastfrag >= UFS_IND_FRAGMENT) {
+               end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
+               phys64 = bh->b_blocknr + 1;
+               for (i = 0; i < end; ++i) {
+                       bh = sb_getblk(sb, i + phys64);
+                       lock_buffer(bh);
+                       memset(bh->b_data, 0, sb->s_blocksize);
+                       set_buffer_uptodate(bh);
+                       mark_buffer_dirty(bh);
+                       unlock_buffer(bh);
+                       sync_dirty_buffer(bh);
+                       brelse(bh);
+               }
+       }
 out_unlock:
       ufs_put_locked_page(lastpage);
 out:
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e2bea6a661f0..69e9e80735d2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1829,11 +1829,11 @@ xfs_buf_init(void)
        if (!xfs_buf_zone)
                goto out_free_trace_buf;
-        xfslogd_workqueue = create_freezeable_workqueue("xfslogd");
+        xfslogd_workqueue = create_workqueue("xfslogd");
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_freezeable_workqueue("xfsdatad");
+        xfsdatad_workqueue = create_workqueue("xfsdatad");
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 1a4103ca593c..2f2c40db562e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -900,7 +900,7 @@ static struct quotactl_ops xfs_quotactl_operations = {
        .set_xquota             = xfs_fs_setxquota,
 };
-STATIC struct file_system_type xfs_fs_type = {
+static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
        .get_sb                 = xfs_fs_get_sb,
author	Dmitry Torokhov <dtor@insightbb.com>	2007-05-01 00:24:54 -0400
committer	Dmitry Torokhov <dtor@insightbb.com>	2007-05-01 00:24:54 -0400
commit	bc95f3669f5e6f63cf0b84fe4922c3c6dd4aa775 (patch)
tree	427fcf2a7287c16d4b5aa6cbf494d59579a6a8b1 /fs
parent	3d29cdff999c37b3876082278a8134a0642a02cd (diff)
parent	dc87c3985e9b442c60994308a96f887579addc39 (diff)