153 files changed, 7910 insertions, 2292 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 94b9d861bf9b..613df554728d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -991,7 +991,7 @@ config TMPFS_POSIX_ACL
 config HUGETLBFS
        bool "HugeTLB file system support"
-        depends on X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
+        depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || BROKEN
        help
          hugetlbfs is a filesystem backing for HugeTLB pages, based on
          ramfs. For architectures that support it, say Y here and read
@@ -1675,6 +1675,7 @@ config NFSD_V3_ACL
 config NFSD_V4
        bool "Provide NFSv4 server support (EXPERIMENTAL)"
        depends on NFSD_V3 && EXPERIMENTAL
+        select RPCSEC_GSS_KRB5
        help
          If you would like to include the NFSv4 server as well as the NFSv2
          and NFSv3 servers, say Y here.  This feature is experimental, and
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 73ce561f3ea0..a66671082cfb 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-objs := \
        cmservice.o \
        dir.o \
        file.o \
+        flock.o \
        fsclient.o \
        inode.o \
        main.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 245257948140..c548aa346f0d 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -37,6 +37,13 @@ typedef enum {
        AFS_FTYPE_SYMLINK       = 3,
 } afs_file_type_t;
+typedef enum {
+        AFS_LOCK_READ           = 0,    /* read lock request */
+        AFS_LOCK_WRITE          = 1,    /* write lock request */
+} afs_lock_type_t;
+#define AFS_LOCKWAIT            (5 * 60) /* time until a lock times out (seconds) */
 /*
 * AFS file identifier
 */
@@ -120,6 +127,7 @@ struct afs_file_status {
        struct afs_fid          parent;         /* parent dir ID for non-dirs only */
        time_t                  mtime_client;   /* last time client changed data */
        time_t                  mtime_server;   /* last time server changed data */
+        s32                     lock_count;     /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
 };
 /*
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index a18c374ebe08..eb647323d8f0 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -31,6 +31,9 @@ enum AFS_FS_Operations {
        FSGETVOLUMEINFO         = 148,  /* AFS Get information about a volume */
        FSGETVOLUMESTATUS       = 149,  /* AFS Get volume status information */
        FSGETROOTVOLUME         = 151,  /* AFS Get root volume name */
+        FSSETLOCK               = 156,  /* AFS Request a file lock */
+        FSEXTENDLOCK            = 157,  /* AFS Extend a file lock */
+        FSRELEASELOCK           = 158,  /* AFS Release a file lock */
        FSLOOKUP                = 161,  /* AFS lookup file in directory */
        FSFETCHDATA64           = 65537, /* AFS Fetch file data */
        FSSTOREDATA64           = 65538, /* AFS Store file data */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index bacf518c6fa8..b8243945818d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -125,6 +125,9 @@ static void afs_break_callback(struct afs_server *server,
                spin_unlock(&server->cb_lock);
                queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+                if (list_empty(&vnode->granted_locks) &&
+                    !list_empty(&vnode->pending_locks))
+                        afs_lock_may_be_available(vnode);
                spin_unlock(&vnode->lock);
        }
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 546c59522eb1..33fe39ad4e03 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -44,6 +44,7 @@ const struct file_operations afs_dir_file_operations = {
        .open           = afs_dir_open,
        .release        = afs_release,
        .readdir        = afs_readdir,
+        .lock           = afs_lock,
 };
 const struct inode_operations afs_dir_inode_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index aede7eb66dd4..525f7c56e068 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -34,6 +34,8 @@ const struct file_operations afs_file_operations = {
        .mmap           = generic_file_readonly_mmap,
        .splice_read    = generic_file_splice_read,
        .fsync          = afs_fsync,
+        .lock           = afs_lock,
+        .flock          = afs_flock,
 };
 const struct inode_operations afs_file_inode_operations = {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
new file mode 100644
index 000000000000..8f07f8d1bfa9
--- /dev/null
+++ b/fs/afs/flock.c
@@ -0,0 +1,558 @@
+/* AFS file locking support
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/smp_lock.h>
+#include "internal.h"
+#define AFS_LOCK_GRANTED        0
+#define AFS_LOCK_PENDING        1
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
+static void afs_fl_release_private(struct file_lock *fl);
+static struct workqueue_struct *afs_lock_manager;
+static struct file_lock_operations afs_lock_ops = {
+        .fl_copy_lock           = afs_fl_copy_lock,
+        .fl_release_private     = afs_fl_release_private,
+};
+/*
+ * initialise the lock manager thread if it isn't already running
+ */
+static int afs_init_lock_manager(void)
+{
+        if (!afs_lock_manager) {
+                afs_lock_manager = create_singlethread_workqueue("kafs_lockd");
+                if (!afs_lock_manager)
+                        return -ENOMEM;
+        }
+        return 0;
+}
+/*
+ * destroy the lock manager thread if it's running
+ */
+void __exit afs_kill_lock_manager(void)
+{
+        if (afs_lock_manager)
+                destroy_workqueue(afs_lock_manager);
+}
+/*
+ * if the callback is broken on this vnode, then the lock may now be available
+ */
+void afs_lock_may_be_available(struct afs_vnode *vnode)
+{
+        _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+        queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
+}
+/*
+ * the lock will time out in 5 minutes unless we extend it, so schedule
+ * extension in a bit less than that time
+ */
+static void afs_schedule_lock_extension(struct afs_vnode *vnode)
+{
+        queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+                           AFS_LOCKWAIT * HZ / 2);
+}
+/*
+ * do work for a lock, including:
+ * - probing for a lock we're waiting on but didn't get immediately
+ * - extending a lock that's close to timing out
+ */
+void afs_lock_work(struct work_struct *work)
+{
+        struct afs_vnode *vnode =
+                container_of(work, struct afs_vnode, lock_work.work);
+        struct file_lock *fl;
+        afs_lock_type_t type;
+        struct key *key;
+        int ret;
+        _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+        spin_lock(&vnode->lock);
+        if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
+                _debug("unlock");
+                spin_unlock(&vnode->lock);
+                /* attempt to release the server lock; if it fails, we just
+                 * wait 5 minutes and it'll time out anyway */
+                ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
+                if (ret < 0)
+                        printk(KERN_WARNING "AFS:"
+                               " Failed to release lock on {%x:%x} error %d\n",
+                               vnode->fid.vid, vnode->fid.vnode, ret);
+                spin_lock(&vnode->lock);
+                key_put(vnode->unlock_key);
+                vnode->unlock_key = NULL;
+                clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
+        }
+        /* if we've got a lock, then it must be time to extend that lock as AFS
+         * locks time out after 5 minutes */
+        if (!list_empty(&vnode->granted_locks)) {
+                _debug("extend");
+                if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+                        BUG();
+                fl = list_entry(vnode->granted_locks.next,
+                                struct file_lock, fl_u.afs.link);
+                key = key_get(fl->fl_file->private_data);
+                spin_unlock(&vnode->lock);
+                ret = afs_vnode_extend_lock(vnode, key);
+                clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+                key_put(key);
+                switch (ret) {
+                case 0:
+                        afs_schedule_lock_extension(vnode);
+                        break;
+                default:
+                        /* ummm... we failed to extend the lock - retry
+                         * extension shortly */
+                        printk(KERN_WARNING "AFS:"
+                               " Failed to extend lock on {%x:%x} error %d\n",
+                               vnode->fid.vid, vnode->fid.vnode, ret);
+                        queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+                                           HZ * 10);
+                        break;
+                }
+                _leave(" [extend]");
+                return;
+        }
+        /* if we don't have a granted lock, then we must've been called back by
+         * the server, and so if might be possible to get a lock we're
+         * currently waiting for */
+        if (!list_empty(&vnode->pending_locks)) {
+                _debug("get");
+                if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+                        BUG();
+                fl = list_entry(vnode->pending_locks.next,
+                                struct file_lock, fl_u.afs.link);
+                key = key_get(fl->fl_file->private_data);
+                type = (fl->fl_type == F_RDLCK) ?
+                        AFS_LOCK_READ : AFS_LOCK_WRITE;
+                spin_unlock(&vnode->lock);
+                ret = afs_vnode_set_lock(vnode, key, type);
+                clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+                switch (ret) {
+                case -EWOULDBLOCK:
+                        _debug("blocked");
+                        break;
+                case 0:
+                        _debug("acquired");
+                        if (type == AFS_LOCK_READ)
+                                set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+                        else
+                                set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+                        ret = AFS_LOCK_GRANTED;
+                default:
+                        spin_lock(&vnode->lock);
+                        /* the pending lock may have been withdrawn due to a
+                         * signal */
+                        if (list_entry(vnode->pending_locks.next,
+                                       struct file_lock, fl_u.afs.link) == fl) {
+                                fl->fl_u.afs.state = ret;
+                                if (ret == AFS_LOCK_GRANTED)
+                                        list_move_tail(&fl->fl_u.afs.link,
+                                                       &vnode->granted_locks);
+                                else
+                                        list_del_init(&fl->fl_u.afs.link);
+                                wake_up(&fl->fl_wait);
+                                spin_unlock(&vnode->lock);
+                        } else {
+                                _debug("withdrawn");
+                                clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+                                clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+                                spin_unlock(&vnode->lock);
+                                afs_vnode_release_lock(vnode, key);
+                                if (!list_empty(&vnode->pending_locks))
+                                        afs_lock_may_be_available(vnode);
+                        }
+                        break;
+                }
+                key_put(key);
+                _leave(" [pend]");
+                return;
+        }
+        /* looks like the lock request was withdrawn on a signal */
+        spin_unlock(&vnode->lock);
+        _leave(" [no locks]");
+}
+/*
+ * pass responsibility for the unlocking of a vnode on the server to the
+ * manager thread, lest a pending signal in the calling thread interrupt
+ * AF_RXRPC
+ * - the caller must hold the vnode lock
+ */
+static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
+{
+        cancel_delayed_work(&vnode->lock_work);
+        if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
+            !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
+                BUG();
+        if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
+                BUG();
+        vnode->unlock_key = key_get(key);
+        afs_lock_may_be_available(vnode);
+}
+/*
+ * request a lock on a file on the server
+ */
+static int afs_do_setlk(struct file *file, struct file_lock *fl)
+{
+        struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+        afs_lock_type_t type;
+        struct key *key = file->private_data;
+        int ret;
+        _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+        /* only whole-file locks are supported */
+        if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+                return -EINVAL;
+        ret = afs_init_lock_manager();
+        if (ret < 0)
+                return ret;
+        fl->fl_ops = &afs_lock_ops;
+        INIT_LIST_HEAD(&fl->fl_u.afs.link);
+        fl->fl_u.afs.state = AFS_LOCK_PENDING;
+        type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+        lock_kernel();
+        /* make sure we've got a callback on this file and that our view of the
+         * data version is up to date */
+        ret = afs_vnode_fetch_status(vnode, NULL, key);
+        if (ret < 0)
+                goto error;
+        if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
+                ret = -EAGAIN;
+                goto error;
+        }
+        spin_lock(&vnode->lock);
+        if (list_empty(&vnode->pending_locks)) {
+                /* if there's no-one else with a lock on this vnode, then we
+                 * need to ask the server for a lock */
+                if (list_empty(&vnode->granted_locks)) {
+                        _debug("not locked");
+                        ASSERTCMP(vnode->flags &
+                                  ((1 << AFS_VNODE_LOCKING) |
+                                   (1 << AFS_VNODE_READLOCKED) |
+                                   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+                        list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+                        set_bit(AFS_VNODE_LOCKING, &vnode->flags);
+                        spin_unlock(&vnode->lock);
+                        ret = afs_vnode_set_lock(vnode, key, type);
+                        clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+                        switch (ret) {
+                        case 0:
+                                goto acquired_server_lock;
+                        case -EWOULDBLOCK:
+                                spin_lock(&vnode->lock);
+                                ASSERT(list_empty(&vnode->granted_locks));
+                                ASSERTCMP(vnode->pending_locks.next, ==,
+                                          &fl->fl_u.afs.link);
+                                goto wait;
+                        default:
+                                spin_lock(&vnode->lock);
+                                list_del_init(&fl->fl_u.afs.link);
+                                spin_unlock(&vnode->lock);
+                                goto error;
+                        }
+                }
+                /* if we've already got a readlock on the server and no waiting
+                 * writelocks, then we might be able to instantly grant another
+                 * readlock */
+                if (type == AFS_LOCK_READ &&
+                    vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
+                        _debug("instant readlock");
+                        ASSERTCMP(vnode->flags &
+                                  ((1 << AFS_VNODE_LOCKING) |
+                                   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+                        ASSERT(!list_empty(&vnode->granted_locks));
+                        goto sharing_existing_lock;
+                }
+        }
+        /* otherwise, we need to wait for a local lock to become available */
+        _debug("wait local");
+        list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+wait:
+        if (!(fl->fl_flags & FL_SLEEP)) {
+                _debug("noblock");
+                ret = -EAGAIN;
+                goto abort_attempt;
+        }
+        spin_unlock(&vnode->lock);
+        /* now we need to sleep and wait for the lock manager thread to get the
+         * lock from the server */
+        _debug("sleep");
+        ret = wait_event_interruptible(fl->fl_wait,
+                                       fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
+        if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+                ret = fl->fl_u.afs.state;
+                if (ret < 0)
+                        goto error;
+                spin_lock(&vnode->lock);
+                goto given_lock;
+        }
+        /* we were interrupted, but someone may still be in the throes of
+         * giving us the lock */
+        _debug("intr");
+        ASSERTCMP(ret, ==, -ERESTARTSYS);
+        spin_lock(&vnode->lock);
+        if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+                ret = fl->fl_u.afs.state;
+                if (ret < 0) {
+                        spin_unlock(&vnode->lock);
+                        goto error;
+                }
+                goto given_lock;
+        }
+abort_attempt:
+        /* we aren't going to get the lock, either because we're unwilling to
+         * wait, or because some signal happened */
+        _debug("abort");
+        if (list_empty(&vnode->granted_locks) &&
+            vnode->pending_locks.next == &fl->fl_u.afs.link) {
+                if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
+                        /* kick the next pending lock into having a go */
+                        list_del_init(&fl->fl_u.afs.link);
+                        afs_lock_may_be_available(vnode);
+                }
+        } else {
+                list_del_init(&fl->fl_u.afs.link);
+        }
+        spin_unlock(&vnode->lock);
+        goto error;
+acquired_server_lock:
+        /* we've acquired a server lock, but it needs to be renewed after 5
+         * mins */
+        spin_lock(&vnode->lock);
+        afs_schedule_lock_extension(vnode);
+        if (type == AFS_LOCK_READ)
+                set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+        else
+                set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+sharing_existing_lock:
+        /* the lock has been granted as far as we're concerned... */
+        fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+        list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+given_lock:
+        /* ... but we do still need to get the VFS's blessing */
+        ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
+        ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
+                                (1 << AFS_VNODE_WRITELOCKED))) != 0);
+        ret = posix_lock_file(file, fl, NULL);
+        if (ret < 0)
+                goto vfs_rejected_lock;
+        spin_unlock(&vnode->lock);
+        /* again, make sure we've got a callback on this file and, again, make
+         * sure that our view of the data version is up to date (we ignore
+         * errors incurred here and deal with the consequences elsewhere) */
+        afs_vnode_fetch_status(vnode, NULL, key);
+error:
+        unlock_kernel();
+        _leave(" = %d", ret);
+        return ret;
+vfs_rejected_lock:
+        /* the VFS rejected the lock we just obtained, so we have to discard
+         * what we just got */
+        _debug("vfs refused %d", ret);
+        list_del_init(&fl->fl_u.afs.link);
+        if (list_empty(&vnode->granted_locks))
+                afs_defer_unlock(vnode, key);
+        spin_unlock(&vnode->lock);
+        goto abort_attempt;
+}
+/*
+ * unlock on a file on the server
+ */
+static int afs_do_unlk(struct file *file, struct file_lock *fl)
+{
+        struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+        struct key *key = file->private_data;
+        int ret;
+        _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+        /* only whole-file unlocks are supported */
+        if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+                return -EINVAL;
+        fl->fl_ops = &afs_lock_ops;
+        INIT_LIST_HEAD(&fl->fl_u.afs.link);
+        fl->fl_u.afs.state = AFS_LOCK_PENDING;
+        spin_lock(&vnode->lock);
+        ret = posix_lock_file(file, fl, NULL);
+        if (ret < 0) {
+                spin_unlock(&vnode->lock);
+                _leave(" = %d [vfs]", ret);
+                return ret;
+        }
+        /* discard the server lock only if all granted locks are gone */
+        if (list_empty(&vnode->granted_locks))
+                afs_defer_unlock(vnode, key);
+        spin_unlock(&vnode->lock);
+        _leave(" = 0");
+        return 0;
+}
+/*
+ * return information about a lock we currently hold, if indeed we hold one
+ */
+static int afs_do_getlk(struct file *file, struct file_lock *fl)
+{
+        struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+        struct key *key = file->private_data;
+        int ret, lock_count;
+        _enter("");
+        fl->fl_type = F_UNLCK;
+        mutex_lock(&vnode->vfs_inode.i_mutex);
+        /* check local lock records first */
+        ret = 0;
+        if (posix_test_lock(file, fl) == 0) {
+                /* no local locks; consult the server */
+                ret = afs_vnode_fetch_status(vnode, NULL, key);
+                if (ret < 0)
+                        goto error;
+                lock_count = vnode->status.lock_count;
+                if (lock_count) {
+                        if (lock_count > 0)
+                                fl->fl_type = F_RDLCK;
+                        else
+                                fl->fl_type = F_WRLCK;
+                        fl->fl_start = 0;
+                        fl->fl_end = OFFSET_MAX;
+                }
+        }
+error:
+        mutex_unlock(&vnode->vfs_inode.i_mutex);
+        _leave(" = %d [%hd]", ret, fl->fl_type);
+        return ret;
+}
+/*
+ * manage POSIX locks on a file
+ */
+int afs_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+        _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+               vnode->fid.vid, vnode->fid.vnode, cmd,
+               fl->fl_type, fl->fl_flags,
+               (long long) fl->fl_start, (long long) fl->fl_end);
+        /* AFS doesn't support mandatory locks */
+        if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+            fl->fl_type != F_UNLCK)
+                return -ENOLCK;
+        if (IS_GETLK(cmd))
+                return afs_do_getlk(file, fl);
+        if (fl->fl_type == F_UNLCK)
+                return afs_do_unlk(file, fl);
+        return afs_do_setlk(file, fl);
+}
+/*
+ * manage FLOCK locks on a file
+ */
+int afs_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+        _enter("{%x:%u},%d,{t=%x,fl=%x}",
+               vnode->fid.vid, vnode->fid.vnode, cmd,
+               fl->fl_type, fl->fl_flags);
+        /*
+         * No BSD flocks over NFS allowed.
+         * Note: we could try to fake a POSIX lock request here by
+         * using ((u32) filp | 0x80000000) or some such as the pid.
+         * Not sure whether that would be unique, though, or whether
+         * that would break in other places.
+         */
+        if (!(fl->fl_flags & FL_FLOCK))
+                return -ENOLCK;
+        /* we're simulating flock() locks using posix locks on the server */
+        fl->fl_owner = (fl_owner_t) file;
+        fl->fl_start = 0;
+        fl->fl_end = OFFSET_MAX;
+        if (fl->fl_type == F_UNLCK)
+                return afs_do_unlk(file, fl);
+        return afs_do_setlk(file, fl);
+}
+/*
+ * the POSIX lock management core VFS code copies the lock record and adds the
+ * copy into its own list, so we need to add that copy to the vnode's lock
+ * queue in the same place as the original (which will be deleted shortly
+ * after)
+ */
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+        _enter("");
+        list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
+}
+/*
+ * need to remove this lock from the vnode queue when it's removed from the
+ * VFS's list
+ */
+static void afs_fl_release_private(struct file_lock *fl)
+{
+        _enter("");
+        list_del_init(&fl->fl_u.afs.link);
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 5dff1308b6f0..023b95b0d9d7 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -67,7 +67,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
        EXTRACT(status->group);
        bp++; /* sync counter */
        data_version |= (u64) ntohl(*bp++) << 32;
-        bp++; /* lock count */
+        EXTRACT(status->lock_count);
        size |= (u64) ntohl(*bp++) << 32;
        bp++; /* spare 4 */
        *_bp = bp;
@@ -1748,3 +1748,156 @@ int afs_fs_get_volume_status(struct afs_server *server,
        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
+/*
+ * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
+ */
+static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
+                                    struct sk_buff *skb, bool last)
+{
+        const __be32 *bp;
+        _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+        afs_transfer_reply(call, skb);
+        if (!last)
+                return 0;
+        if (call->reply_size != call->reply_max)
+                return -EBADMSG;
+        /* unmarshall the reply once we've received all of it */
+        bp = call->buffer;
+        /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+        _leave(" = 0 [done]");
+        return 0;
+}
+/*
+ * FS.SetLock operation type
+ */
+static const struct afs_call_type afs_RXFSSetLock = {
+        .name           = "FS.SetLock",
+        .deliver        = afs_deliver_fs_xxxx_lock,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * FS.ExtendLock operation type
+ */
+static const struct afs_call_type afs_RXFSExtendLock = {
+        .name           = "FS.ExtendLock",
+        .deliver        = afs_deliver_fs_xxxx_lock,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * FS.ReleaseLock operation type
+ */
+static const struct afs_call_type afs_RXFSReleaseLock = {
+        .name           = "FS.ReleaseLock",
+        .deliver        = afs_deliver_fs_xxxx_lock,
+        .abort_to_error = afs_abort_to_error,
+        .destructor     = afs_flat_call_destructor,
+};
+/*
+ * get a lock on a file
+ */
+int afs_fs_set_lock(struct afs_server *server,
+                    struct key *key,
+                    struct afs_vnode *vnode,
+                    afs_lock_type_t type,
+                    const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        __be32 *bp;
+        _enter("");
+        call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSSETLOCK);
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        *bp++ = htonl(type);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+/*
+ * extend a lock on a file
+ */
+int afs_fs_extend_lock(struct afs_server *server,
+                       struct key *key,
+                       struct afs_vnode *vnode,
+                       const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        __be32 *bp;
+        _enter("");
+        call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSEXTENDLOCK);
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+/*
+ * release a lock on a file
+ */
+int afs_fs_release_lock(struct afs_server *server,
+                        struct key *key,
+                        struct afs_vnode *vnode,
+                        const struct afs_wait_mode *wait_mode)
+{
+        struct afs_call *call;
+        __be32 *bp;
+        _enter("");
+        call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+        if (!call)
+                return -ENOMEM;
+        call->key = key;
+        call->reply = vnode;
+        call->service_id = FS_SERVICE;
+        call->port = htons(AFS_FS_PORT);
+        /* marshall the parameters */
+        bp = call->request;
+        *bp++ = htonl(FSRELEASELOCK);
+        *bp++ = htonl(vnode->fid.vid);
+        *bp++ = htonl(vnode->fid.vnode);
+        *bp++ = htonl(vnode->fid.unique);
+        return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2c55dd94a1de..6306438f331f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -351,10 +351,18 @@ struct afs_vnode {
 #define AFS_VNODE_ZAP_DATA      3               /* set if vnode's data should be invalidated */
 #define AFS_VNODE_DELETED       4               /* set if vnode deleted on server */
 #define AFS_VNODE_MOUNTPOINT    5               /* set if vnode is a mountpoint symlink */
+#define AFS_VNODE_LOCKING       6               /* set if waiting for lock on vnode */
+#define AFS_VNODE_READLOCKED    7               /* set if vnode is read-locked on the server */
+#define AFS_VNODE_WRITELOCKED   8               /* set if vnode is write-locked on the server */
+#define AFS_VNODE_UNLOCKING     9               /* set if vnode is being unlocked on the server */
        long                    acl_order;      /* ACL check count (callback break count) */
        struct list_head        writebacks;     /* alterations in pagecache that need writing */
+        struct list_head        pending_locks;  /* locks waiting to be granted */
+        struct list_head        granted_locks;  /* locks granted on this file */
+        struct delayed_work     lock_work;      /* work to be done in locking */
+        struct key              *unlock_key;    /* key to be used in unlocking */
        /* outstanding callback notification on this file */
        struct rb_node          server_rb;      /* link in server->fs_vnodes */
@@ -474,6 +482,15 @@ extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
 /*
+ * flock.c
+ */
+extern void __exit afs_kill_lock_manager(void);
+extern void afs_lock_work(struct work_struct *);
+extern void afs_lock_may_be_available(struct afs_vnode *);
+extern int afs_lock(struct file *, int, struct file_lock *);
+extern int afs_flock(struct file *, int, struct file_lock *);
+/*
 * fsclient.c
 */
 extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
@@ -513,6 +530,15 @@ extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
                                    struct afs_vnode *,
                                    struct afs_volume_status *,
                                    const struct afs_wait_mode *);
+extern int afs_fs_set_lock(struct afs_server *, struct key *,
+                           struct afs_vnode *, afs_lock_type_t,
+                           const struct afs_wait_mode *);
+extern int afs_fs_extend_lock(struct afs_server *, struct key *,
+                              struct afs_vnode *,
+                              const struct afs_wait_mode *);
+extern int afs_fs_release_lock(struct afs_server *, struct key *,
+                               struct afs_vnode *,
+                               const struct afs_wait_mode *);
 /*
 * inode.c
@@ -681,6 +707,10 @@ extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
 extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
 extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
                                       struct afs_volume_status *);
+extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
+                              afs_lock_type_t);
+extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
+extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
 /*
 * volume.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cd21195bbb24..0f60f6b35769 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -168,6 +168,7 @@ static void __exit afs_exit(void)
        printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
        afs_fs_exit();
+        afs_kill_lock_manager();
        afs_close_socket();
        afs_purge_servers();
        afs_callback_update_kill();
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index d1a889c40742..2d33a5f7d218 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -35,6 +35,7 @@ int afs_abort_to_error(u32 abort_code)
        case VOVERQUOTA:        return -EDQUOT;
        case VBUSY:             return -EBUSY;
        case VMOVED:            return -ENXIO;
+        case 0x2f6df0a:         return -EWOULDBLOCK;
        case 0x2f6df0c:         return -EACCES;
        case 0x2f6df0f:         return -EBUSY;
        case 0x2f6df10:         return -EEXIST;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 13df512aea9e..6edb56683b9a 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -201,23 +201,9 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 */
 static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
 {
-        struct list_head *_p;
-        loff_t pos = *_pos;
        /* lock the list against modification */
        down_read(&afs_proc_cells_sem);
+        return seq_list_start_head(&afs_proc_cells, *_pos);
-        /* allow for the header line */
-        if (!pos)
-                return (void *) 1;
-        pos--;
-        /* find the n'th element in the list */
-        list_for_each(_p, &afs_proc_cells)
-                if (!pos--)
-                        break;
-        return _p != &afs_proc_cells ? _p : NULL;
 }
 /*
@@ -225,14 +211,7 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
 */
 static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        struct list_head *_p;
+        return seq_list_next(v, &afs_proc_cells, pos);
-        (*pos)++;
-        _p = v;
-        _p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
-        return _p != &afs_proc_cells ? _p : NULL;
 }
 /*
@@ -250,7 +229,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
        struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
-        if (v == (void *) 1) {
+        if (v == &afs_proc_cells) {
                /* display header on line 1 */
                seq_puts(m, "USE NAME\n");
                return 0;
@@ -503,26 +482,13 @@ static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
 */
 static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 {
-        struct list_head *_p;
        struct afs_cell *cell = m->private;
-        loff_t pos = *_pos;
        _enter("cell=%p pos=%Ld", cell, *_pos);
        /* lock the list against modification */
        down_read(&cell->vl_sem);
+        return seq_list_start_head(&cell->vl_list, *_pos);
-        /* allow for the header line */
-        if (!pos)
-                return (void *) 1;
-        pos--;
-        /* find the n'th element in the list */
-        list_for_each(_p, &cell->vl_list)
-                if (!pos--)
-                        break;
-        return _p != &cell->vl_list ? _p : NULL;
 }
 /*
@@ -531,17 +497,10 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
                                        loff_t *_pos)
 {
-        struct list_head *_p;
        struct afs_cell *cell = p->private;
        _enter("cell=%p pos=%Ld", cell, *_pos);
+        return seq_list_next(v, &cell->vl_list, _pos);
-        (*_pos)++;
-        _p = v;
-        _p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
-        return (_p != &cell->vl_list) ? _p : NULL;
 }
 /*
@@ -569,11 +528,12 @@ const char afs_vlocation_states[][4] = {
 */
 static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 {
+        struct afs_cell *cell = m->private;
        struct afs_vlocation *vlocation =
                list_entry(v, struct afs_vlocation, link);
        /* display header on line 1 */
-        if (v == (void *) 1) {
+        if (v == &cell->vl_list) {
                seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
                return 0;
        }
@@ -734,26 +694,13 @@ static int afs_proc_cell_servers_release(struct inode *inode,
 static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
        __acquires(m->private->servers_lock)
 {
-        struct list_head *_p;
        struct afs_cell *cell = m->private;
-        loff_t pos = *_pos;
        _enter("cell=%p pos=%Ld", cell, *_pos);
        /* lock the list against modification */
        read_lock(&cell->servers_lock);
+        return seq_list_start_head(&cell->servers, *_pos);
-        /* allow for the header line */
-        if (!pos)
-                return (void *) 1;
-        pos--;
-        /* find the n'th element in the list */
-        list_for_each(_p, &cell->servers)
-                if (!pos--)
-                        break;
-        return _p != &cell->servers ? _p : NULL;
 }
 /*
@@ -762,17 +709,10 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
 static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
                                        loff_t *_pos)
 {
-        struct list_head *_p;
        struct afs_cell *cell = p->private;
        _enter("cell=%p pos=%Ld", cell, *_pos);
+        return seq_list_next(v, &cell->servers, _pos);
-        (*_pos)++;
-        _p = v;
-        _p = v == (void *) 1 ? cell->servers.next : _p->next;
-        return _p != &cell->servers ? _p : NULL;
 }
 /*
@@ -791,11 +731,12 @@ static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
 */
 static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
 {
+        struct afs_cell *cell = m->private;
        struct afs_server *server = list_entry(v, struct afs_server, link);
        char ipaddr[20];
        /* display header on line 1 */
-        if (v == (void *) 1) {
+        if (v == &cell->servers) {
                seq_puts(m, "USE ADDR            STATE\n");
                return 0;
        }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 2e8496ba1205..993cdf1cce3a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -460,6 +460,9 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
        spin_lock_init(&vnode->writeback_lock);
        spin_lock_init(&vnode->lock);
        INIT_LIST_HEAD(&vnode->writebacks);
+        INIT_LIST_HEAD(&vnode->pending_locks);
+        INIT_LIST_HEAD(&vnode->granted_locks);
+        INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
        INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
 }
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 232c55dc245d..2f05c4fc2a70 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -561,7 +561,7 @@ no_server:
 /*
 * create a hard link
 */
-extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
                          struct key *key, const char *name)
 {
        struct afs_server *server;
@@ -887,11 +887,6 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
               vnode->fid.unique,
               key_serial(key));
-        /* this op will fetch the status */
-        spin_lock(&vnode->lock);
-        vnode->update_cnt++;
-        spin_unlock(&vnode->lock);
        do {
                /* pick a server to query */
                server = afs_volume_pick_fileserver(vnode);
@@ -905,20 +900,127 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
        } while (!afs_volume_release_fileserver(vnode, server, ret));
        /* adjust the flags */
-        if (ret == 0) {
+        if (ret == 0)
-                afs_vnode_finalise_status_update(vnode, server);
+                afs_put_server(server);
+        _leave(" = %d", ret);
+        return ret;
+no_server:
+        return PTR_ERR(server);
+}
+/*
+ * get a lock on a file
+ */
+int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
+                       afs_lock_type_t type)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%x:%u.%u},%x,%u",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key), type);
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0)
+                afs_put_server(server);
+        _leave(" = %d", ret);
+        return ret;
+no_server:
+        return PTR_ERR(server);
+}
+/*
+ * extend a lock on a file
+ */
+int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%x:%u.%u},%x",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key));
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0)
+                afs_put_server(server);
+        _leave(" = %d", ret);
+        return ret;
+no_server:
+        return PTR_ERR(server);
+}
+/*
+ * release a lock on a file
+ */
+int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
+{
+        struct afs_server *server;
+        int ret;
+        _enter("%s{%x:%u.%u},%x",
+               vnode->volume->vlocation->vldb.name,
+               vnode->fid.vid,
+               vnode->fid.vnode,
+               vnode->fid.unique,
+               key_serial(key));
+        do {
+                /* pick a server to query */
+                server = afs_volume_pick_fileserver(vnode);
+                if (IS_ERR(server))
+                        goto no_server;
+                _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+                ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
+        } while (!afs_volume_release_fileserver(vnode, server, ret));
+        /* adjust the flags */
+        if (ret == 0)
                afs_put_server(server);
-        } else {
-                afs_vnode_status_update_failed(vnode, ret);
-        }
        _leave(" = %d", ret);
        return ret;
 no_server:
-        spin_lock(&vnode->lock);
-        vnode->update_cnt--;
-        ASSERTCMP(vnode->update_cnt, >=, 0);
-        spin_unlock(&vnode->lock);
        return PTR_ERR(server);
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index edc67486238f..b4a75880f6fd 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -53,7 +53,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
 };
 /**
- * anon_inode_getfd - creates a new file instance by hooking it up to and
+ * anon_inode_getfd - creates a new file instance by hooking it up to an
 *                    anonymous inode, and a dentry that describe the "class"
 *                    of the file
 *
@@ -66,7 +66,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfd() will share a single inode, by
+ * All the files created with anon_inode_getfd() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.
 */
@@ -142,9 +142,9 @@ err_put_filp:
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 /*
- * A single inode exist for all anon_inode files. Contrary to pipes,
+ * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes has no per-instance data associated, so we can avoid
+ * anon_inode inodes have no associated per-instance data, so we need
- * the allocation of multiple of them.
+ * only allocate one of them.
 */
 static struct inode *anon_inode_mkinode(void)
 {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 08e4414b8374..a27e42bf3400 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,7 @@
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
 /*
 * If we don't support core dumping, then supply a NULL so we
@@ -80,7 +80,7 @@ static struct linux_binfmt elf_format = {
                .hasvdso        = 1
 };
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+#define BAD_ADDR(x) IS_ERR_VALUE(x)
 static int set_brk(unsigned long start, unsigned long end)
 {
@@ -285,33 +285,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 #ifndef elf_map
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-                struct elf_phdr *eppnt, int prot, int type)
+                struct elf_phdr *eppnt, int prot, int type,
+                unsigned long total_size)
 {
        unsigned long map_addr;
-        unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
+        unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+        unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+        addr = ELF_PAGESTART(addr);
+        size = ELF_PAGEALIGN(size);
-        down_write(&current->mm->mmap_sem);
        /* mmap() will return -EINVAL if given a zero size, but a
         * segment with zero filesize is perfectly valid */
-        if (eppnt->p_filesz + pageoffset)
+        if (!size)
-                map_addr = do_mmap(filep, ELF_PAGESTART(addr),
+                return addr;
-                                   eppnt->p_filesz + pageoffset, prot, type,
-                                   eppnt->p_offset - pageoffset);
+        down_write(&current->mm->mmap_sem);
-        else
+        /*
-                map_addr = ELF_PAGESTART(addr);
+        * total_size is the size of the ELF (interpreter) image.
+        * The _first_ mmap needs to know the full size, otherwise
+        * randomization might put this image into an overlapping
+        * position with the ELF binary image. (since size < total_size)
+        * So we first map the 'big' image - and unmap the remainder at
+        * the end. (which unmap is needed for ELF images with holes.)
+        */
+        if (total_size) {
+                total_size = ELF_PAGEALIGN(total_size);
+                map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+                if (!BAD_ADDR(map_addr))
+                        do_munmap(current->mm, map_addr+size, total_size-size);
+        } else
+                map_addr = do_mmap(filep, addr, size, prot, type, off);
        up_write(&current->mm->mmap_sem);
        return(map_addr);
 }
 #endif /* !elf_map */
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+        int i, first_idx = -1, last_idx = -1;
+        for (i = 0; i < nr; i++) {
+                if (cmds[i].p_type == PT_LOAD) {
+                        last_idx = i;
+                        if (first_idx == -1)
+                                first_idx = i;
+                }
+        }
+        if (first_idx == -1)
+                return 0;
+        return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+                                ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
 /* This is much more generalized than the library routine read function,
   so we keep this separate.  Technically the library read function
   is only provided so that we can read a.out libraries that have
   an ELF header */
 static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
-                struct file *interpreter, unsigned long *interp_load_addr)
+                struct file *interpreter, unsigned long *interp_map_addr,
+                unsigned long no_base)
 {
        struct elf_phdr *elf_phdata;
        struct elf_phdr *eppnt;
@@ -319,6 +356,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
        int load_addr_set = 0;
        unsigned long last_bss = 0, elf_bss = 0;
        unsigned long error = ~0UL;
+        unsigned long total_size;
        int retval, i, size;
        /* First of all, some simple consistency checks */
@@ -357,6 +395,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                goto out_close;
        }
+        total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+        if (!total_size) {
+                error = -EINVAL;
+                goto out_close;
+        }
        eppnt = elf_phdata;
        for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
                if (eppnt->p_type == PT_LOAD) {
@@ -374,9 +418,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                        vaddr = eppnt->p_vaddr;
                        if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
                                elf_type |= MAP_FIXED;
+                        else if (no_base && interp_elf_ex->e_type == ET_DYN)
+                                load_addr = -vaddr;
                        map_addr = elf_map(interpreter, load_addr + vaddr,
-                                           eppnt, elf_prot, elf_type);
+                                           eppnt, elf_prot, elf_type, total_size);
+                        total_size = 0;
+                        if (!*interp_map_addr)
+                                *interp_map_addr = map_addr;
                        error = map_addr;
                        if (BAD_ADDR(map_addr))
                                goto out_close;
@@ -442,8 +491,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                        goto out_close;
        }
-        *interp_load_addr = load_addr;
+        error = load_addr;
-        error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
 out_close:
        kfree(elf_phdata);
@@ -540,7 +588,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        int elf_exec_fileno;
        int retval, i;
        unsigned int size;
-        unsigned long elf_entry, interp_load_addr = 0;
+        unsigned long elf_entry;
+        unsigned long interp_load_addr = 0;
        unsigned long start_code, end_code, start_data, end_data;
        unsigned long reloc_func_desc = 0;
        char passed_fileno[6];
@@ -808,9 +857,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        current->mm->start_stack = bprm->p;
        /* Now we do a little grungy work by mmaping the ELF image into
-           the correct location in memory.  At this point, we assume that
+           the correct location in memory. */
-           the image should be loaded at fixed address, not at a variable
-           address. */
        for(i = 0, elf_ppnt = elf_phdata;
            i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
                int elf_prot = 0, elf_flags;
@@ -864,11 +911,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
+#ifdef CONFIG_X86
+                        load_bias = 0;
+#else
                        load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#endif
                }
                error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-                                elf_prot, elf_flags);
+                                elf_prot, elf_flags,0);
                if (BAD_ADDR(error)) {
                        send_sig(SIGKILL, current, 0);
                        retval = IS_ERR((void *)error) ?
@@ -944,13 +995,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        }
        if (elf_interpreter) {
-                if (interpreter_type == INTERPRETER_AOUT)
+                if (interpreter_type == INTERPRETER_AOUT) {
                        elf_entry = load_aout_interp(&loc->interp_ex,
                                                     interpreter);
-                else
+                } else {
+                        unsigned long uninitialized_var(interp_map_addr);
                        elf_entry = load_elf_interp(&loc->interp_elf_ex,
                                                    interpreter,
-                                                    &interp_load_addr);
+                                                    &interp_map_addr,
+                                                    load_bias);
+                        if (!BAD_ADDR(elf_entry)) {
+                                /*
+                                 * load_elf_interp() returns relocation
+                                 * adjustment
+                                 */
+                                interp_load_addr = elf_entry;
+                                elf_entry += loc->interp_elf_ex.e_entry;
+                        }
+                }
                if (BAD_ADDR(elf_entry)) {
                        force_sig(SIGSEGV, current);
                        retval = IS_ERR((void *)elf_entry) ?
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3e9bfa748cf..3635315e3b99 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -588,12 +588,10 @@ EXPORT_SYMBOL(bdget);
 long nr_blockdev_pages(void)
 {
-        struct list_head *p;
+        struct block_device *bdev;
        long ret = 0;
        spin_lock(&bdev_lock);
-        list_for_each(p, &all_bdevs) {
+        list_for_each_entry(bdev, &all_bdevs, bd_list) {
-                struct block_device *bdev;
-                bdev = list_entry(p, struct block_device, bd_list);
                ret += bdev->bd_inode->i_mapping->nrpages;
        }
        spin_unlock(&bdev_lock);
@@ -874,7 +872,7 @@ static struct bd_holder *find_bd_holder(struct block_device *bdev,
 */
 static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
 {
-        int ret;
+        int err;
        if (!bo)
                return -EINVAL;
@@ -882,15 +880,18 @@ static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
        if (!bd_holder_grab_dirs(bdev, bo))
                return -EBUSY;
-        ret = add_symlink(bo->sdir, bo->sdev);
+        err = add_symlink(bo->sdir, bo->sdev);
-        if (ret == 0) {
+        if (err)
-                ret = add_symlink(bo->hdir, bo->hdev);
+                return err;
-                if (ret)
-                        del_symlink(bo->sdir, bo->sdev);
+        err = add_symlink(bo->hdir, bo->hdev);
+        if (err) {
+                del_symlink(bo->sdir, bo->sdev);
+                return err;
        }
-        if (ret == 0)
-                list_add_tail(&bo->list, &bdev->bd_holder_list);
+        list_add_tail(&bo->list, &bdev->bd_holder_list);
-        return ret;
+        return 0;
 }
 /**
@@ -948,7 +949,7 @@ static struct bd_holder *del_bd_holder(struct block_device *bdev,
 static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
                                struct kobject *kobj)
 {
-        int res;
+        int err;
        struct bd_holder *bo, *found;
        if (!kobj)
@@ -959,21 +960,24 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
                return -ENOMEM;
        mutex_lock(&bdev->bd_mutex);
-        res = bd_claim(bdev, holder);
-        if (res == 0) {
-                found = find_bd_holder(bdev, bo);
-                if (found == NULL) {
-                        res = add_bd_holder(bdev, bo);
-                        if (res)
-                                bd_release(bdev);
-                }
-        }
-        if (res || found)
+        err = bd_claim(bdev, holder);
-                free_bd_holder(bo);
+        if (err)
-        mutex_unlock(&bdev->bd_mutex);
+                goto fail;
-        return res;
+        found = find_bd_holder(bdev, bo);
+        if (found)
+                goto fail;
+        err = add_bd_holder(bdev, bo);
+        if (err)
+                bd_release(bdev);
+        else
+                bo = NULL;
+fail:
+        mutex_unlock(&bdev->bd_mutex);
+        free_bd_holder(bo);
+        return err;
 }
 /**
@@ -987,15 +991,12 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
 static void bd_release_from_kobject(struct block_device *bdev,
                                        struct kobject *kobj)
 {
-        struct bd_holder *bo;
        if (!kobj)
                return;
        mutex_lock(&bdev->bd_mutex);
        bd_release(bdev);
-        if ((bo = del_bd_holder(bdev, kobj)))
+        free_bd_holder(del_bd_holder(bdev, kobj));
-                free_bd_holder(bo);
        mutex_unlock(&bdev->bd_mutex);
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index aa68206bd517..0f9006714230 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -356,7 +356,7 @@ static void free_more_memory(void)
        for_each_online_pgdat(pgdat) {
                zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
                if (*zones)
-                        try_to_free_pages(zones, GFP_NOFS);
+                        try_to_free_pages(zones, 0, GFP_NOFS);
        }
 }
@@ -676,6 +676,39 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 /*
+ * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * dirty.
+ *
+ * If warn is true, then emit a warning if the page is not uptodate and has
+ * not been truncated.
+ */
+static int __set_page_dirty(struct page *page,
+                struct address_space *mapping, int warn)
+{
+        if (unlikely(!mapping))
+                return !TestSetPageDirty(page);
+        if (TestSetPageDirty(page))
+                return 0;
+        write_lock_irq(&mapping->tree_lock);
+        if (page->mapping) {    /* Race with truncate? */
+                WARN_ON_ONCE(warn && !PageUptodate(page));
+                if (mapping_cap_account_dirty(mapping)) {
+                        __inc_zone_page_state(page, NR_FILE_DIRTY);
+                        task_io_account_write(PAGE_CACHE_SIZE);
+                }
+                radix_tree_tag_set(&mapping->page_tree,
+                                page_index(page), PAGECACHE_TAG_DIRTY);
+        }
+        write_unlock_irq(&mapping->tree_lock);
+        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+        return 1;
+}
+/*
 * Add a page to the dirty page list.
 *
 * It is a sad fact of life that this function is called from several places
@@ -702,7 +735,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 */
 int __set_page_dirty_buffers(struct page *page)
 {
-        struct address_space * const mapping = page_mapping(page);
+        struct address_space *mapping = page_mapping(page);
        if (unlikely(!mapping))
                return !TestSetPageDirty(page);
@@ -719,21 +752,7 @@ int __set_page_dirty_buffers(struct page *page)
        }
        spin_unlock(&mapping->private_lock);
-        if (TestSetPageDirty(page))
+        return __set_page_dirty(page, mapping, 1);
-                return 0;
-        write_lock_irq(&mapping->tree_lock);
-        if (page->mapping) {    /* Race with truncate? */
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
-                radix_tree_tag_set(&mapping->page_tree,
-                                page_index(page), PAGECACHE_TAG_DIRTY);
-        }
-        write_unlock_irq(&mapping->tree_lock);
-        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-        return 1;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -982,7 +1001,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
        struct buffer_head *bh;
        page = find_or_create_page(inode->i_mapping, index,
-                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+                (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
        if (!page)
                return NULL;
@@ -1026,11 +1045,6 @@ failed:
 /*
 * Create buffers for the specified block device block's page.  If
 * that page was dirty, the buffers are set dirty also.
- *
- * Except that's a bug.  Attaching dirty buffers to a dirty
- * blockdev's page can result in filesystem corruption, because
- * some of those buffers may be aliases of filesystem data.
- * grow_dev_page() will go BUG() if this happens.
 */
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
@@ -1137,8 +1151,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 */
 void fastcall mark_buffer_dirty(struct buffer_head *bh)
 {
+        WARN_ON_ONCE(!buffer_uptodate(bh));
        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
-                __set_page_dirty_nobuffers(bh->b_page);
+                __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
 }
 /*
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8b0cbf4a4ad0..bd0f2f2353ce 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -849,6 +849,7 @@ static int cifs_oplock_thread(void * dummyarg)
        __u16  netfid;
        int rc;
+        set_freezable();
        do {
                if (try_to_freeze()) 
                        continue;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f4e92661b223..0a1b8bd1dfcb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -363,6 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                        GFP_KERNEL);
        }
+        set_freezable();
        while (!kthread_should_stop()) {
                if (try_to_freeze())
                        continue;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 1d716392c3aa..96df1d51fdc3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -29,6 +29,7 @@
  */
 #include <linux/fs.h>
+#include <linux/exportfs.h>
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6b44cdc96fac..e440a7b95d02 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -63,6 +63,7 @@
 #include <linux/wireless.h>
 #include <linux/atalk.h>
 #include <linux/blktrace_api.h>
+#include <linux/loop.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -3489,6 +3490,9 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
 IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
+/* loop */
+IGNORE_IOCTL(LOOP_CLR_FD)
 };
 #define IOCTL_HASHSIZE 256
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c034b312..3b0185fdf9a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
 struct configfs_dirent {
        atomic_t                s_count;
+        int                     s_dependent_count;
        struct list_head        s_sibling;
        struct list_head        s_children;
        struct list_head        s_links;
-        void                    * s_element;
+        void                    * s_element;
        int                     s_type;
        umode_t                 s_mode;
        struct dentry           * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
 #define CONFIGFS_ROOT           0x0001
 #define CONFIGFS_DIR            0x0002
-#define CONFIGFS_ITEM_ATTR      0x0004
+#define CONFIGFS_ITEM_ATTR      0x0004
-#define CONFIGFS_ITEM_LINK      0x0020
+#define CONFIGFS_ITEM_LINK      0x0020
 #define CONFIGFS_USET_DIR       0x0040
 #define CONFIGFS_USET_DEFAULT   0x0080
 #define CONFIGFS_USET_DROPPING  0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e58f36..2f436d4f1d6d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
                        /* Mark that we've taken i_mutex */
                        sd->s_type |= CONFIGFS_USET_DROPPING;
+                        /*
+                         * Yup, recursive.  If there's a problem, blame
+                         * deep nesting of default_groups
+                         */
                        ret = configfs_detach_prep(sd->s_dentry);
                        if (!ret)
                                continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
 /*
 * All of link_obj/unlink_obj/link_group/unlink_group require that
- * subsys->su_sem is held.
+ * subsys->su_mutex is held.
 */
 static void unlink_obj(struct config_item *item)
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item)
 }
 /*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy.  Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy.  A client does not need to provide this
+ * callback.  The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items.  It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+                                     struct config_item *item)
+{
+        struct config_item_type *type;
+        type = parent_item->ci_type;
+        BUG_ON(!type);
+        if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+                type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+                                                      item);
+}
+/*
 * Drop the initial reference from make_item()/make_group()
 * This function assumes that reference is held on item
 * and that item holds a valid reference to the parent.  Also, it
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
         */
        if (type->ct_group_ops && type->ct_group_ops->drop_item)
                type->ct_group_ops->drop_item(to_config_group(parent_item),
-                                                item);
+                                              item);
        else
                config_item_put(item);
 }
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+        printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+        type_print(CONFIGFS_ROOT);
+        type_print(CONFIGFS_DIR);
+        type_print(CONFIGFS_ITEM_ATTR);
+        type_print(CONFIGFS_ITEM_LINK);
+        type_print(CONFIGFS_USET_DIR);
+        type_print(CONFIGFS_USET_DEFAULT);
+        type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+        struct configfs_dirent *child_sd;
+        int ret = 0;
+        configfs_dump_one(sd, level);
+        if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+                return 0;
+        list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+                ret = configfs_dump(child_sd, level + 2);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+#endif
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item.  If this
+ * happens, the item cannot go away until the dependant can live without
+ * it.  The idea is to give client modules as simple an interface as
+ * possible.  When a system asks them to depend on an item, they just
+ * call configfs_depend_item().  If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex?  Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path.  So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc).  This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem().  So we take the same
+ * precautions.  We pin the filesystem.  We lock each i_mutex _in_order_
+ * on our way down the tree.  If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore.  Holding the i_mutex chain locks
+ * out mkdir() and rmdir(), who might be racing us.
+ */
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes.  This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep().  We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object.  We take i_mutex on each step of the way down.  IT IS
+ * ESSENTIAL THAT i_mutex LOCKING IS ORDERED.  If we come back up a branch,
+ * we'll drop the i_mutex.
+ *
+ * If the target is not found, -ENOENT is bubbled up and we have released
+ * all locks.  If the target was found, the locks will be cleared by
+ * configfs_depend_rollback().
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive because the locking traversal is tricky.  There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack!  Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+                                struct config_item *target)
+{
+        struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+        int ret = 0;
+        BUG_ON(!origin || !sd);
+        /* Lock this guy on the way down */
+        mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+        if (sd->s_element == target)  /* Boo-yah */
+                goto out;
+        list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+                if (child_sd->s_type & CONFIGFS_DIR) {
+                        ret = configfs_depend_prep(child_sd->s_dentry,
+                                                   target);
+                        if (!ret)
+                                goto out;  /* Child path boo-yah */
+                }
+        }
+        /* We looped all our children and didn't find target */
+        mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+        ret = -ENOENT;
+out:
+        return ret;
+}
+/*
+ * This is ONLY called if configfs_depend_prep() did its job.  So we can
+ * trust the entire path from item back up to origin.
+ *
+ * We walk backwards from item, unlocking each i_mutex.  We finish by
+ * unlocking origin.
+ */
+static void configfs_depend_rollback(struct dentry *origin,
+                                     struct config_item *item)
+{
+        struct dentry *dentry = item->ci_dentry;
+        while (dentry != origin) {
+                mutex_unlock(&dentry->d_inode->i_mutex);
+                dentry = dentry->d_parent;
+        }
+        mutex_unlock(&origin->d_inode->i_mutex);
+}
+int configfs_depend_item(struct configfs_subsystem *subsys,
+                         struct config_item *target)
+{
+        int ret;
+        struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+        struct config_item *s_item = &subsys->su_group.cg_item;
+        /*
+         * Pin the configfs filesystem.  This means we can safely access
+         * the root of the configfs filesystem.
+         */
+        ret = configfs_pin_fs();
+        if (ret)
+                return ret;
+        /*
+         * Next, lock the root directory.  We're going to check that the
+         * subsystem is really registered, and so we need to lock out
+         * configfs_[un]register_subsystem().
+         */
+        mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+        root_sd = configfs_sb->s_root->d_fsdata;
+        list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+                if (p->s_type & CONFIGFS_DIR) {
+                        if (p->s_element == s_item) {
+                                subsys_sd = p;
+                                break;
+                        }
+                }
+        }
+        if (!subsys_sd) {
+                ret = -ENOENT;
+                goto out_unlock_fs;
+        }
+        /* Ok, now we can trust subsys/s_item */
+        /* Scan the tree, locking i_mutex recursively, return 0 if found */
+        ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+        if (ret)
+                goto out_unlock_fs;
+        /* We hold all i_mutexes from the subsystem down to the target */
+        p = target->ci_dentry->d_fsdata;
+        p->s_dependent_count += 1;
+        configfs_depend_rollback(subsys_sd->s_dentry, target);
+out_unlock_fs:
+        mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+        /*
+         * If we succeeded, the fs is pinned via other methods.  If not,
+         * we're done with it anyway.  So release_fs() is always right.
+         */
+        configfs_release_fs();
+        return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+/*
+ * Release the dependent linkage.  This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+                            struct config_item *target)
+{
+        struct configfs_dirent *sd;
+        /*
+         * Since we can trust everything is pinned, we just need i_mutex
+         * on the item.
+         */
+        mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+        sd = target->ci_dentry->d_fsdata;
+        BUG_ON(sd->s_dependent_count < 1);
+        sd->s_dependent_count -= 1;
+        /*
+         * After this unlock, we cannot trust the item to stay alive!
+         * DO NOT REFERENCE item after this unlock.
+         */
+        mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
-        down(&subsys->su_sem);
+        mutex_lock(&subsys->su_mutex);
        group = NULL;
        item = NULL;
        if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                if (item)
                        link_obj(parent_item, item);
        }
-        up(&subsys->su_sem);
+        mutex_unlock(&subsys->su_mutex);
        kfree(name);
        if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_unlink:
        if (ret) {
                /* Tear down everything we built up */
-                down(&subsys->su_sem);
+                mutex_lock(&subsys->su_mutex);
+                client_disconnect_notify(parent_item, item);
                if (group)
                        unlink_group(group);
                else
                        unlink_obj(item);
                client_drop_item(parent_item, item);
-                up(&subsys->su_sem);
+                mutex_unlock(&subsys->su_mutex);
                if (module_got)
                        module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DEFAULT)
                return -EPERM;
+        /*
+         * Here's where we check for dependents.  We're protected by
+         * i_mutex.
+         */
+        if (sd->s_dependent_count)
+                return -EBUSY;
        /* Get a working ref until we have the child */
        parent_item = configfs_get_config_item(dentry->d_parent);
        subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DIR) {
                configfs_detach_group(item);
-                down(&subsys->su_sem);
+                mutex_lock(&subsys->su_mutex);
+                client_disconnect_notify(parent_item, item);
                unlink_group(to_config_group(item));
        } else {
                configfs_detach_item(item);
-                down(&subsys->su_sem);
+                mutex_lock(&subsys->su_mutex);
+                client_disconnect_notify(parent_item, item);
                unlink_obj(item);
        }
        client_drop_item(parent_item, item);
-        up(&subsys->su_sem);
+        mutex_unlock(&subsys->su_mutex);
        /* Drop our reference from above */
        config_item_put(item);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3527c7c6def8..a3658f9a082c 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -27,19 +27,26 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
+/*
+ * A simple attribute can only be 4096 characters.  Why 4k?  Because the
+ * original code limited it to PAGE_SIZE.  That's a bad idea, though,
+ * because an attribute of 16k on ia64 won't work on x86.  So we limit to
+ * 4k, our minimum common page size.
+ */
+#define SIMPLE_ATTR_SIZE 4096
 struct configfs_buffer {
        size_t                  count;
        loff_t                  pos;
        char                    * page;
        struct configfs_item_operations * ops;
-        struct semaphore        sem;
+        struct mutex            mutex;
        int                     needs_read_fill;
 };
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
        count = ops->show_attribute(item,attr,buffer->page);
        buffer->needs_read_fill = 0;
-        BUG_ON(count > (ssize_t)PAGE_SIZE);
+        BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
        if (count >= 0)
                buffer->count = count;
        else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
        struct configfs_buffer * buffer = file->private_data;
        ssize_t retval = 0;
-        down(&buffer->sem);
+        mutex_lock(&buffer->mutex);
        if (buffer->needs_read_fill) {
                if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
                        goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
        retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
                                         buffer->count);
 out:
-        up(&buffer->sem);
+        mutex_unlock(&buffer->mutex);
        return retval;
 }
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
        if (!buffer->page)
                return -ENOMEM;
-        if (count >= PAGE_SIZE)
+        if (count >= SIMPLE_ATTR_SIZE)
-                count = PAGE_SIZE - 1;
+                count = SIMPLE_ATTR_SIZE - 1;
        error = copy_from_user(buffer->page,buf,count);
        buffer->needs_read_fill = 1;
        /* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
        struct configfs_buffer * buffer = file->private_data;
        ssize_t len;
-        down(&buffer->sem);
+        mutex_lock(&buffer->mutex);
        len = fill_write_buffer(buffer, buf, count);
        if (len > 0)
                len = flush_write_buffer(file->f_path.dentry, buffer, count);
        if (len > 0)
                *ppos += len;
-        up(&buffer->sem);
+        mutex_unlock(&buffer->mutex);
        return len;
 }
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
                error = -ENOMEM;
                goto Enomem;
        }
-        init_MUTEX(&buffer->sem);
+        mutex_init(&buffer->mutex);
        buffer->needs_read_fill = 1;
        buffer->ops = ops;
        file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
        if (buffer) {
                if (buffer->page)
                        free_page((unsigned long)buffer->page);
+                mutex_destroy(&buffer->mutex);
                kfree(buffer);
        }
        return 0;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 24421209f854..76dc4c3e5d51 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
 *      dynamically allocated string that @item->ci_name points to.
 *      Otherwise, use the static @item->ci_namebuf array.
 */
 int config_item_set_name(struct config_item * item, const char * fmt, ...)
 {
        int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
        return item;
 }
-/**
+static void config_item_cleanup(struct config_item * item)
- *      config_item_cleanup - free config_item resources.
- *      @item:  item.
- */
-void config_item_cleanup(struct config_item * item)
 {
        struct config_item_type * t = item->ci_type;
        struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
                kref_put(&item->ci_kref, config_item_release);
 }
 /**
 *      config_group_init - initialize a group for use
 *      @k:     group
 */
 void config_group_init(struct config_group *group)
 {
        config_item_init(&group->cg_item);
        INIT_LIST_HEAD(&group->cg_children);
 }
 /**
- *      config_group_find_obj - search for item in group.
+ *      config_group_find_item - search for item in group.
 *      @group: group we're looking in.
 *      @name:  item's name.
 *
- *      Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *      Iterate over @group->cg_list, looking for a matching config_item.
- *      looking for a matching config_item. If matching item is found
+ *      If matching item is found take a reference and return the item.
- *      take a reference and return the item.
+ *      Caller must have locked group via @group->cg_subsys->su_mtx.
 */
+struct config_item *config_group_find_item(struct config_group *group,
-struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+                                           const char *name)
 {
        struct list_head * entry;
        struct config_item * ret = NULL;
-        /* XXX LOCKING! */
        list_for_each(entry,&group->cg_children) {
                struct config_item * item = to_item(entry);
                if (config_item_name(item) &&
-                    !strcmp(config_item_name(item), name)) {
+                    !strcmp(config_item_name(item), name)) {
                        ret = config_item_get(item);
                        break;
                }
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
        return ret;
 }
 EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
-EXPORT_SYMBOL(config_group_find_obj);
+EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dcache.c b/fs/dcache.c
index 0e73aa0a0e8b..cb9d05056b54 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -883,6 +883,11 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
+static struct shrinker dcache_shrinker = {
+        .shrink = shrink_dcache_memory,
+        .seeks = DEFAULT_SEEKS,
+};
 /**
 * d_alloc      -       allocate a dcache entry
 * @parent: parent of entry to allocate
@@ -2115,7 +2120,7 @@ static void __init dcache_init(unsigned long mempages)
        dentry_cache = KMEM_CACHE(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
        
-        set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
+        register_shrinker(&dcache_shrinker);
        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 5069b2cb5a1f..2f8e3c81bc19 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
        return len;
 }
-#define __CONFIGFS_ATTR(_name,_mode,_read,_write) {                           \
-        .attr   = { .ca_name = __stringify(_name),                            \
-                    .ca_mode = _mode,                                         \
-                    .ca_owner = THIS_MODULE },                                \
-        .show   = _read,                                                      \
-        .store  = _write,                                                     \
-}
 #define CLUSTER_ATTR(name, check_zero)                                        \
 static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
 {                                                                             \
@@ -615,7 +607,7 @@ static struct clusters clusters_root = {
 int dlm_config_init(void)
 {
        config_group_init(&clusters_root.subsys.su_group);
-        init_MUTEX(&clusters_root.subsys.su_sem);
+        mutex_init(&clusters_root.subsys.su_mutex);
        return configfs_register_subsystem(&clusters_root.subsys);
 }
@@ -759,9 +751,9 @@ static struct space *get_space(char *name)
        if (!space_list)
                return NULL;
-        down(&space_list->cg_subsys->su_sem);
+        mutex_lock(&space_list->cg_subsys->su_mutex);
-        i = config_group_find_obj(space_list, name);
+        i = config_group_find_item(space_list, name);
-        up(&space_list->cg_subsys->su_sem);
+        mutex_unlock(&space_list->cg_subsys->su_mutex);
        return to_space(i);
 }
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        if (!comm_list)
                return NULL;
-        down(&clusters_root.subsys.su_sem);
+        mutex_lock(&clusters_root.subsys.su_mutex);
        list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
                cm = to_comm(i);
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
                        break;
                }
        }
-        up(&clusters_root.subsys.su_sem);
+        mutex_unlock(&clusters_root.subsys.su_mutex);
        if (!found)
                cm = NULL;
diff --git a/fs/dquot.c b/fs/dquot.c
index 8819d281500c..7e273151f589 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -538,6 +538,11 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
        return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
 }
+static struct shrinker dqcache_shrinker = {
+        .shrink = shrink_dqcache_memory,
+        .seeks = DEFAULT_SEEKS,
+};
 /*
 * Put reference to dquot
 * NOTE: If you change this function please check whether dqput_blocks() works right...
@@ -1870,7 +1875,7 @@ static int __init dquot_init(void)
        printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n",
                        nr_hash, order, (PAGE_SIZE << order));
-        set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory);
+        register_shrinker(&dqcache_shrinker);
        return 0;
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 03ea7696fe39..59375efcf39d 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -20,7 +20,7 @@ static void drop_pagecache_sb(struct super_block *sb)
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_FREEING|I_WILL_FREE))
                        continue;
-                invalidate_mapping_pages(inode->i_mapping, 0, -1);
+                __invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
        }
        spin_unlock(&inode_lock);
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 83e94fedd4e9..e77a2ec71aa5 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -282,7 +282,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
        char *encoded_name;
-        unsigned int encoded_namelen;
+        int encoded_namelen;
        struct ecryptfs_crypt_stat *crypt_stat = NULL;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        char *page_virt = NULL;
@@ -473,7 +473,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        struct dentry *lower_dir_dentry;
        umode_t mode;
        char *encoded_symname;
-        unsigned int encoded_symlen;
+        int encoded_symlen;
        struct ecryptfs_crypt_stat *crypt_stat = NULL;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index ed4a207fe22a..5276b19423c1 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -75,6 +75,38 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
        return NULL;
 }
+struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp)
+{
+        __u32 *objp = vobjp;
+        unsigned long ino = objp[0];
+        __u32 generation = objp[1];
+        struct inode *inode;
+        struct dentry *result;
+        if (ino == 0)
+                return ERR_PTR(-ESTALE);
+        inode = iget(sb, ino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (is_bad_inode(inode) ||
+            (generation && inode->i_generation != generation)) {
+                result = ERR_PTR(-ESTALE);
+                goto out_iput;
+        }
+        result = d_alloc_anon(inode);
+        if (!result) {
+                result = ERR_PTR(-ENOMEM);
+                goto out_iput;
+        }
+        return result;
+ out_iput:
+        iput(inode);
+        return result;
+}
 struct dentry *efs_get_parent(struct dentry *child)
 {
        struct dentry *parent;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e0a6839e68ae..d360c81f3a72 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -11,6 +11,7 @@
 #include <linux/efs_fs.h>
 #include <linux/efs_vh.h>
 #include <linux/efs_fs_sb.h>
+#include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -113,6 +114,7 @@ static const struct super_operations efs_superblock_operations = {
 };
 static struct export_operations efs_export_ops = {
+        .get_dentry     = efs_get_dentry,
        .get_parent     = efs_get_parent,
 };
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e98f6cd7200c..8adb32a9387a 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -1,15 +1,45 @@
+#include <linux/exportfs.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
-struct export_operations export_op_default;
+#define dprintk(fmt, args...) do{}while(0)
-#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
-#define dprintk(fmt, args...) do{}while(0)
+static int get_name(struct dentry *dentry, char *name,
+                struct dentry *child);
+static struct dentry *exportfs_get_dentry(struct super_block *sb, void *obj)
+{
+        struct dentry *result = ERR_PTR(-ESTALE);
+        if (sb->s_export_op->get_dentry) {
+                result = sb->s_export_op->get_dentry(sb, obj);
+                if (!result)
+                        result = ERR_PTR(-ESTALE);
+        }
+        return result;
+}
+static int exportfs_get_name(struct dentry *dir, char *name,
+                struct dentry *child)
+{
+        struct export_operations *nop = dir->d_sb->s_export_op;
+        if (nop->get_name)
+                return nop->get_name(dir, name, child);
+        else
+                return get_name(dir, name, child);
+}
+/*
+ * Check if the dentry or any of it's aliases is acceptable.
+ */
 static struct dentry *
 find_acceptable_alias(struct dentry *result,
                int (*acceptable)(void *context, struct dentry *dentry),
@@ -17,6 +47,9 @@ find_acceptable_alias(struct dentry *result,
 {
        struct dentry *dentry, *toput = NULL;
+        if (acceptable(context, result))
+                return result;
        spin_lock(&dcache_lock);
        list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
                dget_locked(dentry);
@@ -37,130 +70,50 @@ find_acceptable_alias(struct dentry *result,
        return NULL;
 }
-/**
+/*
- * find_exported_dentry - helper routine to implement export_operations->decode_fh
+ * Find root of a disconnected subtree and return a reference to it.
- * @sb:         The &super_block identifying the filesystem
- * @obj:        An opaque identifier of the object to be found - passed to
- *              get_inode
- * @parent:     An optional opqaue identifier of the parent of the object.
- * @acceptable: A function used to test possible &dentries to see if they are
- *              acceptable
- * @context:    A parameter to @acceptable so that it knows on what basis to
- *              judge.
- *
- * find_exported_dentry is the central helper routine to enable file systems
- * to provide the decode_fh() export_operation.  It's main task is to take
- * an &inode, find or create an appropriate &dentry structure, and possibly
- * splice this into the dcache in the correct place.
- *
- * The decode_fh() operation provided by the filesystem should call
- * find_exported_dentry() with the same parameters that it received except
- * that instead of the file handle fragment, pointers to opaque identifiers
- * for the object and optionally its parent are passed.  The default decode_fh
- * routine passes one pointer to the start of the filehandle fragment, and
- * one 8 bytes into the fragment.  It is expected that most filesystems will
- * take this approach, though the offset to the parent identifier may well be
- * different.
- *
- * find_exported_dentry() will call get_dentry to get an dentry pointer from
- * the file system.  If any &dentry in the d_alias list is acceptable, it will
- * be returned.  Otherwise find_exported_dentry() will attempt to splice a new
- * &dentry into the dcache using get_name() and get_parent() to find the
- * appropriate place.
 */
+static struct dentry *
-struct dentry *
+find_disconnected_root(struct dentry *dentry)
-find_exported_dentry(struct super_block *sb, void *obj, void *parent,
-                     int (*acceptable)(void *context, struct dentry *de),
-                     void *context)
 {
-        struct dentry *result = NULL;
+        dget(dentry);
-        struct dentry *target_dir;
+        spin_lock(&dentry->d_lock);
-        int err;
+        while (!IS_ROOT(dentry) &&
-        struct export_operations *nops = sb->s_export_op;
+               (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
-        struct dentry *alias;
+                struct dentry *parent = dentry->d_parent;
-        int noprogress;
+                dget(parent);
-        char nbuf[NAME_MAX+1];
+                spin_unlock(&dentry->d_lock);
+                dput(dentry);
-        /*
+                dentry = parent;
-         * Attempt to find the inode.
+                spin_lock(&dentry->d_lock);
-         */
-        result = CALL(sb->s_export_op,get_dentry)(sb,obj);
-        err = -ESTALE;
-        if (result == NULL)
-                goto err_out;
-        if (IS_ERR(result)) {
-                err = PTR_ERR(result);
-                goto err_out;
        }
-        if (S_ISDIR(result->d_inode->i_mode) &&
+        spin_unlock(&dentry->d_lock);
-            (result->d_flags & DCACHE_DISCONNECTED)) {
+        return dentry;
-                /* it is an unconnected directory, we must connect it */
+}
-                ;
-        } else {
-                if (acceptable(context, result))
-                        return result;
-                if (S_ISDIR(result->d_inode->i_mode)) {
-                        err = -EACCES;
-                        goto err_result;
-                }
-                alias = find_acceptable_alias(result, acceptable, context);
-                if (alias)
-                        return alias;
-        }                       
-        /* It's a directory, or we are required to confirm the file's
-         * location in the tree based on the parent information
-         */
-        dprintk("find_exported_dentry: need to look harder for %s/%d\n",sb->s_id,*(int*)obj);
-        if (S_ISDIR(result->d_inode->i_mode))
-                target_dir = dget(result);
-        else {
-                if (parent == NULL)
-                        goto err_result;
-                target_dir = CALL(sb->s_export_op,get_dentry)(sb,parent);
+/*
-                if (IS_ERR(target_dir))
+ * Make sure target_dir is fully connected to the dentry tree.
-                        err = PTR_ERR(target_dir);
+ *
-                if (target_dir == NULL || IS_ERR(target_dir))
+ * It may already be, as the flag isn't always updated when connection happens.
-                        goto err_result;
+ */
-        }
+static int
-        /*
+reconnect_path(struct super_block *sb, struct dentry *target_dir)
-         * Now we need to make sure that target_dir is properly connected.
+{
-         * It may already be, as the flag isn't always updated when connection
+        char nbuf[NAME_MAX+1];
-         * happens.
+        int noprogress = 0;
-         * So, we walk up parent links until we find a connected directory,
+        int err = -ESTALE;
-         * or we run out of directories.  Then we find the parent, find
-         * the name of the child in that parent, and do a lookup.
-         * This should connect the child into the parent
-         * We then repeat.
-         */
-        /* it is possible that a confused file system might not let us complete 
+        /*
+         * It is possible that a confused file system might not let us complete
         * the path to the root.  For example, if get_parent returns a directory
         * in which we cannot find a name for the child.  While this implies a
         * very sick filesystem we don't want it to cause knfsd to spin.  Hence
         * the noprogress counter.  If we go through the loop 10 times (2 is
         * probably enough) without getting anywhere, we just give up
         */
-        noprogress= 0;
        while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
-                struct dentry *pd = target_dir;
+                struct dentry *pd = find_disconnected_root(target_dir);
-                dget(pd);
-                spin_lock(&pd->d_lock);
-                while (!IS_ROOT(pd) &&
-                                (pd->d_parent->d_flags&DCACHE_DISCONNECTED)) {
-                        struct dentry *parent = pd->d_parent;
-                        dget(parent);
-                        spin_unlock(&pd->d_lock);
-                        dput(pd);
-                        pd = parent;
-                        spin_lock(&pd->d_lock);
-                }
-                spin_unlock(&pd->d_lock);
                if (!IS_ROOT(pd)) {
                        /* must have found a connected parent - great */
@@ -175,29 +128,40 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
                        spin_unlock(&pd->d_lock);
                        noprogress = 0;
                } else {
-                        /* we have hit the top of a disconnected path.  Try
+                        /*
-                         * to find parent and connect
+                         * We have hit the top of a disconnected path, try to
-                         * note: racing with some other process renaming a
+                         * find parent and connect.
-                         * directory isn't much of a problem here.  If someone
+                         *
-                         * renames the directory, it will end up properly
+                         * Racing with some other process renaming a directory
-                         * connected, which is what we want
+                         * isn't much of a problem here.  If someone renames
+                         * the directory, it will end up properly connected,
+                         * which is what we want
+                         *
+                         * Getting the parent can't be supported generically,
+                         * the locking is too icky.
+                         *
+                         * Instead we just return EACCES.  If server reboots
+                         * or inodes get flushed, you lose
                         */
-                        struct dentry *ppd;
+                        struct dentry *ppd = ERR_PTR(-EACCES);
                        struct dentry *npd;
                        mutex_lock(&pd->d_inode->i_mutex);
-                        ppd = CALL(nops,get_parent)(pd);
+                        if (sb->s_export_op->get_parent)
+                                ppd = sb->s_export_op->get_parent(pd);
                        mutex_unlock(&pd->d_inode->i_mutex);
                        if (IS_ERR(ppd)) {
                                err = PTR_ERR(ppd);
-                                dprintk("find_exported_dentry: get_parent of %ld failed, err %d\n",
+                                dprintk("%s: get_parent of %ld failed, err %d\n",
-                                        pd->d_inode->i_ino, err);
+                                        __FUNCTION__, pd->d_inode->i_ino, err);
                                dput(pd);
                                break;
                        }
-                        dprintk("find_exported_dentry: find name of %lu in %lu\n", pd->d_inode->i_ino, ppd->d_inode->i_ino);
-                        err = CALL(nops,get_name)(ppd, nbuf, pd);
+                        dprintk("%s: find name of %lu in %lu\n", __FUNCTION__,
+                                pd->d_inode->i_ino, ppd->d_inode->i_ino);
+                        err = exportfs_get_name(ppd, nbuf, pd);
                        if (err) {
                                dput(ppd);
                                dput(pd);
@@ -208,13 +172,14 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
                                        continue;
                                break;
                        }
-                        dprintk("find_exported_dentry: found name: %s\n", nbuf);
+                        dprintk("%s: found name: %s\n", __FUNCTION__, nbuf);
                        mutex_lock(&ppd->d_inode->i_mutex);
                        npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
                        mutex_unlock(&ppd->d_inode->i_mutex);
                        if (IS_ERR(npd)) {
                                err = PTR_ERR(npd);
-                                dprintk("find_exported_dentry: lookup failed: %d\n", err);
+                                dprintk("%s: lookup failed: %d\n",
+                                        __FUNCTION__, err);
                                dput(ppd);
                                dput(pd);
                                break;
@@ -227,7 +192,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
                        if (npd == pd)
                                noprogress = 0;
                        else
-                                printk("find_exported_dentry: npd != pd\n");
+                                printk("%s: npd != pd\n", __FUNCTION__);
                        dput(npd);
                        dput(ppd);
                        if (IS_ROOT(pd)) {
@@ -243,15 +208,101 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
                /* something went wrong - oh-well */
                if (!err)
                        err = -ESTALE;
-                goto err_target;
+                return err;
        }
-        /* if we weren't after a directory, have one more step to go */
-        if (result != target_dir) {
+        return 0;
-                struct dentry *nresult;
+}
-                err = CALL(nops,get_name)(target_dir, nbuf, result);
+/**
+ * find_exported_dentry - helper routine to implement export_operations->decode_fh
+ * @sb:         The &super_block identifying the filesystem
+ * @obj:        An opaque identifier of the object to be found - passed to
+ *              get_inode
+ * @parent:     An optional opqaue identifier of the parent of the object.
+ * @acceptable: A function used to test possible &dentries to see if they are
+ *              acceptable
+ * @context:    A parameter to @acceptable so that it knows on what basis to
+ *              judge.
+ *
+ * find_exported_dentry is the central helper routine to enable file systems
+ * to provide the decode_fh() export_operation.  It's main task is to take
+ * an &inode, find or create an appropriate &dentry structure, and possibly
+ * splice this into the dcache in the correct place.
+ *
+ * The decode_fh() operation provided by the filesystem should call
+ * find_exported_dentry() with the same parameters that it received except
+ * that instead of the file handle fragment, pointers to opaque identifiers
+ * for the object and optionally its parent are passed.  The default decode_fh
+ * routine passes one pointer to the start of the filehandle fragment, and
+ * one 8 bytes into the fragment.  It is expected that most filesystems will
+ * take this approach, though the offset to the parent identifier may well be
+ * different.
+ *
+ * find_exported_dentry() will call get_dentry to get an dentry pointer from
+ * the file system.  If any &dentry in the d_alias list is acceptable, it will
+ * be returned.  Otherwise find_exported_dentry() will attempt to splice a new
+ * &dentry into the dcache using get_name() and get_parent() to find the
+ * appropriate place.
+ */
+struct dentry *
+find_exported_dentry(struct super_block *sb, void *obj, void *parent,
+                     int (*acceptable)(void *context, struct dentry *de),
+                     void *context)
+{
+        struct dentry *result, *alias;
+        int err = -ESTALE;
+        /*
+         * Attempt to find the inode.
+         */
+        result = exportfs_get_dentry(sb, obj);
+        if (IS_ERR(result))
+                return result;
+        if (S_ISDIR(result->d_inode->i_mode)) {
+                if (!(result->d_flags & DCACHE_DISCONNECTED)) {
+                        if (acceptable(context, result))
+                                return result;
+                        err = -EACCES;
+                        goto err_result;
+                }
+                err = reconnect_path(sb, result);
+                if (err)
+                        goto err_result;
+        } else {
+                struct dentry *target_dir, *nresult;
+                char nbuf[NAME_MAX+1];
+                alias = find_acceptable_alias(result, acceptable, context);
+                if (alias)
+                        return alias;
+                if (parent == NULL)
+                        goto err_result;
+                target_dir = exportfs_get_dentry(sb,parent);
+                if (IS_ERR(target_dir)) {
+                        err = PTR_ERR(target_dir);
+                        goto err_result;
+                }
+                err = reconnect_path(sb, target_dir);
+                if (err) {
+                        dput(target_dir);
+                        goto err_result;
+                }
+                /*
+                 * As we weren't after a directory, have one more step to go.
+                 */
+                err = exportfs_get_name(target_dir, nbuf, result);
                if (!err) {
                        mutex_lock(&target_dir->d_inode->i_mutex);
-                        nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+                        nresult = lookup_one_len(nbuf, target_dir,
+                                                 strlen(nbuf));
                        mutex_unlock(&target_dir->d_inode->i_mutex);
                        if (!IS_ERR(nresult)) {
                                if (nresult->d_inode) {
@@ -261,11 +312,8 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
                                        dput(nresult);
                        }
                }
+                dput(target_dir);
        }
-        dput(target_dir);
-        /* now result is properly connected, it is our best bet */
-        if (acceptable(context, result))
-                return result;
        alias = find_acceptable_alias(result, acceptable, context);
        if (alias)
@@ -275,32 +323,16 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
        dput(result);
        /* It might be justifiable to return ESTALE here,
         * but the filehandle at-least looks reasonable good
-         * and it just be a permission problem, so returning
+         * and it may just be a permission problem, so returning
         * -EACCESS is safer
         */
        return ERR_PTR(-EACCES);
- err_target:
-        dput(target_dir);
 err_result:
        dput(result);
- err_out:
        return ERR_PTR(err);
 }
-static struct dentry *get_parent(struct dentry *child)
-{
-        /* get_parent cannot be supported generically, the locking
-         * is too icky.
-         * instead, we just return EACCES.  If server reboots or inodes
-         * get flushed, you lose
-         */
-        return ERR_PTR(-EACCES);
-}
 struct getdents_callback {
        char *name;             /* name that was found. It already points to a
                                   buffer NAME_MAX+1 is size */
@@ -390,61 +422,6 @@ out:
        return error;
 }
-static struct dentry *export_iget(struct super_block *sb, unsigned long ino, __u32 generation)
-{
-        /* iget isn't really right if the inode is currently unallocated!!
-         * This should really all be done inside each filesystem
-         *
-         * ext2fs' read_inode has been strengthed to return a bad_inode if
-         * the inode had been deleted.
-         *
-         * Currently we don't know the generation for parent directory, so
-         * a generation of 0 means "accept any"
-         */
-        struct inode *inode;
-        struct dentry *result;
-        if (ino == 0)
-                return ERR_PTR(-ESTALE);
-        inode = iget(sb, ino);
-        if (inode == NULL)
-                return ERR_PTR(-ENOMEM);
-        if (is_bad_inode(inode)
-            || (generation && inode->i_generation != generation)
-                ) {
-                /* we didn't find the right inode.. */
-                dprintk("fh_verify: Inode %lu, Bad count: %d %d or version  %u %u\n",
-                        inode->i_ino,
-                        inode->i_nlink, atomic_read(&inode->i_count),
-                        inode->i_generation,
-                        generation);
-                iput(inode);
-                return ERR_PTR(-ESTALE);
-        }
-        /* now to find a dentry.
-         * If possible, get a well-connected one
-         */
-        result = d_alloc_anon(inode);
-        if (!result) {
-                iput(inode);
-                return ERR_PTR(-ENOMEM);
-        }
-        return result;
-}
-static struct dentry *get_object(struct super_block *sb, void *vobjp)
-{
-        __u32 *objp = vobjp;
-        unsigned long ino = objp[0];
-        __u32 generation = objp[1];
-        return export_iget(sb, ino, generation);
-}
 /**
 * export_encode_fh - default export_operations->encode_fh function
 * @dentry:  the dentry to encode
@@ -517,16 +494,40 @@ static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh
                                   acceptable, context);
 }
-struct export_operations export_op_default = {
+int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
-        .decode_fh      = export_decode_fh,
+                int connectable)
-        .encode_fh      = export_encode_fh,
+{
+        struct export_operations *nop = dentry->d_sb->s_export_op;
+        int error;
+        if (nop->encode_fh)
+                error = nop->encode_fh(dentry, fh, max_len, connectable);
+        else
+                error = export_encode_fh(dentry, fh, max_len, connectable);
-        .get_name       = get_name,
+        return error;
-        .get_parent     = get_parent,
+}
-        .get_dentry     = get_object,
+EXPORT_SYMBOL_GPL(exportfs_encode_fh);
-};
+struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len,
+                int fileid_type, int (*acceptable)(void *, struct dentry *),
+                void *context)
+{
+        struct export_operations *nop = mnt->mnt_sb->s_export_op;
+        struct dentry *result;
+        if (nop->decode_fh) {
+                result = nop->decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type,
+                        acceptable, context);
+        } else {
+                result = export_decode_fh(mnt->mnt_sb, fh, fh_len, fileid_type,
+                        acceptable, context);
+        }
+        return result;
+}
+EXPORT_SYMBOL_GPL(exportfs_decode_fh);
-EXPORT_SYMBOL(export_op_default);
 EXPORT_SYMBOL(find_exported_dentry);
 MODULE_LICENSE("GPL");
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 04afeecaaef3..ab7961260c49 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -24,9 +24,9 @@
 #include "acl.h"
 /*
- * Called when an inode is released. Note that this is different
+ * Called when filp is released. This happens when all file descriptors
- * from ext2_open_file: open gets called at every open, but release
+ * for a single struct file are closed. Note that different open() calls
- * gets called only when /all/ the files are closed.
+ * for the same file yield different struct file structures.
 */
 static int ext2_release_file (struct inode * inode, struct file * filp)
 {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5de5061eb331..3eefa97fe204 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -25,6 +25,7 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/seq_file.h>
@@ -1099,15 +1100,18 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct super_block *sb = dentry->d_sb;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = sbi->s_es;
-        unsigned long overhead;
-        int i;
        u64 fsid;
        if (test_opt (sb, MINIX_DF))
-                overhead = 0;
+                sbi->s_overhead_last = 0;
-        else {
+        else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+                unsigned long i, overhead = 0;
+                smp_rmb();
                /*
-                 * Compute the overhead (FS structures)
+                 * Compute the overhead (FS structures). This is constant
+                 * for a given filesystem unless the number of block groups
+                 * changes so we cache the previous value until it does.
                 */
                /*
@@ -1131,17 +1135,22 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
                 */
                overhead += (sbi->s_groups_count *
                             (2 + sbi->s_itb_per_group));
+                sbi->s_overhead_last = overhead;
+                smp_wmb();
+                sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
        }
        buf->f_type = EXT2_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-        buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+        buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
        buf->f_bfree = ext2_count_free_blocks(sb);
+        es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
        buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
        if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = ext2_count_free_inodes(sb);
+        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT2_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2a85ddee4740..de4e3161e479 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3195,7 +3195,7 @@ int ext3_change_inode_journal_flag(struct inode *inode, int val)
         */
        journal = EXT3_JOURNAL(inode);
-        if (is_journal_aborted(journal) || IS_RDONLY(inode))
+        if (is_journal_aborted(journal))
                return -EROFS;
        journal_lock_updates(journal);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 9bb046df827a..1586807b8177 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1019,6 +1019,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
                if (!inode)
                        return ERR_PTR(-EACCES);
+                if (is_bad_inode(inode)) {
+                        iput(inode);
+                        return ERR_PTR(-ENOENT);
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1054,6 +1059,11 @@ struct dentry *ext3_get_parent(struct dentry *child)
        if (!inode)
                return ERR_PTR(-EACCES);
+        if (is_bad_inode(inode)) {
+                iput(inode);
+                return ERR_PTR(-ENOENT);
+        }
        parent = d_alloc_anon(inode);
        if (!parent) {
                iput(inode);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e3062913a92..4f84dc86628a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -29,12 +29,14 @@
 #include <linux/parser.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/log2.h>
 #include <asm/uaccess.h>
@@ -459,6 +461,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
 static void ext3_destroy_inode(struct inode *inode)
 {
+        if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
+                printk("EXT3 Inode %p: orphan list check failed!\n",
+                        EXT3_I(inode));
+                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+                                EXT3_I(inode), sizeof(struct ext3_inode_info),
+                                false);
+                dump_stack();
+        }
        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
@@ -1566,7 +1576,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
                if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
-                    (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
+                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
                        printk (KERN_ERR
                                "EXT3-fs: unsupported inode size: %d\n",
@@ -2075,6 +2085,7 @@ static int ext3_create_journal(struct super_block * sb,
                               unsigned int journal_inum)
 {
        journal_t *journal;
+        int err;
        if (sb->s_flags & MS_RDONLY) {
                printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
@@ -2082,13 +2093,15 @@ static int ext3_create_journal(struct super_block * sb,
                return -EROFS;
        }
-        if (!(journal = ext3_get_journal(sb, journal_inum)))
+        journal = ext3_get_journal(sb, journal_inum);
+        if (!journal)
                return -EINVAL;
        printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
               journal_inum);
-        if (journal_create(journal)) {
+        err = journal_create(journal);
+        if (err) {
                printk(KERN_ERR "EXT3-fs: error creating journal.\n");
                journal_destroy(journal);
                return -EIO;
@@ -2139,12 +2152,14 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
        journal_lock_updates(journal);
        journal_flush(journal);
+        lock_super(sb);
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
                sb->s_dirt = 0;
                ext3_commit_super(sb, es, 1);
        }
+        unlock_super(sb);
        journal_unlock_updates(journal);
 }
@@ -2333,7 +2348,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                            (sbi->s_mount_state & EXT3_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
+                        /*
+                         * We have to unlock super so that we can wait for
+                         * transactions.
+                         */
+                        unlock_super(sb);
                        ext3_mark_recovery_complete(sb, es);
+                        lock_super(sb);
                } else {
                        __le32 ret;
                        if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -2406,19 +2427,19 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct super_block *sb = dentry->d_sb;
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-        ext3_fsblk_t overhead;
-        int i;
        u64 fsid;
-        if (test_opt (sb, MINIX_DF))
+        if (test_opt(sb, MINIX_DF)) {
-                overhead = 0;
+                sbi->s_overhead_last = 0;
-        else {
+        } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
-                unsigned long ngroups;
+                unsigned long ngroups = sbi->s_groups_count, i;
-                ngroups = EXT3_SB(sb)->s_groups_count;
+                ext3_fsblk_t overhead = 0;
                smp_rmb();
                /*
-                 * Compute the overhead (FS structures)
+                 * Compute the overhead (FS structures).  This is constant
+                 * for a given filesystem unless the number of block groups
+                 * changes so we cache the previous value until it does.
                 */
                /*
@@ -2442,18 +2463,23 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
                 * Every block group has an inode bitmap, a block
                 * bitmap, and an inode table.
                 */
-                overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group));
+                overhead += ngroups * (2 + sbi->s_itb_per_group);
+                sbi->s_overhead_last = overhead;
+                smp_wmb();
+                sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
        }
        buf->f_type = EXT3_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-        buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+        buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
        buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+        es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
        buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
        if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT3_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 3b64bb16c727..9de54ae48dee 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1585,7 +1585,7 @@ allocated:
        ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
        if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
-            in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+            in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
            in_range(ret_block, ext4_inode_table(sb, gdp),
                     EXT4_SB(sb)->s_itb_per_group) ||
            in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2811e5720ad0..2de339dd7554 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1017,6 +1017,11 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
                if (!inode)
                        return ERR_PTR(-EACCES);
+                if (is_bad_inode(inode)) {
+                        iput(inode);
+                        return ERR_PTR(-ENOENT);
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1052,6 +1057,11 @@ struct dentry *ext4_get_parent(struct dentry *child)
        if (!inode)
                return ERR_PTR(-EACCES);
+        if (is_bad_inode(inode)) {
+                iput(inode);
+                return ERR_PTR(-ENOENT);
+        }
        parent = d_alloc_anon(inode);
        if (!parent) {
                iput(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 175b68c60968..b806e689c4aa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -29,6 +29,7 @@
 #include <linux/parser.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/mount.h>
@@ -510,6 +511,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 static void ext4_destroy_inode(struct inode *inode)
 {
+        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+                printk("EXT4 Inode %p: orphan list check failed!\n",
+                        EXT4_I(inode));
+                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+                                EXT4_I(inode), sizeof(struct ext4_inode_info),
+                                true);
+                dump_stack();
+        }
        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
@@ -2150,6 +2159,7 @@ static int ext4_create_journal(struct super_block * sb,
                               unsigned int journal_inum)
 {
        journal_t *journal;
+        int err;
        if (sb->s_flags & MS_RDONLY) {
                printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
@@ -2157,13 +2167,15 @@ static int ext4_create_journal(struct super_block * sb,
                return -EROFS;
        }
-        if (!(journal = ext4_get_journal(sb, journal_inum)))
+        journal = ext4_get_journal(sb, journal_inum);
+        if (!journal)
                return -EINVAL;
        printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
               journal_inum);
-        if (jbd2_journal_create(journal)) {
+        err = jbd2_journal_create(journal);
+        if (err) {
                printk(KERN_ERR "EXT4-fs: error creating journal.\n");
                jbd2_journal_destroy(journal);
                return -EIO;
@@ -2214,12 +2226,14 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
        jbd2_journal_lock_updates(journal);
        jbd2_journal_flush(journal);
+        lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                sb->s_dirt = 0;
                ext4_commit_super(sb, es, 1);
        }
+        unlock_super(sb);
        jbd2_journal_unlock_updates(journal);
 }
@@ -2408,7 +2422,13 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
+                        /*
+                         * We have to unlock super so that we can wait for
+                         * transactions.
+                         */
+                        unlock_super(sb);
                        ext4_mark_recovery_complete(sb, es);
+                        lock_super(sb);
                } else {
                        __le32 ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2481,19 +2501,19 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
-        ext4_fsblk_t overhead;
-        int i;
        u64 fsid;
-        if (test_opt (sb, MINIX_DF))
+        if (test_opt(sb, MINIX_DF)) {
-                overhead = 0;
+                sbi->s_overhead_last = 0;
-        else {
+        } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
-                unsigned long ngroups;
+                unsigned long ngroups = sbi->s_groups_count, i;
-                ngroups = EXT4_SB(sb)->s_groups_count;
+                ext4_fsblk_t overhead = 0;
                smp_rmb();
                /*
-                 * Compute the overhead (FS structures)
+                 * Compute the overhead (FS structures).  This is constant
+                 * for a given filesystem unless the number of block groups
+                 * changes so we cache the previous value until it does.
                 */
                /*
@@ -2517,18 +2537,23 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
                 * Every block group has an inode bitmap, a block
                 * bitmap, and an inode table.
                 */
-                overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
+                overhead += ngroups * (2 + sbi->s_itb_per_group);
+                sbi->s_overhead_last = overhead;
+                smp_wmb();
+                sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
        }
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-        buf->f_blocks = ext4_blocks_count(es) - overhead;
+        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
        buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+        es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+        es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
        buf->f_namelen = EXT4_NAME_LEN;
        fsid = le64_to_cpup((void *)es->s_uuid) ^
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ccf161dffb63..72cbcd61bd95 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -313,7 +313,7 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
        wchar_t bufuname[14];
        unsigned char xlate_len, nr_slots;
        wchar_t *unicode = NULL;
-        unsigned char work[8], bufname[260];    /* 256 + 4 */
+        unsigned char work[MSDOS_NAME], bufname[260];   /* 256 + 4 */
        int uni_xlate = sbi->options.unicode_xlate;
        int utf8 = sbi->options.utf8;
        int anycase = (sbi->options.name_check != 's');
@@ -351,7 +351,8 @@ parse_record:
                if (work[0] == 0x05)
                        work[0] = 0xE5;
                for (i = 0, j = 0, last_u = 0; i < 8;) {
-                        if (!work[i]) break;
+                        if (!work[i])
+                                break;
                        chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
                                                &bufuname[j++], opt_shortname,
                                                de->lcase & CASE_LOWER_BASE);
@@ -365,13 +366,15 @@ parse_record:
                }
                j = last_u;
                fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
-                for (i = 0; i < 3;) {
+                for (i = 8; i < MSDOS_NAME;) {
-                        if (!de->ext[i]) break;
+                        if (!work[i])
-                        chl = fat_shortname2uni(nls_disk, &de->ext[i], 3 - i,
+                                break;
+                        chl = fat_shortname2uni(nls_disk, &work[i],
+                                                MSDOS_NAME - i,
                                                &bufuname[j++], opt_shortname,
                                                de->lcase & CASE_LOWER_EXT);
                        if (chl <= 1) {
-                                if (de->ext[i] != ' ')
+                                if (work[i] != ' ')
                                        last_u = j;
                        } else {
                                last_u = j;
@@ -445,7 +448,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        int fill_len;
        wchar_t bufuname[14];
        wchar_t *unicode = NULL;
-        unsigned char c, work[8], bufname[56], *ptname = bufname;
+        unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
        unsigned long lpos, dummy, *furrfu = &lpos;
        int uni_xlate = sbi->options.unicode_xlate;
        int isvfat = sbi->options.isvfat;
@@ -527,7 +530,8 @@ parse_record:
        if (work[0] == 0x05)
                work[0] = 0xE5;
        for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) {
-                if (!(c = work[i])) break;
+                if (!(c = work[i]))
+                        break;
                chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
                                        &bufuname[j++], opt_shortname,
                                        de->lcase & CASE_LOWER_BASE);
@@ -549,9 +553,10 @@ parse_record:
        j = last_u;
        fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
        ptname[i++] = '.';
-        for (i2 = 0; i2 < 3;) {
+        for (i2 = 8; i2 < MSDOS_NAME;) {
-                if (!(c = de->ext[i2])) break;
+                if (!(c = work[i2]))
-                chl = fat_shortname2uni(nls_disk, &de->ext[i2], 3 - i2,
+                        break;
+                chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2,
                                        &bufuname[j++], opt_shortname,
                                        de->lcase & CASE_LOWER_EXT);
                if (chl <= 1) {
@@ -563,8 +568,8 @@ parse_record:
                        }
                } else {
                        last_u = j;
-                        for (chi = 0; chi < chl && i2 < 3; chi++) {
+                        for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) {
-                                ptname[i++] = de->ext[i2++];
+                                ptname[i++] = work[i2++];
                                last = i;
                        }
                }
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index ab171ea8e869..2c1b73fb82ae 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -17,6 +17,8 @@ struct fatent_operations {
        int (*ent_next)(struct fat_entry *);
 };
+static DEFINE_SPINLOCK(fat12_entry_lock);
 static void fat12_ent_blocknr(struct super_block *sb, int entry,
                              int *offset, sector_t *blocknr)
 {
@@ -116,10 +118,13 @@ static int fat12_ent_get(struct fat_entry *fatent)
        u8 **ent12_p = fatent->u.ent12_p;
        int next;
+        spin_lock(&fat12_entry_lock);
        if (fatent->entry & 1)
                next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4);
        else
                next = (*ent12_p[1] << 8) | *ent12_p[0];
+        spin_unlock(&fat12_entry_lock);
        next &= 0x0fff;
        if (next >= BAD_FAT12)
                next = FAT_ENT_EOF;
@@ -151,6 +156,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
        if (new == FAT_ENT_EOF)
                new = EOF_FAT12;
+        spin_lock(&fat12_entry_lock);
        if (fatent->entry & 1) {
                *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f);
                *ent12_p[1] = new >> 4;
@@ -158,6 +164,7 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
                *ent12_p[0] = new & 0xff;
                *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8);
        }
+        spin_unlock(&fat12_entry_lock);
        mark_buffer_dirty(fatent->bhs[0]);
        if (fatent->nr_bhs == 2)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 479722d89667..0a7ddb39a593 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -20,6 +20,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
@@ -354,8 +355,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
        } else { /* not a directory */
                inode->i_generation |= 1;
                inode->i_mode = MSDOS_MKMODE(de->attr,
-                    ((sbi->options.showexec &&
+                    ((sbi->options.showexec && !is_exec(de->name + 8))
-                        !is_exec(de->ext))
                        ? S_IRUGO|S_IWUGO : S_IRWXUGO)
                    & ~sbi->options.fs_fmask) | S_IFREG;
                MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index 8a4dfef1ddad..3c96d6e63978 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -80,7 +80,7 @@ struct vxfs_direct {
 *      a d_name with size len.
 */
 #define VXFS_DIRPAD             4
-#define VXFS_NAMEMIN            ((int)((struct vxfs_direct *)0)->d_name)
+#define VXFS_NAMEMIN            offsetof(struct vxfs_direct, d_name)
 #define VXFS_DIRROUND(len)      ((VXFS_DIRPAD + (len) - 1) & ~(VXFS_DIRPAD -1))
 #define VXFS_DIRLEN(len)        (VXFS_DIRROUND(VXFS_NAMEMIN + (len)))
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index c1f44009853f..1ab3e9d73886 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -11,6 +11,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 99ea5659bc2c..b8312edee0e4 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -11,6 +11,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 90ebab753d30..050d29c0a5b5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -62,8 +62,10 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
                    (head->key_type == HFSPLUS_KEY_BINARY))
                        tree->keycmp = hfsplus_cat_bin_cmp_key;
-                else
+                else {
                        tree->keycmp = hfsplus_cat_case_cmp_key;
+                        HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+                }
        } else {
                printk(KERN_ERR "hfs: unknown B*Tree requested\n");
                goto fail_page;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 80b5682a2273..1955ee61251c 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -36,6 +36,8 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
        u16 type;
        sb = dir->i_sb;
+        dentry->d_op = &hfsplus_dentry_operations;
        dentry->d_fsdata = NULL;
        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 3915635b4470..d9f5eda6d039 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -150,6 +150,7 @@ struct hfsplus_sb_info {
 #define HFSPLUS_SB_NODECOMPOSE  0x0002
 #define HFSPLUS_SB_FORCE        0x0004
 #define HFSPLUS_SB_HFSX         0x0008
+#define HFSPLUS_SB_CASEFOLD     0x0010
 struct hfsplus_inode_info {
@@ -321,6 +322,7 @@ void hfsplus_file_truncate(struct inode *);
 /* inode.c */
 extern const struct address_space_operations hfsplus_aops;
 extern const struct address_space_operations hfsplus_btree_aops;
+extern struct dentry_operations hfsplus_dentry_operations;
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
@@ -353,6 +355,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist
 int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
 int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
+int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 409ce5429c91..6f7c662174db 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -131,6 +131,11 @@ const struct address_space_operations hfsplus_aops = {
        .writepages     = hfsplus_writepages,
 };
+struct dentry_operations hfsplus_dentry_operations = {
+        .d_hash       = hfsplus_hash_dentry,
+        .d_compare    = hfsplus_compare_dentry,
+};
 static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
                                          struct nameidata *nd)
 {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ebd1b380cbbc..6d87a2a9534d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -283,11 +283,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        struct nls_table *nls = NULL;
        int err = -EINVAL;
-        sbi = kmalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        memset(sbi, 0, sizeof(HFSPLUS_SB(sb)));
        sb->s_fs_info = sbi;
        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
        hfsplus_fill_defaults(sbi);
@@ -381,6 +380,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                iput(root);
                goto cleanup;
        }
+        sb->s_root->d_op = &hfsplus_dentry_operations;
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 689c8bd721fb..9e10f9444b64 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -239,61 +239,201 @@ out:
        return res;
 }
-int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, const char *astr, int len)
+/*
+ * Convert one or more ASCII characters into a single unicode character.
+ * Returns the number of ASCII characters corresponding to the unicode char.
+ */
+static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
+                              wchar_t *uc)
 {
-        struct nls_table *nls = HFSPLUS_SB(sb).nls;
+        int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
-        int size, off, decompose;
+        if (size <= 0) {
+                *uc = '?';
+                size = 1;
+        }
+        switch (*uc) {
+        case 0x2400:
+                *uc = 0;
+                break;
+        case ':':
+                *uc = '/';
+                break;
+        }
+        return size;
+}
+/* Decomposes a single unicode character. */
+static inline u16 *decompose_unichar(wchar_t uc, int *size)
+{
+        int off;
+        off = hfsplus_decompose_table[(uc >> 12) & 0xf];
+        if (off == 0 || off == 0xffff)
+                return NULL;
+        off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
+        if (!off)
+                return NULL;
+        off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
+        if (!off)
+                return NULL;
+        off = hfsplus_decompose_table[off + (uc & 0xf)];
+        *size = off & 3;
+        if (*size == 0)
+                return NULL;
+        return hfsplus_decompose_table + (off / 4);
+}
+int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
+                    const char *astr, int len)
+{
+        int size, dsize, decompose;
+        u16 *dstr, outlen = 0;
        wchar_t c;
-        u16 outlen = 0;
        decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
        while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
-                size = nls->char2uni(astr, len, &c);
+                size = asc2unichar(sb, astr, len, &c);
-                if (size <= 0) {
-                        c = '?';
+                if (decompose && (dstr = decompose_unichar(c, &dsize))) {
-                        size = 1;
+                        if (outlen + dsize > HFSPLUS_MAX_STRLEN)
-                }
-                astr += size;
-                len -= size;
-                switch (c) {
-                case 0x2400:
-                        c = 0;
-                        break;
-                case ':':
-                        c = '/';
-                        break;
-                }
-                if (c >= 0xc0 && decompose) {
-                        off = hfsplus_decompose_table[(c >> 12) & 0xf];
-                        if (!off)
-                                goto done;
-                        if (off == 0xffff) {
-                                goto done;
-                        }
-                        off = hfsplus_decompose_table[off + ((c >> 8) & 0xf)];
-                        if (!off)
-                                goto done;
-                        off = hfsplus_decompose_table[off + ((c >> 4) & 0xf)];
-                        if (!off)
-                                goto done;
-                        off = hfsplus_decompose_table[off + (c & 0xf)];
-                        size = off & 3;
-                        if (!size)
-                                goto done;
-                        off /= 4;
-                        if (outlen + size > HFSPLUS_MAX_STRLEN)
                                break;
                        do {
-                                ustr->unicode[outlen++] = cpu_to_be16(hfsplus_decompose_table[off++]);
+                                ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
-                        } while (--size > 0);
+                        } while (--dsize > 0);
-                        continue;
+                } else
-                }
+                        ustr->unicode[outlen++] = cpu_to_be16(c);
-        done:
-                ustr->unicode[outlen++] = cpu_to_be16(c);
+                astr += size;
+                len -= size;
        }
        ustr->length = cpu_to_be16(outlen);
        if (len > 0)
                return -ENAMETOOLONG;
        return 0;
 }
+/*
+ * Hash a string to an integer as appropriate for the HFS+ filesystem.
+ * Composed unicode characters are decomposed and case-folding is performed
+ * if the appropriate bits are (un)set on the superblock.
+ */
+int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+{
+        struct super_block *sb = dentry->d_sb;
+        const char *astr;
+        const u16 *dstr;
+        int casefold, decompose, size, dsize, len;
+        unsigned long hash;
+        wchar_t c;
+        u16 c2;
+        casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
+        decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+        hash = init_name_hash();
+        astr = str->name;
+        len = str->len;
+        while (len > 0) {
+                size = asc2unichar(sb, astr, len, &c);
+                astr += size;
+                len -= size;
+                if (decompose && (dstr = decompose_unichar(c, &dsize))) {
+                        do {
+                                c2 = *dstr++;
+                                if (!casefold || (c2 = case_fold(c2)))
+                                        hash = partial_name_hash(c2, hash);
+                        } while (--dsize > 0);
+                } else {
+                        c2 = c;
+                        if (!casefold || (c2 = case_fold(c2)))
+                                hash = partial_name_hash(c2, hash);
+                }
+        }
+        str->hash = end_name_hash(hash);
+        return 0;
+}
+/*
+ * Compare strings with HFS+ filename ordering.
+ * Composed unicode characters are decomposed and case-folding is performed
+ * if the appropriate bits are (un)set on the superblock.
+ */
+int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+{
+        struct super_block *sb = dentry->d_sb;
+        int casefold, decompose, size;
+        int dsize1, dsize2, len1, len2;
+        const u16 *dstr1, *dstr2;
+        const char *astr1, *astr2;
+        u16 c1, c2;
+        wchar_t c;
+        casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
+        decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+        astr1 = s1->name;
+        len1 = s1->len;
+        astr2 = s2->name;
+        len2 = s2->len;
+        dsize1 = dsize2 = 0;
+        dstr1 = dstr2 = NULL;
+        while (len1 > 0 && len2 > 0) {
+                if (!dsize1) {
+                        size = asc2unichar(sb, astr1, len1, &c);
+                        astr1 += size;
+                        len1 -= size;
+                        if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
+                                c1 = c;
+                                dstr1 = &c1;
+                                dsize1 = 1;
+                        }
+                }
+                if (!dsize2) {
+                        size = asc2unichar(sb, astr2, len2, &c);
+                        astr2 += size;
+                        len2 -= size;
+                        if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
+                                c2 = c;
+                                dstr2 = &c2;
+                                dsize2 = 1;
+                        }
+                }
+                c1 = *dstr1;
+                c2 = *dstr2;
+                if (casefold) {
+                        if  (!(c1 = case_fold(c1))) {
+                                dstr1++;
+                                dsize1--;
+                                continue;
+                        }
+                        if (!(c2 = case_fold(c2))) {
+                                dstr2++;
+                                dsize2--;
+                                continue;
+                        }
+                }
+                if (c1 < c2)
+                        return -1;
+                else if (c1 > c2)
+                        return 1;
+                dstr1++;
+                dsize1--;
+                dstr2++;
+                dsize2--;
+        }
+        if (len1 < len2)
+                return -1;
+        if (len1 > len2)
+                return 1;
+        return 0;
+}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e6b46b3ac2fe..d145cb79c30a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -13,15 +13,18 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
+#include <linux/kernel.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/capability.h>
+#include <linux/ctype.h>
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
+#include <linux/parser.h>
 #include <linux/mman.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
@@ -47,6 +50,21 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
 int sysctl_hugetlb_shm_group;
+enum {
+        Opt_size, Opt_nr_inodes,
+        Opt_mode, Opt_uid, Opt_gid,
+        Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_size,      "size=%s"},
+        {Opt_nr_inodes, "nr_inodes=%s"},
+        {Opt_mode,      "mode=%o"},
+        {Opt_uid,       "uid=%u"},
+        {Opt_gid,       "gid=%u"},
+        {Opt_err,       NULL},
+};
 static void huge_pagevec_release(struct pagevec *pvec)
 {
        int i;
@@ -594,46 +612,73 @@ static const struct super_operations hugetlbfs_ops = {
 static int
 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 {
-        char *opt, *value, *rest;
+        char *p, *rest;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
        if (!options)
                return 0;
-        while ((opt = strsep(&options, ",")) != NULL) {
-                if (!*opt)
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
                        continue;
-                value = strchr(opt, '=');
+                token = match_token(p, tokens, args);
-                if (!value || !*value)
+                switch (token) {
-                        return -EINVAL;
+                case Opt_uid:
-                else
+                        if (match_int(&args[0], &option))
-                        *value++ = '\0';
+                                goto bad_val;
+                        pconfig->uid = option;
-                if (!strcmp(opt, "uid"))
+                        break;
-                        pconfig->uid = simple_strtoul(value, &value, 0);
-                else if (!strcmp(opt, "gid"))
+                case Opt_gid:
-                        pconfig->gid = simple_strtoul(value, &value, 0);
+                        if (match_int(&args[0], &option))
-                else if (!strcmp(opt, "mode"))
+                                goto bad_val;
-                        pconfig->mode = simple_strtoul(value,&value,0) & 0777U;
+                        pconfig->gid = option;
-                else if (!strcmp(opt, "size")) {
+                        break;
-                        unsigned long long size = memparse(value, &rest);
+                case Opt_mode:
+                        if (match_octal(&args[0], &option))
+                                goto bad_val;
+                        pconfig->mode = option & 0777U;
+                        break;
+                case Opt_size: {
+                        unsigned long long size;
+                        /* memparse() will accept a K/M/G without a digit */
+                        if (!isdigit(*args[0].from))
+                                goto bad_val;
+                        size = memparse(args[0].from, &rest);
                        if (*rest == '%') {
                                size <<= HPAGE_SHIFT;
                                size *= max_huge_pages;
                                do_div(size, 100);
-                                rest++;
                        }
                        pconfig->nr_blocks = (size >> HPAGE_SHIFT);
-                        value = rest;
+                        break;
-                } else if (!strcmp(opt,"nr_inodes")) {
+                }
-                        pconfig->nr_inodes = memparse(value, &rest);
-                        value = rest;
+                case Opt_nr_inodes:
-                } else
+                        /* memparse() will accept a K/M/G without a digit */
-                        return -EINVAL;
+                        if (!isdigit(*args[0].from))
+                                goto bad_val;
+                        pconfig->nr_inodes = memparse(args[0].from, &rest);
+                        break;
-                if (*value)
+                default:
+                        printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
+                                 p);
                        return -EINVAL;
+                        break;
+                }
        }
        return 0;
+bad_val:
+        printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
+               args[0].from, p);
+        return 1;
 }
 static int
@@ -651,7 +696,6 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        config.gid = current->fsgid;
        config.mode = 0755;
        ret = hugetlbfs_parse_options(data, &config);
        if (ret)
                return ret;
diff --git a/fs/inode.c b/fs/inode.c
index 9a012cc5b6cd..320e088d0b28 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -145,7 +145,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                mapping->a_ops = &empty_aops;
                mapping->host = inode;
                mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
                mapping->assoc_mapping = NULL;
                mapping->backing_dev_info = &default_backing_dev_info;
@@ -462,6 +462,11 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
        return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
+static struct shrinker icache_shrinker = {
+        .shrink = shrink_icache_memory,
+        .seeks = DEFAULT_SEEKS,
+};
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
@@ -519,7 +524,13 @@ repeat:
 *      new_inode       - obtain an inode
 *      @sb: superblock
 *
- *      Allocates a new inode for given superblock.
+ *      Allocates a new inode for given superblock. The default gfp_mask
+ *      for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *      If HIGHMEM pages are unsuitable or it is known that pages allocated
+ *      for the page cache are not reclaimable or migratable,
+ *      mapping_set_gfp_mask() must be called with suitable flags on the
+ *      newly created inode's mapping
+ *
 */
 struct inode *new_inode(struct super_block *sb)
 {
@@ -1379,7 +1390,7 @@ void __init inode_init(unsigned long mempages)
                                         SLAB_MEM_SPREAD),
                                         init_once,
                                         NULL);
-        set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+        register_shrinker(&icache_shrinker);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 8c90cbc903fa..c2a773e8620b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/module.h>
-#include <linux/kallsyms.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -21,7 +20,6 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
                unsigned long arg)
 {
        int error = -ENOTTY;
-        void *f;
        if (!filp->f_op)
                goto out;
@@ -31,16 +29,10 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
                if (error == -ENOIOCTLCMD)
                        error = -EINVAL;
                goto out;
-        } else if ((f = filp->f_op->ioctl)) {
+        } else if (filp->f_op->ioctl) {
                lock_kernel();
-                if (!filp->f_op->ioctl) {
+                error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-                        printk("%s: ioctl %p disappeared\n", __FUNCTION__, f);
+                                          filp, cmd, arg);
-                        print_symbol("symbol: %s\n", (unsigned long)f);
-                        dump_stack();
-                } else {
-                        error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-                                                  filp, cmd, arg);
-                }
                unlock_kernel();
        }
@@ -182,11 +174,3 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 out:
        return error;
 }
-/*
- * Platforms implementing 32 bit compatibility ioctl handlers in
- * modules need this exported
- */
-#ifdef CONFIG_COMPAT
-EXPORT_SYMBOL(sys_ioctl);
-#endif
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 0e94c31cad9b..1ba407c64df1 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -7,34 +7,18 @@
 *
 *  Steve Beynon                       : Missing last directory entries fixed
 *  (stephen@askone.demon.co.uk)      : 21st June 1996
- * 
+ *
 *  isofs directory handling functions
 */
 #include <linux/smp_lock.h>
 #include "isofs.h"
-static int isofs_readdir(struct file *, void *, filldir_t);
-const struct file_operations isofs_dir_operations =
-{
-        .read           = generic_read_dir,
-        .readdir        = isofs_readdir,
-};
-/*
- * directories can handle most operations...
- */
-const struct inode_operations isofs_dir_inode_operations =
-{
-        .lookup         = isofs_lookup,
-};
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
 {
        char * old = de->name;
        int len = de->name_len[0];
        int i;
-                        
        for (i = 0; i < len; i++) {
                unsigned char c = old[i];
                if (!c)
@@ -62,22 +46,27 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
 }
 /* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
-int get_acorn_filename(struct iso_directory_record * de,
+int get_acorn_filename(struct iso_directory_record *de,
-                            char * retname, struct inode * inode)
+                            char *retname, struct inode *inode)
 {
        int std;
-        unsigned char * chr;
+        unsigned char *chr;
        int retnamlen = isofs_name_translate(de, retname, inode);
-        if (retnamlen == 0) return 0;
+        if (retnamlen == 0)
+                return 0;
        std = sizeof(struct iso_directory_record) + de->name_len[0];
-        if (std & 1) std++;
+        if (std & 1)
-        if ((*((unsigned char *) de) - std) != 32) return retnamlen;
+                std++;
+        if ((*((unsigned char *) de) - std) != 32)
+                return retnamlen;
        chr = ((unsigned char *) de) + std;
-        if (strncmp(chr, "ARCHIMEDES", 10)) return retnamlen;
+        if (strncmp(chr, "ARCHIMEDES", 10))
-        if ((*retname == '_') && ((chr[19] & 1) == 1)) *retname = '!';
+                return retnamlen;
+        if ((*retname == '_') && ((chr[19] & 1) == 1))
+                *retname = '!';
        if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff)
-                && ((chr[12] & 0xf0) == 0xf0))
+                && ((chr[12] & 0xf0) == 0xf0)) {
-        {
                retname[retnamlen] = ',';
                sprintf(retname+retnamlen+1, "%3.3x",
                        ((chr[12] & 0xf) << 8) | chr[11]);
@@ -91,7 +80,7 @@ int get_acorn_filename(struct iso_directory_record * de,
 */
 static int do_isofs_readdir(struct inode *inode, struct file *filp,
                void *dirent, filldir_t filldir,
-                char * tmpname, struct iso_directory_record * tmpde)
+                char *tmpname, struct iso_directory_record *tmpde)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        unsigned char bufbits = ISOFS_BUFFER_BITS(inode);
@@ -121,9 +110,11 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                de_len = *(unsigned char *) de;
-                /* If the length byte is zero, we should move on to the next
+                /*
-                   CDROM sector.  If we are at the end of the directory, we
+                 * If the length byte is zero, we should move on to the next
-                   kick out of the while loop. */
+                 * CDROM sector.  If we are at the end of the directory, we
+                 * kick out of the while loop.
+                 */
                if (de_len == 0) {
                        brelse(bh);
@@ -157,11 +148,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                if (first_de) {
                        isofs_normalize_block_and_offset(de,
-                                                         &block_saved,
+                                                        &block_saved,
-                                                         &offset_saved);
+                                                        &offset_saved);
                        inode_number = isofs_get_ino(block_saved,
-                                                     offset_saved,
+                                                        offset_saved, bufbits);
-                                                     bufbits);
                }
                if (de->flags[-sbi->s_high_sierra] & 0x80) {
@@ -199,7 +189,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                 */
                if ((sbi->s_hide == 'y' &&
                                (de->flags[-sbi->s_high_sierra] & 1)) ||
-                      (sbi->s_showassoc =='n' &&
+                                (sbi->s_showassoc =='n' &&
                                (de->flags[-sbi->s_high_sierra] & 4))) {
                        filp->f_pos += de_len;
                        continue;
@@ -240,7 +230,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
                continue;
        }
-        if (bh) brelse(bh);
+        if (bh)
+                brelse(bh);
        return 0;
 }
@@ -253,8 +244,8 @@ static int isofs_readdir(struct file *filp,
                void *dirent, filldir_t filldir)
 {
        int result;
-        char * tmpname;
+        char *tmpname;
-        struct iso_directory_record * tmpde;
+        struct iso_directory_record *tmpde;
        struct inode *inode = filp->f_path.dentry->d_inode;
        tmpname = (char *)__get_free_page(GFP_KERNEL);
@@ -270,3 +261,19 @@ static int isofs_readdir(struct file *filp,
        unlock_kernel();
        return result;
 }
+const struct file_operations isofs_dir_operations =
+{
+        .read = generic_read_dir,
+        .readdir = isofs_readdir,
+};
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations isofs_dir_inode_operations =
+{
+        .lookup = isofs_lookup,
+};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 5c3eecf7542e..4f5418be0590 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,20 +73,20 @@ static void isofs_destroy_inode(struct inode *inode)
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
-static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
        struct iso_inode_info *ei = foo;
        inode_init_once(&ei->vfs_inode);
 }
- 
 static int init_inodecache(void)
 {
        isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
-                                             sizeof(struct iso_inode_info),
+                                        sizeof(struct iso_inode_info),
-                                             0, (SLAB_RECLAIM_ACCOUNT|
+                                        0, (SLAB_RECLAIM_ACCOUNT|
-                                                SLAB_MEM_SPREAD),
+                                        SLAB_MEM_SPREAD),
-                                             init_once, NULL);
+                                        init_once, NULL);
        if (isofs_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
@@ -150,9 +150,9 @@ struct iso9660_options{
        uid_t uid;
        char *iocharset;
        unsigned char utf8;
-        /* LVE */
+        /* LVE */
-        s32 session;
+        s32 session;
-        s32 sbsector;
+        s32 sbsector;
 };
 /*
@@ -197,7 +197,7 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
        hash = init_name_hash();
        while (len--) {
                c = tolower(*name++);
-                hash = partial_name_hash(tolower(c), hash);
+                hash = partial_name_hash(c, hash);
        }
        qstr->hash = end_name_hash(hash);
@@ -360,10 +360,12 @@ static int parse_options(char *options, struct iso9660_options *popt)
        popt->check = 'u';              /* unset */
        popt->nocompress = 0;
        popt->blocksize = 1024;
-        popt->mode = S_IRUGO | S_IXUGO; /* r-x for all.  The disc could
+        popt->mode = S_IRUGO | S_IXUGO; /*
-                                           be shared with DOS machines so
+                                         * r-x for all.  The disc could
-                                           virtually anything could be
+                                         * be shared with DOS machines so
-                                           a valid executable. */
+                                         * virtually anything could be
+                                         * a valid executable.
+                                         */
        popt->gid = 0;
        popt->uid = 0;
        popt->iocharset = NULL;
@@ -503,30 +505,30 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
                Te.cdte_format=CDROM_LBA;
                i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te);
                if (!i) {
-                        printk(KERN_DEBUG "Session %d start %d type %d\n",
+                        printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
-                               session, Te.cdte_addr.lba,
+                                session, Te.cdte_addr.lba,
-                               Te.cdte_ctrl&CDROM_DATA_TRACK);
+                                Te.cdte_ctrl&CDROM_DATA_TRACK);
                        if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4)
                                return Te.cdte_addr.lba;
                }
-                        
-                printk(KERN_ERR "Invalid session number or type of track\n");
+                printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
        }
        i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info);
        if (session > 0)
-                printk(KERN_ERR "Invalid session number\n");
+                printk(KERN_ERR "ISOFS: Invalid session number\n");
 #if 0
-        printk("isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
+        printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
        if (i==0) {
-                printk("isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
+                printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
-                printk("isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
+                printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
        }
 #endif
        if (i==0)
 #if WE_OBEY_THE_WRITTEN_STANDARDS
-        if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
+                if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
 #endif
-                vol_desc_start=ms_info.addr.lba;
+                        vol_desc_start=ms_info.addr.lba;
        return vol_desc_start;
 }
@@ -538,20 +540,20 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
 */
 static int isofs_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct buffer_head            * bh = NULL, *pri_bh = NULL;
+        struct buffer_head *bh = NULL, *pri_bh = NULL;
-        struct hs_primary_descriptor  * h_pri = NULL;
+        struct hs_primary_descriptor *h_pri = NULL;
-        struct iso_primary_descriptor * pri = NULL;
+        struct iso_primary_descriptor *pri = NULL;
        struct iso_supplementary_descriptor *sec = NULL;
-        struct iso_directory_record   * rootp;
+        struct iso_directory_record *rootp;
-        int                             joliet_level = 0;
+        struct inode *inode;
-        int                             iso_blknum, block;
+        struct iso9660_options opt;
-        int                             orig_zonesize;
+        struct isofs_sb_info *sbi;
-        int                             table;
+        unsigned long first_data_zone;
-        unsigned int                    vol_desc_start;
+        int joliet_level = 0;
-        unsigned long                   first_data_zone;
+        int iso_blknum, block;
-        struct inode                  * inode;
+        int orig_zonesize;
-        struct iso9660_options          opt;
+        int table;
-        struct isofs_sb_info          * sbi;
+        unsigned int vol_desc_start;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -577,72 +579,73 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
        vol_desc_start = (opt.sbsector != -1) ?
                opt.sbsector : isofs_get_last_session(s,opt.session);
-        for (iso_blknum = vol_desc_start+16;
+        for (iso_blknum = vol_desc_start+16;
-             iso_blknum < vol_desc_start+100; iso_blknum++)
+                iso_blknum < vol_desc_start+100; iso_blknum++) {
-        {
+                struct hs_volume_descriptor *hdp;
-            struct hs_volume_descriptor   * hdp;
+                struct iso_volume_descriptor  *vdp;
-            struct iso_volume_descriptor  * vdp;
+                block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
-            block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits);
+                if (!(bh = sb_bread(s, block)))
-            if (!(bh = sb_bread(s, block)))
+                        goto out_no_read;
-                goto out_no_read;
+                vdp = (struct iso_volume_descriptor *)bh->b_data;
-            vdp = (struct iso_volume_descriptor *)bh->b_data;
+                hdp = (struct hs_volume_descriptor *)bh->b_data;
-            hdp = (struct hs_volume_descriptor *)bh->b_data;
-            
+                /*
-            /* Due to the overlapping physical location of the descriptors, 
+                 * Due to the overlapping physical location of the descriptors,
-             * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure 
+                 * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure
-             * proper identification in this case, we first check for ISO.
+                 * proper identification in this case, we first check for ISO.
-             */
+                 */
-            if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
+                if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) {
-                if (isonum_711 (vdp->type) == ISO_VD_END)
+                        if (isonum_711(vdp->type) == ISO_VD_END)
-                    break;
+                                break;
-                if (isonum_711 (vdp->type) == ISO_VD_PRIMARY) {
+                        if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
-                    if (pri == NULL) {
+                                if (pri == NULL) {
-                        pri = (struct iso_primary_descriptor *)vdp;
+                                        pri = (struct iso_primary_descriptor *)vdp;
-                        /* Save the buffer in case we need it ... */
+                                        /* Save the buffer in case we need it ... */
-                        pri_bh = bh;
+                                        pri_bh = bh;
-                        bh = NULL;
+                                        bh = NULL;
-                    }
+                                }
-                }
+                        }
 #ifdef CONFIG_JOLIET
-                else if (isonum_711 (vdp->type) == ISO_VD_SUPPLEMENTARY) {
+                        else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) {
-                    sec = (struct iso_supplementary_descriptor *)vdp;
+                                sec = (struct iso_supplementary_descriptor *)vdp;
-                    if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
+                                if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) {
-                        if (opt.joliet == 'y') {
+                                        if (opt.joliet == 'y') {
-                            if (sec->escape[2] == 0x40) {
+                                                if (sec->escape[2] == 0x40)
-                                joliet_level = 1;
+                                                        joliet_level = 1;
-                            } else if (sec->escape[2] == 0x43) {
+                                                else if (sec->escape[2] == 0x43)
-                                joliet_level = 2;
+                                                        joliet_level = 2;
-                            } else if (sec->escape[2] == 0x45) {
+                                                else if (sec->escape[2] == 0x45)
-                                joliet_level = 3;
+                                                        joliet_level = 3;
-                            }
-                            printk(KERN_DEBUG"ISO 9660 Extensions: Microsoft Joliet Level %d\n",
+                                                printk(KERN_DEBUG "ISO 9660 Extensions: "
-                                   joliet_level);
+                                                        "Microsoft Joliet Level %d\n",
+                                                        joliet_level);
+                                        }
+                                        goto root_found;
+                                } else {
+                                /* Unknown supplementary volume descriptor */
+                                sec = NULL;
+                                }
                        }
-                        goto root_found;
-                    } else {
-                        /* Unknown supplementary volume descriptor */
-                        sec = NULL;
-                    }
-                }
 #endif
-            } else {
+                } else {
-                if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
+                        if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) {
-                    if (isonum_711 (hdp->type) != ISO_VD_PRIMARY)
+                                if (isonum_711(hdp->type) != ISO_VD_PRIMARY)
-                        goto out_freebh;
+                                        goto out_freebh;
-                
-                    sbi->s_high_sierra = 1;
+                                sbi->s_high_sierra = 1;
-                    opt.rock = 'n';
+                                opt.rock = 'n';
-                    h_pri = (struct hs_primary_descriptor *)vdp;
+                                h_pri = (struct hs_primary_descriptor *)vdp;
-                    goto root_found;
+                                goto root_found;
+                        }
                }
-            }
-            /* Just skip any volume descriptors we don't recognize */
+                /* Just skip any volume descriptors we don't recognize */
-            brelse(bh);
+                brelse(bh);
-            bh = NULL;
+                bh = NULL;
        }
        /*
         * If we fall through, either no volume descriptor was found,
@@ -657,24 +660,24 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 root_found:
        if (joliet_level && (pri == NULL || opt.rock == 'n')) {
-            /* This is the case of Joliet with the norock mount flag.
+                /* This is the case of Joliet with the norock mount flag.
-             * A disc with both Joliet and Rock Ridge is handled later
+                 * A disc with both Joliet and Rock Ridge is handled later
-             */
+                 */
-            pri = (struct iso_primary_descriptor *) sec;
+                pri = (struct iso_primary_descriptor *) sec;
        }
        if(sbi->s_high_sierra){
-          rootp = (struct iso_directory_record *) h_pri->root_directory_record;
+                rootp = (struct iso_directory_record *) h_pri->root_directory_record;
-          sbi->s_nzones = isonum_733 (h_pri->volume_space_size);
+                sbi->s_nzones = isonum_733(h_pri->volume_space_size);
-          sbi->s_log_zone_size = isonum_723 (h_pri->logical_block_size);
+                sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size);
-          sbi->s_max_size = isonum_733(h_pri->volume_space_size);
+                sbi->s_max_size = isonum_733(h_pri->volume_space_size);
        } else {
-          if (!pri)
+                if (!pri)
-            goto out_freebh;
+                        goto out_freebh;
-          rootp = (struct iso_directory_record *) pri->root_directory_record;
+                rootp = (struct iso_directory_record *) pri->root_directory_record;
-          sbi->s_nzones = isonum_733 (pri->volume_space_size);
+                sbi->s_nzones = isonum_733(pri->volume_space_size);
-          sbi->s_log_zone_size = isonum_723 (pri->logical_block_size);
+                sbi->s_log_zone_size = isonum_723(pri->logical_block_size);
-          sbi->s_max_size = isonum_733(pri->volume_space_size);
+                sbi->s_max_size = isonum_733(pri->volume_space_size);
        }
        sbi->s_ninodes = 0; /* No way to figure this out easily */
@@ -687,42 +690,43 @@ root_found:
         * blocks that were 512 bytes (which should only very rarely
         * happen.)
         */
-        if(orig_zonesize < opt.blocksize)
+        if (orig_zonesize < opt.blocksize)
                goto out_bad_size;
        /* RDE: convert log zone size to bit shift */
-        switch (sbi->s_log_zone_size)
+        switch (sbi->s_log_zone_size) {
-          { case  512: sbi->s_log_zone_size =  9; break;
+        case  512: sbi->s_log_zone_size =  9; break;
-            case 1024: sbi->s_log_zone_size = 10; break;
+        case 1024: sbi->s_log_zone_size = 10; break;
-            case 2048: sbi->s_log_zone_size = 11; break;
+        case 2048: sbi->s_log_zone_size = 11; break;
-            default:
+        default:
                goto out_bad_zone_size;
-          }
+        }
        s->s_magic = ISOFS_SUPER_MAGIC;
        s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */
-        /* The CDROM is read-only, has no nodes (devices) on it, and since
+        /*
-           all of the files appear to be owned by root, we really do not want
+         * The CDROM is read-only, has no nodes (devices) on it, and since
-           to allow suid.  (suid or devices will not show up unless we have
+         * all of the files appear to be owned by root, we really do not want
-           Rock Ridge extensions) */
+         * to allow suid.  (suid or devices will not show up unless we have
+         * Rock Ridge extensions)
+         */
        s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
        /* Set this for reference. Its not currently used except on write
           which we don't have .. */
-           
-        first_data_zone = isonum_733 (rootp->extent) +
+        first_data_zone = isonum_733(rootp->extent) +
-                          isonum_711 (rootp->ext_attr_length);
+                          isonum_711(rootp->ext_attr_length);
        sbi->s_firstdatazone = first_data_zone;
 #ifndef BEQUIET
-        printk(KERN_DEBUG "Max size:%ld   Log zone size:%ld\n",
+        printk(KERN_DEBUG "ISOFS: Max size:%ld   Log zone size:%ld\n",
-               sbi->s_max_size,
+                sbi->s_max_size, 1UL << sbi->s_log_zone_size);
-               1UL << sbi->s_log_zone_size);
+        printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone);
-        printk(KERN_DEBUG "First datazone:%ld\n", sbi->s_firstdatazone);
        if(sbi->s_high_sierra)
-                printk(KERN_DEBUG "Disc in High Sierra format.\n");
+                printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n");
 #endif
        /*
@@ -737,8 +741,8 @@ root_found:
                pri = (struct iso_primary_descriptor *) sec;
                rootp = (struct iso_directory_record *)
                        pri->root_directory_record;
-                first_data_zone = isonum_733 (rootp->extent) +
+                first_data_zone = isonum_733(rootp->extent) +
-                                isonum_711 (rootp->ext_attr_length);
+                                isonum_711(rootp->ext_attr_length);
        }
        /*
@@ -771,7 +775,7 @@ root_found:
 #ifdef CONFIG_JOLIET
        if (joliet_level && opt.utf8 == 0) {
-                char * p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
+                char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT;
                sbi->s_nls_iocharset = load_nls(p);
                if (! sbi->s_nls_iocharset) {
                        /* Fail only if explicit charset specified */
@@ -821,7 +825,7 @@ root_found:
                sbi->s_rock = 0;
                if (sbi->s_firstdatazone != first_data_zone) {
                        sbi->s_firstdatazone = first_data_zone;
-                        printk(KERN_DEBUG 
+                        printk(KERN_DEBUG
                                "ISOFS: changing to secondary root\n");
                        iput(inode);
                        inode = isofs_iget(s, sbi->s_firstdatazone, 0);
@@ -830,8 +834,10 @@ root_found:
        if (opt.check == 'u') {
                /* Only Joliet is case insensitive by default */
-                if (joliet_level) opt.check = 'r';
+                if (joliet_level)
-                else opt.check = 's';
+                        opt.check = 'r';
+                else
+                        opt.check = 's';
        }
        sbi->s_joliet_level = joliet_level;
@@ -846,8 +852,10 @@ root_found:
                goto out_no_root;
        table = 0;
-        if (joliet_level) table += 2;
+        if (joliet_level)
-        if (opt.check == 'r') table++;
+                table += 2;
+        if (opt.check == 'r')
+                table++;
        s->s_root->d_op = &isofs_dentry_ops[table];
        kfree(opt.iocharset);
@@ -858,10 +866,10 @@ root_found:
         * Display error messages and free resources.
         */
 out_bad_root:
-        printk(KERN_WARNING "isofs_fill_super: root inode not initialized\n");
+        printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
        goto out_iput;
 out_no_root:
-        printk(KERN_WARNING "isofs_fill_super: get root inode failed\n");
+        printk(KERN_WARNING "%s: get root inode failed\n", __func__);
 out_iput:
        iput(inode);
 #ifdef CONFIG_JOLIET
@@ -870,21 +878,20 @@ out_iput:
 #endif
        goto out_freesbi;
 out_no_read:
-        printk(KERN_WARNING "isofs_fill_super: "
+        printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n",
-                "bread failed, dev=%s, iso_blknum=%d, block=%d\n",
+                __func__, s->s_id, iso_blknum, block);
-                s->s_id, iso_blknum, block);
        goto out_freesbi;
 out_bad_zone_size:
-        printk(KERN_WARNING "Bad logical zone size %ld\n",
+        printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n",
                sbi->s_log_zone_size);
        goto out_freebh;
 out_bad_size:
-        printk(KERN_WARNING "Logical zone size(%d) < hardware blocksize(%u)\n",
+        printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n",
                orig_zonesize, opt.blocksize);
        goto out_freebh;
 out_unknown_format:
        if (!silent)
-                printk(KERN_WARNING "Unable to identify CD-ROM format.\n");
+                printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n");
 out_freebh:
        brelse(bh);
@@ -902,7 +909,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = ISOFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (ISOFS_SB(sb)->s_nzones
-                  << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
+                << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits));
        buf->f_bfree = 0;
        buf->f_bavail = 0;
        buf->f_files = ISOFS_SB(sb)->s_ninodes;
@@ -931,20 +938,20 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
        rv = 0;
        if (iblock < 0 || iblock != iblock_s) {
-                printk("isofs_get_blocks: block number too large\n");
+                printk(KERN_DEBUG "%s: block number too large\n", __func__);
                goto abort;
        }
        b_off = iblock;
-        
-        offset    = 0;
+        offset = 0;
-        firstext  = ei->i_first_extent;
+        firstext = ei->i_first_extent;
        sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode);
-        nextblk   = ei->i_next_section_block;
+        nextblk = ei->i_next_section_block;
-        nextoff   = ei->i_next_section_offset;
+        nextoff = ei->i_next_section_offset;
-        section   = 0;
+        section = 0;
-        while ( nblocks ) {
+        while (nblocks) {
                /* If we are *way* beyond the end of the file, print a message.
                 * Access beyond the end of the file up to the next page boundary
                 * is normal, however because of the way the page cache works.
@@ -953,11 +960,11 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                 * I/O errors.
                 */
                if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
-                        printk("isofs_get_blocks: block >= EOF (%ld, %ld)\n",
+                        printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
-                               iblock, (unsigned long) inode->i_size);
+                                __func__, iblock, (unsigned long) inode->i_size);
                        goto abort;
                }
-                
                /* On the last section, nextblk == 0, section size is likely to
                 * exceed sect_size by a partial block, and access beyond the
                 * end of the file will reach beyond the section size, too.
@@ -976,20 +983,21 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                        iput(ninode);
                        if (++section > 100) {
-                                printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n");
+                                printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
-                                printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u "
+                                        " aborting...\n", __func__);
-                                       "nextblk=%lu nextoff=%lu\n",
+                                printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
-                                       iblock, firstext, (unsigned) sect_size,
+                                        "nextblk=%lu nextoff=%lu\n", __func__,
-                                       nextblk, nextoff);
+                                        iblock, firstext, (unsigned) sect_size,
+                                        nextblk, nextoff);
                                goto abort;
                        }
                }
-                
-                if ( *bh ) {
+                if (*bh) {
                        map_bh(*bh, inode->i_sb, firstext + b_off - offset);
                } else {
                        *bh = sb_getblk(inode->i_sb, firstext+b_off-offset);
-                        if ( !*bh )
+                        if (!*bh)
                                goto abort;
                }
                bh++;   /* Next buffer head */
@@ -1010,7 +1018,7 @@ static int isofs_get_block(struct inode *inode, sector_t iblock,
                    struct buffer_head *bh_result, int create)
 {
        if (create) {
-                printk("isofs_get_block: Kernel tries to allocate a block\n");
+                printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__);
                return -EROFS;
        }
@@ -1070,11 +1078,11 @@ static int isofs_read_level3_size(struct inode *inode)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra;
-        struct buffer_head * bh = NULL;
+        struct buffer_head *bh = NULL;
        unsigned long block, offset, block_saved, offset_saved;
        int i = 0;
        int more_entries = 0;
-        struct iso_directory_record * tmpde = NULL;
+        struct iso_directory_record *tmpde = NULL;
        struct iso_inode_info *ei = ISOFS_I(inode);
        inode->i_size = 0;
@@ -1089,7 +1097,7 @@ static int isofs_read_level3_size(struct inode *inode)
        offset = ei->i_iget5_offset;
        do {
-                struct iso_directory_record * de;
+                struct iso_directory_record *de;
                unsigned int de_len;
                if (!bh) {
@@ -1163,10 +1171,9 @@ out_noread:
        return -EIO;
 out_toomany:
-        printk(KERN_INFO "isofs_read_level3_size: "
+        printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n"
-                "More than 100 file sections ?!?, aborting...\n"
+                "isofs_read_level3_size: inode=%lu\n",
-                "isofs_read_level3_size: inode=%lu\n",
+                __func__, inode->i_ino);
-                inode->i_ino);
        goto out;
 }
@@ -1177,9 +1184,9 @@ static void isofs_read_inode(struct inode *inode)
        unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
        unsigned long block;
        int high_sierra = sbi->s_high_sierra;
-        struct buffer_head * bh = NULL;
+        struct buffer_head *bh = NULL;
-        struct iso_directory_record * de;
+        struct iso_directory_record *de;
-        struct iso_directory_record * tmpde = NULL;
+        struct iso_directory_record *tmpde = NULL;
        unsigned int de_len;
        unsigned long offset;
        struct iso_inode_info *ei = ISOFS_I(inode);
@@ -1199,7 +1206,7 @@ static void isofs_read_inode(struct inode *inode)
                tmpde = kmalloc(de_len, GFP_KERNEL);
                if (tmpde == NULL) {
-                        printk(KERN_INFO "isofs_read_inode: out of memory\n");
+                        printk(KERN_INFO "%s: out of memory\n", __func__);
                        goto fail;
                }
                memcpy(tmpde, bh->b_data + offset, frag1);
@@ -1212,24 +1219,26 @@ static void isofs_read_inode(struct inode *inode)
        }
        inode->i_ino = isofs_get_ino(ei->i_iget5_block,
-                                     ei->i_iget5_offset,
+                                        ei->i_iget5_offset,
-                                     ISOFS_BUFFER_BITS(inode));
+                                        ISOFS_BUFFER_BITS(inode));
        /* Assume it is a normal-format file unless told otherwise */
        ei->i_file_format = isofs_file_normal;
        if (de->flags[-high_sierra] & 2) {
                inode->i_mode = S_IRUGO | S_IXUGO | S_IFDIR;
-                inode->i_nlink = 1; /* Set to 1.  We know there are 2, but
+                inode->i_nlink = 1;     /*
-                                       the find utility tries to optimize
+                                         * Set to 1.  We know there are 2, but
-                                       if it is 2, and it screws up.  It is
+                                         * the find utility tries to optimize
-                                       easier to give 1 which tells find to
+                                         * if it is 2, and it screws up.  It is
-                                       do it the hard way. */
+                                         * easier to give 1 which tells find to
+                                         * do it the hard way.
+                                         */
        } else {
-                /* Everybody gets to read the file. */
+                /* Everybody gets to read the file. */
                inode->i_mode = sbi->s_mode;
                inode->i_nlink = 1;
-                inode->i_mode |= S_IFREG;
+                inode->i_mode |= S_IFREG;
        }
        inode->i_uid = sbi->s_uid;
        inode->i_gid = sbi->s_gid;
@@ -1239,13 +1248,14 @@ static void isofs_read_inode(struct inode *inode)
        ei->i_format_parm[1] = 0;
        ei->i_format_parm[2] = 0;
-        ei->i_section_size = isonum_733 (de->size);
+        ei->i_section_size = isonum_733(de->size);
        if (de->flags[-high_sierra] & 0x80) {
-                if(isofs_read_level3_size(inode)) goto fail;
+                if(isofs_read_level3_size(inode))
+                        goto fail;
        } else {
                ei->i_next_section_block = 0;
                ei->i_next_section_offset = 0;
-                inode->i_size = isonum_733 (de->size);
+                inode->i_size = isonum_733(de->size);
        }
        /*
@@ -1258,23 +1268,24 @@ static void isofs_read_inode(struct inode *inode)
                inode->i_size &= 0x00ffffff;
        if (de->interleave[0]) {
-                printk("Interleaved files not (yet) supported.\n");
+                printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n");
                inode->i_size = 0;
        }
        /* I have no idea what file_unit_size is used for, so
           we will flag it for now */
        if (de->file_unit_size[0] != 0) {
-                printk("File unit size != 0 for ISO file (%ld).\n",
+                printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n",
-                       inode->i_ino);
+                        inode->i_ino);
        }
        /* I have no idea what other flag bits are used for, so
           we will flag it for now */
 #ifdef DEBUG
        if((de->flags[-high_sierra] & ~2)!= 0){
-                printk("Unusual flag settings for ISO file (%ld %x).\n",
+                printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file "
-                       inode->i_ino, de->flags[-high_sierra]);
+                                "(%ld %x).\n",
+                        inode->i_ino, de->flags[-high_sierra]);
        }
 #endif
@@ -1285,11 +1296,11 @@ static void isofs_read_inode(struct inode *inode)
        inode->i_atime.tv_nsec =
        inode->i_ctime.tv_nsec = 0;
-        ei->i_first_extent = (isonum_733 (de->extent) +
+        ei->i_first_extent = (isonum_733(de->extent) +
-                              isonum_711 (de->ext_attr_length));
+                        isonum_711(de->ext_attr_length));
        /* Set the number of blocks for stat() - should be done before RR */
-        inode->i_blocks  = (inode->i_size + 511) >> 9;
+        inode->i_blocks = (inode->i_size + 511) >> 9;
        /*
         * Now test for possible Rock Ridge extensions which will override
@@ -1306,7 +1317,7 @@ static void isofs_read_inode(struct inode *inode)
        /* Install the inode operations vector */
        if (S_ISREG(inode->i_mode)) {
                inode->i_fop = &generic_ro_fops;
-                switch ( ei->i_file_format ) {
+                switch (ei->i_file_format) {
 #ifdef CONFIG_ZISOFS
                case isofs_file_compressed:
                        inode->i_data.a_ops = &zisofs_aops;
@@ -1350,7 +1361,7 @@ static int isofs_iget5_test(struct inode *ino, void *data)
        struct isofs_iget5_callback_data *d =
                (struct isofs_iget5_callback_data*)data;
        return (i->i_iget5_block == d->block)
-               && (i->i_iget5_offset == d->offset);
+                && (i->i_iget5_offset == d->offset);
 }
 static int isofs_iget5_set(struct inode *ino, void *data)
@@ -1384,7 +1395,7 @@ struct inode *isofs_iget(struct super_block *sb,
        hashval = (block << sb->s_blocksize_bits) | offset;
        inode = iget5_locked(sb, hashval, &isofs_iget5_test,
-                             &isofs_iget5_set, &data);
+                                &isofs_iget5_set, &data);
        if (inode && (inode->i_state & I_NEW)) {
                sb->s_op->read_inode(inode);
@@ -1398,7 +1409,7 @@ static int isofs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
        return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
-                           mnt);
+                                mnt);
 }
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index efe2872cd4e3..a07e67b1ea7f 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -1,5 +1,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/iso_fs.h>
 #include <asm/unaligned.h>
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index fb8fe7a9ddc6..92c14b850e9c 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -80,22 +80,20 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
        if (utf8) {
                len = wcsntombs_be(outname, de->name,
-                                   de->name_len[0] >> 1, PAGE_SIZE);
+                                de->name_len[0] >> 1, PAGE_SIZE);
        } else {
                len = uni16_to_x8(outname, (__be16 *) de->name,
-                                  de->name_len[0] >> 1, nls);
+                                de->name_len[0] >> 1, nls);
        }
-        if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) {
+        if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1'))
                len -= 2;
-        }
        /*
         * Windows doesn't like periods at the end of a name,
         * so neither do we
         */
-        while (len >= 2 && (outname[len-1] == '.')) {
+        while (len >= 2 && (outname[len-1] == '.'))
                len--;
-        }
        return len;
 }
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c04b3a14a3e9..c8c7e5138a01 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -15,7 +15,7 @@
 * some sanity tests.
 */
 static int
-isofs_cmp(struct dentry * dentry, const char * compare, int dlen)
+isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 {
        struct qstr qstr;
@@ -48,24 +48,24 @@ isofs_cmp(struct dentry * dentry, const char * compare, int dlen)
 */
 static unsigned long
 isofs_find_entry(struct inode *dir, struct dentry *dentry,
-        unsigned long *block_rv, unsigned long* offset_rv,
+        unsigned long *block_rv, unsigned long *offset_rv,
-        char * tmpname, struct iso_directory_record * tmpde)
+        char *tmpname, struct iso_directory_record *tmpde)
 {
        unsigned long bufsize = ISOFS_BUFFER_SIZE(dir);
        unsigned char bufbits = ISOFS_BUFFER_BITS(dir);
        unsigned long block, f_pos, offset, block_saved, offset_saved;
-        struct buffer_head * bh = NULL;
+        struct buffer_head *bh = NULL;
        struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb);
        if (!ISOFS_I(dir)->i_first_extent)
                return 0;
-  
        f_pos = 0;
        offset = 0;
        block = 0;
        while (f_pos < dir->i_size) {
-                struct iso_directory_record * de;
+                struct iso_directory_record *de;
                int de_len, match, i, dlen;
                char *dpnt;
@@ -114,7 +114,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
                if (sbi->s_rock &&
                    ((i = get_rock_ridge_filename(de, tmpname, dir)))) {
-                        dlen = i;       /* possibly -1 */
+                        dlen = i;       /* possibly -1 */
                        dpnt = tmpname;
 #ifdef CONFIG_JOLIET
                } else if (sbi->s_joliet_level) {
@@ -145,8 +145,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
                        isofs_normalize_block_and_offset(de,
                                                         &block_saved,
                                                         &offset_saved);
-                        *block_rv = block_saved;
+                        *block_rv = block_saved;
-                        *offset_rv = offset_saved;
+                        *offset_rv = offset_saved;
                        brelse(bh);
                        return 1;
                }
@@ -155,7 +155,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
        return 0;
 }
-struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        int found;
        unsigned long block, offset;
@@ -170,9 +170,9 @@ struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct n
        lock_kernel();
        found = isofs_find_entry(dir, dentry,
-                                 &block, &offset,
+                                &block, &offset,
-                                 page_address(page),
+                                page_address(page),
-                                 1024 + page_address(page));
+                                1024 + page_address(page));
        __free_page(page);
        inode = NULL;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 1facfaff97cb..a003d50edcdb 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -887,7 +887,8 @@ restart_loop:
        journal->j_committing_transaction = NULL;
        spin_unlock(&journal->j_state_lock);
-        if (commit_transaction->t_checkpoint_list == NULL) {
+        if (commit_transaction->t_checkpoint_list == NULL &&
+            commit_transaction->t_checkpoint_io_list == NULL) {
                __journal_drop_transaction(journal, commit_transaction);
        } else {
                if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 824e3b7d4ec1..8db2fa25170b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -68,6 +68,7 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #endif
+#include <linux/log2.h>
 static struct kmem_cache *revoke_record_cache;
 static struct kmem_cache *revoke_table_cache;
@@ -211,7 +212,7 @@ int journal_init_revoke(journal_t *journal, int hash_size)
        journal->j_revoke = journal->j_revoke_table[0];
        /* Check that the hash_size is a power of two */
-        J_ASSERT ((hash_size & (hash_size-1)) == 0);
+        J_ASSERT(is_power_of_2(hash_size));
        journal->j_revoke->hash_size = hash_size;
@@ -238,7 +239,7 @@ int journal_init_revoke(journal_t *journal, int hash_size)
        journal->j_revoke = journal->j_revoke_table[1];
        /* Check that the hash_size is a power of two */
-        J_ASSERT ((hash_size & (hash_size-1)) == 0);
+        J_ASSERT(is_power_of_2(hash_size));
        journal->j_revoke->hash_size = hash_size;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2856e1100a5f..c0f59d1b13dc 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -896,7 +896,8 @@ restart_loop:
        journal->j_committing_transaction = NULL;
        spin_unlock(&journal->j_state_lock);
-        if (commit_transaction->t_checkpoint_list == NULL) {
+        if (commit_transaction->t_checkpoint_list == NULL &&
+            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
        } else {
                if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9246e763da78..28cac049a56b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -68,6 +68,7 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #endif
+#include <linux/log2.h>
 static struct kmem_cache *jbd2_revoke_record_cache;
 static struct kmem_cache *jbd2_revoke_table_cache;
@@ -212,7 +213,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
        journal->j_revoke = journal->j_revoke_table[0];
        /* Check that the hash_size is a power of two */
-        J_ASSERT ((hash_size & (hash_size-1)) == 0);
+        J_ASSERT(is_power_of_2(hash_size));
        journal->j_revoke->hash_size = hash_size;
@@ -239,7 +240,7 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
        journal->j_revoke = journal->j_revoke_table[1];
        /* Check that the hash_size is a power of two */
-        J_ASSERT ((hash_size & (hash_size-1)) == 0);
+        J_ASSERT(is_power_of_2(hash_size));
        journal->j_revoke->hash_size = hash_size;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0c82dfcfd246..143c5530caf3 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -81,6 +81,7 @@ static int jffs2_garbage_collect_thread(void *_c)
        set_user_nice(current, 10);
+        set_freezable();
        for (;;) {
                allow_signal(SIGHUP);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 2374b595f2e1..f0ec72b263f1 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t);
 extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
 extern void jfs_get_inode_flags(struct jfs_inode_info *);
+extern struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 25161c4121e4..932797ba433b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1477,6 +1477,38 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        return dentry;
 }
+struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp)
+{
+        __u32 *objp = vobjp;
+        unsigned long ino = objp[0];
+        __u32 generation = objp[1];
+        struct inode *inode;
+        struct dentry *result;
+        if (ino == 0)
+                return ERR_PTR(-ESTALE);
+        inode = iget(sb, ino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (is_bad_inode(inode) ||
+            (generation && inode->i_generation != generation)) {
+                result = ERR_PTR(-ESTALE);
+                goto out_iput;
+        }
+        result = d_alloc_anon(inode);
+        if (!result) {
+                result = ERR_PTR(-ENOMEM);
+                goto out_iput;
+        }
+        return result;
+ out_iput:
+        iput(inode);
+        return result;
+}
 struct dentry *jfs_get_parent(struct dentry *dentry)
 {
        struct super_block *sb = dentry->d_inode->i_sb;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 20e4ac1c79a3..929fceca7999 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/posix_acl.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
@@ -737,6 +738,7 @@ static const struct super_operations jfs_super_operations = {
 };
 static struct export_operations jfs_export_operations = {
+        .get_dentry     = jfs_get_dentry,
        .get_parent     = jfs_get_parent,
 };
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 26809325469c..82e2192a0d5c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -75,18 +76,31 @@ static const int		nlm_port_min = 0, nlm_port_max = 65535;
 static struct ctl_table_header * nlm_sysctl_table;
-static unsigned long set_grace_period(void)
+static unsigned long get_lockd_grace_period(void)
 {
-        unsigned long grace_period;
        /* Note: nlm_timeout should always be nonzero */
        if (nlm_grace_period)
-                grace_period = ((nlm_grace_period + nlm_timeout - 1)
+                return roundup(nlm_grace_period, nlm_timeout) * HZ;
-                                / nlm_timeout) * nlm_timeout * HZ;
        else
-                grace_period = nlm_timeout * 5 * HZ;
+                return nlm_timeout * 5 * HZ;
+}
+unsigned long get_nfs_grace_period(void)
+{
+        unsigned long lockdgrace = get_lockd_grace_period();
+        unsigned long nfsdgrace = 0;
+        if (nlmsvc_ops)
+                nfsdgrace = nlmsvc_ops->get_grace_period();
+        return max(lockdgrace, nfsdgrace);
+}
+EXPORT_SYMBOL(get_nfs_grace_period);
+static unsigned long set_grace_period(void)
+{
        nlmsvc_grace_period = 1;
-        return grace_period + jiffies;
+        return get_nfs_grace_period() + jiffies;
 }
 static inline void clear_grace_period(void)
@@ -119,6 +133,7 @@ lockd(struct svc_rqst *rqstp)
        complete(&lockd_start_done);
        daemonize("lockd");
+        set_freezable();
        /* Process request with signals blocked, but allow SIGKILL.  */
        allow_signal(SIGKILL);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index deeb9dc062d9..fbb1d02f8791 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,7 +100,6 @@ struct mb_cache {
 static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
-static struct shrinker *mb_shrinker;
 static inline int
 mb_cache_indexes(struct mb_cache *cache)
@@ -118,6 +117,10 @@ mb_cache_indexes(struct mb_cache *cache)
 static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static struct shrinker mb_cache_shrinker = {
+        .shrink = mb_cache_shrink_fn,
+        .seeks = DEFAULT_SEEKS,
+};
 static inline int
 __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
@@ -662,13 +665,13 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
 static int __init init_mbcache(void)
 {
-        mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn);
+        register_shrinker(&mb_cache_shrinker);
        return 0;
 }
 static void __exit exit_mbcache(void)
 {
-        remove_shrinker(mb_shrinker);
+        unregister_shrinker(&mb_cache_shrinker);
 }
 module_init(init_mbcache)
diff --git a/fs/namespace.c b/fs/namespace.c
index b696e3a0d18f..4198003d7e18 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,6 +28,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
+#include "internal.h"
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
@@ -320,22 +321,16 @@ EXPORT_SYMBOL(mnt_unpin);
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
        struct mnt_namespace *n = m->private;
-        struct list_head *p;
-        loff_t l = *pos;
        down_read(&namespace_sem);
-        list_for_each(p, &n->list)
+        return seq_list_start(&n->list, *pos);
-                if (!l--)
-                        return list_entry(p, struct vfsmount, mnt_list);
-        return NULL;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct mnt_namespace *n = m->private;
-        struct list_head *p = ((struct vfsmount *)v)->mnt_list.next;
-        (*pos)++;
+        return seq_list_next(v, &n->list, pos);
-        return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list);
 }
 static void m_stop(struct seq_file *m, void *v)
@@ -350,7 +345,7 @@ static inline void mangle(struct seq_file *m, const char *s)
 static int show_vfsmnt(struct seq_file *m, void *v)
 {
-        struct vfsmount *mnt = v;
+        struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
        int err = 0;
        static struct proc_fs_info {
                int flag;
@@ -405,7 +400,7 @@ struct seq_operations mounts_op = {
 static int show_vfsstat(struct seq_file *m, void *v)
 {
-        struct vfsmount *mnt = v;
+        struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
        int err = 0;
        /* device */
@@ -1457,7 +1452,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
        if (!new_ns)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        atomic_set(&new_ns->count, 1);
        INIT_LIST_HEAD(&new_ns->list);
@@ -1471,7 +1466,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        if (!new_ns->root) {
                up_write(&namespace_sem);
                kfree(new_ns);
-                return NULL;
+                return ERR_PTR(-ENOMEM);;
        }
        spin_lock(&vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -1515,7 +1510,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        return new_ns;
 }
-struct mnt_namespace *copy_mnt_ns(int flags, struct mnt_namespace *ns,
+struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                struct fs_struct *new_fs)
 {
        struct mnt_namespace *new_ns;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index d3152f8d95c6..2b145de45b39 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -203,7 +203,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
                if (pos >= MAX_NON_LFS) {
-                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                if (count > MAX_NON_LFS - (u32)pos) {
@@ -212,7 +211,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        }
        if (pos >= inode->i_sb->s_maxbytes) {
                if (count || pos > inode->i_sb->s_maxbytes) {
-                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
        }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f309c8741a..a796be5051bf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 #include <net/inet_sock.h>
@@ -67,6 +68,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
        daemonize("nfsv4-svc");
        /* Process request with signals blocked, but allow SIGKILL.  */
        allow_signal(SIGKILL);
+        set_freezable();
        complete(&nfs_callback_info.started);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ccb455053ee4..a49f9feff776 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1206,23 +1206,9 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
 */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 {
-        struct list_head *_p;
-        loff_t pos = *_pos;
        /* lock the list against modification */
        spin_lock(&nfs_client_lock);
+        return seq_list_start_head(&nfs_client_list, *_pos);
-        /* allow for the header line */
-        if (!pos)
-                return SEQ_START_TOKEN;
-        pos--;
-        /* find the n'th element in the list */
-        list_for_each(_p, &nfs_client_list)
-                if (!pos--)
-                        break;
-        return _p != &nfs_client_list ? _p : NULL;
 }
 /*
@@ -1230,14 +1216,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 */
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        struct list_head *_p;
+        return seq_list_next(v, &nfs_client_list, pos);
-        (*pos)++;
-        _p = v;
-        _p = (v == SEQ_START_TOKEN) ? nfs_client_list.next : _p->next;
-        return _p != &nfs_client_list ? _p : NULL;
 }
 /*
@@ -1256,7 +1235,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
        struct nfs_client *clp;
        /* display header on line 1 */
-        if (v == SEQ_START_TOKEN) {
+        if (v == &nfs_client_list) {
                seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");
                return 0;
        }
@@ -1297,23 +1276,9 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
 */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 {
-        struct list_head *_p;
-        loff_t pos = *_pos;
        /* lock the list against modification */
        spin_lock(&nfs_client_lock);
+        return seq_list_start_head(&nfs_volume_list, *_pos);
-        /* allow for the header line */
-        if (!pos)
-                return SEQ_START_TOKEN;
-        pos--;
-        /* find the n'th element in the list */
-        list_for_each(_p, &nfs_volume_list)
-                if (!pos--)
-                        break;
-        return _p != &nfs_volume_list ? _p : NULL;
 }
 /*
@@ -1321,14 +1286,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 */
 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        struct list_head *_p;
+        return seq_list_next(v, &nfs_volume_list, pos);
-        (*pos)++;
-        _p = v;
-        _p = (v == SEQ_START_TOKEN) ? nfs_volume_list.next : _p->next;
-        return _p != &nfs_volume_list ? _p : NULL;
 }
 /*
@@ -1349,7 +1307,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
        char dev[8], fsid[17];
        /* display header on line 1 */
-        if (v == SEQ_START_TOKEN) {
+        if (v == &nfs_volume_list) {
                seq_puts(m, "NV SERVER   PORT DEV     FSID\n");
                return 0;
        }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a2b1af89ca1a..adffe1615c51 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -300,7 +300,10 @@ static const struct super_operations nfs4_sops = {
 };
 #endif
-static struct shrinker *acl_shrinker;
+static struct shrinker acl_shrinker = {
+        .shrink         = nfs_access_cache_shrinker,
+        .seeks          = DEFAULT_SEEKS,
+};
 /*
 * Register the NFS filesystems
@@ -321,7 +324,7 @@ int __init register_nfs_fs(void)
        if (ret < 0)
                goto error_2;
 #endif
-        acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker);
+        register_shrinker(&acl_shrinker);
        return 0;
 #ifdef CONFIG_NFS_V4
@@ -339,8 +342,7 @@ error_0:
 */
 void __exit unregister_nfs_fs(void)
 {
-        if (acl_shrinker != NULL)
+        unregister_shrinker(&acl_shrinker);
-                remove_shrinker(acl_shrinker);
 #ifdef CONFIG_NFS_V4
        unregister_filesystem(&nfs4_fs_type);
        nfs_unregister_sysctl();
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 6e92b0fe5323..cf61dc8ae942 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -12,17 +12,31 @@
 #define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE))
+static int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+        struct exp_flavor_info *f;
+        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+        for (f = exp->ex_flavors; f < end; f++) {
+                if (f->pseudoflavor == rqstp->rq_flavor)
+                        return f->flags;
+        }
+        return exp->ex_flags;
+}
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
        struct svc_cred cred = rqstp->rq_cred;
        int i;
+        int flags = nfsexp_flags(rqstp, exp);
        int ret;
-        if (exp->ex_flags & NFSEXP_ALLSQUASH) {
+        if (flags & NFSEXP_ALLSQUASH) {
                cred.cr_uid = exp->ex_anon_uid;
                cred.cr_gid = exp->ex_anon_gid;
                cred.cr_group_info = groups_alloc(0);
-        } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) {
+        } else if (flags & NFSEXP_ROOTSQUASH) {
                struct group_info *gi;
                if (!cred.cr_uid)
                        cred.cr_uid = exp->ex_anon_uid;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 79bd03b8bbf8..c7bbf460b009 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,12 +26,15 @@
 #include <linux/mount.h>
 #include <linux/hash.h>
 #include <linux/module.h>
+#include <linux/exportfs.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/bind.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
 #define NFSDDBG_FACILITY        NFSDDBG_EXPORT
@@ -451,8 +454,48 @@ out_free_all:
        return err;
 }
+static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+        int listsize, err;
+        struct exp_flavor_info *f;
+        err = get_int(mesg, &listsize);
+        if (err)
+                return err;
+        if (listsize < 0 || listsize > MAX_SECINFO_LIST)
+                return -EINVAL;
+        for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
+                err = get_int(mesg, &f->pseudoflavor);
+                if (err)
+                        return err;
+                /*
+                 * Just a quick sanity check; we could also try to check
+                 * whether this pseudoflavor is supported, but at worst
+                 * an unsupported pseudoflavor on the export would just
+                 * be a pseudoflavor that won't match the flavor of any
+                 * authenticated request.  The administrator will
+                 * probably discover the problem when someone fails to
+                 * authenticate.
+                 */
+                if (f->pseudoflavor < 0)
+                        return -EINVAL;
+                err = get_int(mesg, &f->flags);
+                if (err)
+                        return err;
+                /* Only some flags are allowed to differ between flavors: */
+                if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
+                        return -EINVAL;
+        }
+        exp->ex_nflavors = listsize;
+        return 0;
+}
 #else /* CONFIG_NFSD_V4 */
-static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; }
+static inline int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
+static inline int
+secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
 #endif
 static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
@@ -476,6 +519,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        exp.ex_uuid = NULL;
+        /* secinfo */
+        exp.ex_nflavors = 0;
        if (mesg[mlen-1] != '\n')
                return -EINVAL;
        mesg[mlen-1] = 0;
@@ -553,7 +599,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                                        if (exp.ex_uuid == NULL)
                                                err = -ENOMEM;
                                }
-                        } else
+                        } else if (strcmp(buf, "secinfo") == 0)
+                                err = secinfo_parse(&mesg, buf, &exp);
+                        else
                                /* quietly ignore unknown words and anything
                                 * following. Newer user-space can try to set
                                 * new values, then see what the result was.
@@ -593,6 +641,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 static void exp_flags(struct seq_file *m, int flag, int fsid,
                uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
+static void show_secinfo(struct seq_file *m, struct svc_export *exp);
 static int svc_export_show(struct seq_file *m,
                           struct cache_detail *cd,
@@ -622,6 +671,7 @@ static int svc_export_show(struct seq_file *m,
                                seq_printf(m, "%02x", exp->ex_uuid[i]);
                        }
                }
+                show_secinfo(m, exp);
        }
        seq_puts(m, ")\n");
        return 0;
@@ -654,6 +704,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 {
        struct svc_export *new = container_of(cnew, struct svc_export, h);
        struct svc_export *item = container_of(citem, struct svc_export, h);
+        int i;
        new->ex_flags = item->ex_flags;
        new->ex_anon_uid = item->ex_anon_uid;
@@ -669,6 +720,10 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        item->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = item->ex_fslocs.migrated;
        item->ex_fslocs.migrated = 0;
+        new->ex_nflavors = item->ex_nflavors;
+        for (i = 0; i < MAX_SECINFO_LIST; i++) {
+                new->ex_flavors[i] = item->ex_flavors[i];
+        }
 }
 static struct cache_head *svc_export_alloc(void)
@@ -738,16 +793,18 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
        int err;
        
        if (!clp)
-                return NULL;
+                return ERR_PTR(-ENOENT);
        key.ek_client = clp;
        key.ek_fsidtype = fsid_type;
        memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
        ek = svc_expkey_lookup(&key);
-        if (ek != NULL)
+        if (ek == NULL)
-                if ((err = cache_check(&svc_expkey_cache, &ek->h, reqp)))
+                return ERR_PTR(-ENOMEM);
-                        ek = ERR_PTR(err);
+        err = cache_check(&svc_expkey_cache, &ek->h, reqp);
+        if (err)
+                return ERR_PTR(err);
        return ek;
 }
@@ -808,30 +865,21 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
                struct cache_req *reqp)
 {
        struct svc_export *exp, key;
+        int err;
        
        if (!clp)
-                return NULL;
+                return ERR_PTR(-ENOENT);
        key.ex_client = clp;
        key.ex_mnt = mnt;
        key.ex_dentry = dentry;
        exp = svc_export_lookup(&key);
-        if (exp != NULL)  {
+        if (exp == NULL)
-                int err;
+                return ERR_PTR(-ENOMEM);
+        err = cache_check(&svc_export_cache, &exp->h, reqp);
-                err = cache_check(&svc_export_cache, &exp->h, reqp);
+        if (err)
-                switch (err) {
+                return ERR_PTR(err);
-                case 0: break;
-                case -EAGAIN:
-                case -ETIMEDOUT:
-                        exp = ERR_PTR(err);
-                        break;
-                default:
-                        exp = NULL;
-                }
-        }
        return exp;
 }
@@ -847,7 +895,7 @@ exp_parent(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
        dget(dentry);
        exp = exp_get_by_name(clp, mnt, dentry, reqp);
-        while (exp == NULL && !IS_ROOT(dentry)) {
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
                struct dentry *parent;
                parent = dget_parent(dentry);
@@ -900,7 +948,7 @@ static void exp_fsid_unhash(struct svc_export *exp)
                return;
        ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
-        if (ek && !IS_ERR(ek)) {
+        if (!IS_ERR(ek)) {
                ek->h.expiry_time = get_seconds()-1;
                cache_put(&ek->h, &svc_expkey_cache);
        }
@@ -938,7 +986,7 @@ static void exp_unhash(struct svc_export *exp)
        struct inode *inode = exp->ex_dentry->d_inode;
        ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
-        if (ek && !IS_ERR(ek)) {
+        if (!IS_ERR(ek)) {
                ek->h.expiry_time = get_seconds()-1;
                cache_put(&ek->h, &svc_expkey_cache);
        }
@@ -989,13 +1037,12 @@ exp_export(struct nfsctl_export *nxp)
        /* must make sure there won't be an ex_fsid clash */
        if ((nxp->ex_flags & NFSEXP_FSID) &&
-            (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) &&
+            (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
-            !IS_ERR(fsid_key) &&
            fsid_key->ek_mnt &&
            (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) )
                goto finish;
-        if (exp) {
+        if (!IS_ERR(exp)) {
                /* just a flags/id/fsid update */
                exp_fsid_unhash(exp);
@@ -1104,7 +1151,7 @@ exp_unexport(struct nfsctl_export *nxp)
        err = -EINVAL;
        exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL);
        path_release(&nd);
-        if (!exp)
+        if (IS_ERR(exp))
                goto out_domain;
        exp_do_unexport(exp);
@@ -1149,10 +1196,6 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
                err = PTR_ERR(exp);
                goto out;
        }
-        if (!exp) {
-                dprintk("nfsd: exp_rootfh export not found.\n");
-                goto out;
-        }
        /*
         * fh must be initialized before calling fh_compose
@@ -1176,17 +1219,130 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 {
        struct svc_export *exp;
        struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
-        if (!ek || IS_ERR(ek))
+        if (IS_ERR(ek))
                return ERR_PTR(PTR_ERR(ek));
        exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
        cache_put(&ek->h, &svc_expkey_cache);
-        if (!exp || IS_ERR(exp))
+        if (IS_ERR(exp))
                return ERR_PTR(PTR_ERR(exp));
        return exp;
 }
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+        struct exp_flavor_info *f;
+        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+        /* legacy gss-only clients are always OK: */
+        if (exp->ex_client == rqstp->rq_gssclient)
+                return 0;
+        /* ip-address based client; check sec= export option: */
+        for (f = exp->ex_flavors; f < end; f++) {
+                if (f->pseudoflavor == rqstp->rq_flavor)
+                        return 0;
+        }
+        /* defaults in absence of sec= options: */
+        if (exp->ex_nflavors == 0) {
+                if (rqstp->rq_flavor == RPC_AUTH_NULL ||
+                    rqstp->rq_flavor == RPC_AUTH_UNIX)
+                        return 0;
+        }
+        return nfserr_wrongsec;
+}
+/*
+ * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
+ * auth_unix client) if it's available and has secinfo information;
+ * otherwise, will try to use rq_gssclient.
+ *
+ * Called from functions that handle requests; functions that do work on
+ * behalf of mountd are passed a single client name to use, and should
+ * use exp_get_by_name() or exp_find().
+ */
+struct svc_export *
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
+                struct dentry *dentry)
+{
+        struct svc_export *gssexp, *exp = NULL;
+        if (rqstp->rq_client == NULL)
+                goto gss;
+        /* First try the auth_unix client: */
+        exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+                                                &rqstp->rq_chandle);
+        if (PTR_ERR(exp) == -ENOENT)
+                goto gss;
+        if (IS_ERR(exp))
+                return exp;
+        /* If it has secinfo, assume there are no gss/... clients */
+        if (exp->ex_nflavors > 0)
+                return exp;
+gss:
+        /* Otherwise, try falling back on gss client */
+        if (rqstp->rq_gssclient == NULL)
+                return exp;
+        gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+                                                &rqstp->rq_chandle);
+        if (PTR_ERR(gssexp) == -ENOENT)
+                return exp;
+        if (exp && !IS_ERR(exp))
+                exp_put(exp);
+        return gssexp;
+}
+struct svc_export *
+rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
+{
+        struct svc_export *gssexp, *exp = NULL;
+        if (rqstp->rq_client == NULL)
+                goto gss;
+        /* First try the auth_unix client: */
+        exp = exp_find(rqstp->rq_client, fsid_type, fsidv, &rqstp->rq_chandle);
+        if (PTR_ERR(exp) == -ENOENT)
+                goto gss;
+        if (IS_ERR(exp))
+                return exp;
+        /* If it has secinfo, assume there are no gss/... clients */
+        if (exp->ex_nflavors > 0)
+                return exp;
+gss:
+        /* Otherwise, try falling back on gss client */
+        if (rqstp->rq_gssclient == NULL)
+                return exp;
+        gssexp = exp_find(rqstp->rq_gssclient, fsid_type, fsidv,
+                                                &rqstp->rq_chandle);
+        if (PTR_ERR(gssexp) == -ENOENT)
+                return exp;
+        if (exp && !IS_ERR(exp))
+                exp_put(exp);
+        return gssexp;
+}
+struct svc_export *
+rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
+                struct dentry *dentry)
+{
+        struct svc_export *exp;
+        dget(dentry);
+        exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                struct dentry *parent;
+                parent = dget_parent(dentry);
+                dput(dentry);
+                dentry = parent;
+                exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+        }
+        dput(dentry);
+        return exp;
+}
 /*
 * Called when we need the filehandle for the root of the pseudofs,
@@ -1194,8 +1350,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 * export point with fsid==0
 */
 __be32
-exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
+exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
-               struct cache_req *creq)
 {
        struct svc_export *exp;
        __be32 rv;
@@ -1203,12 +1358,16 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
-        exp = exp_find(clp, FSID_NUM, fsidv, creq);
+        exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
+        if (PTR_ERR(exp) == -ENOENT)
+                return nfserr_perm;
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
-        if (exp == NULL)
-                return nfserr_perm;
        rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
+        if (rv)
+                goto out;
+        rv = check_nfsd_access(exp, rqstp);
+out:
        exp_put(exp);
        return rv;
 }
@@ -1296,28 +1455,62 @@ static struct flags {
        { 0, {"", ""}}
 };
-static void exp_flags(struct seq_file *m, int flag, int fsid,
+static void show_expflags(struct seq_file *m, int flags, int mask)
-                uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
 {
-        int first = 0;
        struct flags *flg;
+        int state, first = 0;
        for (flg = expflags; flg->flag; flg++) {
-                int state = (flg->flag & flag)?0:1;
+                if (flg->flag & ~mask)
+                        continue;
+                state = (flg->flag & flags) ? 0 : 1;
                if (*flg->name[state])
                        seq_printf(m, "%s%s", first++?",":"", flg->name[state]);
        }
+}
+static void show_secinfo_flags(struct seq_file *m, int flags)
+{
+        seq_printf(m, ",");
+        show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
+}
+static void show_secinfo(struct seq_file *m, struct svc_export *exp)
+{
+        struct exp_flavor_info *f;
+        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+        int lastflags = 0, first = 0;
+        if (exp->ex_nflavors == 0)
+                return;
+        for (f = exp->ex_flavors; f < end; f++) {
+                if (first || f->flags != lastflags) {
+                        if (!first)
+                                show_secinfo_flags(m, lastflags);
+                        seq_printf(m, ",sec=%d", f->pseudoflavor);
+                        lastflags = f->flags;
+                } else {
+                        seq_printf(m, ":%d", f->pseudoflavor);
+                }
+        }
+        show_secinfo_flags(m, lastflags);
+}
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+                uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
+{
+        show_expflags(m, flag, NFSEXP_ALLFLAGS);
        if (flag & NFSEXP_FSID)
-                seq_printf(m, "%sfsid=%d", first++?",":"", fsid);
+                seq_printf(m, ",fsid=%d", fsid);
        if (anonu != (uid_t)-2 && anonu != (0x10000-2))
-                seq_printf(m, "%sanonuid=%d", first++?",":"", anonu);
+                seq_printf(m, ",sanonuid=%d", anonu);
        if (anong != (gid_t)-2 && anong != (0x10000-2))
-                seq_printf(m, "%sanongid=%d", first++?",":"", anong);
+                seq_printf(m, ",sanongid=%d", anong);
        if (fsloc && fsloc->locations_count > 0) {
                char *loctype = (fsloc->migrated) ? "refer" : "replicas";
                int i;
-                seq_printf(m, "%s%s=", first++?",":"", loctype);
+                seq_printf(m, ",%s=", loctype);
                seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
                seq_putc(m, '@');
                seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 221acd1f11f6..9e4a568a5013 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -65,6 +65,7 @@ nlm_fclose(struct file *filp)
 static struct nlmsvc_binding    nfsd_nlm_ops = {
        .fopen          = nlm_fopen,            /* open file for locking */
        .fclose         = nlm_fclose,           /* close file */
+        .get_grace_period = get_nfs4_grace_period,
 };
 void
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index cc3b7badd486..b6ed38380ab8 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -183,8 +183,13 @@ static void
 summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
 {
        struct posix_acl_entry *pa, *pe;
-        pas->users = 0;
-        pas->groups = 0;
+        /*
+         * Only pas.users and pas.groups need initialization; previous
+         * posix_acl_valid() calls ensure that the other fields will be
+         * initialized in the following loop.  But, just to placate gcc:
+         */
+        memset(pas, 0, sizeof(*pas));
        pas->mask = 07;
        pe = acl->a_entries + acl->a_count;
@@ -732,13 +737,16 @@ int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
        *pacl = posix_state_to_acl(&effective_acl_state, flags);
        if (IS_ERR(*pacl)) {
                ret = PTR_ERR(*pacl);
+                *pacl = NULL;
                goto out_dstate;
        }
        *dpacl = posix_state_to_acl(&default_acl_state,
                                                flags | NFS4_ACL_TYPE_DEFAULT);
        if (IS_ERR(*dpacl)) {
                ret = PTR_ERR(*dpacl);
+                *dpacl = NULL;
                posix_acl_release(*pacl);
+                *pacl = NULL;
                goto out_dstate;
        }
        sort_pacl(*pacl);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 5443c52b57aa..31d6633c7fe4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -75,7 +75,7 @@ enum nfs_cb_opnum4 {
 #define op_enc_sz                       1
 #define op_dec_sz                       2
 #define enc_nfs4_fh_sz                  (1 + (NFS4_FHSIZE >> 2))
-#define enc_stateid_sz                  16
+#define enc_stateid_sz                  (NFS4_STATEID_SIZE >> 2)
 #define NFS4_enc_cb_recall_sz           (cb_compound_enc_hdr_sz +       \
                                        1 + enc_stateid_sz +            \
                                        enc_nfs4_fh_sz)
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 45aa21ce6784..2cf9a9a2d89c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -587,6 +587,15 @@ idmap_lookup(struct svc_rqst *rqstp,
        return ret;
 }
+static char *
+rqst_authname(struct svc_rqst *rqstp)
+{
+        struct auth_domain *clp;
+        clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+        return clp->name;
+}
 static int
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
                uid_t *id)
@@ -600,7 +609,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
                return -EINVAL;
        memcpy(key.name, name, namelen);
        key.name[namelen] = '\0';
-        strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname));
+        strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
        ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
        if (ret == -ENOENT)
                ret = -ESRCH; /* nfserr_badname */
@@ -620,7 +629,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
        };
        int ret;
-        strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname));
+        strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
        ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item);
        if (ret == -ENOENT)
                return sprintf(name, "%u", id);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8522729830db..3c627128e205 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -47,6 +47,7 @@
 #include <linux/nfsd/state.h>
 #include <linux/nfsd/xdr4.h>
 #include <linux/nfs4_acl.h>
+#include <linux/sunrpc/gss_api.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -286,8 +287,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 status;
        fh_put(&cstate->current_fh);
-        status = exp_pseudoroot(rqstp->rq_client, &cstate->current_fh,
+        status = exp_pseudoroot(rqstp, &cstate->current_fh);
-                              &rqstp->rq_chandle);
        return status;
 }
@@ -474,8 +474,8 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 ret;
        fh_init(&tmp_fh, NFS4_FHSIZE);
-        if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh,
+        ret = exp_pseudoroot(rqstp, &tmp_fh);
-                              &rqstp->rq_chandle)) != 0)
+        if (ret)
                return ret;
        if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
                fh_put(&tmp_fh);
@@ -611,6 +611,30 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 }
 static __be32
+nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+              struct nfsd4_secinfo *secinfo)
+{
+        struct svc_fh resfh;
+        struct svc_export *exp;
+        struct dentry *dentry;
+        __be32 err;
+        fh_init(&resfh, NFS4_FHSIZE);
+        err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
+                                    secinfo->si_name, secinfo->si_namelen,
+                                    &exp, &dentry);
+        if (err)
+                return err;
+        if (dentry->d_inode == NULL) {
+                exp_put(exp);
+                err = nfserr_noent;
+        } else
+                secinfo->si_exp = exp;
+        dput(dentry);
+        return err;
+}
+static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
              struct nfsd4_setattr *setattr)
 {
@@ -1009,6 +1033,9 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
        [OP_SAVEFH] = {
                .op_func = (nfsd4op_func)nfsd4_savefh,
        },
+        [OP_SECINFO] = {
+                .op_func = (nfsd4op_func)nfsd4_secinfo,
+        },
        [OP_SETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_setattr,
        },
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8c52913d7cb6..e4a4c87ec8c6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,8 +49,10 @@
 #include <linux/nfsd/state.h>
 #include <linux/nfsd/xdr4.h>
 #include <linux/namei.h>
+#include <linux/swap.h>
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
+#include <linux/module.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -149,6 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
 }
 static int num_delegations;
+unsigned int max_delegations;
 /*
 * Open owner state (share locks)
@@ -192,7 +195,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
        dprintk("NFSD alloc_init_deleg\n");
-        if (num_delegations > STATEID_HASH_SIZE * 4)
+        if (fp->fi_had_conflict)
+                return NULL;
+        if (num_delegations > max_delegations)
                return NULL;
        dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
        if (dp == NULL)
@@ -999,6 +1004,7 @@ alloc_init_file(struct inode *ino)
                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
+                fp->fi_had_conflict = false;
                return fp;
        }
        return NULL;
@@ -1325,6 +1331,7 @@ do_recall(void *__dp)
 {
        struct nfs4_delegation *dp = __dp;
+        dp->dl_file->fi_had_conflict = true;
        nfsd4_cb_recall(dp);
        return 0;
 }
@@ -3190,20 +3197,49 @@ nfsd4_load_reboot_recovery_data(void)
                printk("NFSD: Failure reading reboot recovery data\n");
 }
+unsigned long
+get_nfs4_grace_period(void)
+{
+        return max(user_lease_time, lease_time) * HZ;
+}
+/*
+ * Since the lifetime of a delegation isn't limited to that of an open, a
+ * client may quite reasonably hang on to a delegation as long as it has
+ * the inode cached.  This becomes an obvious problem the first time a
+ * client's inode cache approaches the size of the server's total memory.
+ *
+ * For now we avoid this problem by imposing a hard limit on the number
+ * of delegations, which varies according to the server's memory size.
+ */
+static void
+set_max_delegations(void)
+{
+        /*
+         * Allow at most 4 delegations per megabyte of RAM.  Quick
+         * estimates suggest that in the worst case (where every delegation
+         * is for a different inode), a delegation could take about 1.5K,
+         * giving a worst case usage of about 6% of memory.
+         */
+        max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
+}
 /* initialization to perform when the nfsd service is started: */
 static void
 __nfs4_state_start(void)
 {
-        time_t grace_time;
+        unsigned long grace_time;
        boot_time = get_seconds();
-        grace_time = max(user_lease_time, lease_time);
+        grace_time = get_nfs_grace_period();
        lease_time = user_lease_time;
        in_grace = 1;
-        printk("NFSD: starting %ld-second grace period\n", grace_time);
+        printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
+               grace_time/HZ);
        laundry_wq = create_singlethread_workqueue("nfsd4");
-        queue_delayed_work(laundry_wq, &laundromat_work, grace_time*HZ);
+        queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+        set_max_delegations();
 }
 int
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15809dfd88a5..b3d55c6747fd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -56,6 +56,8 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
@@ -819,6 +821,23 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
 }
 static __be32
+nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
+                     struct nfsd4_secinfo *secinfo)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(secinfo->si_namelen);
+        READ_BUF(secinfo->si_namelen);
+        SAVEMEM(secinfo->si_name, secinfo->si_namelen);
+        status = check_filename(secinfo->si_name, secinfo->si_namelen,
+                                                                nfserr_noent);
+        if (status)
+                return status;
+        DECODE_TAIL;
+}
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
        DECODE_HEAD;
@@ -1131,6 +1150,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                case OP_SAVEFH:
                        op->status = nfs_ok;
                        break;
+                case OP_SECINFO:
+                        op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
+                        break;
                case OP_SETATTR:
                        op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
                        break;
@@ -1296,7 +1318,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
        char *path, *rootpath;
        fh_init(&tmp_fh, NFS4_FHSIZE);
-        *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle);
+        *stat = exp_pseudoroot(rqstp, &tmp_fh);
        if (*stat)
                return NULL;
        rootpath = tmp_fh.fh_export->ex_path;
@@ -1847,11 +1869,19 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
        if (d_mountpoint(dentry)) {
                int err;
+                /*
+                 * Why the heck aren't we just using nfsd_lookup??
+                 * Different "."/".." handling?  Something else?
+                 * At least, add a comment here to explain....
+                 */
                err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
                if (err) {
                        nfserr = nfserrno(err);
                        goto out_put;
                }
+                nfserr = check_nfsd_access(exp, cd->rd_rqstp);
+                if (nfserr)
+                        goto out_put;
        }
        nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
@@ -2419,6 +2449,72 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
        }
 }
+static void
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr,
+                     struct nfsd4_secinfo *secinfo)
+{
+        int i = 0;
+        struct svc_export *exp = secinfo->si_exp;
+        u32 nflavs;
+        struct exp_flavor_info *flavs;
+        struct exp_flavor_info def_flavs[2];
+        ENCODE_HEAD;
+        if (nfserr)
+                goto out;
+        if (exp->ex_nflavors) {
+                flavs = exp->ex_flavors;
+                nflavs = exp->ex_nflavors;
+        } else { /* Handling of some defaults in absence of real secinfo: */
+                flavs = def_flavs;
+                if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
+                        nflavs = 2;
+                        flavs[0].pseudoflavor = RPC_AUTH_UNIX;
+                        flavs[1].pseudoflavor = RPC_AUTH_NULL;
+                } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
+                        nflavs = 1;
+                        flavs[0].pseudoflavor
+                                        = svcauth_gss_flavor(exp->ex_client);
+                } else {
+                        nflavs = 1;
+                        flavs[0].pseudoflavor
+                                        = exp->ex_client->flavour->flavour;
+                }
+        }
+        RESERVE_SPACE(4);
+        WRITE32(nflavs);
+        ADJUST_ARGS();
+        for (i = 0; i < nflavs; i++) {
+                u32 flav = flavs[i].pseudoflavor;
+                struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
+                if (gm) {
+                        RESERVE_SPACE(4);
+                        WRITE32(RPC_AUTH_GSS);
+                        ADJUST_ARGS();
+                        RESERVE_SPACE(4 + gm->gm_oid.len);
+                        WRITE32(gm->gm_oid.len);
+                        WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
+                        ADJUST_ARGS();
+                        RESERVE_SPACE(4);
+                        WRITE32(0); /* qop */
+                        ADJUST_ARGS();
+                        RESERVE_SPACE(4);
+                        WRITE32(gss_pseudoflavor_to_service(gm, flav));
+                        ADJUST_ARGS();
+                        gss_mech_put(gm);
+                } else {
+                        RESERVE_SPACE(4);
+                        WRITE32(flav);
+                        ADJUST_ARGS();
+                }
+        }
+out:
+        if (exp)
+                exp_put(exp);
+}
 /*
 * The SETATTR encode routine is special -- it always encodes a bitmap,
 * regardless of the error status.
@@ -2559,6 +2655,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
                break;
        case OP_SAVEFH:
                break;
+        case OP_SECINFO:
+                nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
+                break;
        case OP_SETATTR:
                nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
                break;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 71c686dc7257..baac89d917ca 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -35,7 +35,6 @@
 #include <linux/nfsd/cache.h>
 #include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
-#include <linux/nfsd/interface.h>
 #include <asm/uaccess.h>
@@ -245,7 +244,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        }
        exp_readunlock();
        if (err == 0)
-                err = res->fh_size + (int)&((struct knfsd_fh*)0)->fh_base;
+                err = res->fh_size + offsetof(struct knfsd_fh, fh_base);
 out:
        return err;
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6ca2d24fc216..0eb464a39aae 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -15,10 +15,12 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/dcache.h>
+#include <linux/exportfs.h>
 #include <linux/mount.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #include <linux/nfsd/nfsd.h>
 #define NFSDDBG_FACILITY                NFSDDBG_FH
@@ -27,10 +29,6 @@
 static int nfsd_nr_verified;
 static int nfsd_nr_put;
-extern struct export_operations export_op_default;
-#define CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
 /*
 * our acceptability function.
 * if NOSUBTREECHECK, accept anything
@@ -123,8 +121,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                int data_left = fh->fh_size/4;
                error = nfserr_stale;
-                if (rqstp->rq_client == NULL)
-                        goto out;
                if (rqstp->rq_vers > 2)
                        error = nfserr_badhandle;
                if (rqstp->rq_vers == 4 && fh->fh_size == 0)
@@ -148,7 +144,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                                fh->fh_fsid[1] = fh->fh_fsid[2];
                        }
                        if ((data_left -= len)<0) goto out;
-                        exp = exp_find(rqstp->rq_client, fh->fh_fsid_type, datap, &rqstp->rq_chandle);
+                        exp = rqst_exp_find(rqstp, fh->fh_fsid_type, datap);
                        datap += len;
                } else {
                        dev_t xdev;
@@ -159,19 +155,17 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                        xdev = old_decode_dev(fh->ofh_xdev);
                        xino = u32_to_ino_t(fh->ofh_xino);
                        mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
-                        exp = exp_find(rqstp->rq_client, FSID_DEV, tfh,
+                        exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
-                                       &rqstp->rq_chandle);
                }
-                if (IS_ERR(exp) && (PTR_ERR(exp) == -EAGAIN
+                error = nfserr_stale;
-                                || PTR_ERR(exp) == -ETIMEDOUT)) {
+                if (PTR_ERR(exp) == -ENOENT)
-                        error = nfserrno(PTR_ERR(exp));
                        goto out;
-                }
-                error = nfserr_stale; 
+                if (IS_ERR(exp)) {
-                if (!exp || IS_ERR(exp))
+                        error = nfserrno(PTR_ERR(exp));
                        goto out;
+                }
                /* Check if the request originated from a secure port. */
                error = nfserr_perm;
@@ -211,11 +205,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                if (fileid_type == 0)
                        dentry = dget(exp->ex_dentry);
                else {
-                        struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
+                        dentry = exportfs_decode_fh(exp->ex_mnt, datap,
-                        dentry = CALL(nop,decode_fh)(exp->ex_mnt->mnt_sb,
+                                        data_left, fileid_type,
-                                                     datap, data_left,
+                                        nfsd_acceptable, exp);
-                                                     fileid_type,
-                                                     nfsd_acceptable, exp);
                }
                if (dentry == NULL)
                        goto out;
@@ -257,8 +249,19 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
        if (error)
                goto out;
+        if (!(access & MAY_LOCK)) {
+                /*
+                 * pseudoflavor restrictions are not enforced on NLM,
+                 * which clients virtually always use auth_sys for,
+                 * even while using RPCSEC_GSS for NFS.
+                 */
+                error = check_nfsd_access(exp, rqstp);
+                if (error)
+                        goto out;
+        }
        /* Finally, check access permissions. */
-        error = nfsd_permission(exp, dentry, access);
+        error = nfsd_permission(rqstp, exp, dentry, access);
        if (error) {
                dprintk("fh_verify: %s/%s permission failure, "
@@ -286,15 +289,13 @@ out:
 static inline int _fh_update(struct dentry *dentry, struct svc_export *exp,
                             __u32 *datap, int *maxsize)
 {
-        struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
        if (dentry == exp->ex_dentry) {
                *maxsize = 0;
                return 0;
        }
-        return CALL(nop,encode_fh)(dentry, datap, maxsize,
+        return exportfs_encode_fh(dentry, datap, maxsize,
-                          !(exp->ex_flags&NFSEXP_NOSUBTREECHECK));
+                          !(exp->ex_flags & NFSEXP_NOSUBTREECHECK));
 }
 /*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index b2c7147aa921..977a71f64e19 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -278,7 +278,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                                         *   echo thing > device-special-file-or-pipe
                                         * by doing a CREATE with type==0
                                         */
-                                        nfserr = nfsd_permission(newfhp->fh_export,
+                                        nfserr = nfsd_permission(rqstp,
+                                                                 newfhp->fh_export,
                                                                 newfhp->fh_dentry,
                                                                 MAY_WRITE|MAY_LOCAL_ACCESS);
                                        if (nfserr && nfserr != nfserr_rofs)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ff55950efb43..a8c89ae4c743 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
+#include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/sunrpc/types.h>
@@ -432,6 +433,7 @@ nfsd(struct svc_rqst *rqstp)
         * dirty pages.
         */
        current->flags |= PF_LESS_THROTTLE;
+        set_freezable();
        /*
         * The main request loop
@@ -492,6 +494,15 @@ out:
        module_put_and_exit(0);
 }
+static __be32 map_new_errors(u32 vers, __be32 nfserr)
+{
+        if (nfserr == nfserr_jukebox && vers == 2)
+                return nfserr_dropit;
+        if (nfserr == nfserr_wrongsec && vers < 4)
+                return nfserr_acces;
+        return nfserr;
+}
 int
 nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 {
@@ -534,6 +545,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
        /* Now call the procedure handler, and encode NFS status. */
        nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
+        nfserr = map_new_errors(rqstp->rq_vers, nfserr);
        if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2)
                nfserr = nfserr_dropit;
        if (nfserr == nfserr_dropit) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 945b1cedde2b..e90f4a8a1d01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -113,7 +113,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
-        exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle);
+        exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
        if (IS_ERR(exp2)) {
                err = PTR_ERR(exp2);
                dput(mounts);
@@ -135,21 +135,10 @@ out:
        return err;
 }
-/*
- * Look up one component of a pathname.
- * N.B. After this call _both_ fhp and resfh need an fh_put
- *
- * If the lookup would cross a mountpoint, and the mounted filesystem
- * is exported to the client with NFSEXP_NOHIDE, then the lookup is
- * accepted as it stands and the mounted directory is
- * returned. Otherwise the covered directory is returned.
- * NOTE: this mountpoint crossing is not supported properly by all
- *   clients and is explicitly disallowed for NFSv3
- *      NeilBrown <neilb@cse.unsw.edu.au>
- */
 __be32
-nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
+nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
-                                        int len, struct svc_fh *resfh)
+                   const char *name, int len,
+                   struct svc_export **exp_ret, struct dentry **dentry_ret)
 {
        struct svc_export       *exp;
        struct dentry           *dparent;
@@ -168,8 +157,6 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
        exp  = fhp->fh_export;
        exp_get(exp);
-        err = nfserr_acces;
        /* Lookup the name, but don't follow links */
        if (isdotent(name, len)) {
                if (len==1)
@@ -190,17 +177,15 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
                        dput(dentry);
                        dentry = dp;
-                        exp2 = exp_parent(exp->ex_client, mnt, dentry,
+                        exp2 = rqst_exp_parent(rqstp, mnt, dentry);
-                                          &rqstp->rq_chandle);
+                        if (PTR_ERR(exp2) == -ENOENT) {
-                        if (IS_ERR(exp2)) {
+                                dput(dentry);
+                                dentry = dget(dparent);
+                        } else if (IS_ERR(exp2)) {
                                host_err = PTR_ERR(exp2);
                                dput(dentry);
                                mntput(mnt);
                                goto out_nfserr;
-                        }
-                        if (!exp2) {
-                                dput(dentry);
-                                dentry = dget(dparent);
                        } else {
                                exp_put(exp);
                                exp = exp2;
@@ -223,6 +208,41 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
                        }
                }
        }
+        *dentry_ret = dentry;
+        *exp_ret = exp;
+        return 0;
+out_nfserr:
+        exp_put(exp);
+        return nfserrno(host_err);
+}
+/*
+ * Look up one component of a pathname.
+ * N.B. After this call _both_ fhp and resfh need an fh_put
+ *
+ * If the lookup would cross a mountpoint, and the mounted filesystem
+ * is exported to the client with NFSEXP_NOHIDE, then the lookup is
+ * accepted as it stands and the mounted directory is
+ * returned. Otherwise the covered directory is returned.
+ * NOTE: this mountpoint crossing is not supported properly by all
+ *   clients and is explicitly disallowed for NFSv3
+ *      NeilBrown <neilb@cse.unsw.edu.au>
+ */
+__be32
+nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
+                                        int len, struct svc_fh *resfh)
+{
+        struct svc_export       *exp;
+        struct dentry           *dentry;
+        __be32 err;
+        err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
+        if (err)
+                return err;
+        err = check_nfsd_access(exp, rqstp);
+        if (err)
+                goto out;
        /*
         * Note: we compose the file handle now, but as the
         * dentry may be negative, it may need to be updated.
@@ -230,16 +250,13 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
        err = fh_compose(resfh, exp, dentry, fhp);
        if (!err && !dentry->d_inode)
                err = nfserr_noent;
-        dput(dentry);
 out:
+        dput(dentry);
        exp_put(exp);
        return err;
-out_nfserr:
-        err = nfserrno(host_err);
-        goto out;
 }
 /*
 * Set various file attributes.
 * N.B. After this call fhp needs an fh_put
@@ -311,7 +328,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        /* The size case is special. It changes the file as well as the attributes.  */
        if (iap->ia_valid & ATTR_SIZE) {
                if (iap->ia_size < inode->i_size) {
-                        err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
+                        err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
                        if (err)
                                goto out;
                }
@@ -435,7 +452,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
        /* Get inode */
        error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
        if (error)
-                goto out;
+                return error;
        dentry = fhp->fh_dentry;
        inode = dentry->d_inode;
@@ -444,33 +461,25 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
        host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
        if (host_error == -EINVAL) {
-                error = nfserr_attrnotsupp;
+                return nfserr_attrnotsupp;
-                goto out;
        } else if (host_error < 0)
                goto out_nfserr;
        host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
        if (host_error < 0)
-                goto out_nfserr;
+                goto out_release;
-        if (S_ISDIR(inode->i_mode)) {
+        if (S_ISDIR(inode->i_mode))
                host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
-                if (host_error < 0)
-                        goto out_nfserr;
-        }
-        error = nfs_ok;
-out:
+out_release:
        posix_acl_release(pacl);
        posix_acl_release(dpacl);
-        return (error);
 out_nfserr:
        if (host_error == -EOPNOTSUPP)
-                error = nfserr_attrnotsupp;
+                return nfserr_attrnotsupp;
        else
-                error = nfserrno(host_error);
+                return nfserrno(host_error);
-        goto out;
 }
 static struct posix_acl *
@@ -607,7 +616,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
                        sresult |= map->access;
-                        err2 = nfsd_permission(export, dentry, map->how);
+                        err2 = nfsd_permission(rqstp, export, dentry, map->how);
                        switch (err2) {
                        case nfs_ok:
                                result |= map->access;
@@ -1034,7 +1043,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        __be32          err;
        if (file) {
-                err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
+                err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
                                MAY_READ|MAY_OWNER_OVERRIDE);
                if (err)
                        goto out;
@@ -1063,7 +1072,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        __be32                  err = 0;
        if (file) {
-                err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
+                err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
                                MAY_WRITE|MAY_OWNER_OVERRIDE);
                if (err)
                        goto out;
@@ -1792,7 +1801,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 * Check for a user's access permissions to this inode.
 */
 __be32
-nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
+nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
+                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
        int             err;
@@ -1823,7 +1833,7 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
         */
        if (!(acc & MAY_LOCAL_ACCESS))
                if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
-                        if (EX_RDONLY(exp) || IS_RDONLY(inode))
+                        if (EX_RDONLY(exp, rqstp) || IS_RDONLY(inode))
                                return nfserr_rofs;
                        if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
                                return nfserr_perm;
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index a7ade138d684..f499dd7c3905 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -36,11 +36,9 @@ obj-$(CONFIG_NLS_ISO8859_6)	+= nls_iso8859-6.o
 obj-$(CONFIG_NLS_ISO8859_7)     += nls_iso8859-7.o
 obj-$(CONFIG_NLS_ISO8859_8)     += nls_cp1255.o
 obj-$(CONFIG_NLS_ISO8859_9)     += nls_iso8859-9.o
-obj-$(CONFIG_NLS_ISO8859_10)    += nls_iso8859-10.o
 obj-$(CONFIG_NLS_ISO8859_13)    += nls_iso8859-13.o
 obj-$(CONFIG_NLS_ISO8859_14)    += nls_iso8859-14.o
 obj-$(CONFIG_NLS_ISO8859_15)    += nls_iso8859-15.o
 obj-$(CONFIG_NLS_KOI8_R)        += nls_koi8-r.o
 obj-$(CONFIG_NLS_KOI8_U)        += nls_koi8-u.o nls_koi8-ru.o
-obj-$(CONFIG_NLS_ABC)           += nls_abc.o
 obj-$(CONFIG_NLS_UTF8)          += nls_utf8.o
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index bff01a54675a..e93c6142b23c 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -21,6 +21,7 @@
 */
 #include <linux/dcache.h>
+#include <linux/exportfs.h>
 #include <linux/security.h>
 #include "attrib.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145f..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
 #include "buffer_head_io.h"
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                         struct ocfs2_extent_block *eb);
 /*
 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
 }
 /*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+        int i;
+        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        BUG_ON(path_root_el(dest) != path_root_el(src));
+        ocfs2_reinit_path(dest, 1);
+        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+                dest->p_node[i].bh = src->p_node[i].bh;
+                dest->p_node[i].el = src->p_node[i].el;
+                if (dest->p_node[i].bh)
+                        get_bh(dest->p_node[i].bh);
+        }
+}
+/*
 * Make the *dest path the same as src and re-initialize src path to
 * have a root only.
 */
@@ -212,10 +239,41 @@ out:
        return ret;
 }
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+        int ret = -1;
+        int i;
+        struct ocfs2_extent_rec *rec;
+        u32 rec_end, rec_start, clusters;
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                rec = &el->l_recs[i];
+                rec_start = le32_to_cpu(rec->e_cpos);
+                clusters = ocfs2_rec_clusters(el, rec);
+                rec_end = rec_start + clusters;
+                if (v_cluster >= rec_start && v_cluster < rec_end) {
+                        ret = i;
+                        break;
+                }
+        }
+        return ret;
+}
 enum ocfs2_contig_type {
        CONTIG_NONE = 0,
        CONTIG_LEFT,
-        CONTIG_RIGHT
+        CONTIG_RIGHT,
+        CONTIG_LEFTRIGHT,
 };
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
 {
        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+        /*
+         * Refuse to coalesce extent records with different flag
+         * fields - we don't want to mix unwritten extents with user
+         * data.
+         */
+        if (ext->e_flags != insert_rec->e_flags)
+                return CONTIG_NONE;
        if (ocfs2_extents_adjacent(ext, insert_rec) &&
            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
                        return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
        APPEND_TAIL,
 };
+enum ocfs2_split_type {
+        SPLIT_NONE = 0,
+        SPLIT_LEFT,
+        SPLIT_RIGHT,
+};
 struct ocfs2_insert_type {
+        enum ocfs2_split_type   ins_split;
        enum ocfs2_append_type  ins_appending;
        enum ocfs2_contig_type  ins_contig;
        int                     ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
        int                     ins_tree_depth;
 };
+struct ocfs2_merge_ctxt {
+        enum ocfs2_contig_type  c_contig_type;
+        int                     c_has_empty_extent;
+        int                     c_split_covers_rec;
+        int                     c_used_tail_recs;
+};
 /*
 * How many free extents have we got before we need more meta data?
 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
                        eb->h_blkno = cpu_to_le64(first_blkno);
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
-                        /* we always use slot zero's suballocator */
-                        eb->h_suballoc_slot = 0;
-#else
                        eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
-#endif
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct buffer_head *fe_bh,
                            struct buffer_head *eb_bh,
-                            struct buffer_head *last_eb_bh,
+                            struct buffer_head **last_eb_bh,
                            struct ocfs2_alloc_context *meta_ac)
 {
        int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        mlog_entry_void();
-        BUG_ON(!last_eb_bh);
+        BUG_ON(!last_eb_bh || !*last_eb_bh);
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
-        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-        status = ocfs2_journal_access(handle, inode, last_eb_bh,
+        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * next_leaf on the previously last-extent-block. */
        fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
-        eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
-        status = ocfs2_journal_dirty(handle, last_eb_bh);
+        status = ocfs2_journal_dirty(handle, *last_eb_bh);
        if (status < 0)
                mlog_errno(status);
        status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                        mlog_errno(status);
        }
+        /*
+         * Some callers want to track the rightmost leaf so pass it
+         * back here.
+         */
+        brelse(*last_eb_bh);
+        get_bh(new_eb_bhs[0]);
+        *last_eb_bh = new_eb_bhs[0];
        status = 0;
 bail:
        if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
 }
 /*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
+                           struct buffer_head *di_bh, int *final_depth,
+                           struct buffer_head **last_eb_bh,
+                           struct ocfs2_alloc_context *meta_ac)
+{
+        int ret, shift;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *bh = NULL;
+        BUG_ON(meta_ac == NULL);
+        shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+        if (shift < 0) {
+                ret = shift;
+                mlog_errno(ret);
+                goto out;
+        }
+        /* We traveled all the way to the bottom of the allocation tree
+         * and didn't find room for any more extents - we need to add
+         * another tree level */
+        if (shift) {
+                BUG_ON(bh);
+                mlog(0, "need to shift tree depth (current = %d)\n", depth);
+                /* ocfs2_shift_tree_depth will return us a buffer with
+                 * the new extent block (so we can pass that to
+                 * ocfs2_add_branch). */
+                ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+                                             meta_ac, &bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                depth++;
+                if (depth == 1) {
+                        /*
+                         * Special case: we have room now if we shifted from
+                         * tree_depth 0, so no more work needs to be done.
+                         *
+                         * We won't be calling add_branch, so pass
+                         * back *last_eb_bh as the new leaf. At depth
+                         * zero, it should always be null so there's
+                         * no reason to brelse.
+                         */
+                        BUG_ON(*last_eb_bh);
+                        get_bh(bh);
+                        *last_eb_bh = bh;
+                        goto out;
+                }
+        }
+        /* call ocfs2_add_branch to add the final part of the tree with
+         * the new data. */
+        mlog(0, "add branch. bh = %p\n", bh);
+        ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+                               meta_ac);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (final_depth)
+                *final_depth = depth;
+        brelse(bh);
+        return ret;
+}
+/*
 * This is only valid for leaf nodes, which are the only ones that can
 * have empty extents anyway.
 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
 }
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+        int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+        BUG_ON(num_recs == 0);
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                num_recs--;
+                size = num_recs * sizeof(struct ocfs2_extent_rec);
+                memmove(&el->l_recs[0], &el->l_recs[1], size);
+                memset(&el->l_recs[num_recs], 0,
+                       sizeof(struct ocfs2_extent_rec));
+                el->l_next_free_rec = cpu_to_le16(num_recs);
+        }
+}
 /*
 * Create an empty extent record .
 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         * immediately to their right.
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+        }
        left_clusters -= le32_to_cpu(left_rec->e_cpos);
        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
@@ -1531,10 +1714,16 @@ out:
        return ret;
 }
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
        if (handle->h_buffer_credits < credits)
                return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
        return 0;
 }
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        unsigned int range;
+        struct ocfs2_extent_rec *rec;
+        if (next_free == 0)
+                return 0;
+        rec = &el->l_recs[0];
+        if (ocfs2_is_empty_extent(rec)) {
+                /* Empty list. */
+                if (next_free == 1)
+                        return 0;
+                rec = &el->l_recs[1];
+        }
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+                return 1;
+        return 0;
+}
 /*
 * Rotate all the records in a btree right one record, starting at insert_cpos.
 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
 */
 static int ocfs2_rotate_tree_right(struct inode *inode,
                                   handle_t *handle,
+                                   enum ocfs2_split_type split,
                                   u32 insert_cpos,
                                   struct ocfs2_path *right_path,
                                   struct ocfs2_path **ret_left_path)
 {
-        int ret, start;
+        int ret, start, orig_credits = handle->h_buffer_credits;
        u32 cpos;
        struct ocfs2_path *left_path = NULL;
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                                (unsigned long long)
                                path_leaf_bh(left_path)->b_blocknr);
-                if (ocfs2_rotate_requires_path_adjustment(left_path,
+                if (split == SPLIT_NONE &&
+                    ocfs2_rotate_requires_path_adjustment(left_path,
                                                          insert_cpos)) {
-                        mlog(0, "Path adjustment required\n");
                        /*
                         * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                     right_path->p_tree_depth);
                ret = ocfs2_extend_rotate_transaction(handle, start,
-                                                      right_path);
+                                                      orig_credits, right_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                        goto out;
                }
+                if (split != SPLIT_NONE &&
+                    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+                                                insert_cpos)) {
+                        /*
+                         * A rotate moves the rightmost left leaf
+                         * record over to the leftmost right leaf
+                         * slot. If we're doing an extent split
+                         * instead of a real insert, then we have to
+                         * check that the extent to be split wasn't
+                         * just moved over. If it was, then we can
+                         * exit here, passing left_path back -
+                         * ocfs2_split_extent() is smart enough to
+                         * search both leaves.
+                         */
+                        *ret_left_path = left_path;
+                        goto out_ret_path;
+                }
                /*
                 * There is no need to re-read the next right path
                 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
        return ret;
 }
+static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+                                      struct ocfs2_path *path)
+{
+        int i, idx;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_block *eb;
+        u32 range;
+        /* Path should always be rightmost. */
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+        BUG_ON(eb->h_next_leaf_blk != 0ULL);
+        el = &eb->h_list;
+        BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+        idx = le16_to_cpu(el->l_next_free_rec) - 1;
+        rec = &el->l_recs[idx];
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        for (i = 0; i < path->p_tree_depth; i++) {
+                el = path->p_node[i].el;
+                idx = le16_to_cpu(el->l_next_free_rec) - 1;
+                rec = &el->l_recs[idx];
+                rec->e_int_clusters = cpu_to_le32(range);
+                le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+                ocfs2_journal_dirty(handle, path->p_node[i].bh);
+        }
+}
+static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc,
+                              struct ocfs2_path *path, int unlink_start)
+{
+        int ret, i;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        struct buffer_head *bh;
+        for(i = unlink_start; i < path_num_items(path); i++) {
+                bh = path->p_node[i].bh;
+                eb = (struct ocfs2_extent_block *)bh->b_data;
+                /*
+                 * Not all nodes might have had their final count
+                 * decremented by the caller - handle this here.
+                 */
+                el = &eb->h_list;
+                if (le16_to_cpu(el->l_next_free_rec) > 1) {
+                        mlog(ML_ERROR,
+                             "Inode %llu, attempted to remove extent block "
+                             "%llu with %u records\n",
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             (unsigned long long)le64_to_cpu(eb->h_blkno),
+                             le16_to_cpu(el->l_next_free_rec));
+                        ocfs2_journal_dirty(handle, bh);
+                        ocfs2_remove_from_cache(inode, bh);
+                        continue;
+                }
+                el->l_next_free_rec = 0;
+                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+                ocfs2_journal_dirty(handle, bh);
+                ret = ocfs2_cache_extent_block_free(dealloc, eb);
+                if (ret)
+                        mlog_errno(ret);
+                ocfs2_remove_from_cache(inode, bh);
+        }
+}
+static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+                                 struct ocfs2_path *left_path,
+                                 struct ocfs2_path *right_path,
+                                 int subtree_index,
+                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int i;
+        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+        struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_block *eb;
+        el = path_leaf_el(left_path);
+        eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+        for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+                if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+                        break;
+        BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+        le16_add_cpu(&root_el->l_next_free_rec, -1);
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+        eb->h_next_leaf_blk = 0;
+        ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        ocfs2_unlink_path(inode, handle, dealloc, right_path,
+                          subtree_index + 1);
+}
+static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+                                     struct ocfs2_path *left_path,
+                                     struct ocfs2_path *right_path,
+                                     int subtree_index,
+                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                     int *deleted)
+{
+        int ret, i, del_right_subtree = 0, right_has_empty = 0;
+        struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+        struct ocfs2_extent_block *eb;
+        *deleted = 0;
+        right_leaf_el = path_leaf_el(right_path);
+        left_leaf_el = path_leaf_el(left_path);
+        root_bh = left_path->p_node[subtree_index].bh;
+        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+        if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+                return 0;
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+        if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+                /*
+                 * It's legal for us to proceed if the right leaf is
+                 * the rightmost one and it has an empty extent. There
+                 * are two cases to handle - whether the leaf will be
+                 * empty after removal or not. If the leaf isn't empty
+                 * then just remove the empty extent up front. The
+                 * next block will handle empty leaves by flagging
+                 * them for unlink.
+                 *
+                 * Non rightmost leaves will throw -EAGAIN and the
+                 * caller can manually move the subtree and retry.
+                 */
+                if (eb->h_next_leaf_blk != 0ULL)
+                        return -EAGAIN;
+                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   path_leaf_bh(right_path),
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ocfs2_remove_empty_extent(right_leaf_el);
+                } else
+                        right_has_empty = 1;
+        }
+        if (eb->h_next_leaf_blk == 0ULL &&
+            le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+                /*
+                 * We have to update i_last_eb_blk during the meta
+                 * data delete.
+                 */
+                ret = ocfs2_journal_access(handle, inode, di_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                del_right_subtree = 1;
+        }
+        /*
+         * Getting here with an empty extent in the right path implies
+         * that it's the rightmost path and will be deleted.
+         */
+        BUG_ON(right_has_empty && !del_right_subtree);
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+                ret = ocfs2_journal_access(handle, inode,
+                                           right_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access(handle, inode,
+                                           left_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (!right_has_empty) {
+                /*
+                 * Only do this if we're moving a real
+                 * record. Otherwise, the action is delayed until
+                 * after removal of the right path in which case we
+                 * can do a simple shift to remove the empty extent.
+                 */
+                ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+                memset(&right_leaf_el->l_recs[0], 0,
+                       sizeof(struct ocfs2_extent_rec));
+        }
+        if (eb->h_next_leaf_blk == 0ULL) {
+                /*
+                 * Move recs over to get rid of empty extent, decrease
+                 * next_free. This is allowed to remove the last
+                 * extent in our leaf (setting l_next_free_rec to
+                 * zero) - the delete code below won't care.
+                 */
+                ocfs2_remove_empty_extent(right_leaf_el);
+        }
+        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        if (ret)
+                mlog_errno(ret);
+        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+        if (ret)
+                mlog_errno(ret);
+        if (del_right_subtree) {
+                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+                                     subtree_index, dealloc);
+                ocfs2_update_edge_lengths(inode, handle, left_path);
+                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+                di->i_last_eb_blk = eb->h_blkno;
+                /*
+                 * Removal of the extent in the left leaf was skipped
+                 * above so we could delete the right path
+                 * 1st.
+                 */
+                if (right_has_empty)
+                        ocfs2_remove_empty_extent(left_leaf_el);
+                ret = ocfs2_journal_dirty(handle, di_bh);
+                if (ret)
+                        mlog_errno(ret);
+                *deleted = 1;
+        } else
+                ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+                                           subtree_index);
+out:
+        return ret;
+}
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+                                          struct ocfs2_path *path, u32 *cpos)
+{
+        int i, j, ret = 0;
+        u64 blkno;
+        struct ocfs2_extent_list *el;
+        *cpos = 0;
+        if (path->p_tree_depth == 0)
+                return 0;
+        blkno = path_leaf_bh(path)->b_blocknr;
+        /* Start at the tree node just above the leaf and work our way up. */
+        i = path->p_tree_depth - 1;
+        while (i >= 0) {
+                int next_free;
+                el = path->p_node[i].el;
+                /*
+                 * Find the extent record just after the one in our
+                 * path.
+                 */
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+                                if (j == (next_free - 1)) {
+                                        if (i == 0) {
+                                                /*
+                                                 * We've determined that the
+                                                 * path specified is already
+                                                 * the rightmost one - return a
+                                                 * cpos of zero.
+                                                 */
+                                                goto out;
+                                        }
+                                        /*
+                                         * The rightmost record points to our
+                                         * leaf - we need to travel up the
+                                         * tree one level.
+                                         */
+                                        goto next_node;
+                                }
+                                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+                                goto out;
+                        }
+                }
+                /*
+                 * If we got here, we never found a valid node where
+                 * the tree indicated one should be.
+                 */
+                ocfs2_error(sb,
+                            "Invalid extent tree at extent block %llu\n",
+                            (unsigned long long)blkno);
+                ret = -EROFS;
+                goto out;
+next_node:
+                blkno = path->p_node[i].bh->b_blocknr;
+                i--;
+        }
+out:
+        return ret;
+}
+static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
+                                            handle_t *handle,
+                                            struct buffer_head *bh,
+                                            struct ocfs2_extent_list *el)
+{
+        int ret;
+        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+                return 0;
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_remove_empty_extent(el);
+        ret = ocfs2_journal_dirty(handle, bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static int __ocfs2_rotate_tree_left(struct inode *inode,
+                                    handle_t *handle, int orig_credits,
+                                    struct ocfs2_path *path,
+                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                    struct ocfs2_path **empty_extent_path)
+{
+        int ret, subtree_root, deleted;
+        u32 right_cpos;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_path *right_path = NULL;
+        BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+        *empty_extent_path = NULL;
+        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
+                                             &right_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        left_path = ocfs2_new_path(path_root_bh(path),
+                                   path_root_el(path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_cp_path(left_path, path);
+        right_path = ocfs2_new_path(path_root_bh(path),
+                                    path_root_el(path));
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        while (right_cpos) {
+                ret = ocfs2_find_path(inode, right_path, right_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                subtree_root = ocfs2_find_subtree_root(inode, left_path,
+                                                       right_path);
+                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                     subtree_root,
+                     (unsigned long long)
+                     right_path->p_node[subtree_root].bh->b_blocknr,
+                     right_path->p_tree_depth);
+                ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+                                                      orig_credits, left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+                                                right_path, subtree_root,
+                                                dealloc, &deleted);
+                if (ret == -EAGAIN) {
+                        /*
+                         * The rotation has to temporarily stop due to
+                         * the right subtree having an empty
+                         * extent. Pass it back to the caller for a
+                         * fixup.
+                         */
+                        *empty_extent_path = right_path;
+                        right_path = NULL;
+                        goto out;
+                }
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * The subtree rotate might have removed records on
+                 * the rightmost edge. If so, then rotation is
+                 * complete.
+                 */
+                if (deleted)
+                        break;
+                ocfs2_mv_path(left_path, right_path);
+                ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+                                                     &right_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(right_path);
+        ocfs2_free_path(left_path);
+        return ret;
+}
+static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+                                       struct ocfs2_path *path,
+                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, subtree_index;
+        u32 cpos;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        /*
+         * XXX: This code assumes that the root is an inode, which is
+         * true for now but may change as tree code gets generic.
+         */
+        di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ret = -EIO;
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has invalid path root",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                goto out;
+        }
+        /*
+         * There's two ways we handle this depending on
+         * whether path is the only existing one.
+         */
+        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                              handle->h_buffer_credits,
+                                              path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (cpos) {
+                /*
+                 * We have a path to the left of this one - it needs
+                 * an update too.
+                 */
+                left_path = ocfs2_new_path(path_root_bh(path),
+                                           path_root_el(path));
+                if (!left_path) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_find_path(inode, left_path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access_path(inode, handle, left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+                ocfs2_unlink_subtree(inode, handle, left_path, path,
+                                     subtree_index, dealloc);
+                ocfs2_update_edge_lengths(inode, handle, left_path);
+                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+                di->i_last_eb_blk = eb->h_blkno;
+        } else {
+                /*
+                 * 'path' is also the leftmost path which
+                 * means it must be the only one. This gets
+                 * handled differently because we want to
+                 * revert the inode back to having extents
+                 * in-line.
+                 */
+                ocfs2_unlink_path(inode, handle, dealloc, path, 1);
+                el = &di->id2.i_list;
+                el->l_tree_depth = 0;
+                el->l_next_free_rec = 0;
+                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+                di->i_last_eb_blk = 0;
+        }
+        ocfs2_journal_dirty(handle, path_root_bh(path));
+out:
+        ocfs2_free_path(left_path);
+        return ret;
+}
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+                                  struct ocfs2_path *path,
+                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, orig_credits = handle->h_buffer_credits;
+        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        el = path_leaf_el(path);
+        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+                return 0;
+        if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+                /*
+                 * In-inode extents. This is trivially handled, so do
+                 * it up front.
+                 */
+                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
+                                                       path_leaf_bh(path),
+                                                       path_leaf_el(path));
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Handle rightmost branch now. There's several cases:
+         *  1) simple rotation leaving records in there. That's trivial.
+         *  2) rotation requiring a branch delete - there's no more
+         *     records left. Two cases of this:
+         *     a) There are branches to the left.
+         *     b) This is also the leftmost (the only) branch.
+         *
+         *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
+         *  2a) we need the left branch so that we can update it with the unlink
+         *  2b) we need to bring the inode back to inline extents.
+         */
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+        el = &eb->h_list;
+        if (eb->h_next_leaf_blk == 0) {
+                /*
+                 * This gets a bit tricky if we're going to delete the
+                 * rightmost path. Get the other cases out of the way
+                 * 1st.
+                 */
+                if (le16_to_cpu(el->l_next_free_rec) > 1)
+                        goto rightmost_no_delete;
+                if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                        ret = -EIO;
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has empty extent block at %llu",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
+                        goto out;
+                }
+                /*
+                 * XXX: The caller can not trust "path" any more after
+                 * this as it will have been deleted. What do we do?
+                 *
+                 * In theory the rotate-for-merge code will never get
+                 * here because it'll always ask for a rotate in a
+                 * nonempty list.
+                 */
+                ret = ocfs2_remove_rightmost_path(inode, handle, path,
+                                                  dealloc);
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Now we can loop, remembering the path we get from -EAGAIN
+         * and restarting from there.
+         */
+try_rotate:
+        ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
+                                       dealloc, &restart_path);
+        if (ret && ret != -EAGAIN) {
+                mlog_errno(ret);
+                goto out;
+        }
+        while (ret == -EAGAIN) {
+                tmp_path = restart_path;
+                restart_path = NULL;
+                ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+                                               tmp_path, dealloc,
+                                               &restart_path);
+                if (ret && ret != -EAGAIN) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_free_path(tmp_path);
+                tmp_path = NULL;
+                if (ret == 0)
+                        goto try_rotate;
+        }
+out:
+        ocfs2_free_path(tmp_path);
+        ocfs2_free_path(restart_path);
+        return ret;
+}
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+                                int index)
+{
+        struct ocfs2_extent_rec *rec = &el->l_recs[index];
+        unsigned int size;
+        if (rec->e_leaf_clusters == 0) {
+                /*
+                 * We consumed all of the merged-from record. An empty
+                 * extent cannot exist anywhere but the 1st array
+                 * position, so move things over if the merged-from
+                 * record doesn't occupy that position.
+                 *
+                 * This creates a new empty extent so the caller
+                 * should be smart enough to have removed any existing
+                 * ones.
+                 */
+                if (index > 0) {
+                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+                        size = index * sizeof(struct ocfs2_extent_rec);
+                        memmove(&el->l_recs[1], &el->l_recs[0], size);
+                }
+                /*
+                 * Always memset - the caller doesn't check whether it
+                 * created an empty extent, so there could be junk in
+                 * the other fields.
+                 */
+                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+        }
+}
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record at index + 1.
+ */
+static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+                                handle_t *handle,
+                                struct ocfs2_extent_rec *split_rec,
+                                struct ocfs2_extent_list *el, int index)
+{
+        int ret;
+        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+        struct ocfs2_extent_rec *left_rec;
+        struct ocfs2_extent_rec *right_rec;
+        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+        left_rec = &el->l_recs[index];
+        right_rec = &el->l_recs[index + 1];
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+        le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+        le64_add_cpu(&right_rec->e_blkno,
+                     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+        le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+        ocfs2_cleanup_merge(el, index);
+        ret = ocfs2_journal_dirty(handle, bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record at index - 1.
+ */
+static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+                                handle_t *handle,
+                                struct ocfs2_extent_rec *split_rec,
+                                struct ocfs2_extent_list *el, int index)
+{
+        int ret, has_empty_extent = 0;
+        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+        struct ocfs2_extent_rec *left_rec;
+        struct ocfs2_extent_rec *right_rec;
+        BUG_ON(index <= 0);
+        left_rec = &el->l_recs[index - 1];
+        right_rec = &el->l_recs[index];
+        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                has_empty_extent = 1;
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (has_empty_extent && index == 1) {
+                /*
+                 * The easy case - we can just plop the record right in.
+                 */
+                *left_rec = *split_rec;
+                has_empty_extent = 0;
+        } else {
+                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+        }
+        le32_add_cpu(&right_rec->e_cpos, split_clusters);
+        le64_add_cpu(&right_rec->e_blkno,
+                     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+        le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+        ocfs2_cleanup_merge(el, index);
+        ret = ocfs2_journal_dirty(handle, bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static int ocfs2_try_to_merge_extent(struct inode *inode,
+                                     handle_t *handle,
+                                     struct ocfs2_path *left_path,
+                                     int split_index,
+                                     struct ocfs2_extent_rec *split_rec,
+                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                     struct ocfs2_merge_ctxt *ctxt)
+{
+        int ret = 0, delete_tail_recs = 0;
+        struct ocfs2_extent_list *el = path_leaf_el(left_path);
+        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+        if (ctxt->c_split_covers_rec) {
+                delete_tail_recs++;
+                if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
+                    ctxt->c_has_empty_extent)
+                        delete_tail_recs++;
+                if (ctxt->c_has_empty_extent) {
+                        /*
+                         * The merge code will need to create an empty
+                         * extent to take the place of the newly
+                         * emptied slot. Remove any pre-existing empty
+                         * extents - having more than one in a leaf is
+                         * illegal.
+                         */
+                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                                     dealloc);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        split_index--;
+                        rec = &el->l_recs[split_index];
+                }
+        }
+        if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+                /*
+                 * Left-right contig implies this.
+                 */
+                BUG_ON(!ctxt->c_split_covers_rec);
+                BUG_ON(split_index == 0);
+                /*
+                 * Since the leftright insert always covers the entire
+                 * extent, this call will delete the insert record
+                 * entirely, resulting in an empty extent record added to
+                 * the extent block.
+                 *
+                 * Since the adding of an empty extent shifts
+                 * everything back to the right, there's no need to
+                 * update split_index here.
+                 */
+                ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+                                           handle, split_rec, el, split_index);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * We can only get this from logic error above.
+                 */
+                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                /*
+                 * The left merge left us with an empty extent, remove
+                 * it.
+                 */
+                ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                split_index--;
+                rec = &el->l_recs[split_index];
+                /*
+                 * Note that we don't pass split_rec here on purpose -
+                 * we've merged it into the left side.
+                 */
+                ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+                                            handle, rec, el, split_index);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                             dealloc);
+                /*
+                 * Error from this last rotate is not critical, so
+                 * print but don't bubble it up.
+                 */
+                if (ret)
+                        mlog_errno(ret);
+                ret = 0;
+        } else {
+                /*
+                 * Merge a record to the left or right.
+                 *
+                 * 'contig_type' is relative to the existing record,
+                 * so for example, if we're "right contig", it's to
+                 * the record on the left (hence the left merge).
+                 */
+                if (ctxt->c_contig_type == CONTIG_RIGHT) {
+                        ret = ocfs2_merge_rec_left(inode,
+                                                   path_leaf_bh(left_path),
+                                                   handle, split_rec, el,
+                                                   split_index);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                } else {
+                        ret = ocfs2_merge_rec_right(inode,
+                                                    path_leaf_bh(left_path),
+                                                    handle, split_rec, el,
+                                                    split_index);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                if (ctxt->c_split_covers_rec) {
+                        /*
+                         * The merge may have left an empty extent in
+                         * our leaf. Try to rotate it away.
+                         */
+                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                                     dealloc);
+                        if (ret)
+                                mlog_errno(ret);
+                        ret = 0;
+                }
+        }
+out:
+        return ret;
+}
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+                                    enum ocfs2_split_type split,
+                                    struct ocfs2_extent_rec *rec,
+                                    struct ocfs2_extent_rec *split_rec)
+{
+        u64 len_blocks;
+        len_blocks = ocfs2_clusters_to_blocks(sb,
+                                le16_to_cpu(split_rec->e_leaf_clusters));
+        if (split == SPLIT_LEFT) {
+                /*
+                 * Region is on the left edge of the existing
+                 * record.
+                 */
+                le32_add_cpu(&rec->e_cpos,
+                             le16_to_cpu(split_rec->e_leaf_clusters));
+                le64_add_cpu(&rec->e_blkno, len_blocks);
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             -le16_to_cpu(split_rec->e_leaf_clusters));
+        } else {
+                /*
+                 * Region is on the right edge of the existing
+                 * record.
+                 */
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             -le16_to_cpu(split_rec->e_leaf_clusters));
+        }
+}
 /*
 * Do the final bits of extent record insertion at the target leaf
 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (insert->ins_split != SPLIT_NONE) {
+                i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+                BUG_ON(i == -1);
+                rec = &el->l_recs[i];
+                ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+                                        insert_rec);
+                goto rotate;
+        }
        /*
         * Contiguous insert - either left or right.
         */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
                return;
        }
+rotate:
        /*
         * Ok, we have to rotate.
         *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+                                           handle_t *handle,
+                                           struct ocfs2_path *path,
+                                           struct ocfs2_extent_rec *insert_rec)
+{
+        int ret, i, next_free;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        /*
+         * Update everything except the leaf block.
+         */
+        for (i = 0; i < path->p_tree_depth; i++) {
+                bh = path->p_node[i].bh;
+                el = path->p_node[i].el;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (next_free == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Dinode %llu has a bad extent list",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        ret = -EIO;
+                        return;
+                }
+                rec = &el->l_recs[next_free - 1];
+                rec->e_int_clusters = insert_rec->e_cpos;
+                le32_add_cpu(&rec->e_int_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                le32_add_cpu(&rec->e_int_clusters,
+                             -le32_to_cpu(rec->e_cpos));
+                ret = ocfs2_journal_dirty(handle, bh);
+                if (ret)
+                        mlog_errno(ret);
+        }
+}
 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                                    struct ocfs2_extent_rec *insert_rec,
                                    struct ocfs2_path *right_path,
                                    struct ocfs2_path **ret_left_path)
 {
-        int ret, i, next_free;
+        int ret, next_free;
-        struct buffer_head *bh;
        struct ocfs2_extent_list *el;
        struct ocfs2_path *left_path = NULL;
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                goto out;
        }
-        el = path_root_el(right_path);
+        ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
-        bh = path_root_bh(right_path);
-        i = 0;
-        while (1) {
-                struct ocfs2_extent_rec *rec;
-                next_free = le16_to_cpu(el->l_next_free_rec);
-                if (next_free == 0) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu has a bad extent list",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        ret = -EIO;
-                        goto out;
-                }
-                rec = &el->l_recs[next_free - 1];
-                rec->e_int_clusters = insert_rec->e_cpos;
-                le32_add_cpu(&rec->e_int_clusters,
-                             le16_to_cpu(insert_rec->e_leaf_clusters));
-                le32_add_cpu(&rec->e_int_clusters,
-                             -le32_to_cpu(rec->e_cpos));
-                ret = ocfs2_journal_dirty(handle, bh);
-                if (ret)
-                        mlog_errno(ret);
-                /* Don't touch the leaf node */
-                if (++i >= right_path->p_tree_depth)
-                        break;
-                bh = right_path->p_node[i].bh;
-                el = right_path->p_node[i].el;
-        }
        *ret_left_path = left_path;
        ret = 0;
@@ -1931,6 +3204,83 @@ out:
        return ret;
 }
+static void ocfs2_split_record(struct inode *inode,
+                               struct ocfs2_path *left_path,
+                               struct ocfs2_path *right_path,
+                               struct ocfs2_extent_rec *split_rec,
+                               enum ocfs2_split_type split)
+{
+        int index;
+        u32 cpos = le32_to_cpu(split_rec->e_cpos);
+        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+        struct ocfs2_extent_rec *rec, *tmprec;
+        right_el = path_leaf_el(right_path);;
+        if (left_path)
+                left_el = path_leaf_el(left_path);
+        el = right_el;
+        insert_el = right_el;
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index != -1) {
+                if (index == 0 && left_path) {
+                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+                        /*
+                         * This typically means that the record
+                         * started in the left path but moved to the
+                         * right as a result of rotation. We either
+                         * move the existing record to the left, or we
+                         * do the later insert there.
+                         *
+                         * In this case, the left path should always
+                         * exist as the rotate code will have passed
+                         * it back for a post-insert update.
+                         */
+                        if (split == SPLIT_LEFT) {
+                                /*
+                                 * It's a left split. Since we know
+                                 * that the rotate code gave us an
+                                 * empty extent in the left path, we
+                                 * can just do the insert there.
+                                 */
+                                insert_el = left_el;
+                        } else {
+                                /*
+                                 * Right split - we have to move the
+                                 * existing record over to the left
+                                 * leaf. The insert will be into the
+                                 * newly created empty extent in the
+                                 * right leaf.
+                                 */
+                                tmprec = &right_el->l_recs[index];
+                                ocfs2_rotate_leaf(left_el, tmprec);
+                                el = left_el;
+                                memset(tmprec, 0, sizeof(*tmprec));
+                                index = ocfs2_search_extent_list(left_el, cpos);
+                                BUG_ON(index == -1);
+                        }
+                }
+        } else {
+                BUG_ON(!left_path);
+                BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+                /*
+                 * Left path is easy - we can just allow the insert to
+                 * happen.
+                 */
+                el = left_el;
+                insert_el = left_el;
+                index = ocfs2_search_extent_list(el, cpos);
+                BUG_ON(index == -1);
+        }
+        rec = &el->l_recs[index];
+        ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+        ocfs2_rotate_leaf(insert_el, split_rec);
+}
 /*
 * This function only does inserts on an allocation b-tree. For dinode
 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
 {
        int ret, subtree_index;
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
-        struct ocfs2_extent_list *el;
        /*
         * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
                }
        }
-        el = path_leaf_el(right_path);
+        if (insert->ins_split != SPLIT_NONE) {
+                /*
+                 * We could call ocfs2_insert_at_leaf() for some types
+                 * of splits, but it's easier to just let one seperate
+                 * function sort it all out.
+                 */
+                ocfs2_split_record(inode, left_path, right_path,
+                                   insert_rec, insert->ins_split);
+        } else
+                ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
+                                     insert, inode);
-        ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
        ret = ocfs2_journal_dirty(handle, leaf_bh);
        if (ret)
                mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
         * can wind up skipping both of these two special cases...
         */
        if (rotate) {
-                ret = ocfs2_rotate_tree_right(inode, handle,
+                ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
                                              le32_to_cpu(insert_rec->e_cpos),
                                              right_path, &left_path);
                if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        }
 out_update_clusters:
-        ocfs2_update_dinode_clusters(inode, di,
+        if (type->ins_split == SPLIT_NONE)
-                                     le16_to_cpu(insert_rec->e_leaf_clusters));
+                ocfs2_update_dinode_clusters(inode, di,
+                                             le16_to_cpu(insert_rec->e_leaf_clusters));
        ret = ocfs2_journal_dirty(handle, di_bh);
        if (ret)
@@ -2114,6 +3473,44 @@ out:
        return ret;
 }
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct inode *inode,
+                               struct ocfs2_extent_list *el, int index,
+                               struct ocfs2_extent_rec *split_rec)
+{
+        struct ocfs2_extent_rec *rec;
+        enum ocfs2_contig_type ret = CONTIG_NONE;
+        /*
+         * We're careful to check for an empty extent record here -
+         * the merge code will know what to do if it sees one.
+         */
+        if (index > 0) {
+                rec = &el->l_recs[index - 1];
+                if (index == 1 && ocfs2_is_empty_extent(rec)) {
+                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+                                ret = CONTIG_RIGHT;
+                } else {
+                        ret = ocfs2_extent_contig(inode, rec, split_rec);
+                }
+        }
+        if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+                enum ocfs2_contig_type contig_type;
+                rec = &el->l_recs[index + 1];
+                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+                        ret = CONTIG_LEFTRIGHT;
+                else if (ret == CONTIG_NONE)
+                        ret = contig_type;
+        }
+        return ret;
+}
 static void ocfs2_figure_contig_type(struct inode *inode,
                                     struct ocfs2_insert_type *insert,
                                     struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
        struct ocfs2_path *path = NULL;
        struct buffer_head *bh = NULL;
+        insert->ins_split = SPLIT_NONE;
        el = &di->id2.i_list;
        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
+                        u8 flags,
                        struct ocfs2_alloc_context *meta_ac)
 {
-        int status, shift;
+        int status;
        struct buffer_head *last_eb_bh = NULL;
        struct buffer_head *bh = NULL;
        struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        rec.e_cpos = cpu_to_le32(cpos);
        rec.e_blkno = cpu_to_le64(start_blk);
        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+        rec.e_flags = flags;
        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
                                          &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
             insert.ins_free_records, insert.ins_tree_depth);
-        /*
+        if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
-         * Avoid growing the tree unless we're out of records and the
+                status = ocfs2_grow_tree(inode, handle, fe_bh,
-         * insert type requres one.
+                                         &insert.ins_tree_depth, &last_eb_bh,
-         */
+                                         meta_ac);
-        if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+                if (status) {
-                goto out_add;
-        shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
-        if (shift < 0) {
-                status = shift;
-                mlog_errno(status);
-                goto bail;
-        }
-        /* We traveled all the way to the bottom of the allocation tree
-         * and didn't find room for any more extents - we need to add
-         * another tree level */
-        if (shift) {
-                BUG_ON(bh);
-                mlog(0, "need to shift tree depth "
-                     "(current = %d)\n", insert.ins_tree_depth);
-                /* ocfs2_shift_tree_depth will return us a buffer with
-                 * the new extent block (so we can pass that to
-                 * ocfs2_add_branch). */
-                status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
-                                                meta_ac, &bh);
-                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
-                insert.ins_tree_depth++;
-                /* Special case: we have room now if we shifted from
-                 * tree_depth 0 */
-                if (insert.ins_tree_depth == 1)
-                        goto out_add;
-        }
-        /* call ocfs2_add_branch to add the final part of the tree with
-         * the new data. */
-        mlog(0, "add branch. bh = %p\n", bh);
-        status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
-                                  meta_ac);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
        }
-out_add:
        /* Finally, we can add clusters. This might rotate the tree for us. */
        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
        if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
        return status;
 }
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+                                       struct ocfs2_extent_rec *split_rec,
+                                       u32 cpos,
+                                       struct ocfs2_extent_rec *rec)
+{
+        u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+        u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+        memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+        split_rec->e_cpos = cpu_to_le32(cpos);
+        split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+        split_rec->e_blkno = rec->e_blkno;
+        le64_add_cpu(&split_rec->e_blkno,
+                     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+        split_rec->e_flags = rec->e_flags;
+}
+static int ocfs2_split_and_insert(struct inode *inode,
+                                  handle_t *handle,
+                                  struct ocfs2_path *path,
+                                  struct buffer_head *di_bh,
+                                  struct buffer_head **last_eb_bh,
+                                  int split_index,
+                                  struct ocfs2_extent_rec *orig_split_rec,
+                                  struct ocfs2_alloc_context *meta_ac)
+{
+        int ret = 0, depth;
+        unsigned int insert_range, rec_range, do_leftright = 0;
+        struct ocfs2_extent_rec tmprec;
+        struct ocfs2_extent_list *rightmost_el;
+        struct ocfs2_extent_rec rec;
+        struct ocfs2_extent_rec split_rec = *orig_split_rec;
+        struct ocfs2_insert_type insert;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_dinode *di;
+leftright:
+        /*
+         * Store a copy of the record on the stack - it might move
+         * around as the tree is manipulated below.
+         */
+        rec = path_leaf_el(path)->l_recs[split_index];
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        rightmost_el = &di->id2.i_list;
+        depth = le16_to_cpu(rightmost_el->l_tree_depth);
+        if (depth) {
+                BUG_ON(!(*last_eb_bh));
+                eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+                rightmost_el = &eb->h_list;
+        }
+        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+            le16_to_cpu(rightmost_el->l_count)) {
+                int old_depth = depth;
+                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
+                                      meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (old_depth != depth) {
+                        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+                        rightmost_el = &eb->h_list;
+                }
+        }
+        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+        insert.ins_appending = APPEND_NONE;
+        insert.ins_contig = CONTIG_NONE;
+        insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+                - le16_to_cpu(rightmost_el->l_next_free_rec);
+        insert.ins_tree_depth = depth;
+        insert_range = le32_to_cpu(split_rec.e_cpos) +
+                le16_to_cpu(split_rec.e_leaf_clusters);
+        rec_range = le32_to_cpu(rec.e_cpos) +
+                le16_to_cpu(rec.e_leaf_clusters);
+        if (split_rec.e_cpos == rec.e_cpos) {
+                insert.ins_split = SPLIT_LEFT;
+        } else if (insert_range == rec_range) {
+                insert.ins_split = SPLIT_RIGHT;
+        } else {
+                /*
+                 * Left/right split. We fake this as a right split
+                 * first and then make a second pass as a left split.
+                 */
+                insert.ins_split = SPLIT_RIGHT;
+                ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
+                                           &rec);
+                split_rec = tmprec;
+                BUG_ON(do_leftright);
+                do_leftright = 1;
+        }
+        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
+                                     &insert);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (do_leftright == 1) {
+                u32 cpos;
+                struct ocfs2_extent_list *el;
+                do_leftright++;
+                split_rec = *orig_split_rec;
+                ocfs2_reinit_path(path, 1);
+                cpos = le32_to_cpu(split_rec.e_cpos);
+                ret = ocfs2_find_path(inode, path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                el = path_leaf_el(path);
+                split_index = ocfs2_search_extent_list(el, cpos);
+                goto leftright;
+        }
+out:
+        return ret;
+}
+/*
+ * Mark part or all of the extent record at split_index in the leaf
+ * pointed to by path as written. This removes the unwritten
+ * extent flag.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any inode with a
+ * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+static int __ocfs2_mark_extent_written(struct inode *inode,
+                                       struct buffer_head *di_bh,
+                                       handle_t *handle,
+                                       struct ocfs2_path *path,
+                                       int split_index,
+                                       struct ocfs2_extent_rec *split_rec,
+                                       struct ocfs2_alloc_context *meta_ac,
+                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret = 0;
+        struct ocfs2_extent_list *el = path_leaf_el(path);
+        struct buffer_head *eb_bh, *last_eb_bh = NULL;
+        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+        struct ocfs2_merge_ctxt ctxt;
+        struct ocfs2_extent_list *rightmost_el;
+        if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+            ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+             (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out;
+        }
+        eb_bh = path_leaf_bh(path);
+        ret = ocfs2_journal_access(handle, inode, eb_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+                                                            split_index,
+                                                            split_rec);
+        /*
+         * The core merge / split code wants to know how much room is
+         * left in this inodes allocation tree, so we pass the
+         * rightmost extent list.
+         */
+        if (path->p_tree_depth) {
+                struct ocfs2_extent_block *eb;
+                struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk),
+                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_exit(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                        ret = -EROFS;
+                        goto out;
+                }
+                rightmost_el = &eb->h_list;
+        } else
+                rightmost_el = path_root_el(path);
+        ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
+        if (ctxt.c_used_tail_recs > 0 &&
+            ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
+                ctxt.c_used_tail_recs--;
+        if (rec->e_cpos == split_rec->e_cpos &&
+            rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+                ctxt.c_split_covers_rec = 1;
+        else
+                ctxt.c_split_covers_rec = 0;
+        ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+        mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
+             "has_empty: %u, split_covers: %u\n", split_index,
+             ctxt.c_contig_type, ctxt.c_used_tail_recs,
+             ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
+        if (ctxt.c_contig_type == CONTIG_NONE) {
+                if (ctxt.c_split_covers_rec)
+                        el->l_recs[split_index] = *split_rec;
+                else
+                        ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+                                                     &last_eb_bh, split_index,
+                                                     split_rec, meta_ac);
+                if (ret)
+                        mlog_errno(ret);
+        } else {
+                ret = ocfs2_try_to_merge_extent(inode, handle, path,
+                                                split_index, split_rec,
+                                                dealloc, &ctxt);
+                if (ret)
+                        mlog_errno(ret);
+        }
+        ocfs2_journal_dirty(handle, eb_bh);
+out:
+        brelse(last_eb_bh);
+        return ret;
+}
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, u32 cpos, u32 len, u32 phys,
+                              struct ocfs2_alloc_context *meta_ac,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, index;
+        u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+        struct ocfs2_extent_rec split_rec;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_extent_list *el;
+        mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
+             inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
+        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+                            "that are being written to, but the feature bit "
+                            "is not set in the super block.",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                ret = -EROFS;
+                goto out;
+        }
+        /*
+         * XXX: This should be fixed up so that we just re-insert the
+         * next extent records.
+         */
+        ocfs2_extent_map_trunc(inode, 0);
+        left_path = ocfs2_new_inode_path(di_bh);
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, left_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(left_path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+        split_rec.e_cpos = cpu_to_le32(cpos);
+        split_rec.e_leaf_clusters = cpu_to_le16(len);
+        split_rec.e_blkno = cpu_to_le64(start_blkno);
+        split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
+        split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
+        ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
+                                          index, &split_rec, meta_ac, dealloc);
+        if (ret)
+                mlog_errno(ret);
+out:
+        ocfs2_free_path(left_path);
+        return ret;
+}
+static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+                            handle_t *handle, struct ocfs2_path *path,
+                            int index, u32 new_range,
+                            struct ocfs2_alloc_context *meta_ac)
+{
+        int ret, depth, credits = handle->h_buffer_credits;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct buffer_head *last_eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *rightmost_el, *el;
+        struct ocfs2_extent_rec split_rec;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_insert_type insert;
+        /*
+         * Setup the record to split before we grow the tree.
+         */
+        el = path_leaf_el(path);
+        rec = &el->l_recs[index];
+        ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+        depth = path->p_tree_depth;
+        if (depth > 0) {
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk),
+                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                rightmost_el = &eb->h_list;
+        } else
+                rightmost_el = path_leaf_el(path);
+        credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+        ret = ocfs2_extend_trans(handle, credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+            le16_to_cpu(rightmost_el->l_count)) {
+                int old_depth = depth;
+                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+                                      meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (old_depth != depth) {
+                        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+                        rightmost_el = &eb->h_list;
+                }
+        }
+        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+        insert.ins_appending = APPEND_NONE;
+        insert.ins_contig = CONTIG_NONE;
+        insert.ins_split = SPLIT_RIGHT;
+        insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+                - le16_to_cpu(rightmost_el->l_next_free_rec);
+        insert.ins_tree_depth = depth;
+        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+        if (ret)
+                mlog_errno(ret);
+out:
+        brelse(last_eb_bh);
+        return ret;
+}
+static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+                              struct ocfs2_path *path, int index,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc,
+                              u32 cpos, u32 len)
+{
+        int ret;
+        u32 left_cpos, rec_range, trunc_range;
+        int wants_rotate = 0, is_rightmost_tree_rec = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_extent_list *el = path_leaf_el(path);
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_extent_block *eb;
+        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                index--;
+        }
+        if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+            path->p_tree_depth) {
+                /*
+                 * Check whether this is the rightmost tree record. If
+                 * we remove all of this record or part of its right
+                 * edge then an update of the record lengths above it
+                 * will be required.
+                 */
+                eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+                if (eb->h_next_leaf_blk == 0)
+                        is_rightmost_tree_rec = 1;
+        }
+        rec = &el->l_recs[index];
+        if (index == 0 && path->p_tree_depth &&
+            le32_to_cpu(rec->e_cpos) == cpos) {
+                /*
+                 * Changing the leftmost offset (via partial or whole
+                 * record truncate) of an interior (or rightmost) path
+                 * means we have to update the subtree that is formed
+                 * by this leaf and the one to it's left.
+                 *
+                 * There are two cases we can skip:
+                 *   1) Path is the leftmost one in our inode tree.
+                 *   2) The leaf is rightmost and will be empty after
+                 *      we remove the extent record - the rotate code
+                 *      knows how to update the newly formed edge.
+                 */
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
+                                                    &left_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+                        left_path = ocfs2_new_path(path_root_bh(path),
+                                                   path_root_el(path));
+                        if (!left_path) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+        }
+        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                              handle->h_buffer_credits,
+                                              path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, left_path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        trunc_range = cpos + len;
+        if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+                int next_free;
+                memset(rec, 0, sizeof(*rec));
+                ocfs2_cleanup_merge(el, index);
+                wants_rotate = 1;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (is_rightmost_tree_rec && next_free > 1) {
+                        /*
+                         * We skip the edge update if this path will
+                         * be deleted by the rotate code.
+                         */
+                        rec = &el->l_recs[next_free - 1];
+                        ocfs2_adjust_rightmost_records(inode, handle, path,
+                                                       rec);
+                }
+        } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+                /* Remove leftmost portion of the record. */
+                le32_add_cpu(&rec->e_cpos, len);
+                le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+                le16_add_cpu(&rec->e_leaf_clusters, -len);
+        } else if (rec_range == trunc_range) {
+                /* Remove rightmost portion of the record */
+                le16_add_cpu(&rec->e_leaf_clusters, -len);
+                if (is_rightmost_tree_rec)
+                        ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+        } else {
+                /* Caller should have trapped this. */
+                mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
+                     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                     le32_to_cpu(rec->e_cpos),
+                     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+                BUG();
+        }
+        if (left_path) {
+                int subtree_index;
+                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+                ocfs2_complete_edge_insert(inode, handle, left_path, path,
+                                           subtree_index);
+        }
+        ocfs2_journal_dirty(handle, path_leaf_bh(path));
+        ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        ocfs2_free_path(left_path);
+        return ret;
+}
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                        u32 cpos, u32 len, handle_t *handle,
+                        struct ocfs2_alloc_context *meta_ac,
+                        struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, index;
+        u32 rec_range, trunc_range;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *path;
+        ocfs2_extent_map_trunc(inode, 0);
+        path = ocfs2_new_inode_path(di_bh);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        /*
+         * We have 3 cases of extent removal:
+         *   1) Range covers the entire extent rec
+         *   2) Range begins or ends on one edge of the extent rec
+         *   3) Range is in the middle of the extent rec (no shared edges)
+         *
+         * For case 1 we remove the extent rec and left rotate to
+         * fill the hole.
+         *
+         * For case 2 we just shrink the existing extent rec, with a
+         * tree update if the shrinking edge is also the edge of an
+         * extent block.
+         *
+         * For case 3 we do a right split to turn the extent rec into
+         * something case 2 can handle.
+         */
+        rec = &el->l_recs[index];
+        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        trunc_range = cpos + len;
+        BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+        mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
+             "(cpos %u, len %u)\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
+             le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                         cpos, len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else {
+                ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+                                       trunc_range, meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * The split could have manipulated the tree enough to
+                 * move the record location, so we have to look for it again.
+                 */
+                ocfs2_reinit_path(path, 1);
+                ret = ocfs2_find_path(inode, path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                el = path_leaf_el(path);
+                index = ocfs2_search_extent_list(el, cpos);
+                if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu: split at cpos %u lost record.",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    cpos);
+                        ret = -EROFS;
+                        goto out;
+                }
+                /*
+                 * Double check our values here. If anything is fishy,
+                 * it's easier to catch it at the top level.
+                 */
+                rec = &el->l_recs[index];
+                rec_range = le32_to_cpu(rec->e_cpos) +
+                        ocfs2_rec_clusters(el, rec);
+                if (rec_range != trunc_range) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu: error after split at cpos %u"
+                                    "trunc len %u, existing record is (%u,%u)",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    cpos, len, le32_to_cpu(rec->e_cpos),
+                                    ocfs2_rec_clusters(el, rec));
+                        ret = -EROFS;
+                        goto out;
+                }
+                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                         cpos, len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(path);
+        return ret;
+}
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
        struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
        return current_tail == new_start;
 }
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
-                                     handle_t *handle,
+                              handle_t *handle,
-                                     u64 start_blk,
+                              u64 start_blk,
-                                     unsigned int num_clusters)
+                              unsigned int num_clusters)
 {
        int status, index;
        unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
 }
 /* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
        int status;
        unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
        return status;
 }
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+/*
+ * Describes a single block free from a suballocator
+ */
+struct ocfs2_cached_block_free {
+        struct ocfs2_cached_block_free          *free_next;
+        u64                                     free_blk;
+        unsigned int                            free_bit;
+};
+struct ocfs2_per_slot_free_list {
+        struct ocfs2_per_slot_free_list         *f_next_suballocator;
+        int                                     f_inode_type;
+        int                                     f_slot;
+        struct ocfs2_cached_block_free          *f_first;
+};
+static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+                                   int sysfile_type,
+                                   int slot,
+                                   struct ocfs2_cached_block_free *head)
+{
+        int ret;
+        u64 bg_blkno;
+        handle_t *handle;
+        struct inode *inode;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_cached_block_free *tmp;
+        inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+        if (!inode) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&inode->i_mutex);
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        while (head) {
+                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                      head->free_bit);
+                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
+                     head->free_bit, (unsigned long long)head->free_blk);
+                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+                                               head->free_bit, bg_blkno, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_journal;
+                }
+                ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_journal;
+                }
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+out_journal:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        ocfs2_meta_unlock(inode, 1);
+        brelse(di_bh);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        iput(inode);
+out:
+        while(head) {
+                /* Premature exit may have left some dangling items. */
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+        return ret;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+                       struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+        int ret = 0, ret2;
+        struct ocfs2_per_slot_free_list *fl;
+        if (!ctxt)
+                return 0;
+        while (ctxt->c_first_suballocator) {
+                fl = ctxt->c_first_suballocator;
+                if (fl->f_first) {
+                        mlog(0, "Free items: (type %u, slot %d)\n",
+                             fl->f_inode_type, fl->f_slot);
+                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                                                       fl->f_slot, fl->f_first);
+                        if (ret2)
+                                mlog_errno(ret2);
+                        if (!ret)
+                                ret = ret2;
+                }
+                ctxt->c_first_suballocator = fl->f_next_suballocator;
+                kfree(fl);
+        }
+        return ret;
+}
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+                              int slot,
+                              struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+        struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+        while (fl) {
+                if (fl->f_inode_type == type && fl->f_slot == slot)
+                        return fl;
+                fl = fl->f_next_suballocator;
+        }
+        fl = kmalloc(sizeof(*fl), GFP_NOFS);
+        if (fl) {
+                fl->f_inode_type = type;
+                fl->f_slot = slot;
+                fl->f_first = NULL;
+                fl->f_next_suballocator = ctxt->c_first_suballocator;
+                ctxt->c_first_suballocator = fl;
+        }
+        return fl;
+}
+static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                     int type, int slot, u64 blkno,
+                                     unsigned int bit)
+{
+        int ret;
+        struct ocfs2_per_slot_free_list *fl;
+        struct ocfs2_cached_block_free *item;
+        fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+        if (fl == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        item = kmalloc(sizeof(*item), GFP_NOFS);
+        if (item == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
+             type, slot, bit, (unsigned long long)blkno);
+        item->free_blk = blkno;
+        item->free_bit = bit;
+        item->free_next = fl->f_first;
+        fl->f_first = item;
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                         struct ocfs2_extent_block *eb)
+{
+        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+                                         le16_to_cpu(eb->h_suballoc_slot),
+                                         le64_to_cpu(eb->h_blkno),
+                                         le16_to_cpu(eb->h_suballoc_bit));
+}
 /* This function will figure out whether the currently last extent
 * block will be deleted, and if it will, what the new last extent
 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-                        if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                                /*
+                        /* An error here is not fatal. */
-                                 * This code only understands how to
+                        if (ret < 0)
-                                 * lock the suballocator in slot 0,
+                                mlog_errno(ret);
-                                 * which is fine because allocation is
-                                 * only ever done out of that
-                                 * suballocator too. A future version
-                                 * might change that however, so avoid
-                                 * a free if we don't know how to
-                                 * handle it. This way an fs incompat
-                                 * bit will not be necessary.
-                                 */
-                                ret = ocfs2_free_extent_block(handle,
-                                                              tc->tc_ext_alloc_inode,
-                                                              tc->tc_ext_alloc_bh,
-                                                              eb);
-                                /* An error here is not fatal. */
-                                if (ret < 0)
-                                        mlog_errno(ret);
-                        }
                } else {
                        deleted_eb = 0;
                }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
        return ocfs2_journal_dirty_data(handle, bh);
 }
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
-                                     struct page **pages, int numpages,
+                                     loff_t end, struct page **pages,
-                                     u64 phys, handle_t *handle)
+                                     int numpages, u64 phys, handle_t *handle)
 {
        int i, ret, partial = 0;
        void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
        if (numpages == 0)
                goto out;
-        from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+        to = PAGE_CACHE_SIZE;
-        if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
-                /*
-                 * Since 'from' has been capped to a value below page
-                 * size, this calculation won't be able to overflow
-                 * 'to'
-                 */
-                to = ocfs2_align_bytes_to_clusters(sb, from);
-                /*
-                 * The truncate tail in this case should never contain
-                 * more than one page at maximum. The loop below also
-                 * assumes this.
-                 */
-                BUG_ON(numpages != 1);
-        }
        for(i = 0; i < numpages; i++) {
                page = pages[i];
+                from = start & (PAGE_CACHE_SIZE - 1);
+                if ((end >> PAGE_CACHE_SHIFT) == page->index)
+                        to = end & (PAGE_CACHE_SIZE - 1);
                BUG_ON(from > PAGE_CACHE_SIZE);
                BUG_ON(to > PAGE_CACHE_SIZE);
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
                flush_dcache_page(page);
-                /*
+                start = (page->index + 1) << PAGE_CACHE_SHIFT;
-                 * Every page after the 1st one should be completely zero'd.
-                 */
-                from = 0;
        }
 out:
        if (pages) {
@@ -3484,24 +5740,26 @@ out:
        }
 }
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-                                int *num, u64 *phys)
+                                struct page **pages, int *num, u64 *phys)
 {
        int i, numpages = 0, ret = 0;
-        unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
        unsigned int ext_flags;
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index;
-        u64 next_cluster_bytes;
+        loff_t last_page_bytes;
        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        BUG_ON(start > end);
-        /* Cluster boundary, so we don't need to grab any pages. */
+        if (start == end)
-        if ((isize & (csize - 1)) == 0)
                goto out;
-        ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+        BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+               (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+        ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
                                          phys, NULL, &ext_flags);
        if (ret) {
                mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
        if (ext_flags & OCFS2_EXT_UNWRITTEN)
                goto out;
-        next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+        last_page_bytes = PAGE_ALIGN(end);
-        index = isize >> PAGE_CACHE_SHIFT;
+        index = start >> PAGE_CACHE_SHIFT;
        do {
                pages[numpages] = grab_cache_page(mapping, index);
                if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
                numpages++;
                index++;
-        } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+        } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
 out:
        if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
 * otherwise block_write_full_page() will skip writeout of pages past
 * i_size. The new_i_size parameter is passed for this reason.
 */
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
-                                 u64 new_i_size)
+                                  u64 range_start, u64 range_end)
 {
        int ret, numpages;
-        loff_t endbyte;
        struct page **pages = NULL;
        u64 phys;
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
                goto out;
        }
-        ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+        ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+                                   &numpages, &phys);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
        if (numpages == 0)
                goto out;
-        ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+        ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
-                                 handle);
+                                 numpages, phys, handle);
        /*
         * Initiate writeout of the pages we zero'd here. We don't
         * wait on them - the truncate_inode_pages() call later will
         * do that for us.
         */
-        endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        ret = do_sync_mapping_range(inode->i_mapping, range_start,
-        ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+                                    range_end - 1, SYNC_FILE_RANGE_WRITE);
-                                    endbyte - 1, SYNC_FILE_RANGE_WRITE);
        if (ret)
                mlog_errno(ret);
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        mlog_entry_void();
-        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
@@ -3754,7 +6009,6 @@ start:
        goto start;
 bail:
-        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_schedule_truncate_log_flush(osb, 1);
@@ -3764,6 +6018,8 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
+        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
        ocfs2_free_path(path);
        /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
 }
 /*
- * Expects the inode to already be locked. This will figure out which
+ * Expects the inode to already be locked.
- * inodes need to be locked and will put them on the returned truncate
- * context.
 */
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
                           struct ocfs2_truncate_context **tc)
 {
-        int status, metadata_delete, i;
+        int status;
        unsigned int new_i_clusters;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list *el;
        struct buffer_head *last_eb_bh = NULL;
-        struct inode *ext_alloc_inode = NULL;
-        struct buffer_head *ext_alloc_bh = NULL;
        mlog_entry_void();
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                mlog_errno(status);
                goto bail;
        }
+        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        metadata_delete = 0;
        if (fe->id2.i_list.l_tree_depth) {
-                /* If we have a tree, then the truncate may result in
-                 * metadata deletes. Figure this out from the
-                 * rightmost leaf block.*/
                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
                if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                        status = -EIO;
                        goto bail;
                }
-                el = &(eb->h_list);
-                i = 0;
-                if (ocfs2_is_empty_extent(&el->l_recs[0]))
-                        i = 1;
-                /*
-                 * XXX: Should we check that next_free_rec contains
-                 * the extent?
-                 */
-                if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
-                        metadata_delete = 1;
        }
        (*tc)->tc_last_eb_bh = last_eb_bh;
-        if (metadata_delete) {
-                mlog(0, "Will have to delete metadata for this trunc. "
-                     "locking allocator.\n");
-                ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
-                if (!ext_alloc_inode) {
-                        status = -ENOMEM;
-                        mlog_errno(status);
-                        goto bail;
-                }
-                mutex_lock(&ext_alloc_inode->i_mutex);
-                (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
-                status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
-                (*tc)->tc_ext_alloc_locked = 1;
-        }
        status = 0;
 bail:
        if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 {
-        if (tc->tc_ext_alloc_inode) {
+        /*
-                if (tc->tc_ext_alloc_locked)
+         * The caller is responsible for completing deallocation
-                        ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+         * before freeing the context.
+         */
-                mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
+        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                iput(tc->tc_ext_alloc_inode);
+                mlog(ML_NOTICE,
-        }
+                     "Truncate completion has non-empty dealloc context\n");
-        if (tc->tc_ext_alloc_bh)
-                brelse(tc->tc_ext_alloc_bh);
        if (tc->tc_last_eb_bh)
                brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a081..990df48ae8d3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
+                        u8 flags,
                        struct ocfs2_alloc_context *meta_ac);
+struct ocfs2_cached_dealloc_ctxt;
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, u32 cpos, u32 len, u32 phys,
+                              struct ocfs2_alloc_context *meta_ac,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                        u32 cpos, u32 len, handle_t *handle,
+                        struct ocfs2_alloc_context *meta_ac,
+                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                                      struct ocfs2_dinode **tl_copy);
 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
                                         struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+                              handle_t *handle,
+                              u64 start_blk,
+                              unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+        c->c_first_suballocator = NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+                       struct ocfs2_cached_dealloc_ctxt *ctxt);
 struct ocfs2_truncate_context {
-        struct inode *tc_ext_alloc_inode;
+        struct ocfs2_cached_dealloc_ctxt tc_dealloc;
-        struct buffer_head *tc_ext_alloc_bh;
        int tc_ext_alloc_locked; /* is it cluster locked? */
        /* these get destroyed once it's passed to ocfs2_commit_truncate. */
        struct buffer_head *tc_last_eb_bh;
 };
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
-                                 u64 new_i_size);
+                                  u64 range_start, u64 range_end);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
                    u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 /*
 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79b9..84bf6e79de23 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
             bh = bh->b_this_page, block_start += bsize) {
                block_end = block_start + bsize;
+                clear_buffer_new(bh);
                /*
                 * Ignore blocks outside of our i/o range -
                 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                 * For an allocating write with cluster size >= page
                 * size, we always write the entire page.
                 */
+                if (new)
-                if (buffer_new(bh))
+                        set_buffer_new(bh);
-                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-                     (block_start < from || block_end > to)) {
+                           !buffer_new(bh) &&
+                           (block_start < from || block_end > to)) {
                        ll_rw_block(READ, 1, &bh);
                        *wait_bh++=bh;
                }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
        bh = head;
        block_start = 0;
        do {
-                void *kaddr;
                block_end = block_start + bsize;
                if (block_end <= from)
                        goto next_bh;
                if (block_start >= to)
                        break;
-                kaddr = kmap_atomic(page, KM_USER0);
+                zero_user_page(page, block_start, bh->b_size, KM_USER0);
-                memset(kaddr+block_start, 0, bh->b_size);
-                flush_dcache_page(page);
-                kunmap_atomic(kaddr, KM_USER0);
                set_buffer_uptodate(bh);
                mark_buffer_dirty(bh);
@@ -761,217 +758,240 @@ next_bh:
        return ret;
 }
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES    1
+#else
+#define OCFS2_MAX_CTXT_PAGES    (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+#define OCFS2_MAX_CLUSTERS_PER_PAGE     (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 /*
- * This will copy user data from the buffer page in the splice
+ * Describe the state of a single cluster to be written to.
- * context.
- *
- * For now, we ignore SPLICE_F_MOVE as that would require some extra
- * communication out all the way to ocfs2_write().
 */
-int ocfs2_map_and_write_splice_data(struct inode *inode,
+struct ocfs2_write_cluster_desc {
-                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+        u32             c_cpos;
-                                  unsigned int *ret_from, unsigned int *ret_to)
+        u32             c_phys;
+        /*
+         * Give this a unique field because c_phys eventually gets
+         * filled.
+         */
+        unsigned        c_new;
+        unsigned        c_unwritten;
+};
+static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
 {
-        int ret;
+        return d->c_new || d->c_unwritten;
-        unsigned int to, from, cluster_start, cluster_end;
+}
-        char *src, *dst;
-        struct ocfs2_splice_write_priv *sp = wc->w_private;
-        struct pipe_buffer *buf = sp->s_buf;
-        unsigned long bytes, src_from;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+struct ocfs2_write_ctxt {
-                                        &cluster_end);
+        /* Logical cluster position / len of write */
+        u32                             w_cpos;
+        u32                             w_clen;
-        from = sp->s_offset;
+        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
-        src_from = sp->s_buf_offset;
-        bytes = wc->w_count;
-        if (wc->w_large_pages) {
+        /*
-                /*
+         * This is true if page_size > cluster_size.
-                 * For cluster size < page size, we have to
+         *
-                 * calculate pos within the cluster and obey
+         * It triggers a set of special cases during write which might
-                 * the rightmost boundary.
+         * have to deal with allocating writes to partial pages.
-                 */
+         */
-                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+        unsigned int                    w_large_pages;
-                                   - (wc->w_pos & (osb->s_clustersize - 1))));
-        }
+        /*
-        to = from + bytes;
+         * Pages involved in this write.
+         *
+         * w_target_page is the page being written to by the user.
+         *
+         * w_pages is an array of pages which always contains
+         * w_target_page, and in the case of an allocating write with
+         * page_size < cluster size, it will contain zero'd and mapped
+         * pages adjacent to w_target_page which need to be written
+         * out in so that future reads from that region will get
+         * zero's.
+         */
+        struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
+        unsigned int                    w_num_pages;
+        struct page                     *w_target_page;
-        BUG_ON(from > PAGE_CACHE_SIZE);
+        /*
-        BUG_ON(to > PAGE_CACHE_SIZE);
+         * ocfs2_write_end() uses this to know what the real range to
-        BUG_ON(from < cluster_start);
+         * write in the target should be.
-        BUG_ON(to > cluster_end);
+         */
+        unsigned int                    w_target_from;
+        unsigned int                    w_target_to;
-        if (wc->w_this_page_new)
+        /*
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+         * We could use journal_current_handle() but this is cleaner,
-                                            cluster_start, cluster_end, 1);
+         * IMHO -Mark
-        else
+         */
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+        handle_t                        *w_handle;
-                                            from, to, 0);
-        if (ret) {
+        struct buffer_head              *w_di_bh;
-                mlog_errno(ret);
-                goto out;
+        struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+        int i;
+        for(i = 0; i < wc->w_num_pages; i++) {
+                if (wc->w_pages[i] == NULL)
+                        continue;
+                unlock_page(wc->w_pages[i]);
+                mark_page_accessed(wc->w_pages[i]);
+                page_cache_release(wc->w_pages[i]);
        }
-        src = buf->ops->map(sp->s_pipe, buf, 1);
+        brelse(wc->w_di_bh);
-        dst = kmap_atomic(wc->w_this_page, KM_USER1);
+        kfree(wc);
-        memcpy(dst + from, src + src_from, bytes);
+}
-        kunmap_atomic(wc->w_this_page, KM_USER1);
-        buf->ops->unmap(sp->s_pipe, buf, src);
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+                                  struct ocfs2_super *osb, loff_t pos,
+                                  unsigned len, struct buffer_head *di_bh)
+{
+        struct ocfs2_write_ctxt *wc;
+        wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+        if (!wc)
+                return -ENOMEM;
-        wc->w_finished_copy = 1;
+        wc->w_cpos = pos >> osb->s_clustersize_bits;
+        wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+        get_bh(di_bh);
+        wc->w_di_bh = di_bh;
-        *ret_from = from;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
-        *ret_to = to;
+                wc->w_large_pages = 1;
-out:
+        else
+                wc->w_large_pages = 0;
+        ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+        *wcp = wc;
-        return bytes ? (unsigned int)bytes : ret;
+        return 0;
 }
 /*
- * This will copy user data from the iovec in the buffered write
+ * If a page has any new buffers, zero them out here, and mark them uptodate
- * context.
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
 */
-int ocfs2_map_and_write_user_data(struct inode *inode,
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
-                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-                                  unsigned int *ret_from, unsigned int *ret_to)
 {
-        int ret;
+        unsigned int block_start, block_end;
-        unsigned int to, from, cluster_start, cluster_end;
+        struct buffer_head *head, *bh;
-        unsigned long bytes, src_from;
-        char *dst;
-        struct ocfs2_buffered_write_priv *bp = wc->w_private;
-        const struct iovec *cur_iov = bp->b_cur_iov;
-        char __user *buf;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+        BUG_ON(!PageLocked(page));
-                                        &cluster_end);
+        if (!page_has_buffers(page))
+                return;
-        buf = cur_iov->iov_base + bp->b_cur_off;
+        bh = head = page_buffers(page);
-        src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+        block_start = 0;
+        do {
+                block_end = block_start + bh->b_size;
-        from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+                if (buffer_new(bh)) {
+                        if (block_end > from && block_start < to) {
+                                if (!PageUptodate(page)) {
+                                        unsigned start, end;
-        /*
+                                        start = max(from, block_start);
-         * This is a lot of comparisons, but it reads quite
+                                        end = min(to, block_end);
-         * easily, which is important here.
-         */
-        /* Stay within the src page */
-        bytes = PAGE_SIZE - src_from;
-        /* Stay within the vector */
-        bytes = min(bytes,
-                    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
-        /* Stay within count */
-        bytes = min(bytes, (unsigned long)wc->w_count);
-        /*
-         * For clustersize > page size, just stay within
-         * target page, otherwise we have to calculate pos
-         * within the cluster and obey the rightmost
-         * boundary.
-         */
-        if (wc->w_large_pages) {
-                /*
-                 * For cluster size < page size, we have to
-                 * calculate pos within the cluster and obey
-                 * the rightmost boundary.
-                 */
-                bytes = min(bytes, (unsigned long)(osb->s_clustersize
-                                   - (wc->w_pos & (osb->s_clustersize - 1))));
-        } else {
-                /*
-                 * cluster size > page size is the most common
-                 * case - we just stay within the target page
-                 * boundary.
-                 */
-                bytes = min(bytes, PAGE_CACHE_SIZE - from);
-        }
-        to = from + bytes;
+                                        zero_user_page(page, start, end - start, KM_USER0);
+                                        set_buffer_uptodate(bh);
+                                }
-        BUG_ON(from > PAGE_CACHE_SIZE);
+                                clear_buffer_new(bh);
-        BUG_ON(to > PAGE_CACHE_SIZE);
+                                mark_buffer_dirty(bh);
-        BUG_ON(from < cluster_start);
+                        }
-        BUG_ON(to > cluster_end);
+                }
-        if (wc->w_this_page_new)
+                block_start = block_end;
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                bh = bh->b_this_page;
-                                            cluster_start, cluster_end, 1);
+        } while (bh != head);
-        else
+}
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-                                            from, to, 0);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        dst = kmap(wc->w_this_page);
+/*
-        memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+ * Only called when we have a failure during allocating write to write
-        kunmap(wc->w_this_page);
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+                                struct ocfs2_write_ctxt *wc,
+                                loff_t user_pos, unsigned user_len)
+{
+        int i;
+        unsigned from, to;
+        struct page *tmppage;
-        /*
+        ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
-         * XXX: This is slow, but simple. The caller of
-         * ocfs2_buffered_write_cluster() is responsible for
-         * passing through the iovecs, so it's difficult to
-         * predict what our next step is in here after our
-         * initial write. A future version should be pushing
-         * that iovec manipulation further down.
-         *
-         * By setting this, we indicate that a copy from user
-         * data was done, and subsequent calls for this
-         * cluster will skip copying more data.
-         */
-        wc->w_finished_copy = 1;
-        *ret_from = from;
+        if (wc->w_large_pages) {
-        *ret_to = to;
+                from = wc->w_target_from;
-out:
+                to = wc->w_target_to;
+        } else {
+                from = 0;
+                to = PAGE_CACHE_SIZE;
+        }
+        for(i = 0; i < wc->w_num_pages; i++) {
+                tmppage = wc->w_pages[i];
-        return bytes ? (unsigned int)bytes : ret;
+                if (ocfs2_should_order_data(inode))
+                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                                          from, to, NULL,
+                                          ocfs2_journal_dirty_data);
+                block_commit_write(tmppage, from, to);
+        }
 }
-/*
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
- * Map, fill and write a page to disk.
+                                        struct ocfs2_write_ctxt *wc,
- *
+                                        struct page *page, u32 cpos,
- * The work of copying data is done via callback.  Newly allocated
+                                        loff_t user_pos, unsigned user_len,
- * pages which don't take user data will be zero'd (set 'new' to
+                                        int new)
- * indicate an allocating write)
- *
- * Returns a negative error code or the number of bytes copied into
- * the page.
- */
-static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-                                 u64 *p_blkno, struct page *page,
-                                 struct ocfs2_write_ctxt *wc, int new)
 {
-        int ret, copied = 0;
+        int ret;
-        unsigned int from = 0, to = 0;
+        unsigned int map_from = 0, map_to = 0;
        unsigned int cluster_start, cluster_end;
-        unsigned int zero_from = 0, zero_to = 0;
+        unsigned int user_data_from = 0, user_data_to = 0;
-        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
                                        &cluster_start, &cluster_end);
-        if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+        if (page == wc->w_target_page) {
-            && !wc->w_finished_copy) {
+                map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+                map_to = map_from + user_len;
-                wc->w_this_page = page;
-                wc->w_this_page_new = new;
+                if (new)
-                ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+                        ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-                if (ret < 0) {
+                                                    cluster_start, cluster_end,
+                                                    new);
+                else
+                        ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                                    map_from, map_to, new);
+                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                copied = ret;
+                user_data_from = map_from;
+                user_data_to = map_to;
-                zero_from = from;
-                zero_to = to;
                if (new) {
-                        from = cluster_start;
+                        map_from = cluster_start;
-                        to = cluster_end;
+                        map_to = cluster_end;
                }
+                wc->w_target_from = map_from;
+                wc->w_target_to = map_to;
        } else {
                /*
                 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
                 */
                BUG_ON(!new);
-                from = cluster_start;
+                map_from = cluster_start;
-                to = cluster_end;
+                map_to = cluster_end;
                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-                                            cluster_start, cluster_end, 1);
+                                            cluster_start, cluster_end, new);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
         */
        if (new && !PageUptodate(page))
                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
-                                         wc->w_cpos, zero_from, zero_to);
+                                         cpos, user_data_from, user_data_to);
        flush_dcache_page(page);
-        if (ocfs2_should_order_data(inode)) {
-                ret = walk_page_buffers(handle,
-                                        page_buffers(page),
-                                        from, to, NULL,
-                                        ocfs2_journal_dirty_data);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-        /*
-         * We don't use generic_commit_write() because we need to
-         * handle our own i_size update.
-         */
-        ret = block_commit_write(page, from, to);
-        if (ret)
-                mlog_errno(ret);
 out:
+        return ret;
-        return copied ? copied : ret;
 }
 /*
- * Do the actual write of some data into an inode. Optionally allocate
+ * This function will only grab one clusters worth of pages.
- * in order to fulfill the write.
- *
- * cpos is the logical cluster offset within the file to write at
- *
- * 'phys' is the physical mapping of that offset. a 'phys' value of
- * zero indicates that allocation is required. In this case, data_ac
- * and meta_ac should be valid (meta_ac can be null if metadata
- * allocation isn't required).
 */
-static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
-                           struct buffer_head *di_bh,
+                                      struct ocfs2_write_ctxt *wc,
-                           struct ocfs2_alloc_context *data_ac,
+                                      u32 cpos, loff_t user_pos, int new,
-                           struct ocfs2_alloc_context *meta_ac,
+                                      struct page *mmap_page)
-                           struct ocfs2_write_ctxt *wc)
 {
-        int ret, i, numpages = 1, new;
+        int ret = 0, i;
-        unsigned int copied = 0;
+        unsigned long start, target_index, index;
-        u32 tmp_pos;
-        u64 v_blkno, p_blkno;
-        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-        unsigned long index, start;
-        struct page **cpages;
-        new = phys == 0 ? 1 : 0;
+        target_index = user_pos >> PAGE_CACHE_SHIFT;
        /*
         * Figure out how many pages we'll be manipulating here. For
         * non allocating write, we just change the one
         * page. Otherwise, we'll need a whole clusters worth.
         */
-        if (new)
-                numpages = ocfs2_pages_per_cluster(inode->i_sb);
-        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
-        if (!cpages) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                return ret;
-        }
-        /*
-         * Fill our page array first. That way we've grabbed enough so
-         * that we can zero and flush if we error after adding the
-         * extent.
-         */
        if (new) {
-                start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+                wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
-                                                           wc->w_cpos);
+                start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
-                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
        } else {
-                start = wc->w_pos >> PAGE_CACHE_SHIFT;
+                wc->w_num_pages = 1;
-                v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+                start = target_index;
        }
-        for(i = 0; i < numpages; i++) {
+        for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
-                cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
+                if (index == target_index && mmap_page) {
-                if (!cpages[i]) {
+                        /*
-                        ret = -ENOMEM;
+                         * ocfs2_pagemkwrite() is a little different
-                        mlog_errno(ret);
+                         * and wants us to directly use the page
-                        goto out;
+                         * passed in.
+                         */
+                        lock_page(mmap_page);
+                        if (mmap_page->mapping != mapping) {
+                                unlock_page(mmap_page);
+                                /*
+                                 * Sanity check - the locking in
+                                 * ocfs2_pagemkwrite() should ensure
+                                 * that this code doesn't trigger.
+                                 */
+                                ret = -EINVAL;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        page_cache_get(mmap_page);
+                        wc->w_pages[i] = mmap_page;
+                } else {
+                        wc->w_pages[i] = find_or_create_page(mapping, index,
+                                                             GFP_NOFS);
+                        if (!wc->w_pages[i]) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
                }
+                if (index == target_index)
+                        wc->w_target_page = wc->w_pages[i];
        }
+out:
+        return ret;
+}
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+                               u32 phys, unsigned int unwritten,
+                               struct ocfs2_alloc_context *data_ac,
+                               struct ocfs2_alloc_context *meta_ac,
+                               struct ocfs2_write_ctxt *wc, u32 cpos,
+                               loff_t user_pos, unsigned user_len)
+{
+        int ret, i, new, should_zero = 0;
+        u64 v_blkno, p_blkno;
+        struct inode *inode = mapping->host;
+        new = phys == 0 ? 1 : 0;
+        if (new || unwritten)
+                should_zero = 1;
        if (new) {
+                u32 tmp_pos;
                /*
                 * This is safe to call with the page locks - it won't take
                 * any additional semaphores or cluster locks.
                 */
-                tmp_pos = wc->w_cpos;
+                tmp_pos = cpos;
                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-                                                 &tmp_pos, 1, di_bh, handle,
+                                                 &tmp_pos, 1, 0, wc->w_di_bh,
-                                                 data_ac, meta_ac, NULL);
+                                                 wc->w_handle, data_ac,
+                                                 meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
                        mlog_errno(ret);
                        goto out;
                }
+        } else if (unwritten) {
+                ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+                                                wc->w_handle, cpos, 1, phys,
+                                                meta_ac, &wc->w_dealloc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
+        if (should_zero)
+                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+        else
+                v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+        /*
+         * The only reason this should fail is due to an inability to
+         * find the extent added.
+         */
        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
                                          NULL);
        if (ret < 0) {
+                ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
-                /*
+                            "at logical block %llu",
-                 * XXX: Should we go readonly here?
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                 */
+                            (unsigned long long)v_blkno);
-                mlog_errno(ret);
                goto out;
        }
        BUG_ON(p_blkno == 0);
-        for(i = 0; i < numpages; i++) {
+        for(i = 0; i < wc->w_num_pages; i++) {
-                ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+                int tmpret;
-                                            wc, new);
-                if (ret < 0) {
+                tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
-                        mlog_errno(ret);
+                                                      wc->w_pages[i], cpos,
-                        goto out;
+                                                      user_pos, user_len,
+                                                      should_zero);
+                if (tmpret) {
+                        mlog_errno(tmpret);
+                        if (ret == 0)
+                                tmpret = ret;
                }
-                copied += ret;
        }
+        /*
+         * We only have cleanup to do in case of allocating write.
+         */
+        if (ret && new)
+                ocfs2_write_failure(inode, wc, user_pos, user_len);
 out:
-        for(i = 0; i < numpages; i++) {
-                unlock_page(cpages[i]);
+        return ret;
-                mark_page_accessed(cpages[i]);
+}
-                page_cache_release(cpages[i]);
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+                                       struct ocfs2_alloc_context *data_ac,
+                                       struct ocfs2_alloc_context *meta_ac,
+                                       struct ocfs2_write_ctxt *wc,
+                                       loff_t pos, unsigned len)
+{
+        int ret, i;
+        struct ocfs2_write_cluster_desc *desc;
+        for (i = 0; i < wc->w_clen; i++) {
+                desc = &wc->w_desc[i];
+                ret = ocfs2_write_cluster(mapping, desc->c_phys,
+                                          desc->c_unwritten, data_ac, meta_ac,
+                                          wc, desc->c_cpos, pos, len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
-        kfree(cpages);
-        return copied ? copied : ret;
+        ret = 0;
+out:
+        return ret;
 }
-static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+/*
-                                  struct ocfs2_super *osb, loff_t pos,
+ * ocfs2_write_end() wants to know which parts of the target page it
-                                  size_t count, ocfs2_page_writer *cb,
+ * should complete the write on. It's easiest to compute them ahead of
-                                  void *cb_priv)
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+                                        struct ocfs2_write_ctxt *wc,
+                                        loff_t pos, unsigned len, int alloc)
 {
-        wc->w_count = count;
+        struct ocfs2_write_cluster_desc *desc;
-        wc->w_pos = pos;
-        wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
-        wc->w_finished_copy = 0;
-        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+        wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
-                wc->w_large_pages = 1;
+        wc->w_target_to = wc->w_target_from + len;
-        else
-                wc->w_large_pages = 0;
-        wc->w_write_data_page = cb;
+        if (alloc == 0)
-        wc->w_private = cb_priv;
+                return;
+        /*
+         * Allocating write - we may have different boundaries based
+         * on page size and cluster size.
+         *
+         * NOTE: We can no longer compute one value from the other as
+         * the actual write length and user provided length may be
+         * different.
+         */
+        if (wc->w_large_pages) {
+                /*
+                 * We only care about the 1st and last cluster within
+                 * our range and whether they should be zero'd or not. Either
+                 * value may be extended out to the start/end of a
+                 * newly allocated cluster.
+                 */
+                desc = &wc->w_desc[0];
+                if (ocfs2_should_zero_cluster(desc))
+                        ocfs2_figure_cluster_boundaries(osb,
+                                                        desc->c_cpos,
+                                                        &wc->w_target_from,
+                                                        NULL);
+                desc = &wc->w_desc[wc->w_clen - 1];
+                if (ocfs2_should_zero_cluster(desc))
+                        ocfs2_figure_cluster_boundaries(osb,
+                                                        desc->c_cpos,
+                                                        NULL,
+                                                        &wc->w_target_to);
+        } else {
+                wc->w_target_from = 0;
+                wc->w_target_to = PAGE_CACHE_SIZE;
+        }
 }
 /*
- * Write a cluster to an inode. The cluster may not be allocated yet,
+ * Populate each single-cluster write descriptor in the write context
- * in which case it will be. This only exists for buffered writes -
+ * with information about the i/o to be done.
- * O_DIRECT takes a more "traditional" path through the kernel.
- *
- * The caller is responsible for incrementing pos, written counts, etc
 *
- * For file systems that don't support sparse files, pre-allocation
+ * Returns the number of clusters that will have to be allocated, as
- * and page zeroing up until cpos should be done prior to this
+ * well as a worst case estimate of the number of extent records that
- * function call.
+ * would have to be created during a write to an unwritten region.
- *
- * Callers should be holding i_sem, and the rw cluster lock.
- *
- * Returns the number of user bytes written, or less than zero for
- * error.
 */
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+static int ocfs2_populate_write_desc(struct inode *inode,
-                                     size_t count, ocfs2_page_writer *actor,
+                                     struct ocfs2_write_ctxt *wc,
-                                     void *priv)
+                                     unsigned int *clusters_to_alloc,
+                                     unsigned int *extents_to_split)
+{
+        int ret;
+        struct ocfs2_write_cluster_desc *desc;
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        u32 phys = 0;
+        int i;
+        *clusters_to_alloc = 0;
+        *extents_to_split = 0;
+        for (i = 0; i < wc->w_clen; i++) {
+                desc = &wc->w_desc[i];
+                desc->c_cpos = wc->w_cpos + i;
+                if (num_clusters == 0) {
+                        /*
+                         * Need to look up the next extent record.
+                         */
+                        ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+                                                 &num_clusters, &ext_flags);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /*
+                         * Assume worst case - that we're writing in
+                         * the middle of the extent.
+                         *
+                         * We can assume that the write proceeds from
+                         * left to right, in which case the extent
+                         * insert code is smart enough to coalesce the
+                         * next splits into the previous records created.
+                         */
+                        if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                                *extents_to_split = *extents_to_split + 2;
+                } else if (phys) {
+                        /*
+                         * Only increment phys if it doesn't describe
+                         * a hole.
+                         */
+                        phys++;
+                }
+                desc->c_phys = phys;
+                if (phys == 0) {
+                        desc->c_new = 1;
+                        *clusters_to_alloc = *clusters_to_alloc + 1;
+                }
+                if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                        desc->c_unwritten = 1;
+                num_clusters--;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata,
+                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-        ssize_t written = 0;
+        unsigned int clusters_to_alloc, extents_to_split;
-        u32 phys;
+        struct ocfs2_write_ctxt *wc;
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
-        struct ocfs2_write_ctxt wc;
-        ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                return ret;
        }
-        di = (struct ocfs2_dinode *)di_bh->b_data;
-        /*
-         * Take alloc sem here to prevent concurrent lookups. That way
-         * the mapping, zeroing and tree manipulation within
-         * ocfs2_write() will be safe against ->readpage(). This
-         * should also serve to lock out allocation from a shared
-         * writeable region.
-         */
-        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+        ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+                                        &extents_to_split);
        if (ret) {
                mlog_errno(ret);
-                goto out_meta;
+                goto out;
        }
-        /* phys == 0 means that allocation is required. */
+        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
-        if (phys == 0) {
-                ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+        /*
+         * We set w_target_from, w_target_to here so that
+         * ocfs2_write_end() knows which range in the target page to
+         * write out. An allocation requires that we write the entire
+         * cluster range.
+         */
+        if (clusters_to_alloc || extents_to_split) {
+                /*
+                 * XXX: We are stretching the limits of
+                 * ocfs2_lock_allocators(). It greatly over-estimates
+                 * the work to be done.
+                 */
+                ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+                                            extents_to_split, &data_ac, &meta_ac);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_meta;
+                        goto out;
                }
-                credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+                credits = ocfs2_calc_extend_credits(inode->i_sb, di,
-        }
+                                                    clusters_to_alloc);
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_meta;
        }
+        ocfs2_set_target_boundaries(osb, wc, pos, len,
+                                    clusters_to_alloc + extents_to_split);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_data;
+                goto out;
        }
-        written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+        wc->w_handle = handle;
-                              meta_ac, &wc);
-        if (written < 0) {
+        /*
-                ret = written;
+         * We don't want this to fail in ocfs2_write_end(), so do it
+         * here.
+         */
+        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
                mlog_errno(ret);
                goto out_commit;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        /*
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+         * Fill our page array first. That way we've grabbed enough so
+         * that we can zero and flush if we error after adding the
+         * extent.
+         */
+        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+                                         clusters_to_alloc + extents_to_split,
+                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
        }
-        pos += written;
+        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+                                          len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        *pagep = wc->w_target_page;
+        *fsdata = wc;
+        return 0;
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        ocfs2_free_write_ctxt(wc);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+                      loff_t pos, unsigned len, unsigned flags,
+                      struct page **pagep, void **fsdata)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct inode *inode = mapping->host;
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * Take alloc sem here to prevent concurrent lookups. That way
+         * the mapping, zeroing and tree manipulation within
+         * ocfs2_write() will be safe against ->readpage(). This
+         * should also serve to lock out allocation from a shared
+         * writeable region.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = ocfs2_data_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_fail;
+        }
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+                                       fsdata, di_bh, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_fail_data;
+        }
+        brelse(di_bh);
+        return 0;
+out_fail_data:
+        ocfs2_data_unlock(inode, 1);
+out_fail:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        brelse(di_bh);
+        ocfs2_meta_unlock(inode, 1);
+        return ret;
+}
+int ocfs2_write_end_nolock(struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned copied,
+                           struct page *page, void *fsdata)
+{
+        int i;
+        unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+        struct inode *inode = mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_write_ctxt *wc = fsdata;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+        handle_t *handle = wc->w_handle;
+        struct page *tmppage;
+        if (unlikely(copied < len)) {
+                if (!PageUptodate(wc->w_target_page))
+                        copied = 0;
+                ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+                                       start+len);
+        }
+        flush_dcache_page(wc->w_target_page);
+        for(i = 0; i < wc->w_num_pages; i++) {
+                tmppage = wc->w_pages[i];
+                if (tmppage == wc->w_target_page) {
+                        from = wc->w_target_from;
+                        to = wc->w_target_to;
+                        BUG_ON(from > PAGE_CACHE_SIZE ||
+                               to > PAGE_CACHE_SIZE ||
+                               to < from);
+                } else {
+                        /*
+                         * Pages adjacent to the target (if any) imply
+                         * a hole-filling write in which case we want
+                         * to flush their entire range.
+                         */
+                        from = 0;
+                        to = PAGE_CACHE_SIZE;
+                }
+                if (ocfs2_should_order_data(inode))
+                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                                          from, to, NULL,
+                                          ocfs2_journal_dirty_data);
+                block_commit_write(tmppage, from, to);
+        }
+        pos += copied;
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
                mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ocfs2_journal_dirty(handle, wc->w_di_bh);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret)
-                mlog_errno(ret);
-out_commit:
        ocfs2_commit_trans(osb, handle);
-out_data:
+        ocfs2_run_deallocs(osb, &wc->w_dealloc);
-        ocfs2_data_unlock(inode, 1);
+        ocfs2_free_write_ctxt(wc);
+        return copied;
+}
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+                    loff_t pos, unsigned len, unsigned copied,
+                    struct page *page, void *fsdata)
+{
+        int ret;
+        struct inode *inode = mapping->host;
-out_meta:
+        ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+        ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_meta_unlock(inode, 1);
-out:
+        return ret;
-        brelse(di_bh);
-        if (data_ac)
-                ocfs2_free_alloc_context(data_ac);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        return written ? written : ret;
 }
 const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d479b5a..389579bd64e3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers(	handle_t *handle,
                        int (*fn)(      handle_t *handle,
                                        struct buffer_head *bh));
-struct ocfs2_write_ctxt;
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
-typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+                      loff_t pos, unsigned len, unsigned flags,
-                                u64 *, unsigned int *, unsigned int *);
+                      struct page **pagep, void **fsdata);
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
-                                     size_t count, ocfs2_page_writer *actor,
+                    loff_t pos, unsigned len, unsigned copied,
-                                     void *priv);
+                    struct page *page, void *fsdata);
-struct ocfs2_write_ctxt {
+int ocfs2_write_end_nolock(struct address_space *mapping,
-        size_t                          w_count;
+                           loff_t pos, unsigned len, unsigned copied,
-        loff_t                          w_pos;
+                           struct page *page, void *fsdata);
-        u32                             w_cpos;
-        unsigned int                    w_finished_copy;
-        /* This is true if page_size > cluster_size */
+int ocfs2_write_begin_nolock(struct address_space *mapping,
-        unsigned int                    w_large_pages;
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata,
-        /* Filler callback and private data */
+                             struct buffer_head *di_bh, struct page *mmap_page);
-        ocfs2_page_writer               *w_write_data_page;
-        void                            *w_private;
-        /* Only valid for the filler callback */
-        struct page                     *w_this_page;
-        unsigned int                    w_this_page_new;
-};
-struct ocfs2_buffered_write_priv {
-        char                            *b_src_buf;
-        const struct iovec              *b_cur_iov; /* Current iovec */
-        size_t                          b_cur_off; /* Offset in the
-                                                    * current iovec */
-};
-int ocfs2_map_and_write_user_data(struct inode *inode,
-                                  struct ocfs2_write_ctxt *wc,
-                                  u64 *p_blkno,
-                                  unsigned int *ret_from,
-                                  unsigned int *ret_to);
-struct ocfs2_splice_write_priv {
-        struct splice_desc              *s_sd;
-        struct pipe_buffer              *s_buf;
-        struct pipe_inode_info          *s_pipe;
-        /* Neither offset value is ever larger than one page */
-        unsigned int                    s_offset;
-        unsigned int                    s_buf_offset;
-};
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-                                    struct ocfs2_write_ctxt *wc,
-                                    u64 *p_blkno,
-                                    unsigned int *ret_from,
-                                    unsigned int *ret_to);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 979113479c66..2bd7f788cf34 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        ret = wait_event_interruptible(o2hb_steady_queue,
                                atomic_read(&reg->hr_steady_iterations) == 0);
        if (ret) {
+                /* We got interrupted (hello ptrace!).  Clean up */
                spin_lock(&o2hb_live_lock);
                hb_task = reg->hr_task;
                reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                goto out;
        }
-        ret = count;
+        /* Ok, we were woken.  Make sure it wasn't by drop_item() */
+        spin_lock(&o2hb_live_lock);
+        hb_task = reg->hr_task;
+        spin_unlock(&o2hb_live_lock);
+        if (hb_task)
+                ret = count;
+        else
+                ret = -EIO;
 out:
        if (filp)
                fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (hb_task)
                kthread_stop(hb_task);
+        /*
+         * If we're racing a dev_write(), we need to wake them.  They will
+         * check reg->hr_task
+         */
+        if (atomic_read(&reg->hr_steady_iterations) != 0) {
+                atomic_set(&reg->hr_steady_iterations, 0);
+                wake_up(&o2hb_steady_queue);
+        }
        config_item_put(item);
 }
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-int o2hb_register_callback(struct o2hb_callback_func *hc)
+static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+{
+        struct o2hb_region *p, *reg = NULL;
+        assert_spin_locked(&o2hb_live_lock);
+        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                        reg = p;
+                        break;
+                }
+        }
+        return reg;
+}
+static int o2hb_region_get(const char *region_uuid)
+{
+        int ret = 0;
+        struct o2hb_region *reg;
+        spin_lock(&o2hb_live_lock);
+        reg = o2hb_find_region(region_uuid);
+        if (!reg)
+                ret = -ENOENT;
+        spin_unlock(&o2hb_live_lock);
+        if (ret)
+                goto out;
+        ret = o2nm_depend_this_node();
+        if (ret)
+                goto out;
+        ret = o2nm_depend_item(&reg->hr_item);
+        if (ret)
+                o2nm_undepend_this_node();
+out:
+        return ret;
+}
+static void o2hb_region_put(const char *region_uuid)
+{
+        struct o2hb_region *reg;
+        spin_lock(&o2hb_live_lock);
+        reg = o2hb_find_region(region_uuid);
+        spin_unlock(&o2hb_live_lock);
+        if (reg) {
+                o2nm_undepend_item(&reg->hr_item);
+                o2nm_undepend_this_node();
+        }
+}
+int o2hb_register_callback(const char *region_uuid,
+                           struct o2hb_callback_func *hc)
 {
        struct o2hb_callback_func *tmp;
        struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
                goto out;
        }
+        if (region_uuid) {
+                ret = o2hb_region_get(region_uuid);
+                if (ret)
+                        goto out;
+        }
        down_write(&o2hb_callback_sem);
        list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(o2hb_register_callback);
-void o2hb_unregister_callback(struct o2hb_callback_func *hc)
+void o2hb_unregister_callback(const char *region_uuid,
+                              struct o2hb_callback_func *hc)
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
+        /* XXX Can this happen _with_ a region reference? */
        if (list_empty(&hc->hc_item))
                return;
+        if (region_uuid)
+                o2hb_region_put(region_uuid);
        down_write(&o2hb_callback_sem);
        list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b39771..35397dd5ecdb 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
                         o2hb_cb_func *func,
                         void *data,
                         int priority);
-int o2hb_register_callback(struct o2hb_callback_func *hc);
+int o2hb_register_callback(const char *region_uuid,
-void o2hb_unregister_callback(struct o2hb_callback_func *hc);
+                           struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+                              struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
 void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01ce0..af2070da308b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
        },
 };
+int o2nm_depend_item(struct config_item *item)
+{
+        return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+void o2nm_undepend_item(struct config_item *item)
+{
+        configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+int o2nm_depend_this_node(void)
+{
+        int ret = 0;
+        struct o2nm_node *local_node;
+        local_node = o2nm_get_node_by_num(o2nm_this_node());
+        if (!local_node) {
+                ret = -EINVAL;
+                goto out;
+        }
+        ret = o2nm_depend_item(&local_node->nd_item);
+        o2nm_node_put(local_node);
+out:
+        return ret;
+}
+void o2nm_undepend_this_node(void)
+{
+        struct o2nm_node *local_node;
+        local_node = o2nm_get_node_by_num(o2nm_this_node());
+        BUG_ON(!local_node);
+        o2nm_undepend_item(&local_node->nd_item);
+        o2nm_node_put(local_node);
+}
 static void __exit exit_o2nm(void)
 {
        if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
                goto out_sysctl;
        config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
-        init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+        mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
        ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
        if (ret) {
                printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 070522138ae2..7c860361b8dd 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
 void o2nm_node_get(struct o2nm_node *node);
 void o2nm_node_put(struct o2nm_node *node);
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
 #endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9c7952..f0bdfd944c44 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
 static void o2net_complete_nodes_nsw(struct o2net_node *nn)
 {
-        struct list_head *iter, *tmp;
+        struct o2net_status_wait *nsw, *tmp;
        unsigned int num_kills = 0;
-        struct o2net_status_wait *nsw;
        assert_spin_locked(&nn->nn_lock);
-        list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+        list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
-                nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
                o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
                num_kills++;
        }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
 void o2net_unregister_handler_list(struct list_head *list)
 {
-        struct list_head *pos, *n;
+        struct o2net_msg_handler *nmh, *n;
-        struct o2net_msg_handler *nmh;
        write_lock(&o2net_handler_lock);
-        list_for_each_safe(pos, n, list) {
+        list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
-                nmh = list_entry(pos, struct o2net_msg_handler,
-                                 nh_unregister_item);
                mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
                     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
                rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 void o2net_unregister_hb_callbacks(void)
 {
-        o2hb_unregister_callback(&o2net_hb_up);
+        o2hb_unregister_callback(NULL, &o2net_hb_up);
-        o2hb_unregister_callback(&o2net_hb_down);
+        o2hb_unregister_callback(NULL, &o2net_hb_down);
 }
 int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
        o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
                            o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
-        ret = o2hb_register_callback(&o2net_hb_up);
+        ret = o2hb_register_callback(NULL, &o2net_hb_up);
        if (ret == 0)
-                ret = o2hb_register_callback(&o2net_hb_down);
+                ret = o2hb_register_callback(NULL, &o2net_hb_down);
        if (ret)
                o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
                u32 offset = OCFS2_I(dir)->ip_clusters;
                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-                                                    1, parent_fe_bh, handle,
+                                                    1, 0, parent_fe_bh, handle,
                                                    data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98dd99a..6954565b8ccb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(&dlm->dlm_hb_up);
+        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(&dlm->dlm_hb_down);
+        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(&dlm->dlm_hb_down);
+        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(&dlm->dlm_hb_up);
+        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
        if (status)
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca99d98..65b2b9b92688 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
        struct dlm_master_list_entry *mle;
-        struct list_head *iter;
        
        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
        spin_lock(&dlm->master_lock);
-        list_for_each(iter, &dlm->master_list) {
+        list_for_each_entry(mle, &dlm->master_list, list)
-                mle = list_entry(iter, struct dlm_master_list_entry, list);
                dlm_print_one_mle(mle);
-        }
        spin_unlock(&dlm->master_lock);
 }
 int dlm_dump_all_mles(const char __user *data, unsigned int len)
 {
-        struct list_head *iter;
        struct dlm_ctxt *dlm;
        spin_lock(&dlm_domain_lock);
-        list_for_each(iter, &dlm_domains) {
+        list_for_each_entry(dlm, &dlm_domains, list) {
-                dlm = list_entry (iter, struct dlm_ctxt, list);
                mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
                dlm_dump_mles(dlm);
        }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
-        struct list_head *iter;
        assert_spin_locked(&dlm->master_lock);
-        list_for_each(iter, &dlm->master_list) {
+        list_for_each_entry(tmpmle, &dlm->master_list, list) {
-                tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
 {
        struct dlm_master_list_entry *mle;
-        struct list_head *iter;
        assert_spin_locked(&dlm->spinlock);
        
-        list_for_each(iter, &dlm->mle_hb_events) {
+        list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
-                mle = list_entry(iter, struct dlm_master_list_entry, 
-                                 hb_events);
                if (node_up)
                        dlm_mle_node_up(dlm, mle, NULL, idx);
                else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        int ret;
        int i;
        int count = 0;
-        struct list_head *queue, *iter;
+        struct list_head *queue;
        struct dlm_lock *lock;
        assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        ret = 0;
        queue = &res->granted;
        for (i = 0; i < 3; i++) {
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
-                        lock = list_entry(iter, struct dlm_lock, list);
                        ++count;
                        if (lock->ml.node == dlm->node_num) {
                                mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res)
 {
-        struct list_head *iter, *iter2;
        struct list_head *queue = &res->granted;
        int i, bit;
-        struct dlm_lock *lock;
+        struct dlm_lock *lock, *next;
        assert_spin_locked(&res->spinlock);
        BUG_ON(res->owner == dlm->node_num);
        for (i=0; i<3; i++) {
-                list_for_each_safe(iter, iter2, queue) {
+                list_for_each_entry_safe(lock, next, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node != dlm->node_num) {
                                mlog(0, "putting lock for node %u\n",
                                     lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 {
        int i;
        struct list_head *queue = &res->granted;
-        struct list_head *iter;
        struct dlm_lock *lock;
        int nodenum;
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        for (i=0; i<3; i++) {
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
                        /* up to the caller to make sure this node
                         * is alive */
-                        lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node != dlm->node_num) {
                                spin_unlock(&res->spinlock);
                                return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 {
-        struct list_head *iter, *iter2;
+        struct dlm_master_list_entry *mle, *next;
-        struct dlm_master_list_entry *mle;
        struct dlm_lock_resource *res;
        unsigned int hash;
@@ -3245,9 +3229,7 @@ top:
        /* clean the master list */
        spin_lock(&dlm->master_lock);
-        list_for_each_safe(iter, iter2, &dlm->master_list) {
+        list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
-                mle = list_entry(iter, struct dlm_master_list_entry, list);
                BUG_ON(mle->type != DLM_MLE_BLOCK &&
                       mle->type != DLM_MLE_MASTER &&
                       mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed58ee2..a2c33160bfd6 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
        struct dlm_ctxt *dlm =
                container_of(work, struct dlm_ctxt, dispatched_work);
        LIST_HEAD(tmp_list);
-        struct list_head *iter, *iter2;
+        struct dlm_work_item *item, *next;
-        struct dlm_work_item *item;
        dlm_workfunc_t *workfunc;
        int tot=0;
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
-        list_for_each_safe(iter, iter2, &tmp_list) {
+        list_for_each_entry(item, &tmp_list, list) {
                tot++;
        }
        mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
-        list_for_each_safe(iter, iter2, &tmp_list) {
+        list_for_each_entry_safe(item, next, &tmp_list, list) {
-                item = list_entry(iter, struct dlm_work_item, list);
                workfunc = item->func;
                list_del_init(&item->list);
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 {
        int status = 0;
        struct dlm_reco_node_data *ndata;
-        struct list_head *iter;
        int all_nodes_done;
        int destroy = 0;
        int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
-        list_for_each(iter, &dlm->reco.node_data) {
+        list_for_each_entry(ndata, &dlm->reco.node_data, list) {
-                ndata = list_entry (iter, struct dlm_reco_node_data, list);
                BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
                ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                 * done, or if anyone died */
                all_nodes_done = 1;
                spin_lock(&dlm_reco_state_lock);
-                list_for_each(iter, &dlm->reco.node_data) {
+                list_for_each_entry(ndata, &dlm->reco.node_data, list) {
-                        ndata = list_entry (iter, struct dlm_reco_node_data, list);
                        mlog(0, "checking recovery state of node %u\n",
                             ndata->node_num);
                        switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 {
-        struct list_head *iter, *iter2;
+        struct dlm_reco_node_data *ndata, *next;
-        struct dlm_reco_node_data *ndata;
        LIST_HEAD(tmplist);
        spin_lock(&dlm_reco_state_lock);
        list_splice_init(&dlm->reco.node_data, &tmplist);
        spin_unlock(&dlm_reco_state_lock);
-        list_for_each_safe(iter, iter2, &tmplist) {
+        list_for_each_entry_safe(ndata, next, &tmplist, list) {
-                ndata = list_entry (iter, struct dlm_reco_node_data, list);
                list_del_init(&ndata->list);
                kfree(ndata);
        }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        struct dlm_lock_resource *res;
        struct dlm_ctxt *dlm;
        LIST_HEAD(resources);
-        struct list_head *iter;
        int ret;
        u8 dead_node, reco_master;
        int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        /* any errors returned will be due to the new_master dying,
         * the dlm_reco_thread should detect this */
-        list_for_each(iter, &resources) {
+        list_for_each_entry(res, &resources, recovering) {
-                res = list_entry (iter, struct dlm_lock_resource, recovering);
                ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
                                        DLM_MRES_RECOVERY);
                if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
-        struct list_head *iter;
        struct dlm_reco_node_data *ndata = NULL;
        int ret = -EINVAL;
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
                        dlm->reco.dead_node, done->node_idx, dlm->node_num);
        spin_lock(&dlm_reco_state_lock);
-        list_for_each(iter, &dlm->reco.node_data) {
+        list_for_each_entry(ndata, &dlm->reco.node_data, list) {
-                ndata = list_entry (iter, struct dlm_reco_node_data, list);
                if (ndata->node_num != done->node_idx)
                        continue;
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                        struct list_head *list,
                                        u8 dead_node)
 {
-        struct dlm_lock_resource *res;
+        struct dlm_lock_resource *res, *next;
-        struct list_head *iter, *iter2;
        struct dlm_lock *lock;
        spin_lock(&dlm->spinlock);
-        list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+        list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
-                res = list_entry (iter, struct dlm_lock_resource, recovering);
                /* always prune any $RECOVERY entries for dead nodes,
                 * otherwise hangs can occur during later recovery */
                if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
                                        u8 flags, u8 master)
 {
        /* mres here is one full page */
-        memset(mres, 0, PAGE_SIZE);
+        clear_page(mres);
        mres->lockname_len = namelen;
        memcpy(mres->lockname, lockname, namelen);
        mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         struct dlm_migratable_lockres *mres,
                         u8 send_to, u8 flags)
 {
-        struct list_head *queue, *iter;
+        struct list_head *queue;
        int total_locks, i;
        u64 mig_cookie = 0;
        struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        total_locks = 0;
        for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
                queue = dlm_list_idx_to_ptr(res, i);
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        /* add another lock. */
                        total_locks++;
                        if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
        int i, j, bad;
-        struct list_head *iter;
        struct dlm_lock *lock = NULL;
        u8 from = O2NM_MAX_NODES;
        unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                                list_for_each(iter, tmpq) {
+                                list_for_each_entry(lock, tmpq, list) {
-                                        lock = list_entry (iter, struct dlm_lock, list);
                                        if (lock->ml.cookie != ml->cookie)
                                                lock = NULL;
                                        else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
                                       struct dlm_lock_resource *res)
 {
        int i;
-        struct list_head *queue, *iter, *iter2;
+        struct list_head *queue;
-        struct dlm_lock *lock;
+        struct dlm_lock *lock, *next;
        res->state |= DLM_LOCK_RES_RECOVERING;
        if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        /* find any pending locks and put them back on proper list */
        for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
                queue = dlm_list_idx_to_ptr(res, i);
-                list_for_each_safe(iter, iter2, queue) {
+                list_for_each_entry_safe(lock, next, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        dlm_lock_get(lock);
                        if (lock->convert_pending) {
                                /* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                                              u8 dead_node, u8 new_master)
 {
        int i;
-        struct list_head *iter, *iter2;
        struct hlist_node *hash_iter;
        struct hlist_head *bucket;
+        struct dlm_lock_resource *res, *next;
-        struct dlm_lock_resource *res;
        mlog_entry_void();
        assert_spin_locked(&dlm->spinlock);
-        list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+        list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
-                res = list_entry (iter, struct dlm_lock_resource, recovering);
                if (res->owner == dead_node) {
                        list_del_init(&res->recovering);
                        spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
 static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
                               struct dlm_lock_resource *res, u8 dead_node)
 {
-        struct list_head *iter, *queue;
+        struct list_head *queue;
        struct dlm_lock *lock;
        int blank_lvb = 0, local = 0;
        int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
        for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
                queue = dlm_list_idx_to_ptr(res, i);
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node == search_node) {
                                if (dlm_lvb_needs_invalidation(lock, local)) {
                                        /* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
 static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                                struct dlm_lock_resource *res, u8 dead_node)
 {
-        struct list_head *iter, *tmpiter;
+        struct dlm_lock *lock, *next;
-        struct dlm_lock *lock;
        unsigned int freed = 0;
        /* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
        /* TODO: check pending_asts, pending_basts here */
-        list_for_each_safe(iter, tmpiter, &res->granted) {
+        list_for_each_entry_safe(lock, next, &res->granted, list) {
-                lock = list_entry (iter, struct dlm_lock, list);
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
                        freed++;
                }
        }
-        list_for_each_safe(iter, tmpiter, &res->converting) {
+        list_for_each_entry_safe(lock, next, &res->converting, list) {
-                lock = list_entry (iter, struct dlm_lock, list);
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
                        freed++;
                }
        }
-        list_for_each_safe(iter, tmpiter, &res->blocked) {
+        list_for_each_entry_safe(lock, next, &res->blocked, list) {
-                lock = list_entry (iter, struct dlm_lock, list);
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305ef0d7..f71250ed166f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
 static void lockres_set_flags(struct ocfs2_lock_res *lockres,
                              unsigned long newflags)
 {
-        struct list_head *pos, *tmp;
+        struct ocfs2_mask_waiter *mw, *tmp;
-        struct ocfs2_mask_waiter *mw;
        assert_spin_locked(&lockres->l_lock);
        lockres->l_flags = newflags;
-        list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+        list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
-                mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
                if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
                        continue;
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
        *var = cpu_to_le32(le32_to_cpu(*var) + val);
 }
+static inline void le64_add_cpu(__le64 *var, u64 val)
+{
+        *var = cpu_to_le64(le64_to_cpu(*var) + val);
+}
 static inline void le32_and_cpu(__le32 *var, u32 val)
 {
        *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 5b77ee7866ef..e08bed9e45a0 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_EXPORT_H
 #define OCFS2_EXPORT_H
+#include <linux/exportfs.h>
 extern struct export_operations ocfs2_export_ops;
 #endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab1c6e4..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
 */
 void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
 {
-        struct list_head *p, *n;
+        struct ocfs2_extent_map_item *emi, *n;
-        struct ocfs2_extent_map_item *emi;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_extent_map *em = &oi->ip_extent_map;
        LIST_HEAD(tmp_list);
        unsigned int range;
        spin_lock(&oi->ip_lock);
-        list_for_each_safe(p, n, &em->em_list) {
+        list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
-                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
                if (emi->ei_cpos >= cpos) {
                        /* Full truncate of this record. */
                        list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
        }
        spin_unlock(&oi->ip_lock);
-        list_for_each_safe(p, n, &tmp_list) {
+        list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
-                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
                list_del(&emi->ei_list);
                kfree(emi);
        }
@@ -377,37 +373,6 @@ out:
        return ret;
 }
-/*
- * Return the index of the extent record which contains cluster #v_cluster.
- * -1 is returned if it was not found.
- *
- * Should work fine on interior and exterior nodes.
- */
-static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
-                                    u32 v_cluster)
-{
-        int ret = -1;
-        int i;
-        struct ocfs2_extent_rec *rec;
-        u32 rec_end, rec_start, clusters;
-        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                rec = &el->l_recs[i];
-                rec_start = le32_to_cpu(rec->e_cpos);
-                clusters = ocfs2_rec_clusters(el, rec);
-                rec_end = rec_start + clusters;
-                if (v_cluster >= rec_start && v_cluster < rec_end) {
-                        ret = i;
-                        break;
-                }
-        }
-        return ret;
-}
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
                       u32 *p_cluster, u32 *num_clusters,
                       unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b6675717..f04c7aa834cb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        int status;
        handle_t *handle;
        struct ocfs2_dinode *di;
+        u64 cluster_bytes;
        mlog_entry_void();
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        /*
         * Do this before setting i_size.
         */
-        status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+        cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+                                               cluster_bytes);
        if (status) {
                mlog_errno(status);
                goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
                   (unsigned long long)new_i_size);
-        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
-        truncate_inode_pages(inode->i_mapping, new_i_size);
        fe = (struct ocfs2_dinode *) di_bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
        if (new_i_size == le64_to_cpu(fe->i_size))
                goto bail;
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        /* This forces other nodes to sync and drop their pages. Do
         * this even if we have a truncate without allocation change -
         * ocfs2 cluster sizes can be much greater than page size, so
         * we have to truncate them anyway.  */
        status = ocfs2_data_lock(inode, 1);
        if (status < 0) {
+                up_write(&OCFS2_I(inode)->ip_alloc_sem);
                mlog_errno(status);
                goto bail;
        }
+        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+        truncate_inode_pages(inode->i_mapping, new_i_size);
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
         * truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
 bail_unlock_data:
        ocfs2_data_unlock(inode, 1);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
        mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
                               u32 *logical_offset,
                               u32 clusters_to_add,
+                               int mark_unwritten,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        enum ocfs2_alloc_restarted reason = RESTART_NONE;
        u32 bit_off, num_bits;
        u64 block;
+        u8 flags = 0;
        BUG_ON(!clusters_to_add);
+        if (mark_unwritten)
+                flags = OCFS2_EXT_UNWRITTEN;
        free_extents = ocfs2_num_free_extents(osb, inode, fe);
        if (free_extents < 0) {
                status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
        status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
                                     *logical_offset, block, num_bits,
-                                     meta_ac);
+                                     flags, meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -516,25 +530,31 @@ leave:
 * For a given allocation, determine which allocators will need to be
 * accessed, and lock them, reserving the appropriate number of bits.
 *
- * Called from ocfs2_extend_allocation() for file systems which don't
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
- * support holes, and from ocfs2_write() for file systems which
+ * and ocfs2_allocate_unwritten_extents().
- * understand sparse inodes.
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
 */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                          u32 clusters_to_add,
+                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac)
 {
-        int ret, num_free_extents;
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        *meta_ac = NULL;
-        *data_ac = NULL;
+        if (data_ac)
+                *data_ac = NULL;
+        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-             "clusters_to_add = %u\n",
+             "clusters_to_add = %u, extents_to_split = %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-             le32_to_cpu(di->i_clusters), clusters_to_add);
+             le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
        if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
         *
         * Most of the time we'll only be seeing this 1 cluster at a time
         * anyway.
+         *
+         * Always lock for any unwritten extents - we might want to
+         * add blocks during a split.
         */
        if (!num_free_extents ||
-            (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
                ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
                if (ret < 0) {
                        if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
                }
        }
+        if (clusters_to_add == 0)
+                goto out;
        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
        return ret;
 }
-static int ocfs2_extend_allocation(struct inode *inode,
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
-                                   u32 clusters_to_add)
+                                     u32 clusters_to_add, int mark_unwritten)
 {
        int status = 0;
        int restart_func = 0;
-        int drop_alloc_sem = 0;
        int credits;
-        u32 prev_clusters, logical_start;
+        u32 prev_clusters;
        struct buffer_head *bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
         * This function only exists for file systems which don't
         * support holes.
         */
-        BUG_ON(ocfs2_sparse_alloc(osb));
+        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
                                  OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
                goto leave;
        }
-        logical_start = OCFS2_I(inode)->ip_clusters;
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        /* blocks peope in read/write from reading our allocation
+        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
-         * until we're done changing it. We depend on i_mutex to block
-         * other extend/truncate calls while we're here. Ordering wrt
-         * start_trans is important here -- always do it before! */
-        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        drop_alloc_sem = 1;
-        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
                                       &meta_ac);
        if (status) {
                mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
                                            inode,
                                            &logical_start,
                                            clusters_to_add,
+                                            mark_unwritten,
                                            bh,
                                            handle,
                                            data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
             OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 leave:
-        if (drop_alloc_sem) {
-                up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                drop_alloc_sem = 0;
-        }
        if (handle) {
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
@@ -749,6 +762,25 @@ leave:
        return status;
 }
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+                                   u32 clusters_to_add, int mark_unwritten)
+{
+        int ret;
+        /*
+         * The alloc sem blocks peope in read/write from reading our
+         * allocation until we're done changing it. We depend on
+         * i_mutex to block other extend/truncate calls while we're
+         * here.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
+                                        mark_unwritten);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        return ret;
+}
 /* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
        }
        if (clusters_to_add) {
-                ret = ocfs2_extend_allocation(inode, clusters_to_add);
+                ret = ocfs2_extend_allocation(inode,
+                                              OCFS2_I(inode)->ip_clusters,
+                                              clusters_to_add, 0);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                goto bail_unlock;
        }
+        /*
+         * This will intentionally not wind up calling vmtruncate(),
+         * since all the work for a size change has been done above.
+         * Otherwise, we could get into problems with truncate as
+         * ip_alloc_sem is used there to protect against i_size
+         * changes.
+         */
        status = inode_setattr(inode, attr);
        if (status < 0) {
                mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
        return ret;
 }
-static int ocfs2_write_remove_suid(struct inode *inode)
+static int __ocfs2_write_remove_suid(struct inode *inode,
+                                     struct buffer_head *bh)
 {
        int ret;
-        struct buffer_head *bh = NULL;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        handle_t *handle;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di;
        mlog_entry("(Inode %llu, mode 0%o)\n",
-                   (unsigned long long)oi->ip_blkno, inode->i_mode);
+                   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
                goto out;
        }
-        ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_trans;
-        }
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
-                goto out_bh;
+                goto out_trans;
        }
        inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
        ret = ocfs2_journal_dirty(handle, bh);
        if (ret < 0)
                mlog_errno(ret);
-out_bh:
-        brelse(bh);
 out_trans:
        ocfs2_commit_trans(osb, handle);
 out:
@@ -1159,6 +1192,460 @@ out:
        return ret;
 }
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+        int ret;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                               oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret =  __ocfs2_write_remove_suid(inode, bh);
+out:
+        brelse(bh);
+        return ret;
+}
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+                                            u64 start, u64 len)
+{
+        int ret;
+        u32 cpos, phys_cpos, clusters, alloc_size;
+        /*
+         * We consider both start and len to be inclusive.
+         */
+        cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+        clusters -= cpos;
+        while (clusters) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                                         &alloc_size, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * Hole or existing extent len can be arbitrary, so
+                 * cap it to our own allocation request.
+                 */
+                if (alloc_size > clusters)
+                        alloc_size = clusters;
+                if (phys_cpos) {
+                        /*
+                         * We already have an allocation at this
+                         * region so we can safely skip it.
+                         */
+                        goto next;
+                }
+                ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+                if (ret) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+next:
+                cpos += alloc_size;
+                clusters -= alloc_size;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+static int __ocfs2_remove_inode_range(struct inode *inode,
+                                      struct buffer_head *di_bh,
+                                      u32 cpos, u32 phys_cpos, u32 len,
+                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        if (handle == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        OCFS2_I(inode)->ip_clusters -= len;
+        di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+                                         u64 byte_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        loff_t start, end;
+        struct address_space *mapping = inode->i_mapping;
+        start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+        end = byte_start + byte_len;
+        end = end & ~(osb->s_clustersize - 1);
+        if (start < end) {
+                unmap_mapping_range(mapping, start, end - start, 0);
+                truncate_inode_pages_range(mapping, start, end - 1);
+        }
+}
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+                                       u64 start, u64 len)
+{
+        int ret = 0;
+        u64 tmpend, end = start + len;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        unsigned int csize = osb->s_clustersize;
+        handle_t *handle;
+        /*
+         * The "start" and "end" values are NOT necessarily part of
+         * the range whose allocation is being deleted. Rather, this
+         * is what the user passed in with the request. We must zero
+         * partial clusters here. There's no need to worry about
+         * physical allocation - the zeroing code knows to skip holes.
+         */
+        mlog(0, "byte start: %llu, end: %llu\n",
+             (unsigned long long)start, (unsigned long long)end);
+        /*
+         * If both edges are on a cluster boundary then there's no
+         * zeroing required as the region is part of the allocation to
+         * be truncated.
+         */
+        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (handle == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * We want to get the byte offset of the end of the 1st cluster.
+         */
+        tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+        if (tmpend > end)
+                tmpend = end;
+        mlog(0, "1st range: start: %llu, tmpend: %llu\n",
+             (unsigned long long)start, (unsigned long long)tmpend);
+        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+        if (ret)
+                mlog_errno(ret);
+        if (tmpend < end) {
+                /*
+                 * This may make start and end equal, but the zeroing
+                 * code will skip any work in that case so there's no
+                 * need to catch it up here.
+                 */
+                start = end & ~(osb->s_clustersize - 1);
+                mlog(0, "2nd range: start: %llu, end: %llu\n",
+                     (unsigned long long)start, (unsigned long long)end);
+                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+                if (ret)
+                        mlog_errno(ret);
+        }
+        ocfs2_commit_trans(osb, handle);
+out:
+        return ret;
+}
+static int ocfs2_remove_inode_range(struct inode *inode,
+                                    struct buffer_head *di_bh, u64 byte_start,
+                                    u64 byte_len)
+{
+        int ret = 0;
+        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        if (byte_len == 0)
+                return 0;
+        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+        if (trunc_len >= trunc_start)
+                trunc_len -= trunc_start;
+        else
+                trunc_len = 0;
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             (unsigned long long)byte_start,
+             (unsigned long long)byte_len, trunc_start, trunc_len);
+        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        cpos = trunc_start;
+        while (trunc_len) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                                         &alloc_size, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (alloc_size > trunc_len)
+                        alloc_size = trunc_len;
+                /* Only do work for non-holes */
+                if (phys_cpos != 0) {
+                        ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+                                                         phys_cpos, alloc_size,
+                                                         &dealloc);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                cpos += alloc_size;
+                trunc_len -= alloc_size;
+        }
+        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &dealloc);
+        return ret;
+}
+/*
+ * Parts of this function taken from xfs_change_file_space()
+ */
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+                            struct ocfs2_space_resv *sr)
+{
+        int ret;
+        s64 llen;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        handle_t *handle;
+        unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
+        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
+            !ocfs2_writes_unwritten_extents(osb))
+                return -ENOTTY;
+        else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
+                 !ocfs2_sparse_alloc(osb))
+                return -ENOTTY;
+        if (!S_ISREG(inode->i_mode))
+                return -EINVAL;
+        if (!(file->f_mode & FMODE_WRITE))
+                return -EBADF;
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        mutex_lock(&inode->i_mutex);
+        /*
+         * This prevents concurrent writes on other nodes
+         */
+        ret = ocfs2_rw_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_rw_unlock;
+        }
+        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+                ret = -EPERM;
+                goto out_meta_unlock;
+        }
+        switch (sr->l_whence) {
+        case 0: /*SEEK_SET*/
+                break;
+        case 1: /*SEEK_CUR*/
+                sr->l_start += file->f_pos;
+                break;
+        case 2: /*SEEK_END*/
+                sr->l_start += i_size_read(inode);
+                break;
+        default:
+                ret = -EINVAL;
+                goto out_meta_unlock;
+        }
+        sr->l_whence = 0;
+        llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
+        if (sr->l_start < 0
+            || sr->l_start > max_off
+            || (sr->l_start + llen) < 0
+            || (sr->l_start + llen) > max_off) {
+                ret = -EINVAL;
+                goto out_meta_unlock;
+        }
+        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+                if (sr->l_len <= 0) {
+                        ret = -EINVAL;
+                        goto out_meta_unlock;
+                }
+        }
+        if (should_remove_suid(file->f_path.dentry)) {
+                ret = __ocfs2_write_remove_suid(inode, di_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_meta_unlock;
+                }
+        }
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        switch (cmd) {
+        case OCFS2_IOC_RESVSP:
+        case OCFS2_IOC_RESVSP64:
+                /*
+                 * This takes unsigned offsets, but the signed ones we
+                 * pass have been checked against overflow above.
+                 */
+                ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
+                                                       sr->l_len);
+                break;
+        case OCFS2_IOC_UNRESVSP:
+        case OCFS2_IOC_UNRESVSP64:
+                ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
+                                               sr->l_len);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta_unlock;
+        }
+        /*
+         * We update c/mtime for these changes
+         */
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_meta_unlock;
+        }
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+        ocfs2_commit_trans(osb, handle);
+out_meta_unlock:
+        brelse(di_bh);
+        ocfs2_meta_unlock(inode, 1);
+out_rw_unlock:
+        ocfs2_rw_unlock(inode, 1);
+        mutex_unlock(&inode->i_mutex);
+out:
+        return ret;
+}
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
        *basep = base;
 }
-static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+static struct page * ocfs2_get_write_source(char **ret_src_buf,
                                            const struct iovec *cur_iov,
                                            size_t iov_offset)
 {
        int ret;
-        char *buf;
+        char *buf = cur_iov->iov_base + iov_offset;
        struct page *src_page = NULL;
+        unsigned long off;
-        buf = cur_iov->iov_base + iov_offset;
+        off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
        if (!segment_eq(get_fs(), KERNEL_DS)) {
                /*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
                                     (unsigned long)buf & PAGE_CACHE_MASK, 1,
                                     0, 0, &src_page, NULL);
                if (ret == 1)
-                        bp->b_src_buf = kmap(src_page);
+                        *ret_src_buf = kmap(src_page) + off;
                else
                        src_page = ERR_PTR(-EFAULT);
        } else {
-                bp->b_src_buf = buf;
+                *ret_src_buf = buf;
        }
        return src_page;
 }
-static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+static void ocfs2_put_write_source(struct page *page)
-                                   struct page *page)
 {
        if (page) {
                kunmap(page);
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
 {
        int ret = 0;
        ssize_t copied, total = 0;
-        size_t iov_offset = 0;
+        size_t iov_offset = 0, bytes;
+        loff_t pos;
        const struct iovec *cur_iov = iov;
-        struct ocfs2_buffered_write_priv bp;
+        struct page *user_page, *page;
-        struct page *page;
+        char *buf, *dst;
+        void *fsdata;
        /*
         * handle partial DIO write.  Adjust cur_iov if needed.
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
        ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
        do {
-                bp.b_cur_off = iov_offset;
+                pos = *ppos;
-                bp.b_cur_iov = cur_iov;
-                page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+                user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
-                if (IS_ERR(page)) {
+                if (IS_ERR(user_page)) {
-                        ret = PTR_ERR(page);
+                        ret = PTR_ERR(user_page);
                        goto out;
                }
-                copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+                /* Stay within our page boundaries */
-                                                      ocfs2_map_and_write_user_data,
+                bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
-                                                      &bp);
+                            (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
+                /* Stay within the vector boundary */
+                bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
+                /* Stay within count */
+                bytes = min(bytes, count);
+                page = NULL;
+                ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
+                                        &page, &fsdata);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-                ocfs2_put_write_source(&bp, page);
+                dst = kmap_atomic(page, KM_USER0);
+                memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
+                kunmap_atomic(dst, KM_USER0);
+                flush_dcache_page(page);
+                ocfs2_put_write_source(user_page);
+                copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
+                                         bytes, page, fsdata);
                if (copied < 0) {
                        mlog_errno(copied);
                        ret = copied;
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
                }
                total += copied;
-                *ppos = *ppos + copied;
+                *ppos = pos + copied;
                count -= copied;
                ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf,
                                    struct splice_desc *sd)
 {
-        int ret, count, total = 0;
+        int ret, count;
        ssize_t copied = 0;
-        struct ocfs2_splice_write_priv sp;
+        struct file *file = sd->u.file;
+        unsigned int offset;
+        struct page *page = NULL;
+        void *fsdata;
+        char *src, *dst;
        ret = buf->ops->confirm(pipe, buf);
        if (ret)
                goto out;
-        sp.s_sd = sd;
+        offset = sd->pos & ~PAGE_CACHE_MASK;
-        sp.s_buf = buf;
-        sp.s_pipe = pipe;
-        sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
-        sp.s_buf_offset = buf->offset;
        count = sd->len;
-        if (count + sp.s_offset > PAGE_CACHE_SIZE)
+        if (count + offset > PAGE_CACHE_SIZE)
-                count = PAGE_CACHE_SIZE - sp.s_offset;
+                count = PAGE_CACHE_SIZE - offset;
-        do {
+        ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
-                /*
+                                &page, &fsdata);
-                 * splice wants us to copy up to one page at a
+        if (ret) {
-                 * time. For pagesize > cluster size, this means we
+                mlog_errno(ret);
-                 * might enter ocfs2_buffered_write_cluster() more
+                goto out;
-                 * than once, so keep track of our progress here.
+        }
-                 */
-                copied = ocfs2_buffered_write_cluster(sd->u.file,
-                                                      (loff_t)sd->pos + total,
-                                                      count,
-                                                      ocfs2_map_and_write_splice_data,
-                                                      &sp);
-                if (copied < 0) {
-                        mlog_errno(copied);
-                        ret = copied;
-                        goto out;
-                }
-                count -= copied;
+        src = buf->ops->map(pipe, buf, 1);
-                sp.s_offset += copied;
+        dst = kmap_atomic(page, KM_USER1);
-                sp.s_buf_offset += copied;
+        memcpy(dst + offset, src + buf->offset, count);
-                total += copied;
+        kunmap_atomic(page, KM_USER1);
-        } while (count);
+        buf->ops->unmap(pipe, buf, src);
-        ret = 0;
+        copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
+                                 page, fsdata);
+        if (copied < 0) {
+                mlog_errno(copied);
+                ret = copied;
+                goto out;
+        }
 out:
-        return total ? total : ret;
+        return copied ? copied : ret;
 }
 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa1822b..36fe27f268ee 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
-                               u32 *cluster_start,
+                               u32 *logical_offset,
                               u32 clusters_to_add,
+                               int mark_unwritten,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
-                               enum ocfs2_alloc_restarted *reason);
+                               enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                          u32 clusters_to_add,
+                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
 int ocfs2_update_inode_atime(struct inode *inode,
                             struct buffer_head *bh);
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+                            struct ocfs2_space_resv *sr);
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63781ba..352eb4a13f98 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return 0;
-        status = o2hb_register_callback(&osb->osb_hb_down);
+        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = o2hb_register_callback(&osb->osb_hb_up);
+        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
        if (status < 0) {
                mlog_errno(status);
-                o2hb_unregister_callback(&osb->osb_hb_down);
+                o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
        }
 bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return;
-        o2hb_unregister_callback(&osb->osb_hb_down);
+        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-        o2hb_unregister_callback(&osb->osb_hb_up);
+        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
 }
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21ad9aed..bd68c3f2afbe 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
 #include "dlmglue.h"
+#include "file.h"
 #include "inode.h"
 #include "journal.h"
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 {
        unsigned int flags;
        int status;
+        struct ocfs2_space_resv sr;
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
                return ocfs2_set_inode_attr(inode, flags,
                        OCFS2_FL_MODIFIABLE);
+        case OCFS2_IOC_RESVSP:
+        case OCFS2_IOC_RESVSP64:
+        case OCFS2_IOC_UNRESVSP:
+        case OCFS2_IOC_UNRESVSP64:
+                if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
+                        return -EFAULT;
+                return ocfs2_change_file_space(filp, cmd, &sr);
        default:
                return -ENOTTY;
        }
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC32_SETFLAGS:
                cmd = OCFS2_IOC_SETFLAGS;
                break;
+        case OCFS2_IOC_RESVSP:
+        case OCFS2_IOC_RESVSP64:
+        case OCFS2_IOC_UNRESVSP:
+        case OCFS2_IOC_UNRESVSP64:
+                break;
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc1188081720..dbfb20bb27ea 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                container_of(work, struct ocfs2_journal, j_recovery_work);
        struct ocfs2_super *osb = journal->j_osb;
        struct ocfs2_dinode *la_dinode, *tl_dinode;
-        struct ocfs2_la_recovery_item *item;
+        struct ocfs2_la_recovery_item *item, *n;
-        struct list_head *p, *n;
        LIST_HEAD(tmp_la_list);
        mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
        spin_unlock(&journal->j_lock);
-        list_for_each_safe(p, n, &tmp_la_list) {
+        list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
-                item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
                list_del_init(&item->lri_list);
                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506da..ce60aab013aa 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE                 \
                                         + OCFS2_TRUNCATE_LOG_UPDATE)
+#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39f5..d79aa12137d2 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
 #include "ocfs2.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+        /* The best way to deal with signals in the vm path is
+         * to block them upfront, rather than allowing the
+         * locking paths to return -ERESTARTSYS. */
+        sigfillset(blocked);
+        /* We should technically never get a bad return value
+         * from sigprocmask */
+        return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+        return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
                                 unsigned long address,
                                 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
        mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
                   type);
-        /* The best way to deal with signals in this path is
+        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-         * to block them upfront, rather than allowing the
-         * locking paths to return -ERESTARTSYS. */
-        sigfillset(&blocked);
-        /* We should technically never get a bad ret return
-         * from sigprocmask */
-        ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
        page = filemap_nopage(area, address, type);
-        ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+        ret = ocfs2_vm_op_unblock_sigs(&oldset);
        if (ret < 0)
                mlog_errno(ret);
 out:
@@ -76,28 +87,136 @@ out:
        return page;
 }
-static struct vm_operations_struct ocfs2_file_vm_ops = {
+static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
-        .nopage = ocfs2_nopage,
+                                struct page *page)
-};
+{
+        int ret;
+        struct address_space *mapping = inode->i_mapping;
+        loff_t pos = page->index << PAGE_CACHE_SHIFT;
+        unsigned int len = PAGE_CACHE_SIZE;
+        pgoff_t last_index;
+        struct page *locked_page = NULL;
+        void *fsdata;
+        loff_t size = i_size_read(inode);
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+        /*
+         * Another node might have truncated while we were waiting on
+         * cluster locks.
+         */
+        last_index = size >> PAGE_CACHE_SHIFT;
+        if (page->index > last_index) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * The i_size check above doesn't catch the case where nodes
+         * truncated and then re-extended the file. We'll re-check the
+         * page mapping after taking the page lock inside of
+         * ocfs2_write_begin_nolock().
+         */
+        if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * Call ocfs2_write_begin() and ocfs2_write_end() to take
+         * advantage of the allocation code there. We pass a write
+         * length of the whole page (chopped to i_size) to make sure
+         * the whole thing is allocated.
+         *
+         * Since we know the page is up to date, we don't have to
+         * worry about ocfs2_write_begin() skipping some buffer reads
+         * because the "write" would invalidate their data.
+         */
+        if (page->index == last_index)
+                len = size & ~PAGE_CACHE_MASK;
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+                                       &fsdata, di_bh, page);
+        if (ret) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+                                     fsdata);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(ret != len);
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-        int ret = 0, lock_level = 0;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        sigset_t blocked, oldset;
+        int ret, ret2;
+        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * The cluster locks taken will block a truncate from another
+         * node. Taking the data lock will also ensure that we don't
+         * attempt page truncation as part of a downconvert.
+         */
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
-         * Only support shared writeable mmap for local mounts which
+         * The alloc sem should be enough to serialize with
-         * don't know about holes.
+         * ocfs2_truncate_file() changing i_size as well as any thread
+         * modifying the inode btree.
         */
-        if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-            ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
-            ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+        ret = ocfs2_data_lock(inode, 1);
-                mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
+        if (ret < 0) {
-                /* This is -EINVAL because generic_file_readonly_mmap
+                mlog_errno(ret);
-                 * returns it in a similar situation. */
+                goto out_meta_unlock;
-                return -EINVAL;
        }
+        ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+        ocfs2_data_unlock(inode, 1);
+out_meta_unlock:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        brelse(di_bh);
+        ocfs2_meta_unlock(inode, 1);
+out:
+        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+        if (ret2 < 0)
+                mlog_errno(ret2);
+        return ret;
+}
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+        .nopage         = ocfs2_nopage,
+        .page_mkwrite   = ocfs2_page_mkwrite,
+};
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int ret = 0, lock_level = 0;
        ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
                                    file->f_vfsmnt, &lock_level);
        if (ret < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
                                                    new_fe_bh,
                                                    handle, data_ac, NULL,
                                                    NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633e833f..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
        u16 max_slots;
        s16 node_num;
        s16 slot_num;
+        s16 preferred_slot;
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+        /*
+         * Support for sparse files is a pre-requisite
+         */
+        if (!ocfs2_sparse_alloc(osb))
+                return 0;
+        if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..82f8a75b207e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP    0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -116,6 +116,11 @@
 */
 #define OCFS2_FEATURE_COMPAT_BACKUP_SB          0x0001
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -170,6 +175,32 @@
 #define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
 /*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start;
+        __s64           l_len;          /* len == 0 means until end of file */
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area                     */
+};
+#define OCFS2_IOC_ALLOCSP               _IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP                _IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP                _IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP      _IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64     _IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64      _IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64      _IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64    _IOW ('X', 43, struct ocfs2_space_resv)
+/*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
 */
 #define OCFS2_JOURNAL_DIRTY_FL  (0x00000001)    /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b79067dc14..af4882b62cfa 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
        return ret;
 }
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
 {
        int i;
        s16 ret = OCFS2_INVALID_SLOT;
+        if (preferred >= 0 && preferred < si->si_num_slots) {
+                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+                        ret = preferred;
+                        goto out;
+                }
+        }
        for(i = 0; i < si->si_num_slots; i++) {
                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
                        ret = (s16) i;
                        break;
                }
        }
+out:
        return ret;
 }
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
        if (slot == OCFS2_INVALID_SLOT) {
                /* if no slot yet, then just take 1st available
                 * one. */
-                slot = __ocfs2_find_empty_slot(si);
+                slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
                if (slot == OCFS2_INVALID_SLOT) {
                        spin_unlock(&si->si_lock);
                        mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e3437626d183..d9c5c9fcb30f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
                                    u16 chain);
 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
                                                     u32 wanted);
-static int ocfs2_free_suballoc_bits(handle_t *handle,
-                                    struct inode *alloc_inode,
-                                    struct buffer_head *alloc_bh,
-                                    unsigned int start_bit,
-                                    u64 bg_blkno,
-                                    unsigned int count);
-static inline u64 ocfs2_which_suballoc_group(u64 block,
-                                             unsigned int bit);
 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
                                                   u64 bg_blkno,
                                                   u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
        (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
        (*ac)->ac_which = OCFS2_AC_USE_META;
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
-        slot = 0;
-#else
        slot = osb->slot_num;
-#endif
        (*ac)->ac_group_search = ocfs2_block_group_search;
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
 /*
 * expects the suballoc inode to already be locked.
 */
-static int ocfs2_free_suballoc_bits(handle_t *handle,
+int ocfs2_free_suballoc_bits(handle_t *handle,
-                                    struct inode *alloc_inode,
+                             struct inode *alloc_inode,
-                                    struct buffer_head *alloc_bh,
+                             struct buffer_head *alloc_bh,
-                                    unsigned int start_bit,
+                             unsigned int start_bit,
-                                    u64 bg_blkno,
+                             u64 bg_blkno,
-                                    unsigned int count)
+                             unsigned int count)
 {
        int status = 0;
        u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
        return status;
 }
-static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
-{
-        u64 group = block - (u64) bit;
-        return group;
-}
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
-int ocfs2_free_extent_block(handle_t *handle,
-                            struct inode *eb_alloc_inode,
-                            struct buffer_head *eb_alloc_bh,
-                            struct ocfs2_extent_block *eb)
-{
-        u64 blk = le64_to_cpu(eb->h_blkno);
-        u16 bit = le16_to_cpu(eb->h_suballoc_bit);
-        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-        return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
-                                        bit, bg_blkno, 1);
-}
 int ocfs2_free_clusters(handle_t *handle,
                       struct inode *bitmap_inode,
                       struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94cb9250..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
                         u32 *cluster_start,
                         u32 *num_clusters);
+int ocfs2_free_suballoc_bits(handle_t *handle,
+                             struct inode *alloc_inode,
+                             struct buffer_head *alloc_bh,
+                             unsigned int start_bit,
+                             u64 bg_blkno,
+                             unsigned int count);
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
                      struct ocfs2_dinode *di);
-int ocfs2_free_extent_block(handle_t *handle,
-                            struct inode *eb_alloc_inode,
-                            struct buffer_head *eb_alloc_bh,
-                            struct ocfs2_extent_block *eb);
 int ocfs2_free_clusters(handle_t *handle,
                        struct inode *bitmap_inode,
                        struct buffer_head *bitmap_bh,
                        u64 start_blk,
                        unsigned int num_clusters);
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+        u64 group = block - (u64) bit;
+        return group;
+}
 static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
                                          u64 bg_blkno)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c7dce9..3a5a1ed09ac9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
 static int ocfs2_parse_options(struct super_block *sb, char *options,
-                               unsigned long *mount_opt, int is_remount);
+                               unsigned long *mount_opt, s16 *slot,
+                               int is_remount);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
 static const struct super_operations ocfs2_sops = {
        .statfs         = ocfs2_statfs,
        .alloc_inode    = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
        Opt_data_ordered,
        Opt_data_writeback,
        Opt_atime_quantum,
+        Opt_slot,
        Opt_err,
 };
@@ -154,6 +154,7 @@ static match_table_t tokens = {
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
        {Opt_atime_quantum, "atime_quantum=%u"},
+        {Opt_slot, "preferred_slot=%u"},
        {Opt_err, NULL}
 };
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
 /* From xfs_super.c:xfs_max_file_offset
 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
 */
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
 {
        unsigned int pagefactor = 1;
        unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        int incompat_features;
        int ret = 0;
        unsigned long parsed_options;
+        s16 slot;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+        if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
                ret = -EINVAL;
                goto out;
        }
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        struct dentry *root;
        int status, sector_size;
        unsigned long parsed_opt;
+        s16 slot;
        struct inode *inode = NULL;
        struct ocfs2_super *osb = NULL;
        struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        mlog_entry("%p, %p, %i", sb, data, silent);
-        if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+        if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
                status = -EINVAL;
                goto read_super_error;
        }
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        brelse(bh);
        bh = NULL;
        osb->s_mount_opt = parsed_opt;
+        osb->preferred_slot = slot;
        sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
 static int ocfs2_parse_options(struct super_block *sb,
                               char *options,
                               unsigned long *mount_opt,
+                               s16 *slot,
                               int is_remount)
 {
        int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
        *mount_opt = 0;
+        *slot = OCFS2_INVALID_SLOT;
        if (!options) {
                status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                        else
                                osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
                        break;
+                case Opt_slot:
+                        option = 0;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option)
+                                *slot = (s16)option;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..3b9cb3d0b008 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
 #endif /* OCFS2_SUPER_H */
diff --git a/fs/open.c b/fs/open.c
index 0d515d161974..be6a457f4226 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -855,7 +855,7 @@ EXPORT_SYMBOL(dentry_open);
 /*
 * Find an empty file descriptor entry, and mark it busy.
 */
-int get_unused_fd(void)
+int get_unused_fd_flags(int flags)
 {
        struct files_struct * files = current->files;
        int fd, error;
@@ -891,7 +891,10 @@ repeat:
        }
        FD_SET(fd, fdt->open_fds);
-        FD_CLR(fd, fdt->close_on_exec);
+        if (flags & O_CLOEXEC)
+                FD_SET(fd, fdt->close_on_exec);
+        else
+                FD_CLR(fd, fdt->close_on_exec);
        files->next_fd = fd + 1;
 #if 1
        /* Sanity check */
@@ -907,6 +910,11 @@ out:
        return error;
 }
+int get_unused_fd(void)
+{
+        return get_unused_fd_flags(0);
+}
 EXPORT_SYMBOL(get_unused_fd);
 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
@@ -959,7 +967,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
        int fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
-                fd = get_unused_fd();
+                fd = get_unused_fd_flags(flags);
                if (fd >= 0) {
                        struct file *f = do_filp_open(dfd, tmp, flags, mode);
                        if (IS_ERR(f)) {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index e3491328596b..3d3e16631472 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -25,6 +25,8 @@
 #define PARTITION_RISCIX_SCSI   2
 #define PARTITION_LINUX         9
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+        defined(CONFIG_ACORN_PARTITION_ADFS)
 static struct adfs_discrecord *
 adfs_partition(struct parsed_partitions *state, char *name, char *data,
               unsigned long first_sector, int slot)
@@ -48,6 +50,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
        put_partition(state, slot, first_sector, nr_sects);
        return dr;
 }
+#endif
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
@@ -65,6 +68,8 @@ struct riscix_record {
        struct riscix_part part[8];
 };
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+        defined(CONFIG_ACORN_PARTITION_ADFS)
 static int
 riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
                unsigned long first_sect, int slot, unsigned long nr_sects)
@@ -105,6 +110,7 @@ riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
        return slot;
 }
 #endif
+#endif
 #define LINUX_NATIVE_MAGIC 0xdeafa1de
 #define LINUX_SWAP_MAGIC   0xdeafab1e
@@ -115,6 +121,8 @@ struct linux_part {
        __le32 nr_sects;
 };
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+        defined(CONFIG_ACORN_PARTITION_ADFS)
 static int
 linux_partition(struct parsed_partitions *state, struct block_device *bdev,
                unsigned long first_sect, int slot, unsigned long nr_sects)
@@ -146,6 +154,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
        put_dev_sector(sect);
        return slot;
 }
+#endif
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
 int
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 99873a2b4cbc..e7dd1d4e3473 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -677,15 +677,24 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
 * Return:  -1 Error, the calculated offset exceeded the size of the buffer
 *           n OK, a range-checked offset into buffer
 */
-static int ldm_relative (const u8 *buffer, int buflen, int base, int offset)
+static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
 {
        base += offset;
-        if ((!buffer) || (offset < 0) || (base > buflen))
+        if (!buffer || offset < 0 || base > buflen) {
+                if (!buffer)
+                        ldm_error("!buffer");
+                if (offset < 0)
+                        ldm_error("offset (%d) < 0", offset);
+                if (base > buflen)
+                        ldm_error("base (%d) > buflen (%d)", base, buflen);
                return -1;
-        if ((base + buffer[base]) >= buflen)
+        }
+        if (base + buffer[base] >= buflen) {
+                ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
+                                buffer[base], buflen);
                return -1;
+        }
        return buffer[base] + offset + 1;
 }
@@ -1054,60 +1063,98 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
 * Return:  'true'   @vb contains a Volume VBLK
 *          'false'  @vb contents are not defined
 */
-static bool ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
 {
-        int r_objid, r_name, r_vtype, r_child, r_size, r_id1, r_id2, r_size2;
+        int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
-        int r_drive, len;
+        int r_id1, r_id2, r_size2, r_drive, len;
        struct vblk_volu *volu;
-        BUG_ON (!buffer || !vb);
+        BUG_ON(!buffer || !vb);
+        r_objid = ldm_relative(buffer, buflen, 0x18, 0);
-        r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
+        if (r_objid < 0) {
-        r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
+                ldm_error("r_objid %d < 0", r_objid);
-        r_vtype  = ldm_relative (buffer, buflen, 0x18, r_name);
+                return false;
-        r_child  = ldm_relative (buffer, buflen, 0x2E, r_vtype);
+        }
-        r_size   = ldm_relative (buffer, buflen, 0x3E, r_child);
+        r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
+        if (r_name < 0) {
-        if (buffer[0x12] & VBLK_FLAG_VOLU_ID1)
+                ldm_error("r_name %d < 0", r_name);
-                r_id1 = ldm_relative (buffer, buflen, 0x53, r_size);
+                return false;
-        else
+        }
+        r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
+        if (r_vtype < 0) {
+                ldm_error("r_vtype %d < 0", r_vtype);
+                return false;
+        }
+        r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
+        if (r_disable_drive_letter < 0) {
+                ldm_error("r_disable_drive_letter %d < 0",
+                                r_disable_drive_letter);
+                return false;
+        }
+        r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
+        if (r_child < 0) {
+                ldm_error("r_child %d < 0", r_child);
+                return false;
+        }
+        r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
+        if (r_size < 0) {
+                ldm_error("r_size %d < 0", r_size);
+                return false;
+        }
+        if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
+                r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
+                if (r_id1 < 0) {
+                        ldm_error("r_id1 %d < 0", r_id1);
+                        return false;
+                }
+        } else
                r_id1 = r_size;
+        if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
-        if (buffer[0x12] & VBLK_FLAG_VOLU_ID2)
+                r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
-                r_id2 = ldm_relative (buffer, buflen, 0x53, r_id1);
+                if (r_id2 < 0) {
-        else
+                        ldm_error("r_id2 %d < 0", r_id2);
+                        return false;
+                }
+        } else
                r_id2 = r_id1;
+        if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
-        if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE)
+                r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
-                r_size2 = ldm_relative (buffer, buflen, 0x53, r_id2);
+                if (r_size2 < 0) {
-        else
+                        ldm_error("r_size2 %d < 0", r_size2);
+                        return false;
+                }
+        } else
                r_size2 = r_id2;
+        if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
-        if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE)
+                r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
-                r_drive = ldm_relative (buffer, buflen, 0x53, r_size2);
+                if (r_drive < 0) {
-        else
+                        ldm_error("r_drive %d < 0", r_drive);
+                        return false;
+                }
+        } else
                r_drive = r_size2;
        len = r_drive;
-        if (len < 0)
+        if (len < 0) {
+                ldm_error("len %d < 0", len);
                return false;
+        }
        len += VBLK_SIZE_VOL5;
-        if (len != BE32 (buffer + 0x14))
+        if (len > BE32(buffer + 0x14)) {
+                ldm_error("len %d > BE32(buffer + 0x14) %d", len,
+                                BE32(buffer + 0x14));
                return false;
+        }
        volu = &vb->vblk.volu;
+        ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
-        ldm_get_vstr (buffer + 0x18 + r_name,  volu->volume_type,
+                        sizeof(volu->volume_type));
-                sizeof (volu->volume_type));
+        memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
-        memcpy (volu->volume_state, buffer + 0x19 + r_vtype,
+                        sizeof(volu->volume_state));
-                        sizeof (volu->volume_state));
+        volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
-        volu->size = ldm_get_vnum (buffer + 0x3E + r_child);
+        volu->partition_type = buffer[0x41 + r_size];
-        volu->partition_type = buffer[0x42 + r_size];
+        memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
-        memcpy (volu->guid, buffer + 0x43 + r_size,  sizeof (volu->guid));
        if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
-                ldm_get_vstr (buffer + 0x53 + r_size,  volu->drive_hint,
+                ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
-                        sizeof (volu->drive_hint));
+                                sizeof(volu->drive_hint));
        }
        return true;
 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index d2e6a3046939..80f63b5fdd9f 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -68,7 +68,7 @@ struct parsed_partitions;
 #define VBLK_SIZE_DSK3          12
 #define VBLK_SIZE_DSK4          45
 #define VBLK_SIZE_PRT3          28
-#define VBLK_SIZE_VOL5          59
+#define VBLK_SIZE_VOL5          58
 /* component types */
 #define COMP_STRIPE             0x01            /* Stripe-set */
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 98e78e2f18d6..965625a0977d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -62,6 +62,8 @@
 #include <linux/mman.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
@@ -76,9 +78,7 @@
 #include <linux/rcupdate.h>
 #include <linux/delayacct.h>
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
-#include <asm/io.h>
 #include <asm/processor.h>
 #include "internal.h"
@@ -87,10 +87,10 @@
 do { memcpy(buffer, string, strlen(string)); \
     buffer += strlen(string); } while (0)
-static inline char * task_name(struct task_struct *p, char * buf)
+static inline char *task_name(struct task_struct *p, char *buf)
 {
        int i;
-        char * name;
+        char *name;
        char tcomm[sizeof(p->comm)];
        get_task_comm(tcomm, p);
@@ -138,7 +138,7 @@ static const char *task_state_array[] = {
        "X (dead)"              /* 32 */
 };
-static inline const char * get_task_state(struct task_struct *tsk)
+static inline const char *get_task_state(struct task_struct *tsk)
 {
        unsigned int state = (tsk->state & (TASK_RUNNING |
                                            TASK_INTERRUPTIBLE |
@@ -156,7 +156,7 @@ static inline const char * get_task_state(struct task_struct *tsk)
        return *p;
 }
-static inline char * task_state(struct task_struct *p, char *buffer)
+static inline char *task_state(struct task_struct *p, char *buffer)
 {
        struct group_info *group_info;
        int g;
@@ -172,8 +172,8 @@ static inline char * task_state(struct task_struct *p, char *buffer)
                "Uid:\t%d\t%d\t%d\t%d\n"
                "Gid:\t%d\t%d\t%d\t%d\n",
                get_task_state(p),
-                p->tgid, p->pid,
+                p->tgid, p->pid,
-                pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+                pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
                pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
                p->uid, p->euid, p->suid, p->fsuid,
                p->gid, p->egid, p->sgid, p->fsgid);
@@ -191,15 +191,15 @@ static inline char * task_state(struct task_struct *p, char *buffer)
        get_group_info(group_info);
        task_unlock(p);
-        for (g = 0; g < min(group_info->ngroups,NGROUPS_SMALL); g++)
+        for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
-                buffer += sprintf(buffer, "%d ", GROUP_AT(group_info,g));
+                buffer += sprintf(buffer, "%d ", GROUP_AT(group_info, g));
        put_group_info(group_info);
        buffer += sprintf(buffer, "\n");
        return buffer;
 }
-static char * render_sigset_t(const char *header, sigset_t *set, char *buffer)
+static char *render_sigset_t(const char *header, sigset_t *set, char *buffer)
 {
        int i, len;
@@ -239,7 +239,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
        }
 }
-static inline char * task_sig(struct task_struct *p, char *buffer)
+static inline char *task_sig(struct task_struct *p, char *buffer)
 {
        unsigned long flags;
        sigset_t pending, shpending, blocked, ignored, caught;
@@ -289,14 +289,23 @@ static inline char *task_cap(struct task_struct *p, char *buffer)
                            cap_t(p->cap_effective));
 }
-int proc_pid_status(struct task_struct *task, char * buffer)
+static inline char *task_context_switch_counts(struct task_struct *p,
+                                                char *buffer)
 {
-        char * orig = buffer;
+        return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n"
+                            "nonvoluntary_ctxt_switches:\t%lu\n",
+                            p->nvcsw,
+                            p->nivcsw);
+}
+int proc_pid_status(struct task_struct *task, char *buffer)
+{
+        char *orig = buffer;
        struct mm_struct *mm = get_task_mm(task);
        buffer = task_name(task, buffer);
        buffer = task_state(task, buffer);
- 
        if (mm) {
                buffer = task_mem(mm, buffer);
                mmput(mm);
@@ -307,6 +316,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 #if defined(CONFIG_S390)
        buffer = task_show_regs(task, buffer);
 #endif
+        buffer = task_context_switch_counts(task, buffer);
        return buffer - orig;
 }
@@ -332,7 +342,7 @@ static clock_t task_utime(struct task_struct *p)
 static clock_t task_stime(struct task_struct *p)
 {
-        clock_t stime = cputime_to_clock_t(p->stime);
+        clock_t stime;
        /*
         * Use CFS's precise accounting. (we subtract utime from
@@ -344,8 +354,7 @@ static clock_t task_stime(struct task_struct *p)
        return stime;
 }
+static int do_task_stat(struct task_struct *task, char *buffer, int whole)
-static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 {
        unsigned long vsize, eip, esp, wchan = ~0UL;
        long priority, nice;
@@ -353,7 +362,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        sigset_t sigign, sigcatch;
        char state;
        int res;
-        pid_t ppid = 0, pgid = -1, sid = -1;
+        pid_t ppid = 0, pgid = -1, sid = -1;
        int num_threads = 0;
        struct mm_struct *mm;
        unsigned long long start_time;
@@ -424,7 +433,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        }
        rcu_read_unlock();
-        if (!whole || num_threads<2)
+        if (!whole || num_threads < 2)
                wchan = get_wchan(task);
        if (!whole) {
                min_flt = task->min_flt;
@@ -440,12 +449,13 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        /* Temporary variable needed for gcc-2.96 */
        /* convert timespec -> nsec*/
-        start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
+        start_time =
-                                + task->start_time.tv_nsec;
+                (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
+                                + task->real_start_time.tv_nsec;
        /* convert nsec -> ticks */
        start_time = nsec_to_clock_t(start_time);
-        res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \
+        res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
                task->pid,
@@ -471,7 +481,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
                start_time,
                vsize,
                mm ? get_mm_rss(mm) : 0,
-                rsslim,
+                rsslim,
                mm ? mm->start_code : 0,
                mm ? mm->end_code : 0,
                mm ? mm->start_stack : 0,
@@ -493,17 +503,17 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
                task->rt_priority,
                task->policy,
                (unsigned long long)delayacct_blkio_ticks(task));
-        if(mm)
+        if (mm)
                mmput(mm);
        return res;
 }
-int proc_tid_stat(struct task_struct *task, char * buffer)
+int proc_tid_stat(struct task_struct *task, char *buffer)
 {
        return do_task_stat(task, buffer, 0);
 }
-int proc_tgid_stat(struct task_struct *task, char * buffer)
+int proc_tgid_stat(struct task_struct *task, char *buffer)
 {
        return do_task_stat(task, buffer, 1);
 }
@@ -512,12 +522,12 @@ int proc_pid_statm(struct task_struct *task, char *buffer)
 {
        int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);
-        
        if (mm) {
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
-        return sprintf(buffer,"%d %d %d %d %d %d %d\n",
+        return sprintf(buffer, "%d %d %d %d %d %d %d\n",
                       size, resident, shared, text, lib, data, 0);
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 46ea5d56e1bb..42cb4f5613b6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,7 +67,6 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
-#include <linux/seccomp.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
 #include <linux/poll.h>
@@ -204,12 +203,17 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
        int res = 0;
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
-                unsigned int len = mm->env_end - mm->env_start;
+                unsigned int len;
+                res = -ESRCH;
+                if (!ptrace_may_attach(task))
+                        goto out;
+                len  = mm->env_end - mm->env_start;
                if (len > PAGE_SIZE)
                        len = PAGE_SIZE;
                res = access_process_vm(task, mm->env_start, buffer, len, 0);
-                if (!ptrace_may_attach(task))
+out:
-                        res = -ESRCH;
                mmput(mm);
        }
        return res;
@@ -279,7 +283,7 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
 static int proc_pid_wchan(struct task_struct *task, char *buffer)
 {
        unsigned long wchan;
-        char symname[KSYM_NAME_LEN+1];
+        char symname[KSYM_NAME_LEN];
        wchan = get_wchan(task);
@@ -812,71 +816,6 @@ static const struct file_operations proc_loginuid_operations = {
 };
 #endif
-#ifdef CONFIG_SECCOMP
-static ssize_t seccomp_read(struct file *file, char __user *buf,
-                            size_t count, loff_t *ppos)
-{
-        struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
-        char __buf[20];
-        size_t len;
-        if (!tsk)
-                return -ESRCH;
-        /* no need to print the trailing zero, so use only len */
-        len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
-        put_task_struct(tsk);
-        return simple_read_from_buffer(buf, count, ppos, __buf, len);
-}
-static ssize_t seccomp_write(struct file *file, const char __user *buf,
-                             size_t count, loff_t *ppos)
-{
-        struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
-        char __buf[20], *end;
-        unsigned int seccomp_mode;
-        ssize_t result;
-        result = -ESRCH;
-        if (!tsk)
-                goto out_no_task;
-        /* can set it only once to be even more secure */
-        result = -EPERM;
-        if (unlikely(tsk->seccomp.mode))
-                goto out;
-        result = -EFAULT;
-        memset(__buf, 0, sizeof(__buf));
-        count = min(count, sizeof(__buf) - 1);
-        if (copy_from_user(__buf, buf, count))
-                goto out;
-        seccomp_mode = simple_strtoul(__buf, &end, 0);
-        if (*end == '\n')
-                end++;
-        result = -EINVAL;
-        if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-                tsk->seccomp.mode = seccomp_mode;
-                set_tsk_thread_flag(tsk, TIF_SECCOMP);
-        } else
-                goto out;
-        result = -EIO;
-        if (unlikely(!(end - __buf)))
-                goto out;
-        result = end - __buf;
-out:
-        put_task_struct(tsk);
-out_no_task:
-        return result;
-}
-static const struct file_operations proc_seccomp_operations = {
-        .read           = seccomp_read,
-        .write          = seccomp_write,
-};
-#endif /* CONFIG_SECCOMP */
 #ifdef CONFIG_FAULT_INJECTION
 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
                                      size_t count, loff_t *ppos)
@@ -2037,9 +1976,6 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("numa_maps",  S_IRUGO, numa_maps),
 #endif
        REG("mem",        S_IRUSR|S_IWUSR, mem),
-#ifdef CONFIG_SECCOMP
-        REG("seccomp",    S_IRUSR|S_IWUSR, seccomp),
-#endif
        LNK("cwd",        cwd),
        LNK("root",       root),
        LNK("exe",        exe),
@@ -2324,9 +2260,6 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("numa_maps", S_IRUGO, numa_maps),
 #endif
        REG("mem",       S_IRUSR|S_IWUSR, mem),
-#ifdef CONFIG_SECCOMP
-        REG("seccomp",   S_IRUSR|S_IWUSR, seccomp),
-#endif
        LNK("cwd",       cwd),
        LNK("root",      root),
        LNK("exe",       exe),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8a40e15f5ecb..b5e7155d30d8 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -20,6 +20,7 @@
 #include <linux/namei.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
+#include <linux/completion.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -529,12 +530,6 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
                return -EAGAIN;
        dp->low_ino = i;
-        spin_lock(&proc_subdir_lock);
-        dp->next = dir->subdir;
-        dp->parent = dir;
-        dir->subdir = dp;
-        spin_unlock(&proc_subdir_lock);
        if (S_ISDIR(dp->mode)) {
                if (dp->proc_iops == NULL) {
                        dp->proc_fops = &proc_dir_operations;
@@ -550,6 +545,13 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
                if (dp->proc_iops == NULL)
                        dp->proc_iops = &proc_file_inode_operations;
        }
+        spin_lock(&proc_subdir_lock);
+        dp->next = dir->subdir;
+        dp->parent = dir;
+        dir->subdir = dp;
+        spin_unlock(&proc_subdir_lock);
        return 0;
 }
@@ -613,6 +615,9 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
        ent->namelen = len;
        ent->mode = mode;
        ent->nlink = nlink;
+        ent->pde_users = 0;
+        spin_lock_init(&ent->pde_unload_lock);
+        ent->pde_unload_completion = NULL;
 out:
        return ent;
 }
@@ -649,9 +654,6 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
        ent = proc_create(&parent, name, S_IFDIR | mode, 2);
        if (ent) {
-                ent->proc_fops = &proc_dir_operations;
-                ent->proc_iops = &proc_dir_inode_operations;
                if (proc_register(parent, ent) < 0) {
                        kfree(ent);
                        ent = NULL;
@@ -686,10 +688,6 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
        ent = proc_create(&parent,name,mode,nlink);
        if (ent) {
-                if (S_ISDIR(mode)) {
-                        ent->proc_fops = &proc_dir_operations;
-                        ent->proc_iops = &proc_dir_inode_operations;
-                }
                if (proc_register(parent, ent) < 0) {
                        kfree(ent);
                        ent = NULL;
@@ -734,9 +732,35 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
                de = *p;
                *p = de->next;
                de->next = NULL;
+                spin_lock(&de->pde_unload_lock);
+                /*
+                 * Stop accepting new callers into module. If you're
+                 * dynamically allocating ->proc_fops, save a pointer somewhere.
+                 */
+                de->proc_fops = NULL;
+                /* Wait until all existing callers into module are done. */
+                if (de->pde_users > 0) {
+                        DECLARE_COMPLETION_ONSTACK(c);
+                        if (!de->pde_unload_completion)
+                                de->pde_unload_completion = &c;
+                        spin_unlock(&de->pde_unload_lock);
+                        spin_unlock(&proc_subdir_lock);
+                        wait_for_completion(de->pde_unload_completion);
+                        spin_lock(&proc_subdir_lock);
+                        goto continue_removing;
+                }
+                spin_unlock(&de->pde_unload_lock);
+continue_removing:
                if (S_ISDIR(de->mode))
                        parent->nlink--;
-                proc_kill_inodes(de);
+                if (!S_ISREG(de->mode))
+                        proc_kill_inodes(de);
                de->nlink = 0;
                WARN_ON(de->subdir);
                if (!atomic_read(&de->count))
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d5ce65c68d7b..dd28e86ab422 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
+#include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/limits.h>
 #include <linux/init.h>
@@ -140,6 +141,251 @@ static const struct super_operations proc_sops = {
        .remount_fs     = proc_remount,
 };
+static void pde_users_dec(struct proc_dir_entry *pde)
+{
+        spin_lock(&pde->pde_unload_lock);
+        pde->pde_users--;
+        if (pde->pde_unload_completion && pde->pde_users == 0)
+                complete(pde->pde_unload_completion);
+        spin_unlock(&pde->pde_unload_lock);
+}
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        loff_t rv = -EINVAL;
+        loff_t (*llseek)(struct file *, loff_t, int);
+        spin_lock(&pde->pde_unload_lock);
+        /*
+         * remove_proc_entry() is going to delete PDE (as part of module
+         * cleanup sequence). No new callers into module allowed.
+         */
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        /*
+         * Bump refcount so that remove_proc_entry will wail for ->llseek to
+         * complete.
+         */
+        pde->pde_users++;
+        /*
+         * Save function pointer under lock, to protect against ->proc_fops
+         * NULL'ifying right after ->pde_unload_lock is dropped.
+         */
+        llseek = pde->proc_fops->llseek;
+        spin_unlock(&pde->pde_unload_lock);
+        if (!llseek)
+                llseek = default_llseek;
+        rv = llseek(file, offset, whence);
+        pde_users_dec(pde);
+        return rv;
+}
+static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        ssize_t rv = -EIO;
+        ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        read = pde->proc_fops->read;
+        spin_unlock(&pde->pde_unload_lock);
+        if (read)
+                rv = read(file, buf, count, ppos);
+        pde_users_dec(pde);
+        return rv;
+}
+static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        ssize_t rv = -EIO;
+        ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        write = pde->proc_fops->write;
+        spin_unlock(&pde->pde_unload_lock);
+        if (write)
+                rv = write(file, buf, count, ppos);
+        pde_users_dec(pde);
+        return rv;
+}
+static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        unsigned int rv = 0;
+        unsigned int (*poll)(struct file *, struct poll_table_struct *);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        poll = pde->proc_fops->poll;
+        spin_unlock(&pde->pde_unload_lock);
+        if (poll)
+                rv = poll(file, pts);
+        pde_users_dec(pde);
+        return rv;
+}
+static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        long rv = -ENOTTY;
+        long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
+        int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
+        ioctl = pde->proc_fops->ioctl;
+        spin_unlock(&pde->pde_unload_lock);
+        if (unlocked_ioctl) {
+                rv = unlocked_ioctl(file, cmd, arg);
+                if (rv == -ENOIOCTLCMD)
+                        rv = -EINVAL;
+        } else if (ioctl) {
+                lock_kernel();
+                rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
+                unlock_kernel();
+        }
+        pde_users_dec(pde);
+        return rv;
+}
+#ifdef CONFIG_COMPAT
+static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        long rv = -ENOTTY;
+        long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        compat_ioctl = pde->proc_fops->compat_ioctl;
+        spin_unlock(&pde->pde_unload_lock);
+        if (compat_ioctl)
+                rv = compat_ioctl(file, cmd, arg);
+        pde_users_dec(pde);
+        return rv;
+}
+#endif
+static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+        int rv = -EIO;
+        int (*mmap)(struct file *, struct vm_area_struct *);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        mmap = pde->proc_fops->mmap;
+        spin_unlock(&pde->pde_unload_lock);
+        if (mmap)
+                rv = mmap(file, vma);
+        pde_users_dec(pde);
+        return rv;
+}
+static int proc_reg_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *pde = PDE(inode);
+        int rv = 0;
+        int (*open)(struct inode *, struct file *);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        open = pde->proc_fops->open;
+        spin_unlock(&pde->pde_unload_lock);
+        if (open)
+                rv = open(inode, file);
+        pde_users_dec(pde);
+        return rv;
+}
+static int proc_reg_release(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *pde = PDE(inode);
+        int rv = 0;
+        int (*release)(struct inode *, struct file *);
+        spin_lock(&pde->pde_unload_lock);
+        if (!pde->proc_fops) {
+                spin_unlock(&pde->pde_unload_lock);
+                return rv;
+        }
+        pde->pde_users++;
+        release = pde->proc_fops->release;
+        spin_unlock(&pde->pde_unload_lock);
+        if (release)
+                rv = release(inode, file);
+        pde_users_dec(pde);
+        return rv;
+}
+static const struct file_operations proc_reg_file_ops = {
+        .llseek         = proc_reg_llseek,
+        .read           = proc_reg_read,
+        .write          = proc_reg_write,
+        .poll           = proc_reg_poll,
+        .unlocked_ioctl = proc_reg_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = proc_reg_compat_ioctl,
+#endif
+        .mmap           = proc_reg_mmap,
+        .open           = proc_reg_open,
+        .release        = proc_reg_release,
+};
 struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
                                struct proc_dir_entry *de)
 {
@@ -166,8 +412,12 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
                        inode->i_nlink = de->nlink;
                if (de->proc_iops)
                        inode->i_op = de->proc_iops;
-                if (de->proc_fops)
+                if (de->proc_fops) {
-                        inode->i_fop = de->proc_fops;
+                        if (S_ISREG(inode->i_mode))
+                                inode->i_fop = &proc_reg_file_ops;
+                        else
+                                inode->i_fop = de->proc_fops;
+                }
        }
        return inode;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5fd49e47f83a..d24b8d46059a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -105,6 +105,7 @@ static int uptime_read_proc(char *page, char **start, off_t off,
        cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
        do_posix_clock_monotonic_gettime(&uptime);
+        monotonic_to_bootbased(&uptime);
        cputime_to_timespec(idletime, &idle);
        len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
                        (unsigned long) uptime.tv_sec,
@@ -443,12 +444,12 @@ static int show_stat(struct seq_file *p, void *v)
        unsigned long jif;
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
        u64 sum = 0;
+        struct timespec boottime;
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
-        jif = - wall_to_monotonic.tv_sec;
+        getboottime(&boottime);
-        if (wall_to_monotonic.tv_nsec)
+        jif = boottime.tv_sec;
-                --jif;
        for_each_possible_cpu(i) {
                int j;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index b3a473b0a191..22846225acfa 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -69,7 +69,7 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
 static int show_tty_driver(struct seq_file *m, void *v)
 {
-        struct tty_driver *p = v;
+        struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers);
        dev_t from = MKDEV(p->major, p->minor_start);
        dev_t to = from + p->num;
@@ -106,22 +106,13 @@ static int show_tty_driver(struct seq_file *m, void *v)
 /* iterator */
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        struct list_head *p;
-        loff_t l = *pos;
        mutex_lock(&tty_mutex);
-        list_for_each(p, &tty_drivers)
+        return seq_list_start(&tty_drivers, *pos);
-                if (!l--)
-                        return list_entry(p, struct tty_driver, tty_drivers);
-        return NULL;
 }
 static void *t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next;
+        return seq_list_next(v, &tty_drivers, pos);
-        (*pos)++;
-        return p==&tty_drivers ? NULL :
-                        list_entry(p, struct tty_driver, tty_drivers);
 }
 static void t_stop(struct seq_file *m, void *v)
diff --git a/fs/quota.c b/fs/quota.c
index 9f237d6182c9..e6577ac15a6c 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -10,12 +10,14 @@
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
+#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
+#include <linux/types.h>
 /* Check validity of generic quotactl commands */
 static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
@@ -384,3 +386,119 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
        return ret;
 }
+#if defined(CONFIG_X86_64) || defined(CONFIG_IA64)
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+        compat_u64 dqb_bhardlimit;
+        compat_u64 dqb_bsoftlimit;
+        compat_u64 dqb_curspace;
+        compat_u64 dqb_ihardlimit;
+        compat_u64 dqb_isoftlimit;
+        compat_u64 dqb_curinodes;
+        compat_u64 dqb_btime;
+        compat_u64 dqb_itime;
+        compat_uint_t dqb_valid;
+};
+/* XFS structures */
+struct compat_fs_qfilestat {
+        compat_u64 dqb_bhardlimit;
+        compat_u64 qfs_nblks;
+        compat_uint_t qfs_nextents;
+};
+struct compat_fs_quota_stat {
+        __s8            qs_version;
+        __u16           qs_flags;
+        __s8            qs_pad;
+        struct compat_fs_qfilestat      qs_uquota;
+        struct compat_fs_qfilestat      qs_gquota;
+        compat_uint_t   qs_incoredqs;
+        compat_int_t    qs_btimelimit;
+        compat_int_t    qs_itimelimit;
+        compat_int_t    qs_rtbtimelimit;
+        __u16           qs_bwarnlimit;
+        __u16           qs_iwarnlimit;
+};
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+                                                qid_t id, void __user *addr)
+{
+        unsigned int cmds;
+        struct if_dqblk __user *dqblk;
+        struct compat_if_dqblk __user *compat_dqblk;
+        struct fs_quota_stat __user *fsqstat;
+        struct compat_fs_quota_stat __user *compat_fsqstat;
+        compat_uint_t data;
+        u16 xdata;
+        long ret;
+        cmds = cmd >> SUBCMDSHIFT;
+        switch (cmds) {
+        case Q_GETQUOTA:
+                dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+                compat_dqblk = addr;
+                ret = sys_quotactl(cmd, special, id, dqblk);
+                if (ret)
+                        break;
+                if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+                        get_user(data, &dqblk->dqb_valid) ||
+                        put_user(data, &compat_dqblk->dqb_valid))
+                        ret = -EFAULT;
+                break;
+        case Q_SETQUOTA:
+                dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+                compat_dqblk = addr;
+                ret = -EFAULT;
+                if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+                        get_user(data, &compat_dqblk->dqb_valid) ||
+                        put_user(data, &dqblk->dqb_valid))
+                        break;
+                ret = sys_quotactl(cmd, special, id, dqblk);
+                break;
+        case Q_XGETQSTAT:
+                fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+                compat_fsqstat = addr;
+                ret = sys_quotactl(cmd, special, id, fsqstat);
+                if (ret)
+                        break;
+                ret = -EFAULT;
+                /* Copying qs_version, qs_flags, qs_pad */
+                if (copy_in_user(compat_fsqstat, fsqstat,
+                        offsetof(struct compat_fs_quota_stat, qs_uquota)))
+                        break;
+                /* Copying qs_uquota */
+                if (copy_in_user(&compat_fsqstat->qs_uquota,
+                        &fsqstat->qs_uquota,
+                        sizeof(compat_fsqstat->qs_uquota)) ||
+                        get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+                        put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+                        break;
+                /* Copying qs_gquota */
+                if (copy_in_user(&compat_fsqstat->qs_gquota,
+                        &fsqstat->qs_gquota,
+                        sizeof(compat_fsqstat->qs_gquota)) ||
+                        get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+                        put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+                        break;
+                /* Copying the rest */
+                if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+                        &fsqstat->qs_incoredqs,
+                        sizeof(struct compat_fs_quota_stat) -
+                        offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+                        get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+                        put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+                        break;
+                ret = 0;
+                break;
+        default:
+                ret = sys_quotactl(cmd, special, id, addr);
+        }
+        return ret;
+}
+#endif
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d40d22b347b7..ef2b46d099ff 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -60,6 +60,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
                default:
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 30eebfb1b2d8..2070aeee2a52 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1305,7 +1305,6 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
        if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
            *ppos + count > MAX_NON_LFS) {
                if (*ppos >= MAX_NON_LFS) {
-                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                if (count > MAX_NON_LFS - (unsigned long)*ppos)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1272d11399fb..ddde489f1cb2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -7,6 +7,7 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
+#include <linux/exportfs.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4ac9119200e..5a93cfe1a032 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 49194a4e6b91..bbb19be260ce 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -177,21 +177,23 @@ EXPORT_SYMBOL(seq_read);
 static int traverse(struct seq_file *m, loff_t offset)
 {
-        loff_t pos = 0;
+        loff_t pos = 0, index;
        int error = 0;
        void *p;
        m->version = 0;
-        m->index = 0;
+        index = 0;
        m->count = m->from = 0;
-        if (!offset)
+        if (!offset) {
+                m->index = index;
                return 0;
+        }
        if (!m->buf) {
                m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
                if (!m->buf)
                        return -ENOMEM;
        }
-        p = m->op->start(m, &m->index);
+        p = m->op->start(m, &index);
        while (p) {
                error = PTR_ERR(p);
                if (IS_ERR(p))
@@ -204,15 +206,17 @@ static int traverse(struct seq_file *m, loff_t offset)
                if (pos + m->count > offset) {
                        m->from = offset - pos;
                        m->count -= m->from;
+                        m->index = index;
                        break;
                }
                pos += m->count;
                m->count = 0;
                if (pos == offset) {
-                        m->index++;
+                        index++;
+                        m->index = index;
                        break;
                }
-                p = m->op->next(m, p, &m->index);
+                p = m->op->next(m, p, &index);
        }
        m->op->stop(m, p);
        return error;
@@ -260,8 +264,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
                                }
                        }
        }
-        mutex_unlock(&m->lock);
        file->f_version = m->version;
+        mutex_unlock(&m->lock);
        return retval;
 }
 EXPORT_SYMBOL(seq_lseek);
diff --git a/fs/splice.c b/fs/splice.c
index 6c9828651e6f..53fc2082a468 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1061,8 +1061,9 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
        while (len) {
                size_t read_len;
+                loff_t pos = sd->pos;
-                ret = do_splice_to(in, &sd->pos, pipe, len, flags);
+                ret = do_splice_to(in, &pos, pipe, len, flags);
                if (unlikely(ret <= 0))
                        goto out_release;
@@ -1080,6 +1081,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                bytes += ret;
                len -= ret;
+                sd->pos = pos;
                if (ret < read_len)
                        goto out_release;
diff --git a/fs/super.c b/fs/super.c
index 5260d620c555..fc8ebedc6bed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -884,6 +884,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        error = type->get_sb(type, flags, name, data, mnt);
        if (error < 0)
                goto out_free_secdata;
+        BUG_ON(!mnt->mnt_sb);
        error = security_sb_kern_mount(mnt->mnt_sb, secdata);
        if (error)
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index 1b82a4adc2f7..ef2bfaa19d75 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -106,8 +106,8 @@ int main(void)
 {
        unsigned short x;
-        x = udf_crc16(bytes, sizeof bytes);
+        x = udf_crc(bytes, sizeof bytes);
-        printf("udf_crc16: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
+        printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
        return 0;
 }
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 8206983f2ebf..10f3188738af 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -50,7 +50,7 @@ void udf_free_inode(struct inode * inode)
                else
                        UDF_SB_LVIDIU(sb)->numFiles =
                                cpu_to_le32(le32_to_cpu(UDF_SB_LVIDIU(sb)->numFiles) - 1);
-                
                mark_buffer_dirty(sbi->s_lvidbh);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
@@ -136,6 +136,13 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
                UDF_I_EFE(inode) = 0;
                UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
        }
+        if (!UDF_I_DATA(inode))
+        {
+                iput(inode);
+                *err = -ENOMEM;
+                mutex_unlock(&sbi->s_alloc_mutex);
+                return NULL;
+        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
        else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bf7de0bdbab3..5b82e489af78 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -49,6 +49,7 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
+static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
        long *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
@@ -734,7 +735,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset, int newbl
                        (*c) ++;
                        (*endnum) ++;
                }
-                
                laarr[curr].extLocation.logicalBlockNum = newblocknum;
                if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
                        laarr[curr].extLocation.partitionReferenceNum =
@@ -836,7 +837,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                {
                                        numalloc -= elen;
                                        if (*endnum > (i+1))
-                                                memmove(&laarr[i], &laarr[i+1], 
+                                                memmove(&laarr[i], &laarr[i+1],
                                                        sizeof(long_ad) * (*endnum - (i+1)));
                                        i --;
                                        (*endnum) --;
@@ -1024,7 +1025,7 @@ void udf_truncate(struct inode * inode)
        {
                block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block);
                udf_truncate_extents(inode);
-        }       
+        }
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        if (IS_SYNC(inode))
@@ -1087,10 +1088,10 @@ __udf_read_inode(struct inode *inode)
                        {
                                kernel_lb_addr loc;
                                ie = (struct indirectEntry *)ibh->b_data;
-        
                                loc = lelb_to_cpu(ie->indirectICB.extLocation);
-        
-                                if (ie->indirectICB.extLength && 
+                                if (ie->indirectICB.extLength &&
                                        (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident)))
                                {
                                        if (ident == TAG_IDENT_FE ||
@@ -1156,14 +1157,22 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        {
                UDF_I_EFE(inode) = 1;
                UDF_I_USE(inode) = 0;
-                UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL);
+                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)))
+                {
+                        make_bad_inode(inode);
+                        return;
+                }
                memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry));
        }
        else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_FE)
        {
                UDF_I_EFE(inode) = 0;
                UDF_I_USE(inode) = 0;
-                UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
+                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct fileEntry)))
+                {
+                        make_bad_inode(inode);
+                        return;
+                }
                memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry));
        }
        else if (le16_to_cpu(fe->descTag.tagIdent) == TAG_IDENT_USE)
@@ -1173,7 +1182,11 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
                UDF_I_LENALLOC(inode) =
                        le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->lengthAllocDescs);
-                UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry), GFP_KERNEL);
+                if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)))
+                {
+                        make_bad_inode(inode);
+                        return;
+                }
                memcpy(UDF_I_DATA(inode), bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry));
                return;
        }
@@ -1191,7 +1204,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
        if (!inode->i_nlink)
                inode->i_nlink = 1;
-        
        inode->i_size = le64_to_cpu(fe->informationLength);
        UDF_I_LENEXTENTS(inode) = inode->i_size;
@@ -1243,7 +1256,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        }
        else
        {
-                inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << 
+                inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
                        (inode->i_sb->s_blocksize_bits - 9);
                if ( udf_stamp_to_time(&convtime, &convtime_usec,
@@ -1374,6 +1387,20 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        }
 }
+static int udf_alloc_i_data(struct inode *inode, size_t size)
+{
+        UDF_I_DATA(inode) = kmalloc(size, GFP_KERNEL);
+        if (!UDF_I_DATA(inode))
+        {
+                printk(KERN_ERR "udf:udf_alloc_i_data (ino %ld) no free memory\n",
+                       inode->i_ino);
+                return -ENOMEM;
+        }
+        return 0;
+}
 static mode_t
 udf_convert_permissions(struct fileEntry *fe)
 {
@@ -2072,7 +2099,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        mark_buffer_dirty_inode(oepos.bh, inode);
                }
        }
-        
        brelse(epos.bh);
        brelse(oepos.bh);
        return (elen >> 30);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 22ff6ed55ce9..2b3011689e89 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -87,6 +87,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/log2.h>
 #include "swab.h"
 #include "util.h"
@@ -854,7 +855,7 @@ magic_found:
        uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
        uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
-        if (uspi->s_fsize & (uspi->s_fsize - 1)) {
+        if (!is_power_of_2(uspi->s_fsize)) {
                printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n",
                        uspi->s_fsize);
                        goto failed;
@@ -869,7 +870,7 @@ magic_found:
                        uspi->s_fsize);
                goto failed;
        }
-        if (uspi->s_bsize & (uspi->s_bsize - 1)) {
+        if (!is_power_of_2(uspi->s_bsize)) {
                printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n",
                        uspi->s_bsize);
                goto failed;
diff --git a/fs/utimes.c b/fs/utimes.c
index b3c88952465f..83a7e69e706c 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -106,7 +106,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
                if (IS_IMMUTABLE(inode))
                        goto dput_and_out;
-                if (current->fsuid != inode->i_uid) {
+                if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) {
                        if (f) {
                                if (!(f->f_mode & FMODE_WRITE))
                                        goto dput_and_out;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2df63622354e..b0f0e58866de 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -35,10 +35,13 @@
 #include <linux/freezer.h>
 static kmem_zone_t *xfs_buf_zone;
-static struct shrinker *xfs_buf_shake;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+static struct shrinker xfs_buf_shake = {
+        .shrink = xfsbufd_wakeup,
+        .seeks = DEFAULT_SEEKS,
+};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -1832,14 +1835,9 @@ xfs_buf_init(void)
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfs_buf_shake = set_shrinker(DEFAULT_SEEKS, xfsbufd_wakeup);
+        register_shrinker(&xfs_buf_shake);
-        if (!xfs_buf_shake)
-                goto out_destroy_xfsdatad_workqueue;
        return 0;
- out_destroy_xfsdatad_workqueue:
-        destroy_workqueue(xfsdatad_workqueue);
 out_destroy_xfslogd_workqueue:
        destroy_workqueue(xfslogd_workqueue);
 out_free_buf_zone:
@@ -1854,7 +1852,7 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        remove_shrinker(xfs_buf_shake);
+        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 06894cf00b12..4528f9a3f304 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -562,6 +562,7 @@ xfssyncd(
        bhv_vfs_sync_work_t     *work, *n;
        LIST_HEAD               (tmp);
+        set_freezable();
        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
        for (;;) {
                timeleft = schedule_timeout_interruptible(timeleft);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 33dd1ca13245..201cc3273c84 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_SUPER_H__
 #define __XFS_SUPER_H__
+#include <linux/exportfs.h>
 #ifdef CONFIG_XFS_DMAPI
 # define vfs_insertdmapi(vfs)   vfs_insertops(vfsp, &xfs_dmops)
 # define vfs_initdmapi()        dmapi_init()
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7def4c699343..2d274b23ade5 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -62,7 +62,6 @@ uint		ndquot;
 kmem_zone_t     *qm_dqzone;
 kmem_zone_t     *qm_dqtrxzone;
-static struct shrinker *xfs_qm_shaker;
 static cred_t   xfs_zerocr;
@@ -78,6 +77,11 @@ STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int      xfs_qm_shake(int, gfp_t);
+static struct shrinker xfs_qm_shaker = {
+        .shrink = xfs_qm_shake,
+        .seeks = DEFAULT_SEEKS,
+};
 #ifdef DEBUG
 extern mutex_t  qcheck_lock;
 #endif
@@ -149,7 +153,7 @@ xfs_Gqm_init(void)
        } else
                xqm->qm_dqzone = qm_dqzone;
-        xfs_qm_shaker = set_shrinker(DEFAULT_SEEKS, xfs_qm_shake);
+        register_shrinker(&xfs_qm_shaker);
        /*
         * The t_dqinfo portion of transactions.
@@ -181,7 +185,7 @@ xfs_qm_destroy(
        ASSERT(xqm != NULL);
        ASSERT(xqm->qm_nrefs == 0);
-        remove_shrinker(xfs_qm_shaker);
+        unregister_shrinker(&xfs_qm_shaker);
        hsize = xqm->qm_dqhashmask + 1;
        for (i = 0; i < hsize; i++) {
                xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));