Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (32 commits) [PATCH] ocfs2: zero_user_page conversion ocfs2: Support xfs style space reservation ioctls ocfs2: support for removing file regions ocfs2: update truncate handling of partial clusters ocfs2: btree support for removal of arbirtrary extents ocfs2: Support creation of unwritten extents ocfs2: support writing of unwritten extents ocfs2: small cleanup of ocfs2_write_begin_nolock() ocfs2: btree changes for unwritten extents ocfs2: abstract btree growing calls ocfs2: use all extent block suballocators ocfs2: plug truncate into cached dealloc routines ocfs2: simplify deallocation locking ocfs2: harden buffer check during mapping of page blocks ocfs2: shared writeable mmap ocfs2: factor out write aops into nolock variants ocfs2: rework ocfs2_buffered_write_cluster() ocfs2: take ip_alloc_sem during entire truncate ocfs2: Add "preferred slot" mount option [KJ PATCH] Replacing memset(<addr>,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c ...
author: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-07-16 13:52:55 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-07-16 13:52:55 -0400
commit: add096909da63ef32d6766f6771c07c9f16c6ee5 (patch)
tree: 58594bcf68cbb6f777d5270d098ab8ca69cbaee3 /fs
parent: e245befce7af0a1e1347079ed62695b059594bd4 (diff)
parent: 54c57dc3b6578356c0a428c767d4bf080254a2ee (diff)
36 files changed, 4548 insertions, 1036 deletions
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c034b312..3b0185fdf9a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
 struct configfs_dirent {
        atomic_t                s_count;
+        int                     s_dependent_count;
        struct list_head        s_sibling;
        struct list_head        s_children;
        struct list_head        s_links;
-        void                    * s_element;
+        void                    * s_element;
        int                     s_type;
        umode_t                 s_mode;
        struct dentry           * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
 #define CONFIGFS_ROOT           0x0001
 #define CONFIGFS_DIR            0x0002
-#define CONFIGFS_ITEM_ATTR      0x0004
+#define CONFIGFS_ITEM_ATTR      0x0004
-#define CONFIGFS_ITEM_LINK      0x0020
+#define CONFIGFS_ITEM_LINK      0x0020
 #define CONFIGFS_USET_DIR       0x0040
 #define CONFIGFS_USET_DEFAULT   0x0080
 #define CONFIGFS_USET_DROPPING  0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e58f36..2f436d4f1d6d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
                        /* Mark that we've taken i_mutex */
                        sd->s_type |= CONFIGFS_USET_DROPPING;
+                        /*
+                         * Yup, recursive.  If there's a problem, blame
+                         * deep nesting of default_groups
+                         */
                        ret = configfs_detach_prep(sd->s_dentry);
                        if (!ret)
                                continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
 /*
 * All of link_obj/unlink_obj/link_group/unlink_group require that
- * subsys->su_sem is held.
+ * subsys->su_mutex is held.
 */
 static void unlink_obj(struct config_item *item)
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item)
 }
 /*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy.  Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy.  A client does not need to provide this
+ * callback.  The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items.  It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+                                     struct config_item *item)
+{
+        struct config_item_type *type;
+        type = parent_item->ci_type;
+        BUG_ON(!type);
+        if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+                type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+                                                      item);
+}
+/*
 * Drop the initial reference from make_item()/make_group()
 * This function assumes that reference is held on item
 * and that item holds a valid reference to the parent.  Also, it
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
         */
        if (type->ct_group_ops && type->ct_group_ops->drop_item)
                type->ct_group_ops->drop_item(to_config_group(parent_item),
-                                                item);
+                                              item);
        else
                config_item_put(item);
 }
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+        printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+        type_print(CONFIGFS_ROOT);
+        type_print(CONFIGFS_DIR);
+        type_print(CONFIGFS_ITEM_ATTR);
+        type_print(CONFIGFS_ITEM_LINK);
+        type_print(CONFIGFS_USET_DIR);
+        type_print(CONFIGFS_USET_DEFAULT);
+        type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+        struct configfs_dirent *child_sd;
+        int ret = 0;
+        configfs_dump_one(sd, level);
+        if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+                return 0;
+        list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+                ret = configfs_dump(child_sd, level + 2);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+#endif
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item.  If this
+ * happens, the item cannot go away until the dependant can live without
+ * it.  The idea is to give client modules as simple an interface as
+ * possible.  When a system asks them to depend on an item, they just
+ * call configfs_depend_item().  If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex?  Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path.  So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc).  This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem().  So we take the same
+ * precautions.  We pin the filesystem.  We lock each i_mutex _in_order_
+ * on our way down the tree.  If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore.  Holding the i_mutex chain locks
+ * out mkdir() and rmdir(), who might be racing us.
+ */
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes.  This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep().  We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object.  We take i_mutex on each step of the way down.  IT IS
+ * ESSENTIAL THAT i_mutex LOCKING IS ORDERED.  If we come back up a branch,
+ * we'll drop the i_mutex.
+ *
+ * If the target is not found, -ENOENT is bubbled up and we have released
+ * all locks.  If the target was found, the locks will be cleared by
+ * configfs_depend_rollback().
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive because the locking traversal is tricky.  There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack!  Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+                                struct config_item *target)
+{
+        struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+        int ret = 0;
+        BUG_ON(!origin || !sd);
+        /* Lock this guy on the way down */
+        mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+        if (sd->s_element == target)  /* Boo-yah */
+                goto out;
+        list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+                if (child_sd->s_type & CONFIGFS_DIR) {
+                        ret = configfs_depend_prep(child_sd->s_dentry,
+                                                   target);
+                        if (!ret)
+                                goto out;  /* Child path boo-yah */
+                }
+        }
+        /* We looped all our children and didn't find target */
+        mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+        ret = -ENOENT;
+out:
+        return ret;
+}
+/*
+ * This is ONLY called if configfs_depend_prep() did its job.  So we can
+ * trust the entire path from item back up to origin.
+ *
+ * We walk backwards from item, unlocking each i_mutex.  We finish by
+ * unlocking origin.
+ */
+static void configfs_depend_rollback(struct dentry *origin,
+                                     struct config_item *item)
+{
+        struct dentry *dentry = item->ci_dentry;
+        while (dentry != origin) {
+                mutex_unlock(&dentry->d_inode->i_mutex);
+                dentry = dentry->d_parent;
+        }
+        mutex_unlock(&origin->d_inode->i_mutex);
+}
+int configfs_depend_item(struct configfs_subsystem *subsys,
+                         struct config_item *target)
+{
+        int ret;
+        struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+        struct config_item *s_item = &subsys->su_group.cg_item;
+        /*
+         * Pin the configfs filesystem.  This means we can safely access
+         * the root of the configfs filesystem.
+         */
+        ret = configfs_pin_fs();
+        if (ret)
+                return ret;
+        /*
+         * Next, lock the root directory.  We're going to check that the
+         * subsystem is really registered, and so we need to lock out
+         * configfs_[un]register_subsystem().
+         */
+        mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+        root_sd = configfs_sb->s_root->d_fsdata;
+        list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+                if (p->s_type & CONFIGFS_DIR) {
+                        if (p->s_element == s_item) {
+                                subsys_sd = p;
+                                break;
+                        }
+                }
+        }
+        if (!subsys_sd) {
+                ret = -ENOENT;
+                goto out_unlock_fs;
+        }
+        /* Ok, now we can trust subsys/s_item */
+        /* Scan the tree, locking i_mutex recursively, return 0 if found */
+        ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+        if (ret)
+                goto out_unlock_fs;
+        /* We hold all i_mutexes from the subsystem down to the target */
+        p = target->ci_dentry->d_fsdata;
+        p->s_dependent_count += 1;
+        configfs_depend_rollback(subsys_sd->s_dentry, target);
+out_unlock_fs:
+        mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+        /*
+         * If we succeeded, the fs is pinned via other methods.  If not,
+         * we're done with it anyway.  So release_fs() is always right.
+         */
+        configfs_release_fs();
+        return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+/*
+ * Release the dependent linkage.  This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+                            struct config_item *target)
+{
+        struct configfs_dirent *sd;
+        /*
+         * Since we can trust everything is pinned, we just need i_mutex
+         * on the item.
+         */
+        mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+        sd = target->ci_dentry->d_fsdata;
+        BUG_ON(sd->s_dependent_count < 1);
+        sd->s_dependent_count -= 1;
+        /*
+         * After this unlock, we cannot trust the item to stay alive!
+         * DO NOT REFERENCE item after this unlock.
+         */
+        mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
-        down(&subsys->su_sem);
+        mutex_lock(&subsys->su_mutex);
        group = NULL;
        item = NULL;
        if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                if (item)
                        link_obj(parent_item, item);
        }
-        up(&subsys->su_sem);
+        mutex_unlock(&subsys->su_mutex);
        kfree(name);
        if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_unlink:
        if (ret) {
                /* Tear down everything we built up */
-                down(&subsys->su_sem);
+                mutex_lock(&subsys->su_mutex);
+                client_disconnect_notify(parent_item, item);
                if (group)
                        unlink_group(group);
                else
                        unlink_obj(item);
                client_drop_item(parent_item, item);
-                up(&subsys->su_sem);
+                mutex_unlock(&subsys->su_mutex);
                if (module_got)
                        module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DEFAULT)
                return -EPERM;
+        /*
+         * Here's where we check for dependents.  We're protected by
+         * i_mutex.
+         */
+        if (sd->s_dependent_count)
+                return -EBUSY;
        /* Get a working ref until we have the child */
        parent_item = configfs_get_config_item(dentry->d_parent);
        subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DIR) {
                configfs_detach_group(item);
-                down(&subsys->su_sem);
+                mutex_lock(&subsys->su_mutex);
+                client_disconnect_notify(parent_item, item);
                unlink_group(to_config_group(item));
        } else {
                configfs_detach_item(item);
-                down(&subsys->su_sem);
+                mutex_lock(&subsys->su_mutex);
+                client_disconnect_notify(parent_item, item);
                unlink_obj(item);
        }
        client_drop_item(parent_item, item);
-        up(&subsys->su_sem);
+        mutex_unlock(&subsys->su_mutex);
        /* Drop our reference from above */
        config_item_put(item);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3527c7c6def8..a3658f9a082c 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -27,19 +27,26 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
+/*
+ * A simple attribute can only be 4096 characters.  Why 4k?  Because the
+ * original code limited it to PAGE_SIZE.  That's a bad idea, though,
+ * because an attribute of 16k on ia64 won't work on x86.  So we limit to
+ * 4k, our minimum common page size.
+ */
+#define SIMPLE_ATTR_SIZE 4096
 struct configfs_buffer {
        size_t                  count;
        loff_t                  pos;
        char                    * page;
        struct configfs_item_operations * ops;
-        struct semaphore        sem;
+        struct mutex            mutex;
        int                     needs_read_fill;
 };
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
        count = ops->show_attribute(item,attr,buffer->page);
        buffer->needs_read_fill = 0;
-        BUG_ON(count > (ssize_t)PAGE_SIZE);
+        BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
        if (count >= 0)
                buffer->count = count;
        else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
        struct configfs_buffer * buffer = file->private_data;
        ssize_t retval = 0;
-        down(&buffer->sem);
+        mutex_lock(&buffer->mutex);
        if (buffer->needs_read_fill) {
                if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
                        goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
        retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
                                         buffer->count);
 out:
-        up(&buffer->sem);
+        mutex_unlock(&buffer->mutex);
        return retval;
 }
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
        if (!buffer->page)
                return -ENOMEM;
-        if (count >= PAGE_SIZE)
+        if (count >= SIMPLE_ATTR_SIZE)
-                count = PAGE_SIZE - 1;
+                count = SIMPLE_ATTR_SIZE - 1;
        error = copy_from_user(buffer->page,buf,count);
        buffer->needs_read_fill = 1;
        /* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
        struct configfs_buffer * buffer = file->private_data;
        ssize_t len;
-        down(&buffer->sem);
+        mutex_lock(&buffer->mutex);
        len = fill_write_buffer(buffer, buf, count);
        if (len > 0)
                len = flush_write_buffer(file->f_path.dentry, buffer, count);
        if (len > 0)
                *ppos += len;
-        up(&buffer->sem);
+        mutex_unlock(&buffer->mutex);
        return len;
 }
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
                error = -ENOMEM;
                goto Enomem;
        }
-        init_MUTEX(&buffer->sem);
+        mutex_init(&buffer->mutex);
        buffer->needs_read_fill = 1;
        buffer->ops = ops;
        file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
        if (buffer) {
                if (buffer->page)
                        free_page((unsigned long)buffer->page);
+                mutex_destroy(&buffer->mutex);
                kfree(buffer);
        }
        return 0;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 24421209f854..76dc4c3e5d51 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
 *      dynamically allocated string that @item->ci_name points to.
 *      Otherwise, use the static @item->ci_namebuf array.
 */
 int config_item_set_name(struct config_item * item, const char * fmt, ...)
 {
        int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
        return item;
 }
-/**
+static void config_item_cleanup(struct config_item * item)
- *      config_item_cleanup - free config_item resources.
- *      @item:  item.
- */
-void config_item_cleanup(struct config_item * item)
 {
        struct config_item_type * t = item->ci_type;
        struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
                kref_put(&item->ci_kref, config_item_release);
 }
 /**
 *      config_group_init - initialize a group for use
 *      @k:     group
 */
 void config_group_init(struct config_group *group)
 {
        config_item_init(&group->cg_item);
        INIT_LIST_HEAD(&group->cg_children);
 }
 /**
- *      config_group_find_obj - search for item in group.
+ *      config_group_find_item - search for item in group.
 *      @group: group we're looking in.
 *      @name:  item's name.
 *
- *      Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *      Iterate over @group->cg_list, looking for a matching config_item.
- *      looking for a matching config_item. If matching item is found
+ *      If matching item is found take a reference and return the item.
- *      take a reference and return the item.
+ *      Caller must have locked group via @group->cg_subsys->su_mtx.
 */
+struct config_item *config_group_find_item(struct config_group *group,
-struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+                                           const char *name)
 {
        struct list_head * entry;
        struct config_item * ret = NULL;
-        /* XXX LOCKING! */
        list_for_each(entry,&group->cg_children) {
                struct config_item * item = to_item(entry);
                if (config_item_name(item) &&
-                    !strcmp(config_item_name(item), name)) {
+                    !strcmp(config_item_name(item), name)) {
                        ret = config_item_get(item);
                        break;
                }
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
        return ret;
 }
 EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
-EXPORT_SYMBOL(config_group_find_obj);
+EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 5069b2cb5a1f..2f8e3c81bc19 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
        return len;
 }
-#define __CONFIGFS_ATTR(_name,_mode,_read,_write) {                           \
-        .attr   = { .ca_name = __stringify(_name),                            \
-                    .ca_mode = _mode,                                         \
-                    .ca_owner = THIS_MODULE },                                \
-        .show   = _read,                                                      \
-        .store  = _write,                                                     \
-}
 #define CLUSTER_ATTR(name, check_zero)                                        \
 static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
 {                                                                             \
@@ -615,7 +607,7 @@ static struct clusters clusters_root = {
 int dlm_config_init(void)
 {
        config_group_init(&clusters_root.subsys.su_group);
-        init_MUTEX(&clusters_root.subsys.su_sem);
+        mutex_init(&clusters_root.subsys.su_mutex);
        return configfs_register_subsystem(&clusters_root.subsys);
 }
@@ -759,9 +751,9 @@ static struct space *get_space(char *name)
        if (!space_list)
                return NULL;
-        down(&space_list->cg_subsys->su_sem);
+        mutex_lock(&space_list->cg_subsys->su_mutex);
-        i = config_group_find_obj(space_list, name);
+        i = config_group_find_item(space_list, name);
-        up(&space_list->cg_subsys->su_sem);
+        mutex_unlock(&space_list->cg_subsys->su_mutex);
        return to_space(i);
 }
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        if (!comm_list)
                return NULL;
-        down(&clusters_root.subsys.su_sem);
+        mutex_lock(&clusters_root.subsys.su_mutex);
        list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
                cm = to_comm(i);
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
                        break;
                }
        }
-        up(&clusters_root.subsys.su_sem);
+        mutex_unlock(&clusters_root.subsys.su_mutex);
        if (!found)
                cm = NULL;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145f..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
 #include "buffer_head_io.h"
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                         struct ocfs2_extent_block *eb);
 /*
 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
 }
 /*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+        int i;
+        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        BUG_ON(path_root_el(dest) != path_root_el(src));
+        ocfs2_reinit_path(dest, 1);
+        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+                dest->p_node[i].bh = src->p_node[i].bh;
+                dest->p_node[i].el = src->p_node[i].el;
+                if (dest->p_node[i].bh)
+                        get_bh(dest->p_node[i].bh);
+        }
+}
+/*
 * Make the *dest path the same as src and re-initialize src path to
 * have a root only.
 */
@@ -212,10 +239,41 @@ out:
        return ret;
 }
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+        int ret = -1;
+        int i;
+        struct ocfs2_extent_rec *rec;
+        u32 rec_end, rec_start, clusters;
+        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                rec = &el->l_recs[i];
+                rec_start = le32_to_cpu(rec->e_cpos);
+                clusters = ocfs2_rec_clusters(el, rec);
+                rec_end = rec_start + clusters;
+                if (v_cluster >= rec_start && v_cluster < rec_end) {
+                        ret = i;
+                        break;
+                }
+        }
+        return ret;
+}
 enum ocfs2_contig_type {
        CONTIG_NONE = 0,
        CONTIG_LEFT,
-        CONTIG_RIGHT
+        CONTIG_RIGHT,
+        CONTIG_LEFTRIGHT,
 };
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
 {
        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+        /*
+         * Refuse to coalesce extent records with different flag
+         * fields - we don't want to mix unwritten extents with user
+         * data.
+         */
+        if (ext->e_flags != insert_rec->e_flags)
+                return CONTIG_NONE;
        if (ocfs2_extents_adjacent(ext, insert_rec) &&
            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
                        return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
        APPEND_TAIL,
 };
+enum ocfs2_split_type {
+        SPLIT_NONE = 0,
+        SPLIT_LEFT,
+        SPLIT_RIGHT,
+};
 struct ocfs2_insert_type {
+        enum ocfs2_split_type   ins_split;
        enum ocfs2_append_type  ins_appending;
        enum ocfs2_contig_type  ins_contig;
        int                     ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
        int                     ins_tree_depth;
 };
+struct ocfs2_merge_ctxt {
+        enum ocfs2_contig_type  c_contig_type;
+        int                     c_has_empty_extent;
+        int                     c_split_covers_rec;
+        int                     c_used_tail_recs;
+};
 /*
 * How many free extents have we got before we need more meta data?
 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
                        eb->h_blkno = cpu_to_le64(first_blkno);
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
-                        /* we always use slot zero's suballocator */
-                        eb->h_suballoc_slot = 0;
-#else
                        eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
-#endif
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct buffer_head *fe_bh,
                            struct buffer_head *eb_bh,
-                            struct buffer_head *last_eb_bh,
+                            struct buffer_head **last_eb_bh,
                            struct ocfs2_alloc_context *meta_ac)
 {
        int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        mlog_entry_void();
-        BUG_ON(!last_eb_bh);
+        BUG_ON(!last_eb_bh || !*last_eb_bh);
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                goto bail;
        }
-        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-        status = ocfs2_journal_access(handle, inode, last_eb_bh,
+        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * next_leaf on the previously last-extent-block. */
        fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
-        eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
-        status = ocfs2_journal_dirty(handle, last_eb_bh);
+        status = ocfs2_journal_dirty(handle, *last_eb_bh);
        if (status < 0)
                mlog_errno(status);
        status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                        mlog_errno(status);
        }
+        /*
+         * Some callers want to track the rightmost leaf so pass it
+         * back here.
+         */
+        brelse(*last_eb_bh);
+        get_bh(new_eb_bhs[0]);
+        *last_eb_bh = new_eb_bhs[0];
        status = 0;
 bail:
        if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
 }
 /*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
+                           struct buffer_head *di_bh, int *final_depth,
+                           struct buffer_head **last_eb_bh,
+                           struct ocfs2_alloc_context *meta_ac)
+{
+        int ret, shift;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *bh = NULL;
+        BUG_ON(meta_ac == NULL);
+        shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+        if (shift < 0) {
+                ret = shift;
+                mlog_errno(ret);
+                goto out;
+        }
+        /* We traveled all the way to the bottom of the allocation tree
+         * and didn't find room for any more extents - we need to add
+         * another tree level */
+        if (shift) {
+                BUG_ON(bh);
+                mlog(0, "need to shift tree depth (current = %d)\n", depth);
+                /* ocfs2_shift_tree_depth will return us a buffer with
+                 * the new extent block (so we can pass that to
+                 * ocfs2_add_branch). */
+                ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+                                             meta_ac, &bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                depth++;
+                if (depth == 1) {
+                        /*
+                         * Special case: we have room now if we shifted from
+                         * tree_depth 0, so no more work needs to be done.
+                         *
+                         * We won't be calling add_branch, so pass
+                         * back *last_eb_bh as the new leaf. At depth
+                         * zero, it should always be null so there's
+                         * no reason to brelse.
+                         */
+                        BUG_ON(*last_eb_bh);
+                        get_bh(bh);
+                        *last_eb_bh = bh;
+                        goto out;
+                }
+        }
+        /* call ocfs2_add_branch to add the final part of the tree with
+         * the new data. */
+        mlog(0, "add branch. bh = %p\n", bh);
+        ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+                               meta_ac);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (final_depth)
+                *final_depth = depth;
+        brelse(bh);
+        return ret;
+}
+/*
 * This is only valid for leaf nodes, which are the only ones that can
 * have empty extents anyway.
 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
 }
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+        int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+        BUG_ON(num_recs == 0);
+        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                num_recs--;
+                size = num_recs * sizeof(struct ocfs2_extent_rec);
+                memmove(&el->l_recs[0], &el->l_recs[1], size);
+                memset(&el->l_recs[num_recs], 0,
+                       sizeof(struct ocfs2_extent_rec));
+                el->l_next_free_rec = cpu_to_le16(num_recs);
+        }
+}
 /*
 * Create an empty extent record .
 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         * immediately to their right.
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+        }
        left_clusters -= le32_to_cpu(left_rec->e_cpos);
        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
@@ -1531,10 +1714,16 @@ out:
        return ret;
 }
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
        if (handle->h_buffer_credits < credits)
                return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
        return 0;
 }
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+        int next_free = le16_to_cpu(el->l_next_free_rec);
+        unsigned int range;
+        struct ocfs2_extent_rec *rec;
+        if (next_free == 0)
+                return 0;
+        rec = &el->l_recs[0];
+        if (ocfs2_is_empty_extent(rec)) {
+                /* Empty list. */
+                if (next_free == 1)
+                        return 0;
+                rec = &el->l_recs[1];
+        }
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+                return 1;
+        return 0;
+}
 /*
 * Rotate all the records in a btree right one record, starting at insert_cpos.
 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
 */
 static int ocfs2_rotate_tree_right(struct inode *inode,
                                   handle_t *handle,
+                                   enum ocfs2_split_type split,
                                   u32 insert_cpos,
                                   struct ocfs2_path *right_path,
                                   struct ocfs2_path **ret_left_path)
 {
-        int ret, start;
+        int ret, start, orig_credits = handle->h_buffer_credits;
        u32 cpos;
        struct ocfs2_path *left_path = NULL;
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                                (unsigned long long)
                                path_leaf_bh(left_path)->b_blocknr);
-                if (ocfs2_rotate_requires_path_adjustment(left_path,
+                if (split == SPLIT_NONE &&
+                    ocfs2_rotate_requires_path_adjustment(left_path,
                                                          insert_cpos)) {
-                        mlog(0, "Path adjustment required\n");
                        /*
                         * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                     right_path->p_tree_depth);
                ret = ocfs2_extend_rotate_transaction(handle, start,
-                                                      right_path);
+                                                      orig_credits, right_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
                        goto out;
                }
+                if (split != SPLIT_NONE &&
+                    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+                                                insert_cpos)) {
+                        /*
+                         * A rotate moves the rightmost left leaf
+                         * record over to the leftmost right leaf
+                         * slot. If we're doing an extent split
+                         * instead of a real insert, then we have to
+                         * check that the extent to be split wasn't
+                         * just moved over. If it was, then we can
+                         * exit here, passing left_path back -
+                         * ocfs2_split_extent() is smart enough to
+                         * search both leaves.
+                         */
+                        *ret_left_path = left_path;
+                        goto out_ret_path;
+                }
                /*
                 * There is no need to re-read the next right path
                 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
        return ret;
 }
+static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+                                      struct ocfs2_path *path)
+{
+        int i, idx;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_block *eb;
+        u32 range;
+        /* Path should always be rightmost. */
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+        BUG_ON(eb->h_next_leaf_blk != 0ULL);
+        el = &eb->h_list;
+        BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+        idx = le16_to_cpu(el->l_next_free_rec) - 1;
+        rec = &el->l_recs[idx];
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        for (i = 0; i < path->p_tree_depth; i++) {
+                el = path->p_node[i].el;
+                idx = le16_to_cpu(el->l_next_free_rec) - 1;
+                rec = &el->l_recs[idx];
+                rec->e_int_clusters = cpu_to_le32(range);
+                le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+                ocfs2_journal_dirty(handle, path->p_node[i].bh);
+        }
+}
+static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc,
+                              struct ocfs2_path *path, int unlink_start)
+{
+        int ret, i;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        struct buffer_head *bh;
+        for(i = unlink_start; i < path_num_items(path); i++) {
+                bh = path->p_node[i].bh;
+                eb = (struct ocfs2_extent_block *)bh->b_data;
+                /*
+                 * Not all nodes might have had their final count
+                 * decremented by the caller - handle this here.
+                 */
+                el = &eb->h_list;
+                if (le16_to_cpu(el->l_next_free_rec) > 1) {
+                        mlog(ML_ERROR,
+                             "Inode %llu, attempted to remove extent block "
+                             "%llu with %u records\n",
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             (unsigned long long)le64_to_cpu(eb->h_blkno),
+                             le16_to_cpu(el->l_next_free_rec));
+                        ocfs2_journal_dirty(handle, bh);
+                        ocfs2_remove_from_cache(inode, bh);
+                        continue;
+                }
+                el->l_next_free_rec = 0;
+                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+                ocfs2_journal_dirty(handle, bh);
+                ret = ocfs2_cache_extent_block_free(dealloc, eb);
+                if (ret)
+                        mlog_errno(ret);
+                ocfs2_remove_from_cache(inode, bh);
+        }
+}
+static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+                                 struct ocfs2_path *left_path,
+                                 struct ocfs2_path *right_path,
+                                 int subtree_index,
+                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int i;
+        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+        struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_block *eb;
+        el = path_leaf_el(left_path);
+        eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+        for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+                if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+                        break;
+        BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+        le16_add_cpu(&root_el->l_next_free_rec, -1);
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+        eb->h_next_leaf_blk = 0;
+        ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        ocfs2_unlink_path(inode, handle, dealloc, right_path,
+                          subtree_index + 1);
+}
+static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+                                     struct ocfs2_path *left_path,
+                                     struct ocfs2_path *right_path,
+                                     int subtree_index,
+                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                     int *deleted)
+{
+        int ret, i, del_right_subtree = 0, right_has_empty = 0;
+        struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+        struct ocfs2_extent_block *eb;
+        *deleted = 0;
+        right_leaf_el = path_leaf_el(right_path);
+        left_leaf_el = path_leaf_el(left_path);
+        root_bh = left_path->p_node[subtree_index].bh;
+        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+        if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+                return 0;
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+        if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+                /*
+                 * It's legal for us to proceed if the right leaf is
+                 * the rightmost one and it has an empty extent. There
+                 * are two cases to handle - whether the leaf will be
+                 * empty after removal or not. If the leaf isn't empty
+                 * then just remove the empty extent up front. The
+                 * next block will handle empty leaves by flagging
+                 * them for unlink.
+                 *
+                 * Non rightmost leaves will throw -EAGAIN and the
+                 * caller can manually move the subtree and retry.
+                 */
+                if (eb->h_next_leaf_blk != 0ULL)
+                        return -EAGAIN;
+                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   path_leaf_bh(right_path),
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ocfs2_remove_empty_extent(right_leaf_el);
+                } else
+                        right_has_empty = 1;
+        }
+        if (eb->h_next_leaf_blk == 0ULL &&
+            le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+                /*
+                 * We have to update i_last_eb_blk during the meta
+                 * data delete.
+                 */
+                ret = ocfs2_journal_access(handle, inode, di_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                del_right_subtree = 1;
+        }
+        /*
+         * Getting here with an empty extent in the right path implies
+         * that it's the rightmost path and will be deleted.
+         */
+        BUG_ON(right_has_empty && !del_right_subtree);
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+                ret = ocfs2_journal_access(handle, inode,
+                                           right_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access(handle, inode,
+                                           left_path->p_node[i].bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (!right_has_empty) {
+                /*
+                 * Only do this if we're moving a real
+                 * record. Otherwise, the action is delayed until
+                 * after removal of the right path in which case we
+                 * can do a simple shift to remove the empty extent.
+                 */
+                ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+                memset(&right_leaf_el->l_recs[0], 0,
+                       sizeof(struct ocfs2_extent_rec));
+        }
+        if (eb->h_next_leaf_blk == 0ULL) {
+                /*
+                 * Move recs over to get rid of empty extent, decrease
+                 * next_free. This is allowed to remove the last
+                 * extent in our leaf (setting l_next_free_rec to
+                 * zero) - the delete code below won't care.
+                 */
+                ocfs2_remove_empty_extent(right_leaf_el);
+        }
+        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        if (ret)
+                mlog_errno(ret);
+        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+        if (ret)
+                mlog_errno(ret);
+        if (del_right_subtree) {
+                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+                                     subtree_index, dealloc);
+                ocfs2_update_edge_lengths(inode, handle, left_path);
+                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+                di->i_last_eb_blk = eb->h_blkno;
+                /*
+                 * Removal of the extent in the left leaf was skipped
+                 * above so we could delete the right path
+                 * 1st.
+                 */
+                if (right_has_empty)
+                        ocfs2_remove_empty_extent(left_leaf_el);
+                ret = ocfs2_journal_dirty(handle, di_bh);
+                if (ret)
+                        mlog_errno(ret);
+                *deleted = 1;
+        } else
+                ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+                                           subtree_index);
+out:
+        return ret;
+}
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+                                          struct ocfs2_path *path, u32 *cpos)
+{
+        int i, j, ret = 0;
+        u64 blkno;
+        struct ocfs2_extent_list *el;
+        *cpos = 0;
+        if (path->p_tree_depth == 0)
+                return 0;
+        blkno = path_leaf_bh(path)->b_blocknr;
+        /* Start at the tree node just above the leaf and work our way up. */
+        i = path->p_tree_depth - 1;
+        while (i >= 0) {
+                int next_free;
+                el = path->p_node[i].el;
+                /*
+                 * Find the extent record just after the one in our
+                 * path.
+                 */
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+                                if (j == (next_free - 1)) {
+                                        if (i == 0) {
+                                                /*
+                                                 * We've determined that the
+                                                 * path specified is already
+                                                 * the rightmost one - return a
+                                                 * cpos of zero.
+                                                 */
+                                                goto out;
+                                        }
+                                        /*
+                                         * The rightmost record points to our
+                                         * leaf - we need to travel up the
+                                         * tree one level.
+                                         */
+                                        goto next_node;
+                                }
+                                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+                                goto out;
+                        }
+                }
+                /*
+                 * If we got here, we never found a valid node where
+                 * the tree indicated one should be.
+                 */
+                ocfs2_error(sb,
+                            "Invalid extent tree at extent block %llu\n",
+                            (unsigned long long)blkno);
+                ret = -EROFS;
+                goto out;
+next_node:
+                blkno = path->p_node[i].bh->b_blocknr;
+                i--;
+        }
+out:
+        return ret;
+}
+static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
+                                            handle_t *handle,
+                                            struct buffer_head *bh,
+                                            struct ocfs2_extent_list *el)
+{
+        int ret;
+        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+                return 0;
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_remove_empty_extent(el);
+        ret = ocfs2_journal_dirty(handle, bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static int __ocfs2_rotate_tree_left(struct inode *inode,
+                                    handle_t *handle, int orig_credits,
+                                    struct ocfs2_path *path,
+                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                    struct ocfs2_path **empty_extent_path)
+{
+        int ret, subtree_root, deleted;
+        u32 right_cpos;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_path *right_path = NULL;
+        BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+        *empty_extent_path = NULL;
+        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
+                                             &right_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        left_path = ocfs2_new_path(path_root_bh(path),
+                                   path_root_el(path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_cp_path(left_path, path);
+        right_path = ocfs2_new_path(path_root_bh(path),
+                                    path_root_el(path));
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        while (right_cpos) {
+                ret = ocfs2_find_path(inode, right_path, right_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                subtree_root = ocfs2_find_subtree_root(inode, left_path,
+                                                       right_path);
+                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                     subtree_root,
+                     (unsigned long long)
+                     right_path->p_node[subtree_root].bh->b_blocknr,
+                     right_path->p_tree_depth);
+                ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+                                                      orig_credits, left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+                                                right_path, subtree_root,
+                                                dealloc, &deleted);
+                if (ret == -EAGAIN) {
+                        /*
+                         * The rotation has to temporarily stop due to
+                         * the right subtree having an empty
+                         * extent. Pass it back to the caller for a
+                         * fixup.
+                         */
+                        *empty_extent_path = right_path;
+                        right_path = NULL;
+                        goto out;
+                }
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * The subtree rotate might have removed records on
+                 * the rightmost edge. If so, then rotation is
+                 * complete.
+                 */
+                if (deleted)
+                        break;
+                ocfs2_mv_path(left_path, right_path);
+                ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+                                                     &right_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(right_path);
+        ocfs2_free_path(left_path);
+        return ret;
+}
+static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+                                       struct ocfs2_path *path,
+                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, subtree_index;
+        u32 cpos;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        /*
+         * XXX: This code assumes that the root is an inode, which is
+         * true for now but may change as tree code gets generic.
+         */
+        di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ret = -EIO;
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has invalid path root",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                goto out;
+        }
+        /*
+         * There's two ways we handle this depending on
+         * whether path is the only existing one.
+         */
+        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                              handle->h_buffer_credits,
+                                              path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (cpos) {
+                /*
+                 * We have a path to the left of this one - it needs
+                 * an update too.
+                 */
+                left_path = ocfs2_new_path(path_root_bh(path),
+                                           path_root_el(path));
+                if (!left_path) {
+                        ret = -ENOMEM;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_find_path(inode, left_path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access_path(inode, handle, left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+                ocfs2_unlink_subtree(inode, handle, left_path, path,
+                                     subtree_index, dealloc);
+                ocfs2_update_edge_lengths(inode, handle, left_path);
+                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+                di->i_last_eb_blk = eb->h_blkno;
+        } else {
+                /*
+                 * 'path' is also the leftmost path which
+                 * means it must be the only one. This gets
+                 * handled differently because we want to
+                 * revert the inode back to having extents
+                 * in-line.
+                 */
+                ocfs2_unlink_path(inode, handle, dealloc, path, 1);
+                el = &di->id2.i_list;
+                el->l_tree_depth = 0;
+                el->l_next_free_rec = 0;
+                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+                di->i_last_eb_blk = 0;
+        }
+        ocfs2_journal_dirty(handle, path_root_bh(path));
+out:
+        ocfs2_free_path(left_path);
+        return ret;
+}
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+                                  struct ocfs2_path *path,
+                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, orig_credits = handle->h_buffer_credits;
+        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        el = path_leaf_el(path);
+        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+                return 0;
+        if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+                /*
+                 * In-inode extents. This is trivially handled, so do
+                 * it up front.
+                 */
+                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
+                                                       path_leaf_bh(path),
+                                                       path_leaf_el(path));
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Handle rightmost branch now. There's several cases:
+         *  1) simple rotation leaving records in there. That's trivial.
+         *  2) rotation requiring a branch delete - there's no more
+         *     records left. Two cases of this:
+         *     a) There are branches to the left.
+         *     b) This is also the leftmost (the only) branch.
+         *
+         *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
+         *  2a) we need the left branch so that we can update it with the unlink
+         *  2b) we need to bring the inode back to inline extents.
+         */
+        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+        el = &eb->h_list;
+        if (eb->h_next_leaf_blk == 0) {
+                /*
+                 * This gets a bit tricky if we're going to delete the
+                 * rightmost path. Get the other cases out of the way
+                 * 1st.
+                 */
+                if (le16_to_cpu(el->l_next_free_rec) > 1)
+                        goto rightmost_no_delete;
+                if (le16_to_cpu(el->l_next_free_rec) == 0) {
+                        ret = -EIO;
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu has empty extent block at %llu",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
+                        goto out;
+                }
+                /*
+                 * XXX: The caller can not trust "path" any more after
+                 * this as it will have been deleted. What do we do?
+                 *
+                 * In theory the rotate-for-merge code will never get
+                 * here because it'll always ask for a rotate in a
+                 * nonempty list.
+                 */
+                ret = ocfs2_remove_rightmost_path(inode, handle, path,
+                                                  dealloc);
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Now we can loop, remembering the path we get from -EAGAIN
+         * and restarting from there.
+         */
+try_rotate:
+        ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
+                                       dealloc, &restart_path);
+        if (ret && ret != -EAGAIN) {
+                mlog_errno(ret);
+                goto out;
+        }
+        while (ret == -EAGAIN) {
+                tmp_path = restart_path;
+                restart_path = NULL;
+                ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+                                               tmp_path, dealloc,
+                                               &restart_path);
+                if (ret && ret != -EAGAIN) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_free_path(tmp_path);
+                tmp_path = NULL;
+                if (ret == 0)
+                        goto try_rotate;
+        }
+out:
+        ocfs2_free_path(tmp_path);
+        ocfs2_free_path(restart_path);
+        return ret;
+}
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+                                int index)
+{
+        struct ocfs2_extent_rec *rec = &el->l_recs[index];
+        unsigned int size;
+        if (rec->e_leaf_clusters == 0) {
+                /*
+                 * We consumed all of the merged-from record. An empty
+                 * extent cannot exist anywhere but the 1st array
+                 * position, so move things over if the merged-from
+                 * record doesn't occupy that position.
+                 *
+                 * This creates a new empty extent so the caller
+                 * should be smart enough to have removed any existing
+                 * ones.
+                 */
+                if (index > 0) {
+                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+                        size = index * sizeof(struct ocfs2_extent_rec);
+                        memmove(&el->l_recs[1], &el->l_recs[0], size);
+                }
+                /*
+                 * Always memset - the caller doesn't check whether it
+                 * created an empty extent, so there could be junk in
+                 * the other fields.
+                 */
+                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+        }
+}
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record at index + 1.
+ */
+static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+                                handle_t *handle,
+                                struct ocfs2_extent_rec *split_rec,
+                                struct ocfs2_extent_list *el, int index)
+{
+        int ret;
+        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+        struct ocfs2_extent_rec *left_rec;
+        struct ocfs2_extent_rec *right_rec;
+        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+        left_rec = &el->l_recs[index];
+        right_rec = &el->l_recs[index + 1];
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+        le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+        le64_add_cpu(&right_rec->e_blkno,
+                     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+        le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+        ocfs2_cleanup_merge(el, index);
+        ret = ocfs2_journal_dirty(handle, bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record at index - 1.
+ */
+static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+                                handle_t *handle,
+                                struct ocfs2_extent_rec *split_rec,
+                                struct ocfs2_extent_list *el, int index)
+{
+        int ret, has_empty_extent = 0;
+        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+        struct ocfs2_extent_rec *left_rec;
+        struct ocfs2_extent_rec *right_rec;
+        BUG_ON(index <= 0);
+        left_rec = &el->l_recs[index - 1];
+        right_rec = &el->l_recs[index];
+        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                has_empty_extent = 1;
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (has_empty_extent && index == 1) {
+                /*
+                 * The easy case - we can just plop the record right in.
+                 */
+                *left_rec = *split_rec;
+                has_empty_extent = 0;
+        } else {
+                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+        }
+        le32_add_cpu(&right_rec->e_cpos, split_clusters);
+        le64_add_cpu(&right_rec->e_blkno,
+                     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+        le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+        ocfs2_cleanup_merge(el, index);
+        ret = ocfs2_journal_dirty(handle, bh);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static int ocfs2_try_to_merge_extent(struct inode *inode,
+                                     handle_t *handle,
+                                     struct ocfs2_path *left_path,
+                                     int split_index,
+                                     struct ocfs2_extent_rec *split_rec,
+                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                     struct ocfs2_merge_ctxt *ctxt)
+{
+        int ret = 0, delete_tail_recs = 0;
+        struct ocfs2_extent_list *el = path_leaf_el(left_path);
+        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+        if (ctxt->c_split_covers_rec) {
+                delete_tail_recs++;
+                if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
+                    ctxt->c_has_empty_extent)
+                        delete_tail_recs++;
+                if (ctxt->c_has_empty_extent) {
+                        /*
+                         * The merge code will need to create an empty
+                         * extent to take the place of the newly
+                         * emptied slot. Remove any pre-existing empty
+                         * extents - having more than one in a leaf is
+                         * illegal.
+                         */
+                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                                     dealloc);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        split_index--;
+                        rec = &el->l_recs[split_index];
+                }
+        }
+        if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+                /*
+                 * Left-right contig implies this.
+                 */
+                BUG_ON(!ctxt->c_split_covers_rec);
+                BUG_ON(split_index == 0);
+                /*
+                 * Since the leftright insert always covers the entire
+                 * extent, this call will delete the insert record
+                 * entirely, resulting in an empty extent record added to
+                 * the extent block.
+                 *
+                 * Since the adding of an empty extent shifts
+                 * everything back to the right, there's no need to
+                 * update split_index here.
+                 */
+                ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+                                           handle, split_rec, el, split_index);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * We can only get this from logic error above.
+                 */
+                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                /*
+                 * The left merge left us with an empty extent, remove
+                 * it.
+                 */
+                ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                split_index--;
+                rec = &el->l_recs[split_index];
+                /*
+                 * Note that we don't pass split_rec here on purpose -
+                 * we've merged it into the left side.
+                 */
+                ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+                                            handle, rec, el, split_index);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                             dealloc);
+                /*
+                 * Error from this last rotate is not critical, so
+                 * print but don't bubble it up.
+                 */
+                if (ret)
+                        mlog_errno(ret);
+                ret = 0;
+        } else {
+                /*
+                 * Merge a record to the left or right.
+                 *
+                 * 'contig_type' is relative to the existing record,
+                 * so for example, if we're "right contig", it's to
+                 * the record on the left (hence the left merge).
+                 */
+                if (ctxt->c_contig_type == CONTIG_RIGHT) {
+                        ret = ocfs2_merge_rec_left(inode,
+                                                   path_leaf_bh(left_path),
+                                                   handle, split_rec, el,
+                                                   split_index);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                } else {
+                        ret = ocfs2_merge_rec_right(inode,
+                                                    path_leaf_bh(left_path),
+                                                    handle, split_rec, el,
+                                                    split_index);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                if (ctxt->c_split_covers_rec) {
+                        /*
+                         * The merge may have left an empty extent in
+                         * our leaf. Try to rotate it away.
+                         */
+                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                                                     dealloc);
+                        if (ret)
+                                mlog_errno(ret);
+                        ret = 0;
+                }
+        }
+out:
+        return ret;
+}
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+                                    enum ocfs2_split_type split,
+                                    struct ocfs2_extent_rec *rec,
+                                    struct ocfs2_extent_rec *split_rec)
+{
+        u64 len_blocks;
+        len_blocks = ocfs2_clusters_to_blocks(sb,
+                                le16_to_cpu(split_rec->e_leaf_clusters));
+        if (split == SPLIT_LEFT) {
+                /*
+                 * Region is on the left edge of the existing
+                 * record.
+                 */
+                le32_add_cpu(&rec->e_cpos,
+                             le16_to_cpu(split_rec->e_leaf_clusters));
+                le64_add_cpu(&rec->e_blkno, len_blocks);
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             -le16_to_cpu(split_rec->e_leaf_clusters));
+        } else {
+                /*
+                 * Region is on the right edge of the existing
+                 * record.
+                 */
+                le16_add_cpu(&rec->e_leaf_clusters,
+                             -le16_to_cpu(split_rec->e_leaf_clusters));
+        }
+}
 /*
 * Do the final bits of extent record insertion at the target leaf
 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+        if (insert->ins_split != SPLIT_NONE) {
+                i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+                BUG_ON(i == -1);
+                rec = &el->l_recs[i];
+                ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+                                        insert_rec);
+                goto rotate;
+        }
        /*
         * Contiguous insert - either left or right.
         */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
                return;
        }
+rotate:
        /*
         * Ok, we have to rotate.
         *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+                                           handle_t *handle,
+                                           struct ocfs2_path *path,
+                                           struct ocfs2_extent_rec *insert_rec)
+{
+        int ret, i, next_free;
+        struct buffer_head *bh;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        /*
+         * Update everything except the leaf block.
+         */
+        for (i = 0; i < path->p_tree_depth; i++) {
+                bh = path->p_node[i].bh;
+                el = path->p_node[i].el;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (next_free == 0) {
+                        ocfs2_error(inode->i_sb,
+                                    "Dinode %llu has a bad extent list",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        ret = -EIO;
+                        return;
+                }
+                rec = &el->l_recs[next_free - 1];
+                rec->e_int_clusters = insert_rec->e_cpos;
+                le32_add_cpu(&rec->e_int_clusters,
+                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                le32_add_cpu(&rec->e_int_clusters,
+                             -le32_to_cpu(rec->e_cpos));
+                ret = ocfs2_journal_dirty(handle, bh);
+                if (ret)
+                        mlog_errno(ret);
+        }
+}
 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                                    struct ocfs2_extent_rec *insert_rec,
                                    struct ocfs2_path *right_path,
                                    struct ocfs2_path **ret_left_path)
 {
-        int ret, i, next_free;
+        int ret, next_free;
-        struct buffer_head *bh;
        struct ocfs2_extent_list *el;
        struct ocfs2_path *left_path = NULL;
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                goto out;
        }
-        el = path_root_el(right_path);
+        ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
-        bh = path_root_bh(right_path);
-        i = 0;
-        while (1) {
-                struct ocfs2_extent_rec *rec;
-                next_free = le16_to_cpu(el->l_next_free_rec);
-                if (next_free == 0) {
-                        ocfs2_error(inode->i_sb,
-                                    "Dinode %llu has a bad extent list",
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        ret = -EIO;
-                        goto out;
-                }
-                rec = &el->l_recs[next_free - 1];
-                rec->e_int_clusters = insert_rec->e_cpos;
-                le32_add_cpu(&rec->e_int_clusters,
-                             le16_to_cpu(insert_rec->e_leaf_clusters));
-                le32_add_cpu(&rec->e_int_clusters,
-                             -le32_to_cpu(rec->e_cpos));
-                ret = ocfs2_journal_dirty(handle, bh);
-                if (ret)
-                        mlog_errno(ret);
-                /* Don't touch the leaf node */
-                if (++i >= right_path->p_tree_depth)
-                        break;
-                bh = right_path->p_node[i].bh;
-                el = right_path->p_node[i].el;
-        }
        *ret_left_path = left_path;
        ret = 0;
@@ -1931,6 +3204,83 @@ out:
        return ret;
 }
+static void ocfs2_split_record(struct inode *inode,
+                               struct ocfs2_path *left_path,
+                               struct ocfs2_path *right_path,
+                               struct ocfs2_extent_rec *split_rec,
+                               enum ocfs2_split_type split)
+{
+        int index;
+        u32 cpos = le32_to_cpu(split_rec->e_cpos);
+        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+        struct ocfs2_extent_rec *rec, *tmprec;
+        right_el = path_leaf_el(right_path);;
+        if (left_path)
+                left_el = path_leaf_el(left_path);
+        el = right_el;
+        insert_el = right_el;
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index != -1) {
+                if (index == 0 && left_path) {
+                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+                        /*
+                         * This typically means that the record
+                         * started in the left path but moved to the
+                         * right as a result of rotation. We either
+                         * move the existing record to the left, or we
+                         * do the later insert there.
+                         *
+                         * In this case, the left path should always
+                         * exist as the rotate code will have passed
+                         * it back for a post-insert update.
+                         */
+                        if (split == SPLIT_LEFT) {
+                                /*
+                                 * It's a left split. Since we know
+                                 * that the rotate code gave us an
+                                 * empty extent in the left path, we
+                                 * can just do the insert there.
+                                 */
+                                insert_el = left_el;
+                        } else {
+                                /*
+                                 * Right split - we have to move the
+                                 * existing record over to the left
+                                 * leaf. The insert will be into the
+                                 * newly created empty extent in the
+                                 * right leaf.
+                                 */
+                                tmprec = &right_el->l_recs[index];
+                                ocfs2_rotate_leaf(left_el, tmprec);
+                                el = left_el;
+                                memset(tmprec, 0, sizeof(*tmprec));
+                                index = ocfs2_search_extent_list(left_el, cpos);
+                                BUG_ON(index == -1);
+                        }
+                }
+        } else {
+                BUG_ON(!left_path);
+                BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+                /*
+                 * Left path is easy - we can just allow the insert to
+                 * happen.
+                 */
+                el = left_el;
+                insert_el = left_el;
+                index = ocfs2_search_extent_list(el, cpos);
+                BUG_ON(index == -1);
+        }
+        rec = &el->l_recs[index];
+        ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+        ocfs2_rotate_leaf(insert_el, split_rec);
+}
 /*
 * This function only does inserts on an allocation b-tree. For dinode
 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
 {
        int ret, subtree_index;
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
-        struct ocfs2_extent_list *el;
        /*
         * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
                }
        }
-        el = path_leaf_el(right_path);
+        if (insert->ins_split != SPLIT_NONE) {
+                /*
+                 * We could call ocfs2_insert_at_leaf() for some types
+                 * of splits, but it's easier to just let one seperate
+                 * function sort it all out.
+                 */
+                ocfs2_split_record(inode, left_path, right_path,
+                                   insert_rec, insert->ins_split);
+        } else
+                ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
+                                     insert, inode);
-        ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
        ret = ocfs2_journal_dirty(handle, leaf_bh);
        if (ret)
                mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
         * can wind up skipping both of these two special cases...
         */
        if (rotate) {
-                ret = ocfs2_rotate_tree_right(inode, handle,
+                ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
                                              le32_to_cpu(insert_rec->e_cpos),
                                              right_path, &left_path);
                if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        }
 out_update_clusters:
-        ocfs2_update_dinode_clusters(inode, di,
+        if (type->ins_split == SPLIT_NONE)
-                                     le16_to_cpu(insert_rec->e_leaf_clusters));
+                ocfs2_update_dinode_clusters(inode, di,
+                                             le16_to_cpu(insert_rec->e_leaf_clusters));
        ret = ocfs2_journal_dirty(handle, di_bh);
        if (ret)
@@ -2114,6 +3473,44 @@ out:
        return ret;
 }
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct inode *inode,
+                               struct ocfs2_extent_list *el, int index,
+                               struct ocfs2_extent_rec *split_rec)
+{
+        struct ocfs2_extent_rec *rec;
+        enum ocfs2_contig_type ret = CONTIG_NONE;
+        /*
+         * We're careful to check for an empty extent record here -
+         * the merge code will know what to do if it sees one.
+         */
+        if (index > 0) {
+                rec = &el->l_recs[index - 1];
+                if (index == 1 && ocfs2_is_empty_extent(rec)) {
+                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+                                ret = CONTIG_RIGHT;
+                } else {
+                        ret = ocfs2_extent_contig(inode, rec, split_rec);
+                }
+        }
+        if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+                enum ocfs2_contig_type contig_type;
+                rec = &el->l_recs[index + 1];
+                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+                        ret = CONTIG_LEFTRIGHT;
+                else if (ret == CONTIG_NONE)
+                        ret = contig_type;
+        }
+        return ret;
+}
 static void ocfs2_figure_contig_type(struct inode *inode,
                                     struct ocfs2_insert_type *insert,
                                     struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
        struct ocfs2_path *path = NULL;
        struct buffer_head *bh = NULL;
+        insert->ins_split = SPLIT_NONE;
        el = &di->id2.i_list;
        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
+                        u8 flags,
                        struct ocfs2_alloc_context *meta_ac)
 {
-        int status, shift;
+        int status;
        struct buffer_head *last_eb_bh = NULL;
        struct buffer_head *bh = NULL;
        struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        rec.e_cpos = cpu_to_le32(cpos);
        rec.e_blkno = cpu_to_le64(start_blk);
        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+        rec.e_flags = flags;
        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
                                          &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
             insert.ins_free_records, insert.ins_tree_depth);
-        /*
+        if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
-         * Avoid growing the tree unless we're out of records and the
+                status = ocfs2_grow_tree(inode, handle, fe_bh,
-         * insert type requres one.
+                                         &insert.ins_tree_depth, &last_eb_bh,
-         */
+                                         meta_ac);
-        if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+                if (status) {
-                goto out_add;
-        shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
-        if (shift < 0) {
-                status = shift;
-                mlog_errno(status);
-                goto bail;
-        }
-        /* We traveled all the way to the bottom of the allocation tree
-         * and didn't find room for any more extents - we need to add
-         * another tree level */
-        if (shift) {
-                BUG_ON(bh);
-                mlog(0, "need to shift tree depth "
-                     "(current = %d)\n", insert.ins_tree_depth);
-                /* ocfs2_shift_tree_depth will return us a buffer with
-                 * the new extent block (so we can pass that to
-                 * ocfs2_add_branch). */
-                status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
-                                                meta_ac, &bh);
-                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
-                insert.ins_tree_depth++;
-                /* Special case: we have room now if we shifted from
-                 * tree_depth 0 */
-                if (insert.ins_tree_depth == 1)
-                        goto out_add;
-        }
-        /* call ocfs2_add_branch to add the final part of the tree with
-         * the new data. */
-        mlog(0, "add branch. bh = %p\n", bh);
-        status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
-                                  meta_ac);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
        }
-out_add:
        /* Finally, we can add clusters. This might rotate the tree for us. */
        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
        if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
        return status;
 }
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+                                       struct ocfs2_extent_rec *split_rec,
+                                       u32 cpos,
+                                       struct ocfs2_extent_rec *rec)
+{
+        u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+        u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+        memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+        split_rec->e_cpos = cpu_to_le32(cpos);
+        split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+        split_rec->e_blkno = rec->e_blkno;
+        le64_add_cpu(&split_rec->e_blkno,
+                     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+        split_rec->e_flags = rec->e_flags;
+}
+static int ocfs2_split_and_insert(struct inode *inode,
+                                  handle_t *handle,
+                                  struct ocfs2_path *path,
+                                  struct buffer_head *di_bh,
+                                  struct buffer_head **last_eb_bh,
+                                  int split_index,
+                                  struct ocfs2_extent_rec *orig_split_rec,
+                                  struct ocfs2_alloc_context *meta_ac)
+{
+        int ret = 0, depth;
+        unsigned int insert_range, rec_range, do_leftright = 0;
+        struct ocfs2_extent_rec tmprec;
+        struct ocfs2_extent_list *rightmost_el;
+        struct ocfs2_extent_rec rec;
+        struct ocfs2_extent_rec split_rec = *orig_split_rec;
+        struct ocfs2_insert_type insert;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_dinode *di;
+leftright:
+        /*
+         * Store a copy of the record on the stack - it might move
+         * around as the tree is manipulated below.
+         */
+        rec = path_leaf_el(path)->l_recs[split_index];
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        rightmost_el = &di->id2.i_list;
+        depth = le16_to_cpu(rightmost_el->l_tree_depth);
+        if (depth) {
+                BUG_ON(!(*last_eb_bh));
+                eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+                rightmost_el = &eb->h_list;
+        }
+        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+            le16_to_cpu(rightmost_el->l_count)) {
+                int old_depth = depth;
+                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
+                                      meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (old_depth != depth) {
+                        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+                        rightmost_el = &eb->h_list;
+                }
+        }
+        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+        insert.ins_appending = APPEND_NONE;
+        insert.ins_contig = CONTIG_NONE;
+        insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+                - le16_to_cpu(rightmost_el->l_next_free_rec);
+        insert.ins_tree_depth = depth;
+        insert_range = le32_to_cpu(split_rec.e_cpos) +
+                le16_to_cpu(split_rec.e_leaf_clusters);
+        rec_range = le32_to_cpu(rec.e_cpos) +
+                le16_to_cpu(rec.e_leaf_clusters);
+        if (split_rec.e_cpos == rec.e_cpos) {
+                insert.ins_split = SPLIT_LEFT;
+        } else if (insert_range == rec_range) {
+                insert.ins_split = SPLIT_RIGHT;
+        } else {
+                /*
+                 * Left/right split. We fake this as a right split
+                 * first and then make a second pass as a left split.
+                 */
+                insert.ins_split = SPLIT_RIGHT;
+                ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
+                                           &rec);
+                split_rec = tmprec;
+                BUG_ON(do_leftright);
+                do_leftright = 1;
+        }
+        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
+                                     &insert);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (do_leftright == 1) {
+                u32 cpos;
+                struct ocfs2_extent_list *el;
+                do_leftright++;
+                split_rec = *orig_split_rec;
+                ocfs2_reinit_path(path, 1);
+                cpos = le32_to_cpu(split_rec.e_cpos);
+                ret = ocfs2_find_path(inode, path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                el = path_leaf_el(path);
+                split_index = ocfs2_search_extent_list(el, cpos);
+                goto leftright;
+        }
+out:
+        return ret;
+}
+/*
+ * Mark part or all of the extent record at split_index in the leaf
+ * pointed to by path as written. This removes the unwritten
+ * extent flag.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any inode with a
+ * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+static int __ocfs2_mark_extent_written(struct inode *inode,
+                                       struct buffer_head *di_bh,
+                                       handle_t *handle,
+                                       struct ocfs2_path *path,
+                                       int split_index,
+                                       struct ocfs2_extent_rec *split_rec,
+                                       struct ocfs2_alloc_context *meta_ac,
+                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret = 0;
+        struct ocfs2_extent_list *el = path_leaf_el(path);
+        struct buffer_head *eb_bh, *last_eb_bh = NULL;
+        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+        struct ocfs2_merge_ctxt ctxt;
+        struct ocfs2_extent_list *rightmost_el;
+        if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+            ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+             (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out;
+        }
+        eb_bh = path_leaf_bh(path);
+        ret = ocfs2_journal_access(handle, inode, eb_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+                                                            split_index,
+                                                            split_rec);
+        /*
+         * The core merge / split code wants to know how much room is
+         * left in this inodes allocation tree, so we pass the
+         * rightmost extent list.
+         */
+        if (path->p_tree_depth) {
+                struct ocfs2_extent_block *eb;
+                struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk),
+                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret) {
+                        mlog_exit(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                        ret = -EROFS;
+                        goto out;
+                }
+                rightmost_el = &eb->h_list;
+        } else
+                rightmost_el = path_root_el(path);
+        ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
+        if (ctxt.c_used_tail_recs > 0 &&
+            ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
+                ctxt.c_used_tail_recs--;
+        if (rec->e_cpos == split_rec->e_cpos &&
+            rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+                ctxt.c_split_covers_rec = 1;
+        else
+                ctxt.c_split_covers_rec = 0;
+        ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+        mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
+             "has_empty: %u, split_covers: %u\n", split_index,
+             ctxt.c_contig_type, ctxt.c_used_tail_recs,
+             ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
+        if (ctxt.c_contig_type == CONTIG_NONE) {
+                if (ctxt.c_split_covers_rec)
+                        el->l_recs[split_index] = *split_rec;
+                else
+                        ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+                                                     &last_eb_bh, split_index,
+                                                     split_rec, meta_ac);
+                if (ret)
+                        mlog_errno(ret);
+        } else {
+                ret = ocfs2_try_to_merge_extent(inode, handle, path,
+                                                split_index, split_rec,
+                                                dealloc, &ctxt);
+                if (ret)
+                        mlog_errno(ret);
+        }
+        ocfs2_journal_dirty(handle, eb_bh);
+out:
+        brelse(last_eb_bh);
+        return ret;
+}
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, u32 cpos, u32 len, u32 phys,
+                              struct ocfs2_alloc_context *meta_ac,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, index;
+        u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+        struct ocfs2_extent_rec split_rec;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_extent_list *el;
+        mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
+             inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
+        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+                            "that are being written to, but the feature bit "
+                            "is not set in the super block.",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                ret = -EROFS;
+                goto out;
+        }
+        /*
+         * XXX: This should be fixed up so that we just re-insert the
+         * next extent records.
+         */
+        ocfs2_extent_map_trunc(inode, 0);
+        left_path = ocfs2_new_inode_path(di_bh);
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, left_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(left_path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+        split_rec.e_cpos = cpu_to_le32(cpos);
+        split_rec.e_leaf_clusters = cpu_to_le16(len);
+        split_rec.e_blkno = cpu_to_le64(start_blkno);
+        split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
+        split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
+        ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
+                                          index, &split_rec, meta_ac, dealloc);
+        if (ret)
+                mlog_errno(ret);
+out:
+        ocfs2_free_path(left_path);
+        return ret;
+}
+static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+                            handle_t *handle, struct ocfs2_path *path,
+                            int index, u32 new_range,
+                            struct ocfs2_alloc_context *meta_ac)
+{
+        int ret, depth, credits = handle->h_buffer_credits;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct buffer_head *last_eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *rightmost_el, *el;
+        struct ocfs2_extent_rec split_rec;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_insert_type insert;
+        /*
+         * Setup the record to split before we grow the tree.
+         */
+        el = path_leaf_el(path);
+        rec = &el->l_recs[index];
+        ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+        depth = path->p_tree_depth;
+        if (depth > 0) {
+                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                                       le64_to_cpu(di->i_last_eb_blk),
+                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+                rightmost_el = &eb->h_list;
+        } else
+                rightmost_el = path_leaf_el(path);
+        credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+        ret = ocfs2_extend_trans(handle, credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+            le16_to_cpu(rightmost_el->l_count)) {
+                int old_depth = depth;
+                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+                                      meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (old_depth != depth) {
+                        eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+                        rightmost_el = &eb->h_list;
+                }
+        }
+        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+        insert.ins_appending = APPEND_NONE;
+        insert.ins_contig = CONTIG_NONE;
+        insert.ins_split = SPLIT_RIGHT;
+        insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+                - le16_to_cpu(rightmost_el->l_next_free_rec);
+        insert.ins_tree_depth = depth;
+        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+        if (ret)
+                mlog_errno(ret);
+out:
+        brelse(last_eb_bh);
+        return ret;
+}
+static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+                              struct ocfs2_path *path, int index,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc,
+                              u32 cpos, u32 len)
+{
+        int ret;
+        u32 left_cpos, rec_range, trunc_range;
+        int wants_rotate = 0, is_rightmost_tree_rec = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_extent_list *el = path_leaf_el(path);
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_extent_block *eb;
+        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                index--;
+        }
+        if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+            path->p_tree_depth) {
+                /*
+                 * Check whether this is the rightmost tree record. If
+                 * we remove all of this record or part of its right
+                 * edge then an update of the record lengths above it
+                 * will be required.
+                 */
+                eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+                if (eb->h_next_leaf_blk == 0)
+                        is_rightmost_tree_rec = 1;
+        }
+        rec = &el->l_recs[index];
+        if (index == 0 && path->p_tree_depth &&
+            le32_to_cpu(rec->e_cpos) == cpos) {
+                /*
+                 * Changing the leftmost offset (via partial or whole
+                 * record truncate) of an interior (or rightmost) path
+                 * means we have to update the subtree that is formed
+                 * by this leaf and the one to it's left.
+                 *
+                 * There are two cases we can skip:
+                 *   1) Path is the leftmost one in our inode tree.
+                 *   2) The leaf is rightmost and will be empty after
+                 *      we remove the extent record - the rotate code
+                 *      knows how to update the newly formed edge.
+                 */
+                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
+                                                    &left_cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+                        left_path = ocfs2_new_path(path_root_bh(path),
+                                                   path_root_el(path));
+                        if (!left_path) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+        }
+        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                              handle->h_buffer_credits,
+                                              path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, left_path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        trunc_range = cpos + len;
+        if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+                int next_free;
+                memset(rec, 0, sizeof(*rec));
+                ocfs2_cleanup_merge(el, index);
+                wants_rotate = 1;
+                next_free = le16_to_cpu(el->l_next_free_rec);
+                if (is_rightmost_tree_rec && next_free > 1) {
+                        /*
+                         * We skip the edge update if this path will
+                         * be deleted by the rotate code.
+                         */
+                        rec = &el->l_recs[next_free - 1];
+                        ocfs2_adjust_rightmost_records(inode, handle, path,
+                                                       rec);
+                }
+        } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+                /* Remove leftmost portion of the record. */
+                le32_add_cpu(&rec->e_cpos, len);
+                le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+                le16_add_cpu(&rec->e_leaf_clusters, -len);
+        } else if (rec_range == trunc_range) {
+                /* Remove rightmost portion of the record */
+                le16_add_cpu(&rec->e_leaf_clusters, -len);
+                if (is_rightmost_tree_rec)
+                        ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+        } else {
+                /* Caller should have trapped this. */
+                mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
+                     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                     le32_to_cpu(rec->e_cpos),
+                     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+                BUG();
+        }
+        if (left_path) {
+                int subtree_index;
+                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+                ocfs2_complete_edge_insert(inode, handle, left_path, path,
+                                           subtree_index);
+        }
+        ocfs2_journal_dirty(handle, path_leaf_bh(path));
+        ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        ocfs2_free_path(left_path);
+        return ret;
+}
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                        u32 cpos, u32 len, handle_t *handle,
+                        struct ocfs2_alloc_context *meta_ac,
+                        struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret, index;
+        u32 rec_range, trunc_range;
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_path *path;
+        ocfs2_extent_map_trunc(inode, 0);
+        path = ocfs2_new_inode_path(di_bh);
+        if (!path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        el = path_leaf_el(path);
+        index = ocfs2_search_extent_list(el, cpos);
+        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %llu has an extent at cpos %u which can no "
+                            "longer be found.\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+                ret = -EROFS;
+                goto out;
+        }
+        /*
+         * We have 3 cases of extent removal:
+         *   1) Range covers the entire extent rec
+         *   2) Range begins or ends on one edge of the extent rec
+         *   3) Range is in the middle of the extent rec (no shared edges)
+         *
+         * For case 1 we remove the extent rec and left rotate to
+         * fill the hole.
+         *
+         * For case 2 we just shrink the existing extent rec, with a
+         * tree update if the shrinking edge is also the edge of an
+         * extent block.
+         *
+         * For case 3 we do a right split to turn the extent rec into
+         * something case 2 can handle.
+         */
+        rec = &el->l_recs[index];
+        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        trunc_range = cpos + len;
+        BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+        mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
+             "(cpos %u, len %u)\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
+             le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                         cpos, len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        } else {
+                ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+                                       trunc_range, meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * The split could have manipulated the tree enough to
+                 * move the record location, so we have to look for it again.
+                 */
+                ocfs2_reinit_path(path, 1);
+                ret = ocfs2_find_path(inode, path, cpos);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                el = path_leaf_el(path);
+                index = ocfs2_search_extent_list(el, cpos);
+                if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu: split at cpos %u lost record.",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    cpos);
+                        ret = -EROFS;
+                        goto out;
+                }
+                /*
+                 * Double check our values here. If anything is fishy,
+                 * it's easier to catch it at the top level.
+                 */
+                rec = &el->l_recs[index];
+                rec_range = le32_to_cpu(rec->e_cpos) +
+                        ocfs2_rec_clusters(el, rec);
+                if (rec_range != trunc_range) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %llu: error after split at cpos %u"
+                                    "trunc len %u, existing record is (%u,%u)",
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    cpos, len, le32_to_cpu(rec->e_cpos),
+                                    ocfs2_rec_clusters(el, rec));
+                        ret = -EROFS;
+                        goto out;
+                }
+                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+                                         cpos, len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        ocfs2_free_path(path);
+        return ret;
+}
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
        struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
        return current_tail == new_start;
 }
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
-                                     handle_t *handle,
+                              handle_t *handle,
-                                     u64 start_blk,
+                              u64 start_blk,
-                                     unsigned int num_clusters)
+                              unsigned int num_clusters)
 {
        int status, index;
        unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
 }
 /* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
        int status;
        unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
        return status;
 }
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+/*
+ * Describes a single block free from a suballocator
+ */
+struct ocfs2_cached_block_free {
+        struct ocfs2_cached_block_free          *free_next;
+        u64                                     free_blk;
+        unsigned int                            free_bit;
+};
+struct ocfs2_per_slot_free_list {
+        struct ocfs2_per_slot_free_list         *f_next_suballocator;
+        int                                     f_inode_type;
+        int                                     f_slot;
+        struct ocfs2_cached_block_free          *f_first;
+};
+static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+                                   int sysfile_type,
+                                   int slot,
+                                   struct ocfs2_cached_block_free *head)
+{
+        int ret;
+        u64 bg_blkno;
+        handle_t *handle;
+        struct inode *inode;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_cached_block_free *tmp;
+        inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+        if (!inode) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&inode->i_mutex);
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        while (head) {
+                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                      head->free_bit);
+                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
+                     head->free_bit, (unsigned long long)head->free_blk);
+                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+                                               head->free_bit, bg_blkno, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_journal;
+                }
+                ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_journal;
+                }
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+out_journal:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        ocfs2_meta_unlock(inode, 1);
+        brelse(di_bh);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        iput(inode);
+out:
+        while(head) {
+                /* Premature exit may have left some dangling items. */
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+        return ret;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+                       struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+        int ret = 0, ret2;
+        struct ocfs2_per_slot_free_list *fl;
+        if (!ctxt)
+                return 0;
+        while (ctxt->c_first_suballocator) {
+                fl = ctxt->c_first_suballocator;
+                if (fl->f_first) {
+                        mlog(0, "Free items: (type %u, slot %d)\n",
+                             fl->f_inode_type, fl->f_slot);
+                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                                                       fl->f_slot, fl->f_first);
+                        if (ret2)
+                                mlog_errno(ret2);
+                        if (!ret)
+                                ret = ret2;
+                }
+                ctxt->c_first_suballocator = fl->f_next_suballocator;
+                kfree(fl);
+        }
+        return ret;
+}
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+                              int slot,
+                              struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+        struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+        while (fl) {
+                if (fl->f_inode_type == type && fl->f_slot == slot)
+                        return fl;
+                fl = fl->f_next_suballocator;
+        }
+        fl = kmalloc(sizeof(*fl), GFP_NOFS);
+        if (fl) {
+                fl->f_inode_type = type;
+                fl->f_slot = slot;
+                fl->f_first = NULL;
+                fl->f_next_suballocator = ctxt->c_first_suballocator;
+                ctxt->c_first_suballocator = fl;
+        }
+        return fl;
+}
+static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                     int type, int slot, u64 blkno,
+                                     unsigned int bit)
+{
+        int ret;
+        struct ocfs2_per_slot_free_list *fl;
+        struct ocfs2_cached_block_free *item;
+        fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+        if (fl == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        item = kmalloc(sizeof(*item), GFP_NOFS);
+        if (item == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
+             type, slot, bit, (unsigned long long)blkno);
+        item->free_blk = blkno;
+        item->free_bit = bit;
+        item->free_next = fl->f_first;
+        fl->f_first = item;
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                         struct ocfs2_extent_block *eb)
+{
+        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+                                         le16_to_cpu(eb->h_suballoc_slot),
+                                         le64_to_cpu(eb->h_blkno),
+                                         le16_to_cpu(eb->h_suballoc_bit));
+}
 /* This function will figure out whether the currently last extent
 * block will be deleted, and if it will, what the new last extent
 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-                        if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                                /*
+                        /* An error here is not fatal. */
-                                 * This code only understands how to
+                        if (ret < 0)
-                                 * lock the suballocator in slot 0,
+                                mlog_errno(ret);
-                                 * which is fine because allocation is
-                                 * only ever done out of that
-                                 * suballocator too. A future version
-                                 * might change that however, so avoid
-                                 * a free if we don't know how to
-                                 * handle it. This way an fs incompat
-                                 * bit will not be necessary.
-                                 */
-                                ret = ocfs2_free_extent_block(handle,
-                                                              tc->tc_ext_alloc_inode,
-                                                              tc->tc_ext_alloc_bh,
-                                                              eb);
-                                /* An error here is not fatal. */
-                                if (ret < 0)
-                                        mlog_errno(ret);
-                        }
                } else {
                        deleted_eb = 0;
                }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
        return ocfs2_journal_dirty_data(handle, bh);
 }
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
-                                     struct page **pages, int numpages,
+                                     loff_t end, struct page **pages,
-                                     u64 phys, handle_t *handle)
+                                     int numpages, u64 phys, handle_t *handle)
 {
        int i, ret, partial = 0;
        void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
        if (numpages == 0)
                goto out;
-        from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+        to = PAGE_CACHE_SIZE;
-        if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
-                /*
-                 * Since 'from' has been capped to a value below page
-                 * size, this calculation won't be able to overflow
-                 * 'to'
-                 */
-                to = ocfs2_align_bytes_to_clusters(sb, from);
-                /*
-                 * The truncate tail in this case should never contain
-                 * more than one page at maximum. The loop below also
-                 * assumes this.
-                 */
-                BUG_ON(numpages != 1);
-        }
        for(i = 0; i < numpages; i++) {
                page = pages[i];
+                from = start & (PAGE_CACHE_SIZE - 1);
+                if ((end >> PAGE_CACHE_SHIFT) == page->index)
+                        to = end & (PAGE_CACHE_SIZE - 1);
                BUG_ON(from > PAGE_CACHE_SIZE);
                BUG_ON(to > PAGE_CACHE_SIZE);
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
                flush_dcache_page(page);
-                /*
+                start = (page->index + 1) << PAGE_CACHE_SHIFT;
-                 * Every page after the 1st one should be completely zero'd.
-                 */
-                from = 0;
        }
 out:
        if (pages) {
@@ -3484,24 +5740,26 @@ out:
        }
 }
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-                                int *num, u64 *phys)
+                                struct page **pages, int *num, u64 *phys)
 {
        int i, numpages = 0, ret = 0;
-        unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
        unsigned int ext_flags;
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index;
-        u64 next_cluster_bytes;
+        loff_t last_page_bytes;
        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+        BUG_ON(start > end);
-        /* Cluster boundary, so we don't need to grab any pages. */
+        if (start == end)
-        if ((isize & (csize - 1)) == 0)
                goto out;
-        ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+        BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+               (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+        ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
                                          phys, NULL, &ext_flags);
        if (ret) {
                mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
        if (ext_flags & OCFS2_EXT_UNWRITTEN)
                goto out;
-        next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+        last_page_bytes = PAGE_ALIGN(end);
-        index = isize >> PAGE_CACHE_SHIFT;
+        index = start >> PAGE_CACHE_SHIFT;
        do {
                pages[numpages] = grab_cache_page(mapping, index);
                if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
                numpages++;
                index++;
-        } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+        } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
 out:
        if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
 * otherwise block_write_full_page() will skip writeout of pages past
 * i_size. The new_i_size parameter is passed for this reason.
 */
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
-                                 u64 new_i_size)
+                                  u64 range_start, u64 range_end)
 {
        int ret, numpages;
-        loff_t endbyte;
        struct page **pages = NULL;
        u64 phys;
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
                goto out;
        }
-        ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+        ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+                                   &numpages, &phys);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
        if (numpages == 0)
                goto out;
-        ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+        ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
-                                 handle);
+                                 numpages, phys, handle);
        /*
         * Initiate writeout of the pages we zero'd here. We don't
         * wait on them - the truncate_inode_pages() call later will
         * do that for us.
         */
-        endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        ret = do_sync_mapping_range(inode->i_mapping, range_start,
-        ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+                                    range_end - 1, SYNC_FILE_RANGE_WRITE);
-                                    endbyte - 1, SYNC_FILE_RANGE_WRITE);
        if (ret)
                mlog_errno(ret);
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        mlog_entry_void();
-        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
@@ -3754,7 +6009,6 @@ start:
        goto start;
 bail:
-        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_schedule_truncate_log_flush(osb, 1);
@@ -3764,6 +6018,8 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
+        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
        ocfs2_free_path(path);
        /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
 }
 /*
- * Expects the inode to already be locked. This will figure out which
+ * Expects the inode to already be locked.
- * inodes need to be locked and will put them on the returned truncate
- * context.
 */
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
                           struct ocfs2_truncate_context **tc)
 {
-        int status, metadata_delete, i;
+        int status;
        unsigned int new_i_clusters;
        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list *el;
        struct buffer_head *last_eb_bh = NULL;
-        struct inode *ext_alloc_inode = NULL;
-        struct buffer_head *ext_alloc_bh = NULL;
        mlog_entry_void();
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                mlog_errno(status);
                goto bail;
        }
+        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-        metadata_delete = 0;
        if (fe->id2.i_list.l_tree_depth) {
-                /* If we have a tree, then the truncate may result in
-                 * metadata deletes. Figure this out from the
-                 * rightmost leaf block.*/
                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
                if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                        status = -EIO;
                        goto bail;
                }
-                el = &(eb->h_list);
-                i = 0;
-                if (ocfs2_is_empty_extent(&el->l_recs[0]))
-                        i = 1;
-                /*
-                 * XXX: Should we check that next_free_rec contains
-                 * the extent?
-                 */
-                if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
-                        metadata_delete = 1;
        }
        (*tc)->tc_last_eb_bh = last_eb_bh;
-        if (metadata_delete) {
-                mlog(0, "Will have to delete metadata for this trunc. "
-                     "locking allocator.\n");
-                ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
-                if (!ext_alloc_inode) {
-                        status = -ENOMEM;
-                        mlog_errno(status);
-                        goto bail;
-                }
-                mutex_lock(&ext_alloc_inode->i_mutex);
-                (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
-                status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
-                (*tc)->tc_ext_alloc_locked = 1;
-        }
        status = 0;
 bail:
        if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 {
-        if (tc->tc_ext_alloc_inode) {
+        /*
-                if (tc->tc_ext_alloc_locked)
+         * The caller is responsible for completing deallocation
-                        ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+         * before freeing the context.
+         */
-                mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
+        if (tc->tc_dealloc.c_first_suballocator != NULL)
-                iput(tc->tc_ext_alloc_inode);
+                mlog(ML_NOTICE,
-        }
+                     "Truncate completion has non-empty dealloc context\n");
-        if (tc->tc_ext_alloc_bh)
-                brelse(tc->tc_ext_alloc_bh);
        if (tc->tc_last_eb_bh)
                brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a081..990df48ae8d3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
+                        u8 flags,
                        struct ocfs2_alloc_context *meta_ac);
+struct ocfs2_cached_dealloc_ctxt;
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+                              handle_t *handle, u32 cpos, u32 len, u32 phys,
+                              struct ocfs2_alloc_context *meta_ac,
+                              struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+                        u32 cpos, u32 len, handle_t *handle,
+                        struct ocfs2_alloc_context *meta_ac,
+                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                                      struct ocfs2_dinode **tl_copy);
 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
                                         struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+                              handle_t *handle,
+                              u64 start_blk,
+                              unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+        c->c_first_suballocator = NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+                       struct ocfs2_cached_dealloc_ctxt *ctxt);
 struct ocfs2_truncate_context {
-        struct inode *tc_ext_alloc_inode;
+        struct ocfs2_cached_dealloc_ctxt tc_dealloc;
-        struct buffer_head *tc_ext_alloc_bh;
        int tc_ext_alloc_locked; /* is it cluster locked? */
        /* these get destroyed once it's passed to ocfs2_commit_truncate. */
        struct buffer_head *tc_last_eb_bh;
 };
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
-                                 u64 new_i_size);
+                                  u64 range_start, u64 range_end);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
                    u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 /*
 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79b9..84bf6e79de23 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
             bh = bh->b_this_page, block_start += bsize) {
                block_end = block_start + bsize;
+                clear_buffer_new(bh);
                /*
                 * Ignore blocks outside of our i/o range -
                 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                 * For an allocating write with cluster size >= page
                 * size, we always write the entire page.
                 */
+                if (new)
-                if (buffer_new(bh))
+                        set_buffer_new(bh);
-                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-                     (block_start < from || block_end > to)) {
+                           !buffer_new(bh) &&
+                           (block_start < from || block_end > to)) {
                        ll_rw_block(READ, 1, &bh);
                        *wait_bh++=bh;
                }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
        bh = head;
        block_start = 0;
        do {
-                void *kaddr;
                block_end = block_start + bsize;
                if (block_end <= from)
                        goto next_bh;
                if (block_start >= to)
                        break;
-                kaddr = kmap_atomic(page, KM_USER0);
+                zero_user_page(page, block_start, bh->b_size, KM_USER0);
-                memset(kaddr+block_start, 0, bh->b_size);
-                flush_dcache_page(page);
-                kunmap_atomic(kaddr, KM_USER0);
                set_buffer_uptodate(bh);
                mark_buffer_dirty(bh);
@@ -761,217 +758,240 @@ next_bh:
        return ret;
 }
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES    1
+#else
+#define OCFS2_MAX_CTXT_PAGES    (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+#define OCFS2_MAX_CLUSTERS_PER_PAGE     (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 /*
- * This will copy user data from the buffer page in the splice
+ * Describe the state of a single cluster to be written to.
- * context.
- *
- * For now, we ignore SPLICE_F_MOVE as that would require some extra
- * communication out all the way to ocfs2_write().
 */
-int ocfs2_map_and_write_splice_data(struct inode *inode,
+struct ocfs2_write_cluster_desc {
-                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+        u32             c_cpos;
-                                  unsigned int *ret_from, unsigned int *ret_to)
+        u32             c_phys;
+        /*
+         * Give this a unique field because c_phys eventually gets
+         * filled.
+         */
+        unsigned        c_new;
+        unsigned        c_unwritten;
+};
+static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
 {
-        int ret;
+        return d->c_new || d->c_unwritten;
-        unsigned int to, from, cluster_start, cluster_end;
+}
-        char *src, *dst;
-        struct ocfs2_splice_write_priv *sp = wc->w_private;
-        struct pipe_buffer *buf = sp->s_buf;
-        unsigned long bytes, src_from;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+struct ocfs2_write_ctxt {
-                                        &cluster_end);
+        /* Logical cluster position / len of write */
+        u32                             w_cpos;
+        u32                             w_clen;
-        from = sp->s_offset;
+        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
-        src_from = sp->s_buf_offset;
-        bytes = wc->w_count;
-        if (wc->w_large_pages) {
+        /*
-                /*
+         * This is true if page_size > cluster_size.
-                 * For cluster size < page size, we have to
+         *
-                 * calculate pos within the cluster and obey
+         * It triggers a set of special cases during write which might
-                 * the rightmost boundary.
+         * have to deal with allocating writes to partial pages.
-                 */
+         */
-                bytes = min(bytes, (unsigned long)(osb->s_clustersize
+        unsigned int                    w_large_pages;
-                                   - (wc->w_pos & (osb->s_clustersize - 1))));
-        }
+        /*
-        to = from + bytes;
+         * Pages involved in this write.
+         *
+         * w_target_page is the page being written to by the user.
+         *
+         * w_pages is an array of pages which always contains
+         * w_target_page, and in the case of an allocating write with
+         * page_size < cluster size, it will contain zero'd and mapped
+         * pages adjacent to w_target_page which need to be written
+         * out in so that future reads from that region will get
+         * zero's.
+         */
+        struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
+        unsigned int                    w_num_pages;
+        struct page                     *w_target_page;
-        BUG_ON(from > PAGE_CACHE_SIZE);
+        /*
-        BUG_ON(to > PAGE_CACHE_SIZE);
+         * ocfs2_write_end() uses this to know what the real range to
-        BUG_ON(from < cluster_start);
+         * write in the target should be.
-        BUG_ON(to > cluster_end);
+         */
+        unsigned int                    w_target_from;
+        unsigned int                    w_target_to;
-        if (wc->w_this_page_new)
+        /*
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+         * We could use journal_current_handle() but this is cleaner,
-                                            cluster_start, cluster_end, 1);
+         * IMHO -Mark
-        else
+         */
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+        handle_t                        *w_handle;
-                                            from, to, 0);
-        if (ret) {
+        struct buffer_head              *w_di_bh;
-                mlog_errno(ret);
-                goto out;
+        struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+        int i;
+        for(i = 0; i < wc->w_num_pages; i++) {
+                if (wc->w_pages[i] == NULL)
+                        continue;
+                unlock_page(wc->w_pages[i]);
+                mark_page_accessed(wc->w_pages[i]);
+                page_cache_release(wc->w_pages[i]);
        }
-        src = buf->ops->map(sp->s_pipe, buf, 1);
+        brelse(wc->w_di_bh);
-        dst = kmap_atomic(wc->w_this_page, KM_USER1);
+        kfree(wc);
-        memcpy(dst + from, src + src_from, bytes);
+}
-        kunmap_atomic(wc->w_this_page, KM_USER1);
-        buf->ops->unmap(sp->s_pipe, buf, src);
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+                                  struct ocfs2_super *osb, loff_t pos,
+                                  unsigned len, struct buffer_head *di_bh)
+{
+        struct ocfs2_write_ctxt *wc;
+        wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+        if (!wc)
+                return -ENOMEM;
-        wc->w_finished_copy = 1;
+        wc->w_cpos = pos >> osb->s_clustersize_bits;
+        wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+        get_bh(di_bh);
+        wc->w_di_bh = di_bh;
-        *ret_from = from;
+        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
-        *ret_to = to;
+                wc->w_large_pages = 1;
-out:
+        else
+                wc->w_large_pages = 0;
+        ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+        *wcp = wc;
-        return bytes ? (unsigned int)bytes : ret;
+        return 0;
 }
 /*
- * This will copy user data from the iovec in the buffered write
+ * If a page has any new buffers, zero them out here, and mark them uptodate
- * context.
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
 */
-int ocfs2_map_and_write_user_data(struct inode *inode,
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
-                                  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-                                  unsigned int *ret_from, unsigned int *ret_to)
 {
-        int ret;
+        unsigned int block_start, block_end;
-        unsigned int to, from, cluster_start, cluster_end;
+        struct buffer_head *head, *bh;
-        unsigned long bytes, src_from;
-        char *dst;
-        struct ocfs2_buffered_write_priv *bp = wc->w_private;
-        const struct iovec *cur_iov = bp->b_cur_iov;
-        char __user *buf;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+        BUG_ON(!PageLocked(page));
-                                        &cluster_end);
+        if (!page_has_buffers(page))
+                return;
-        buf = cur_iov->iov_base + bp->b_cur_off;
+        bh = head = page_buffers(page);
-        src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+        block_start = 0;
+        do {
+                block_end = block_start + bh->b_size;
-        from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+                if (buffer_new(bh)) {
+                        if (block_end > from && block_start < to) {
+                                if (!PageUptodate(page)) {
+                                        unsigned start, end;
-        /*
+                                        start = max(from, block_start);
-         * This is a lot of comparisons, but it reads quite
+                                        end = min(to, block_end);
-         * easily, which is important here.
-         */
-        /* Stay within the src page */
-        bytes = PAGE_SIZE - src_from;
-        /* Stay within the vector */
-        bytes = min(bytes,
-                    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
-        /* Stay within count */
-        bytes = min(bytes, (unsigned long)wc->w_count);
-        /*
-         * For clustersize > page size, just stay within
-         * target page, otherwise we have to calculate pos
-         * within the cluster and obey the rightmost
-         * boundary.
-         */
-        if (wc->w_large_pages) {
-                /*
-                 * For cluster size < page size, we have to
-                 * calculate pos within the cluster and obey
-                 * the rightmost boundary.
-                 */
-                bytes = min(bytes, (unsigned long)(osb->s_clustersize
-                                   - (wc->w_pos & (osb->s_clustersize - 1))));
-        } else {
-                /*
-                 * cluster size > page size is the most common
-                 * case - we just stay within the target page
-                 * boundary.
-                 */
-                bytes = min(bytes, PAGE_CACHE_SIZE - from);
-        }
-        to = from + bytes;
+                                        zero_user_page(page, start, end - start, KM_USER0);
+                                        set_buffer_uptodate(bh);
+                                }
-        BUG_ON(from > PAGE_CACHE_SIZE);
+                                clear_buffer_new(bh);
-        BUG_ON(to > PAGE_CACHE_SIZE);
+                                mark_buffer_dirty(bh);
-        BUG_ON(from < cluster_start);
+                        }
-        BUG_ON(to > cluster_end);
+                }
-        if (wc->w_this_page_new)
+                block_start = block_end;
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+                bh = bh->b_this_page;
-                                            cluster_start, cluster_end, 1);
+        } while (bh != head);
-        else
+}
-                ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-                                            from, to, 0);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        dst = kmap(wc->w_this_page);
+/*
-        memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+ * Only called when we have a failure during allocating write to write
-        kunmap(wc->w_this_page);
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+                                struct ocfs2_write_ctxt *wc,
+                                loff_t user_pos, unsigned user_len)
+{
+        int i;
+        unsigned from, to;
+        struct page *tmppage;
-        /*
+        ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
-         * XXX: This is slow, but simple. The caller of
-         * ocfs2_buffered_write_cluster() is responsible for
-         * passing through the iovecs, so it's difficult to
-         * predict what our next step is in here after our
-         * initial write. A future version should be pushing
-         * that iovec manipulation further down.
-         *
-         * By setting this, we indicate that a copy from user
-         * data was done, and subsequent calls for this
-         * cluster will skip copying more data.
-         */
-        wc->w_finished_copy = 1;
-        *ret_from = from;
+        if (wc->w_large_pages) {
-        *ret_to = to;
+                from = wc->w_target_from;
-out:
+                to = wc->w_target_to;
+        } else {
+                from = 0;
+                to = PAGE_CACHE_SIZE;
+        }
+        for(i = 0; i < wc->w_num_pages; i++) {
+                tmppage = wc->w_pages[i];
-        return bytes ? (unsigned int)bytes : ret;
+                if (ocfs2_should_order_data(inode))
+                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                                          from, to, NULL,
+                                          ocfs2_journal_dirty_data);
+                block_commit_write(tmppage, from, to);
+        }
 }
-/*
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
- * Map, fill and write a page to disk.
+                                        struct ocfs2_write_ctxt *wc,
- *
+                                        struct page *page, u32 cpos,
- * The work of copying data is done via callback.  Newly allocated
+                                        loff_t user_pos, unsigned user_len,
- * pages which don't take user data will be zero'd (set 'new' to
+                                        int new)
- * indicate an allocating write)
- *
- * Returns a negative error code or the number of bytes copied into
- * the page.
- */
-static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-                                 u64 *p_blkno, struct page *page,
-                                 struct ocfs2_write_ctxt *wc, int new)
 {
-        int ret, copied = 0;
+        int ret;
-        unsigned int from = 0, to = 0;
+        unsigned int map_from = 0, map_to = 0;
        unsigned int cluster_start, cluster_end;
-        unsigned int zero_from = 0, zero_to = 0;
+        unsigned int user_data_from = 0, user_data_to = 0;
-        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
                                        &cluster_start, &cluster_end);
-        if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+        if (page == wc->w_target_page) {
-            && !wc->w_finished_copy) {
+                map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+                map_to = map_from + user_len;
-                wc->w_this_page = page;
-                wc->w_this_page_new = new;
+                if (new)
-                ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+                        ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-                if (ret < 0) {
+                                                    cluster_start, cluster_end,
+                                                    new);
+                else
+                        ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+                                                    map_from, map_to, new);
+                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                copied = ret;
+                user_data_from = map_from;
+                user_data_to = map_to;
-                zero_from = from;
-                zero_to = to;
                if (new) {
-                        from = cluster_start;
+                        map_from = cluster_start;
-                        to = cluster_end;
+                        map_to = cluster_end;
                }
+                wc->w_target_from = map_from;
+                wc->w_target_to = map_to;
        } else {
                /*
                 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
                 */
                BUG_ON(!new);
-                from = cluster_start;
+                map_from = cluster_start;
-                to = cluster_end;
+                map_to = cluster_end;
                ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-                                            cluster_start, cluster_end, 1);
+                                            cluster_start, cluster_end, new);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
         */
        if (new && !PageUptodate(page))
                ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
-                                         wc->w_cpos, zero_from, zero_to);
+                                         cpos, user_data_from, user_data_to);
        flush_dcache_page(page);
-        if (ocfs2_should_order_data(inode)) {
-                ret = walk_page_buffers(handle,
-                                        page_buffers(page),
-                                        from, to, NULL,
-                                        ocfs2_journal_dirty_data);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-        /*
-         * We don't use generic_commit_write() because we need to
-         * handle our own i_size update.
-         */
-        ret = block_commit_write(page, from, to);
-        if (ret)
-                mlog_errno(ret);
 out:
+        return ret;
-        return copied ? copied : ret;
 }
 /*
- * Do the actual write of some data into an inode. Optionally allocate
+ * This function will only grab one clusters worth of pages.
- * in order to fulfill the write.
- *
- * cpos is the logical cluster offset within the file to write at
- *
- * 'phys' is the physical mapping of that offset. a 'phys' value of
- * zero indicates that allocation is required. In this case, data_ac
- * and meta_ac should be valid (meta_ac can be null if metadata
- * allocation isn't required).
 */
-static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
-                           struct buffer_head *di_bh,
+                                      struct ocfs2_write_ctxt *wc,
-                           struct ocfs2_alloc_context *data_ac,
+                                      u32 cpos, loff_t user_pos, int new,
-                           struct ocfs2_alloc_context *meta_ac,
+                                      struct page *mmap_page)
-                           struct ocfs2_write_ctxt *wc)
 {
-        int ret, i, numpages = 1, new;
+        int ret = 0, i;
-        unsigned int copied = 0;
+        unsigned long start, target_index, index;
-        u32 tmp_pos;
-        u64 v_blkno, p_blkno;
-        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-        unsigned long index, start;
-        struct page **cpages;
-        new = phys == 0 ? 1 : 0;
+        target_index = user_pos >> PAGE_CACHE_SHIFT;
        /*
         * Figure out how many pages we'll be manipulating here. For
         * non allocating write, we just change the one
         * page. Otherwise, we'll need a whole clusters worth.
         */
-        if (new)
-                numpages = ocfs2_pages_per_cluster(inode->i_sb);
-        cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
-        if (!cpages) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                return ret;
-        }
-        /*
-         * Fill our page array first. That way we've grabbed enough so
-         * that we can zero and flush if we error after adding the
-         * extent.
-         */
        if (new) {
-                start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+                wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
-                                                           wc->w_cpos);
+                start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
-                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
        } else {
-                start = wc->w_pos >> PAGE_CACHE_SHIFT;
+                wc->w_num_pages = 1;
-                v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+                start = target_index;
        }
-        for(i = 0; i < numpages; i++) {
+        for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
-                cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
+                if (index == target_index && mmap_page) {
-                if (!cpages[i]) {
+                        /*
-                        ret = -ENOMEM;
+                         * ocfs2_pagemkwrite() is a little different
-                        mlog_errno(ret);
+                         * and wants us to directly use the page
-                        goto out;
+                         * passed in.
+                         */
+                        lock_page(mmap_page);
+                        if (mmap_page->mapping != mapping) {
+                                unlock_page(mmap_page);
+                                /*
+                                 * Sanity check - the locking in
+                                 * ocfs2_pagemkwrite() should ensure
+                                 * that this code doesn't trigger.
+                                 */
+                                ret = -EINVAL;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        page_cache_get(mmap_page);
+                        wc->w_pages[i] = mmap_page;
+                } else {
+                        wc->w_pages[i] = find_or_create_page(mapping, index,
+                                                             GFP_NOFS);
+                        if (!wc->w_pages[i]) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
                }
+                if (index == target_index)
+                        wc->w_target_page = wc->w_pages[i];
        }
+out:
+        return ret;
+}
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+                               u32 phys, unsigned int unwritten,
+                               struct ocfs2_alloc_context *data_ac,
+                               struct ocfs2_alloc_context *meta_ac,
+                               struct ocfs2_write_ctxt *wc, u32 cpos,
+                               loff_t user_pos, unsigned user_len)
+{
+        int ret, i, new, should_zero = 0;
+        u64 v_blkno, p_blkno;
+        struct inode *inode = mapping->host;
+        new = phys == 0 ? 1 : 0;
+        if (new || unwritten)
+                should_zero = 1;
        if (new) {
+                u32 tmp_pos;
                /*
                 * This is safe to call with the page locks - it won't take
                 * any additional semaphores or cluster locks.
                 */
-                tmp_pos = wc->w_cpos;
+                tmp_pos = cpos;
                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-                                                 &tmp_pos, 1, di_bh, handle,
+                                                 &tmp_pos, 1, 0, wc->w_di_bh,
-                                                 data_ac, meta_ac, NULL);
+                                                 wc->w_handle, data_ac,
+                                                 meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
                        mlog_errno(ret);
                        goto out;
                }
+        } else if (unwritten) {
+                ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+                                                wc->w_handle, cpos, 1, phys,
+                                                meta_ac, &wc->w_dealloc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
+        if (should_zero)
+                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+        else
+                v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+        /*
+         * The only reason this should fail is due to an inability to
+         * find the extent added.
+         */
        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
                                          NULL);
        if (ret < 0) {
+                ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
-                /*
+                            "at logical block %llu",
-                 * XXX: Should we go readonly here?
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                 */
+                            (unsigned long long)v_blkno);
-                mlog_errno(ret);
                goto out;
        }
        BUG_ON(p_blkno == 0);
-        for(i = 0; i < numpages; i++) {
+        for(i = 0; i < wc->w_num_pages; i++) {
-                ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+                int tmpret;
-                                            wc, new);
-                if (ret < 0) {
+                tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
-                        mlog_errno(ret);
+                                                      wc->w_pages[i], cpos,
-                        goto out;
+                                                      user_pos, user_len,
+                                                      should_zero);
+                if (tmpret) {
+                        mlog_errno(tmpret);
+                        if (ret == 0)
+                                tmpret = ret;
                }
-                copied += ret;
        }
+        /*
+         * We only have cleanup to do in case of allocating write.
+         */
+        if (ret && new)
+                ocfs2_write_failure(inode, wc, user_pos, user_len);
 out:
-        for(i = 0; i < numpages; i++) {
-                unlock_page(cpages[i]);
+        return ret;
-                mark_page_accessed(cpages[i]);
+}
-                page_cache_release(cpages[i]);
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+                                       struct ocfs2_alloc_context *data_ac,
+                                       struct ocfs2_alloc_context *meta_ac,
+                                       struct ocfs2_write_ctxt *wc,
+                                       loff_t pos, unsigned len)
+{
+        int ret, i;
+        struct ocfs2_write_cluster_desc *desc;
+        for (i = 0; i < wc->w_clen; i++) {
+                desc = &wc->w_desc[i];
+                ret = ocfs2_write_cluster(mapping, desc->c_phys,
+                                          desc->c_unwritten, data_ac, meta_ac,
+                                          wc, desc->c_cpos, pos, len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
-        kfree(cpages);
-        return copied ? copied : ret;
+        ret = 0;
+out:
+        return ret;
 }
-static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+/*
-                                  struct ocfs2_super *osb, loff_t pos,
+ * ocfs2_write_end() wants to know which parts of the target page it
-                                  size_t count, ocfs2_page_writer *cb,
+ * should complete the write on. It's easiest to compute them ahead of
-                                  void *cb_priv)
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+                                        struct ocfs2_write_ctxt *wc,
+                                        loff_t pos, unsigned len, int alloc)
 {
-        wc->w_count = count;
+        struct ocfs2_write_cluster_desc *desc;
-        wc->w_pos = pos;
-        wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
-        wc->w_finished_copy = 0;
-        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+        wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
-                wc->w_large_pages = 1;
+        wc->w_target_to = wc->w_target_from + len;
-        else
-                wc->w_large_pages = 0;
-        wc->w_write_data_page = cb;
+        if (alloc == 0)
-        wc->w_private = cb_priv;
+                return;
+        /*
+         * Allocating write - we may have different boundaries based
+         * on page size and cluster size.
+         *
+         * NOTE: We can no longer compute one value from the other as
+         * the actual write length and user provided length may be
+         * different.
+         */
+        if (wc->w_large_pages) {
+                /*
+                 * We only care about the 1st and last cluster within
+                 * our range and whether they should be zero'd or not. Either
+                 * value may be extended out to the start/end of a
+                 * newly allocated cluster.
+                 */
+                desc = &wc->w_desc[0];
+                if (ocfs2_should_zero_cluster(desc))
+                        ocfs2_figure_cluster_boundaries(osb,
+                                                        desc->c_cpos,
+                                                        &wc->w_target_from,
+                                                        NULL);
+                desc = &wc->w_desc[wc->w_clen - 1];
+                if (ocfs2_should_zero_cluster(desc))
+                        ocfs2_figure_cluster_boundaries(osb,
+                                                        desc->c_cpos,
+                                                        NULL,
+                                                        &wc->w_target_to);
+        } else {
+                wc->w_target_from = 0;
+                wc->w_target_to = PAGE_CACHE_SIZE;
+        }
 }
 /*
- * Write a cluster to an inode. The cluster may not be allocated yet,
+ * Populate each single-cluster write descriptor in the write context
- * in which case it will be. This only exists for buffered writes -
+ * with information about the i/o to be done.
- * O_DIRECT takes a more "traditional" path through the kernel.
- *
- * The caller is responsible for incrementing pos, written counts, etc
 *
- * For file systems that don't support sparse files, pre-allocation
+ * Returns the number of clusters that will have to be allocated, as
- * and page zeroing up until cpos should be done prior to this
+ * well as a worst case estimate of the number of extent records that
- * function call.
+ * would have to be created during a write to an unwritten region.
- *
- * Callers should be holding i_sem, and the rw cluster lock.
- *
- * Returns the number of user bytes written, or less than zero for
- * error.
 */
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+static int ocfs2_populate_write_desc(struct inode *inode,
-                                     size_t count, ocfs2_page_writer *actor,
+                                     struct ocfs2_write_ctxt *wc,
-                                     void *priv)
+                                     unsigned int *clusters_to_alloc,
+                                     unsigned int *extents_to_split)
+{
+        int ret;
+        struct ocfs2_write_cluster_desc *desc;
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        u32 phys = 0;
+        int i;
+        *clusters_to_alloc = 0;
+        *extents_to_split = 0;
+        for (i = 0; i < wc->w_clen; i++) {
+                desc = &wc->w_desc[i];
+                desc->c_cpos = wc->w_cpos + i;
+                if (num_clusters == 0) {
+                        /*
+                         * Need to look up the next extent record.
+                         */
+                        ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+                                                 &num_clusters, &ext_flags);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /*
+                         * Assume worst case - that we're writing in
+                         * the middle of the extent.
+                         *
+                         * We can assume that the write proceeds from
+                         * left to right, in which case the extent
+                         * insert code is smart enough to coalesce the
+                         * next splits into the previous records created.
+                         */
+                        if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                                *extents_to_split = *extents_to_split + 2;
+                } else if (phys) {
+                        /*
+                         * Only increment phys if it doesn't describe
+                         * a hole.
+                         */
+                        phys++;
+                }
+                desc->c_phys = phys;
+                if (phys == 0) {
+                        desc->c_new = 1;
+                        *clusters_to_alloc = *clusters_to_alloc + 1;
+                }
+                if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                        desc->c_unwritten = 1;
+                num_clusters--;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata,
+                             struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-        ssize_t written = 0;
+        unsigned int clusters_to_alloc, extents_to_split;
-        u32 phys;
+        struct ocfs2_write_ctxt *wc;
-        struct inode *inode = file->f_mapping->host;
+        struct inode *inode = mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
-        struct ocfs2_write_ctxt wc;
-        ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                return ret;
        }
-        di = (struct ocfs2_dinode *)di_bh->b_data;
-        /*
-         * Take alloc sem here to prevent concurrent lookups. That way
-         * the mapping, zeroing and tree manipulation within
-         * ocfs2_write() will be safe against ->readpage(). This
-         * should also serve to lock out allocation from a shared
-         * writeable region.
-         */
-        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+        ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+                                        &extents_to_split);
        if (ret) {
                mlog_errno(ret);
-                goto out_meta;
+                goto out;
        }
-        /* phys == 0 means that allocation is required. */
+        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
-        if (phys == 0) {
-                ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+        /*
+         * We set w_target_from, w_target_to here so that
+         * ocfs2_write_end() knows which range in the target page to
+         * write out. An allocation requires that we write the entire
+         * cluster range.
+         */
+        if (clusters_to_alloc || extents_to_split) {
+                /*
+                 * XXX: We are stretching the limits of
+                 * ocfs2_lock_allocators(). It greatly over-estimates
+                 * the work to be done.
+                 */
+                ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+                                            extents_to_split, &data_ac, &meta_ac);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_meta;
+                        goto out;
                }
-                credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+                credits = ocfs2_calc_extend_credits(inode->i_sb, di,
-        }
+                                                    clusters_to_alloc);
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_meta;
        }
+        ocfs2_set_target_boundaries(osb, wc, pos, len,
+                                    clusters_to_alloc + extents_to_split);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_data;
+                goto out;
        }
-        written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+        wc->w_handle = handle;
-                              meta_ac, &wc);
-        if (written < 0) {
+        /*
-                ret = written;
+         * We don't want this to fail in ocfs2_write_end(), so do it
+         * here.
+         */
+        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
                mlog_errno(ret);
                goto out_commit;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        /*
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+         * Fill our page array first. That way we've grabbed enough so
+         * that we can zero and flush if we error after adding the
+         * extent.
+         */
+        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+                                         clusters_to_alloc + extents_to_split,
+                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
        }
-        pos += written;
+        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+                                          len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        *pagep = wc->w_target_page;
+        *fsdata = wc;
+        return 0;
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        ocfs2_free_write_ctxt(wc);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+                      loff_t pos, unsigned len, unsigned flags,
+                      struct page **pagep, void **fsdata)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct inode *inode = mapping->host;
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * Take alloc sem here to prevent concurrent lookups. That way
+         * the mapping, zeroing and tree manipulation within
+         * ocfs2_write() will be safe against ->readpage(). This
+         * should also serve to lock out allocation from a shared
+         * writeable region.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = ocfs2_data_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_fail;
+        }
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+                                       fsdata, di_bh, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_fail_data;
+        }
+        brelse(di_bh);
+        return 0;
+out_fail_data:
+        ocfs2_data_unlock(inode, 1);
+out_fail:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        brelse(di_bh);
+        ocfs2_meta_unlock(inode, 1);
+        return ret;
+}
+int ocfs2_write_end_nolock(struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned copied,
+                           struct page *page, void *fsdata)
+{
+        int i;
+        unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+        struct inode *inode = mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_write_ctxt *wc = fsdata;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+        handle_t *handle = wc->w_handle;
+        struct page *tmppage;
+        if (unlikely(copied < len)) {
+                if (!PageUptodate(wc->w_target_page))
+                        copied = 0;
+                ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+                                       start+len);
+        }
+        flush_dcache_page(wc->w_target_page);
+        for(i = 0; i < wc->w_num_pages; i++) {
+                tmppage = wc->w_pages[i];
+                if (tmppage == wc->w_target_page) {
+                        from = wc->w_target_from;
+                        to = wc->w_target_to;
+                        BUG_ON(from > PAGE_CACHE_SIZE ||
+                               to > PAGE_CACHE_SIZE ||
+                               to < from);
+                } else {
+                        /*
+                         * Pages adjacent to the target (if any) imply
+                         * a hole-filling write in which case we want
+                         * to flush their entire range.
+                         */
+                        from = 0;
+                        to = PAGE_CACHE_SIZE;
+                }
+                if (ocfs2_should_order_data(inode))
+                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                                          from, to, NULL,
+                                          ocfs2_journal_dirty_data);
+                block_commit_write(tmppage, from, to);
+        }
+        pos += copied;
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
                mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ocfs2_journal_dirty(handle, wc->w_di_bh);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret)
-                mlog_errno(ret);
-out_commit:
        ocfs2_commit_trans(osb, handle);
-out_data:
+        ocfs2_run_deallocs(osb, &wc->w_dealloc);
-        ocfs2_data_unlock(inode, 1);
+        ocfs2_free_write_ctxt(wc);
+        return copied;
+}
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+                    loff_t pos, unsigned len, unsigned copied,
+                    struct page *page, void *fsdata)
+{
+        int ret;
+        struct inode *inode = mapping->host;
-out_meta:
+        ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+        ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        ocfs2_meta_unlock(inode, 1);
-out:
+        return ret;
-        brelse(di_bh);
-        if (data_ac)
-                ocfs2_free_alloc_context(data_ac);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        return written ? written : ret;
 }
 const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d479b5a..389579bd64e3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers(	handle_t *handle,
                        int (*fn)(      handle_t *handle,
                                        struct buffer_head *bh));
-struct ocfs2_write_ctxt;
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
-typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+                      loff_t pos, unsigned len, unsigned flags,
-                                u64 *, unsigned int *, unsigned int *);
+                      struct page **pagep, void **fsdata);
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
-                                     size_t count, ocfs2_page_writer *actor,
+                    loff_t pos, unsigned len, unsigned copied,
-                                     void *priv);
+                    struct page *page, void *fsdata);
-struct ocfs2_write_ctxt {
+int ocfs2_write_end_nolock(struct address_space *mapping,
-        size_t                          w_count;
+                           loff_t pos, unsigned len, unsigned copied,
-        loff_t                          w_pos;
+                           struct page *page, void *fsdata);
-        u32                             w_cpos;
-        unsigned int                    w_finished_copy;
-        /* This is true if page_size > cluster_size */
+int ocfs2_write_begin_nolock(struct address_space *mapping,
-        unsigned int                    w_large_pages;
+                             loff_t pos, unsigned len, unsigned flags,
+                             struct page **pagep, void **fsdata,
-        /* Filler callback and private data */
+                             struct buffer_head *di_bh, struct page *mmap_page);
-        ocfs2_page_writer               *w_write_data_page;
-        void                            *w_private;
-        /* Only valid for the filler callback */
-        struct page                     *w_this_page;
-        unsigned int                    w_this_page_new;
-};
-struct ocfs2_buffered_write_priv {
-        char                            *b_src_buf;
-        const struct iovec              *b_cur_iov; /* Current iovec */
-        size_t                          b_cur_off; /* Offset in the
-                                                    * current iovec */
-};
-int ocfs2_map_and_write_user_data(struct inode *inode,
-                                  struct ocfs2_write_ctxt *wc,
-                                  u64 *p_blkno,
-                                  unsigned int *ret_from,
-                                  unsigned int *ret_to);
-struct ocfs2_splice_write_priv {
-        struct splice_desc              *s_sd;
-        struct pipe_buffer              *s_buf;
-        struct pipe_inode_info          *s_pipe;
-        /* Neither offset value is ever larger than one page */
-        unsigned int                    s_offset;
-        unsigned int                    s_buf_offset;
-};
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-                                    struct ocfs2_write_ctxt *wc,
-                                    u64 *p_blkno,
-                                    unsigned int *ret_from,
-                                    unsigned int *ret_to);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 979113479c66..2bd7f788cf34 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        ret = wait_event_interruptible(o2hb_steady_queue,
                                atomic_read(&reg->hr_steady_iterations) == 0);
        if (ret) {
+                /* We got interrupted (hello ptrace!).  Clean up */
                spin_lock(&o2hb_live_lock);
                hb_task = reg->hr_task;
                reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                goto out;
        }
-        ret = count;
+        /* Ok, we were woken.  Make sure it wasn't by drop_item() */
+        spin_lock(&o2hb_live_lock);
+        hb_task = reg->hr_task;
+        spin_unlock(&o2hb_live_lock);
+        if (hb_task)
+                ret = count;
+        else
+                ret = -EIO;
 out:
        if (filp)
                fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
        if (hb_task)
                kthread_stop(hb_task);
+        /*
+         * If we're racing a dev_write(), we need to wake them.  They will
+         * check reg->hr_task
+         */
+        if (atomic_read(&reg->hr_steady_iterations) != 0) {
+                atomic_set(&reg->hr_steady_iterations, 0);
+                wake_up(&o2hb_steady_queue);
+        }
        config_item_put(item);
 }
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
 }
 EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-int o2hb_register_callback(struct o2hb_callback_func *hc)
+static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+{
+        struct o2hb_region *p, *reg = NULL;
+        assert_spin_locked(&o2hb_live_lock);
+        list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+                if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+                        reg = p;
+                        break;
+                }
+        }
+        return reg;
+}
+static int o2hb_region_get(const char *region_uuid)
+{
+        int ret = 0;
+        struct o2hb_region *reg;
+        spin_lock(&o2hb_live_lock);
+        reg = o2hb_find_region(region_uuid);
+        if (!reg)
+                ret = -ENOENT;
+        spin_unlock(&o2hb_live_lock);
+        if (ret)
+                goto out;
+        ret = o2nm_depend_this_node();
+        if (ret)
+                goto out;
+        ret = o2nm_depend_item(&reg->hr_item);
+        if (ret)
+                o2nm_undepend_this_node();
+out:
+        return ret;
+}
+static void o2hb_region_put(const char *region_uuid)
+{
+        struct o2hb_region *reg;
+        spin_lock(&o2hb_live_lock);
+        reg = o2hb_find_region(region_uuid);
+        spin_unlock(&o2hb_live_lock);
+        if (reg) {
+                o2nm_undepend_item(&reg->hr_item);
+                o2nm_undepend_this_node();
+        }
+}
+int o2hb_register_callback(const char *region_uuid,
+                           struct o2hb_callback_func *hc)
 {
        struct o2hb_callback_func *tmp;
        struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
                goto out;
        }
+        if (region_uuid) {
+                ret = o2hb_region_get(region_uuid);
+                if (ret)
+                        goto out;
+        }
        down_write(&o2hb_callback_sem);
        list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(o2hb_register_callback);
-void o2hb_unregister_callback(struct o2hb_callback_func *hc)
+void o2hb_unregister_callback(const char *region_uuid,
+                              struct o2hb_callback_func *hc)
 {
        BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
        mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
             __builtin_return_address(0), hc);
+        /* XXX Can this happen _with_ a region reference? */
        if (list_empty(&hc->hc_item))
                return;
+        if (region_uuid)
+                o2hb_region_put(region_uuid);
        down_write(&o2hb_callback_sem);
        list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b39771..35397dd5ecdb 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
                         o2hb_cb_func *func,
                         void *data,
                         int priority);
-int o2hb_register_callback(struct o2hb_callback_func *hc);
+int o2hb_register_callback(const char *region_uuid,
-void o2hb_unregister_callback(struct o2hb_callback_func *hc);
+                           struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+                              struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
 void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01ce0..af2070da308b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
        },
 };
+int o2nm_depend_item(struct config_item *item)
+{
+        return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+void o2nm_undepend_item(struct config_item *item)
+{
+        configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+int o2nm_depend_this_node(void)
+{
+        int ret = 0;
+        struct o2nm_node *local_node;
+        local_node = o2nm_get_node_by_num(o2nm_this_node());
+        if (!local_node) {
+                ret = -EINVAL;
+                goto out;
+        }
+        ret = o2nm_depend_item(&local_node->nd_item);
+        o2nm_node_put(local_node);
+out:
+        return ret;
+}
+void o2nm_undepend_this_node(void)
+{
+        struct o2nm_node *local_node;
+        local_node = o2nm_get_node_by_num(o2nm_this_node());
+        BUG_ON(!local_node);
+        o2nm_undepend_item(&local_node->nd_item);
+        o2nm_node_put(local_node);
+}
 static void __exit exit_o2nm(void)
 {
        if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
                goto out_sysctl;
        config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
-        init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+        mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
        ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
        if (ret) {
                printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 070522138ae2..7c860361b8dd 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
 void o2nm_node_get(struct o2nm_node *node);
 void o2nm_node_put(struct o2nm_node *node);
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
 #endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9c7952..f0bdfd944c44 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
 static void o2net_complete_nodes_nsw(struct o2net_node *nn)
 {
-        struct list_head *iter, *tmp;
+        struct o2net_status_wait *nsw, *tmp;
        unsigned int num_kills = 0;
-        struct o2net_status_wait *nsw;
        assert_spin_locked(&nn->nn_lock);
-        list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+        list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
-                nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
                o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
                num_kills++;
        }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
 void o2net_unregister_handler_list(struct list_head *list)
 {
-        struct list_head *pos, *n;
+        struct o2net_msg_handler *nmh, *n;
-        struct o2net_msg_handler *nmh;
        write_lock(&o2net_handler_lock);
-        list_for_each_safe(pos, n, list) {
+        list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
-                nmh = list_entry(pos, struct o2net_msg_handler,
-                                 nh_unregister_item);
                mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
                     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
                rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 void o2net_unregister_hb_callbacks(void)
 {
-        o2hb_unregister_callback(&o2net_hb_up);
+        o2hb_unregister_callback(NULL, &o2net_hb_up);
-        o2hb_unregister_callback(&o2net_hb_down);
+        o2hb_unregister_callback(NULL, &o2net_hb_down);
 }
 int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
        o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
                            o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
-        ret = o2hb_register_callback(&o2net_hb_up);
+        ret = o2hb_register_callback(NULL, &o2net_hb_up);
        if (ret == 0)
-                ret = o2hb_register_callback(&o2net_hb_down);
+                ret = o2hb_register_callback(NULL, &o2net_hb_down);
        if (ret)
                o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
                u32 offset = OCFS2_I(dir)->ip_clusters;
                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-                                                    1, parent_fe_bh, handle,
+                                                    1, 0, parent_fe_bh, handle,
                                                    data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98dd99a..6954565b8ccb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
 static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
 {
-        o2hb_unregister_callback(&dlm->dlm_hb_up);
+        o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
-        o2hb_unregister_callback(&dlm->dlm_hb_down);
+        o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
        o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
 }
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
                            dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
-        status = o2hb_register_callback(&dlm->dlm_hb_down);
+        status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
        if (status)
                goto bail;
        o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
                            dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
-        status = o2hb_register_callback(&dlm->dlm_hb_up);
+        status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
        if (status)
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca99d98..65b2b9b92688 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
        struct dlm_master_list_entry *mle;
-        struct list_head *iter;
        
        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
        spin_lock(&dlm->master_lock);
-        list_for_each(iter, &dlm->master_list) {
+        list_for_each_entry(mle, &dlm->master_list, list)
-                mle = list_entry(iter, struct dlm_master_list_entry, list);
                dlm_print_one_mle(mle);
-        }
        spin_unlock(&dlm->master_lock);
 }
 int dlm_dump_all_mles(const char __user *data, unsigned int len)
 {
-        struct list_head *iter;
        struct dlm_ctxt *dlm;
        spin_lock(&dlm_domain_lock);
-        list_for_each(iter, &dlm_domains) {
+        list_for_each_entry(dlm, &dlm_domains, list) {
-                dlm = list_entry (iter, struct dlm_ctxt, list);
                mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
                dlm_dump_mles(dlm);
        }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
-        struct list_head *iter;
        assert_spin_locked(&dlm->master_lock);
-        list_for_each(iter, &dlm->master_list) {
+        list_for_each_entry(tmpmle, &dlm->master_list, list) {
-                tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
 {
        struct dlm_master_list_entry *mle;
-        struct list_head *iter;
        assert_spin_locked(&dlm->spinlock);
        
-        list_for_each(iter, &dlm->mle_hb_events) {
+        list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
-                mle = list_entry(iter, struct dlm_master_list_entry, 
-                                 hb_events);
                if (node_up)
                        dlm_mle_node_up(dlm, mle, NULL, idx);
                else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        int ret;
        int i;
        int count = 0;
-        struct list_head *queue, *iter;
+        struct list_head *queue;
        struct dlm_lock *lock;
        assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        ret = 0;
        queue = &res->granted;
        for (i = 0; i < 3; i++) {
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
-                        lock = list_entry(iter, struct dlm_lock, list);
                        ++count;
                        if (lock->ml.node == dlm->node_num) {
                                mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res)
 {
-        struct list_head *iter, *iter2;
        struct list_head *queue = &res->granted;
        int i, bit;
-        struct dlm_lock *lock;
+        struct dlm_lock *lock, *next;
        assert_spin_locked(&res->spinlock);
        BUG_ON(res->owner == dlm->node_num);
        for (i=0; i<3; i++) {
-                list_for_each_safe(iter, iter2, queue) {
+                list_for_each_entry_safe(lock, next, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node != dlm->node_num) {
                                mlog(0, "putting lock for node %u\n",
                                     lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 {
        int i;
        struct list_head *queue = &res->granted;
-        struct list_head *iter;
        struct dlm_lock *lock;
        int nodenum;
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        for (i=0; i<3; i++) {
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
                        /* up to the caller to make sure this node
                         * is alive */
-                        lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node != dlm->node_num) {
                                spin_unlock(&res->spinlock);
                                return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 {
-        struct list_head *iter, *iter2;
+        struct dlm_master_list_entry *mle, *next;
-        struct dlm_master_list_entry *mle;
        struct dlm_lock_resource *res;
        unsigned int hash;
@@ -3245,9 +3229,7 @@ top:
        /* clean the master list */
        spin_lock(&dlm->master_lock);
-        list_for_each_safe(iter, iter2, &dlm->master_list) {
+        list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
-                mle = list_entry(iter, struct dlm_master_list_entry, list);
                BUG_ON(mle->type != DLM_MLE_BLOCK &&
                       mle->type != DLM_MLE_MASTER &&
                       mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed58ee2..a2c33160bfd6 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
        struct dlm_ctxt *dlm =
                container_of(work, struct dlm_ctxt, dispatched_work);
        LIST_HEAD(tmp_list);
-        struct list_head *iter, *iter2;
+        struct dlm_work_item *item, *next;
-        struct dlm_work_item *item;
        dlm_workfunc_t *workfunc;
        int tot=0;
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
-        list_for_each_safe(iter, iter2, &tmp_list) {
+        list_for_each_entry(item, &tmp_list, list) {
                tot++;
        }
        mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
-        list_for_each_safe(iter, iter2, &tmp_list) {
+        list_for_each_entry_safe(item, next, &tmp_list, list) {
-                item = list_entry(iter, struct dlm_work_item, list);
                workfunc = item->func;
                list_del_init(&item->list);
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 {
        int status = 0;
        struct dlm_reco_node_data *ndata;
-        struct list_head *iter;
        int all_nodes_done;
        int destroy = 0;
        int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
-        list_for_each(iter, &dlm->reco.node_data) {
+        list_for_each_entry(ndata, &dlm->reco.node_data, list) {
-                ndata = list_entry (iter, struct dlm_reco_node_data, list);
                BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
                ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                 * done, or if anyone died */
                all_nodes_done = 1;
                spin_lock(&dlm_reco_state_lock);
-                list_for_each(iter, &dlm->reco.node_data) {
+                list_for_each_entry(ndata, &dlm->reco.node_data, list) {
-                        ndata = list_entry (iter, struct dlm_reco_node_data, list);
                        mlog(0, "checking recovery state of node %u\n",
                             ndata->node_num);
                        switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 {
-        struct list_head *iter, *iter2;
+        struct dlm_reco_node_data *ndata, *next;
-        struct dlm_reco_node_data *ndata;
        LIST_HEAD(tmplist);
        spin_lock(&dlm_reco_state_lock);
        list_splice_init(&dlm->reco.node_data, &tmplist);
        spin_unlock(&dlm_reco_state_lock);
-        list_for_each_safe(iter, iter2, &tmplist) {
+        list_for_each_entry_safe(ndata, next, &tmplist, list) {
-                ndata = list_entry (iter, struct dlm_reco_node_data, list);
                list_del_init(&ndata->list);
                kfree(ndata);
        }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        struct dlm_lock_resource *res;
        struct dlm_ctxt *dlm;
        LIST_HEAD(resources);
-        struct list_head *iter;
        int ret;
        u8 dead_node, reco_master;
        int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        /* any errors returned will be due to the new_master dying,
         * the dlm_reco_thread should detect this */
-        list_for_each(iter, &resources) {
+        list_for_each_entry(res, &resources, recovering) {
-                res = list_entry (iter, struct dlm_lock_resource, recovering);
                ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
                                        DLM_MRES_RECOVERY);
                if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
-        struct list_head *iter;
        struct dlm_reco_node_data *ndata = NULL;
        int ret = -EINVAL;
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
                        dlm->reco.dead_node, done->node_idx, dlm->node_num);
        spin_lock(&dlm_reco_state_lock);
-        list_for_each(iter, &dlm->reco.node_data) {
+        list_for_each_entry(ndata, &dlm->reco.node_data, list) {
-                ndata = list_entry (iter, struct dlm_reco_node_data, list);
                if (ndata->node_num != done->node_idx)
                        continue;
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                        struct list_head *list,
                                        u8 dead_node)
 {
-        struct dlm_lock_resource *res;
+        struct dlm_lock_resource *res, *next;
-        struct list_head *iter, *iter2;
        struct dlm_lock *lock;
        spin_lock(&dlm->spinlock);
-        list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+        list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
-                res = list_entry (iter, struct dlm_lock_resource, recovering);
                /* always prune any $RECOVERY entries for dead nodes,
                 * otherwise hangs can occur during later recovery */
                if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
                                        u8 flags, u8 master)
 {
        /* mres here is one full page */
-        memset(mres, 0, PAGE_SIZE);
+        clear_page(mres);
        mres->lockname_len = namelen;
        memcpy(mres->lockname, lockname, namelen);
        mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         struct dlm_migratable_lockres *mres,
                         u8 send_to, u8 flags)
 {
-        struct list_head *queue, *iter;
+        struct list_head *queue;
        int total_locks, i;
        u64 mig_cookie = 0;
        struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        total_locks = 0;
        for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
                queue = dlm_list_idx_to_ptr(res, i);
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        /* add another lock. */
                        total_locks++;
                        if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
        int i, j, bad;
-        struct list_head *iter;
        struct dlm_lock *lock = NULL;
        u8 from = O2NM_MAX_NODES;
        unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                                list_for_each(iter, tmpq) {
+                                list_for_each_entry(lock, tmpq, list) {
-                                        lock = list_entry (iter, struct dlm_lock, list);
                                        if (lock->ml.cookie != ml->cookie)
                                                lock = NULL;
                                        else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
                                       struct dlm_lock_resource *res)
 {
        int i;
-        struct list_head *queue, *iter, *iter2;
+        struct list_head *queue;
-        struct dlm_lock *lock;
+        struct dlm_lock *lock, *next;
        res->state |= DLM_LOCK_RES_RECOVERING;
        if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        /* find any pending locks and put them back on proper list */
        for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
                queue = dlm_list_idx_to_ptr(res, i);
-                list_for_each_safe(iter, iter2, queue) {
+                list_for_each_entry_safe(lock, next, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        dlm_lock_get(lock);
                        if (lock->convert_pending) {
                                /* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                                              u8 dead_node, u8 new_master)
 {
        int i;
-        struct list_head *iter, *iter2;
        struct hlist_node *hash_iter;
        struct hlist_head *bucket;
+        struct dlm_lock_resource *res, *next;
-        struct dlm_lock_resource *res;
        mlog_entry_void();
        assert_spin_locked(&dlm->spinlock);
-        list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+        list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
-                res = list_entry (iter, struct dlm_lock_resource, recovering);
                if (res->owner == dead_node) {
                        list_del_init(&res->recovering);
                        spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
 static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
                               struct dlm_lock_resource *res, u8 dead_node)
 {
-        struct list_head *iter, *queue;
+        struct list_head *queue;
        struct dlm_lock *lock;
        int blank_lvb = 0, local = 0;
        int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
        for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
                queue = dlm_list_idx_to_ptr(res, i);
-                list_for_each(iter, queue) {
+                list_for_each_entry(lock, queue, list) {
-                        lock = list_entry (iter, struct dlm_lock, list);
                        if (lock->ml.node == search_node) {
                                if (dlm_lvb_needs_invalidation(lock, local)) {
                                        /* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
 static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                                struct dlm_lock_resource *res, u8 dead_node)
 {
-        struct list_head *iter, *tmpiter;
+        struct dlm_lock *lock, *next;
-        struct dlm_lock *lock;
        unsigned int freed = 0;
        /* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
        /* TODO: check pending_asts, pending_basts here */
-        list_for_each_safe(iter, tmpiter, &res->granted) {
+        list_for_each_entry_safe(lock, next, &res->granted, list) {
-                lock = list_entry (iter, struct dlm_lock, list);
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
                        freed++;
                }
        }
-        list_for_each_safe(iter, tmpiter, &res->converting) {
+        list_for_each_entry_safe(lock, next, &res->converting, list) {
-                lock = list_entry (iter, struct dlm_lock, list);
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
                        freed++;
                }
        }
-        list_for_each_safe(iter, tmpiter, &res->blocked) {
+        list_for_each_entry_safe(lock, next, &res->blocked, list) {
-                lock = list_entry (iter, struct dlm_lock, list);
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305ef0d7..f71250ed166f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
 static void lockres_set_flags(struct ocfs2_lock_res *lockres,
                              unsigned long newflags)
 {
-        struct list_head *pos, *tmp;
+        struct ocfs2_mask_waiter *mw, *tmp;
-        struct ocfs2_mask_waiter *mw;
        assert_spin_locked(&lockres->l_lock);
        lockres->l_flags = newflags;
-        list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+        list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
-                mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
                if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
                        continue;
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
        *var = cpu_to_le32(le32_to_cpu(*var) + val);
 }
+static inline void le64_add_cpu(__le64 *var, u64 val)
+{
+        *var = cpu_to_le64(le64_to_cpu(*var) + val);
+}
 static inline void le32_and_cpu(__le32 *var, u32 val)
 {
        *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab1c6e4..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
 */
 void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
 {
-        struct list_head *p, *n;
+        struct ocfs2_extent_map_item *emi, *n;
-        struct ocfs2_extent_map_item *emi;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_extent_map *em = &oi->ip_extent_map;
        LIST_HEAD(tmp_list);
        unsigned int range;
        spin_lock(&oi->ip_lock);
-        list_for_each_safe(p, n, &em->em_list) {
+        list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
-                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
                if (emi->ei_cpos >= cpos) {
                        /* Full truncate of this record. */
                        list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
        }
        spin_unlock(&oi->ip_lock);
-        list_for_each_safe(p, n, &tmp_list) {
+        list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
-                emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
                list_del(&emi->ei_list);
                kfree(emi);
        }
@@ -377,37 +373,6 @@ out:
        return ret;
 }
-/*
- * Return the index of the extent record which contains cluster #v_cluster.
- * -1 is returned if it was not found.
- *
- * Should work fine on interior and exterior nodes.
- */
-static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
-                                    u32 v_cluster)
-{
-        int ret = -1;
-        int i;
-        struct ocfs2_extent_rec *rec;
-        u32 rec_end, rec_start, clusters;
-        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-                rec = &el->l_recs[i];
-                rec_start = le32_to_cpu(rec->e_cpos);
-                clusters = ocfs2_rec_clusters(el, rec);
-                rec_end = rec_start + clusters;
-                if (v_cluster >= rec_start && v_cluster < rec_end) {
-                        ret = i;
-                        break;
-                }
-        }
-        return ret;
-}
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
                       u32 *p_cluster, u32 *num_clusters,
                       unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b6675717..f04c7aa834cb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        int status;
        handle_t *handle;
        struct ocfs2_dinode *di;
+        u64 cluster_bytes;
        mlog_entry_void();
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        /*
         * Do this before setting i_size.
         */
-        status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+        cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+        status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+                                               cluster_bytes);
        if (status) {
                mlog_errno(status);
                goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
                   (unsigned long long)new_i_size);
-        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
-        truncate_inode_pages(inode->i_mapping, new_i_size);
        fe = (struct ocfs2_dinode *) di_bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
        if (new_i_size == le64_to_cpu(fe->i_size))
                goto bail;
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
        /* This forces other nodes to sync and drop their pages. Do
         * this even if we have a truncate without allocation change -
         * ocfs2 cluster sizes can be much greater than page size, so
         * we have to truncate them anyway.  */
        status = ocfs2_data_lock(inode, 1);
        if (status < 0) {
+                up_write(&OCFS2_I(inode)->ip_alloc_sem);
                mlog_errno(status);
                goto bail;
        }
+        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+        truncate_inode_pages(inode->i_mapping, new_i_size);
        /* alright, we're going to need to do a full blown alloc size
         * change. Orphan the inode so that recovery can complete the
         * truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
 bail_unlock_data:
        ocfs2_data_unlock(inode, 1);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
        mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
                               u32 *logical_offset,
                               u32 clusters_to_add,
+                               int mark_unwritten,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
        enum ocfs2_alloc_restarted reason = RESTART_NONE;
        u32 bit_off, num_bits;
        u64 block;
+        u8 flags = 0;
        BUG_ON(!clusters_to_add);
+        if (mark_unwritten)
+                flags = OCFS2_EXT_UNWRITTEN;
        free_extents = ocfs2_num_free_extents(osb, inode, fe);
        if (free_extents < 0) {
                status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
        status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
                                     *logical_offset, block, num_bits,
-                                     meta_ac);
+                                     flags, meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -516,25 +530,31 @@ leave:
 * For a given allocation, determine which allocators will need to be
 * accessed, and lock them, reserving the appropriate number of bits.
 *
- * Called from ocfs2_extend_allocation() for file systems which don't
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
- * support holes, and from ocfs2_write() for file systems which
+ * and ocfs2_allocate_unwritten_extents().
- * understand sparse inodes.
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
 */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                          u32 clusters_to_add,
+                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac)
 {
-        int ret, num_free_extents;
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        *meta_ac = NULL;
-        *data_ac = NULL;
+        if (data_ac)
+                *data_ac = NULL;
+        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-             "clusters_to_add = %u\n",
+             "clusters_to_add = %u, extents_to_split = %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-             le32_to_cpu(di->i_clusters), clusters_to_add);
+             le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
        if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
         *
         * Most of the time we'll only be seeing this 1 cluster at a time
         * anyway.
+         *
+         * Always lock for any unwritten extents - we might want to
+         * add blocks during a split.
         */
        if (!num_free_extents ||
-            (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
                ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
                if (ret < 0) {
                        if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
                }
        }
+        if (clusters_to_add == 0)
+                goto out;
        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
        return ret;
 }
-static int ocfs2_extend_allocation(struct inode *inode,
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
-                                   u32 clusters_to_add)
+                                     u32 clusters_to_add, int mark_unwritten)
 {
        int status = 0;
        int restart_func = 0;
-        int drop_alloc_sem = 0;
        int credits;
-        u32 prev_clusters, logical_start;
+        u32 prev_clusters;
        struct buffer_head *bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
         * This function only exists for file systems which don't
         * support holes.
         */
-        BUG_ON(ocfs2_sparse_alloc(osb));
+        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
                                  OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
                goto leave;
        }
-        logical_start = OCFS2_I(inode)->ip_clusters;
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        /* blocks peope in read/write from reading our allocation
+        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
-         * until we're done changing it. We depend on i_mutex to block
-         * other extend/truncate calls while we're here. Ordering wrt
-         * start_trans is important here -- always do it before! */
-        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        drop_alloc_sem = 1;
-        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
                                       &meta_ac);
        if (status) {
                mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
                                            inode,
                                            &logical_start,
                                            clusters_to_add,
+                                            mark_unwritten,
                                            bh,
                                            handle,
                                            data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
             OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 leave:
-        if (drop_alloc_sem) {
-                up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                drop_alloc_sem = 0;
-        }
        if (handle) {
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
@@ -749,6 +762,25 @@ leave:
        return status;
 }
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+                                   u32 clusters_to_add, int mark_unwritten)
+{
+        int ret;
+        /*
+         * The alloc sem blocks peope in read/write from reading our
+         * allocation until we're done changing it. We depend on
+         * i_mutex to block other extend/truncate calls while we're
+         * here.
+         */
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
+                                        mark_unwritten);
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        return ret;
+}
 /* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
        }
        if (clusters_to_add) {
-                ret = ocfs2_extend_allocation(inode, clusters_to_add);
+                ret = ocfs2_extend_allocation(inode,
+                                              OCFS2_I(inode)->ip_clusters,
+                                              clusters_to_add, 0);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                goto bail_unlock;
        }
+        /*
+         * This will intentionally not wind up calling vmtruncate(),
+         * since all the work for a size change has been done above.
+         * Otherwise, we could get into problems with truncate as
+         * ip_alloc_sem is used there to protect against i_size
+         * changes.
+         */
        status = inode_setattr(inode, attr);
        if (status < 0) {
                mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
        return ret;
 }
-static int ocfs2_write_remove_suid(struct inode *inode)
+static int __ocfs2_write_remove_suid(struct inode *inode,
+                                     struct buffer_head *bh)
 {
        int ret;
-        struct buffer_head *bh = NULL;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        handle_t *handle;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di;
        mlog_entry("(Inode %llu, mode 0%o)\n",
-                   (unsigned long long)oi->ip_blkno, inode->i_mode);
+                   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
                goto out;
        }
-        ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_trans;
-        }
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
-                goto out_bh;
+                goto out_trans;
        }
        inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
        ret = ocfs2_journal_dirty(handle, bh);
        if (ret < 0)
                mlog_errno(ret);
-out_bh:
-        brelse(bh);
 out_trans:
        ocfs2_commit_trans(osb, handle);
 out:
@@ -1159,6 +1192,460 @@ out:
        return ret;
 }
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+        int ret;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                               oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret =  __ocfs2_write_remove_suid(inode, bh);
+out:
+        brelse(bh);
+        return ret;
+}
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+                                            u64 start, u64 len)
+{
+        int ret;
+        u32 cpos, phys_cpos, clusters, alloc_size;
+        /*
+         * We consider both start and len to be inclusive.
+         */
+        cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+        clusters -= cpos;
+        while (clusters) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                                         &alloc_size, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * Hole or existing extent len can be arbitrary, so
+                 * cap it to our own allocation request.
+                 */
+                if (alloc_size > clusters)
+                        alloc_size = clusters;
+                if (phys_cpos) {
+                        /*
+                         * We already have an allocation at this
+                         * region so we can safely skip it.
+                         */
+                        goto next;
+                }
+                ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+                if (ret) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+next:
+                cpos += alloc_size;
+                clusters -= alloc_size;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+static int __ocfs2_remove_inode_range(struct inode *inode,
+                                      struct buffer_head *di_bh,
+                                      u32 cpos, u32 phys_cpos, u32 len,
+                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        if (handle == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        OCFS2_I(inode)->ip_clusters -= len;
+        di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+                                         u64 byte_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        loff_t start, end;
+        struct address_space *mapping = inode->i_mapping;
+        start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+        end = byte_start + byte_len;
+        end = end & ~(osb->s_clustersize - 1);
+        if (start < end) {
+                unmap_mapping_range(mapping, start, end - start, 0);
+                truncate_inode_pages_range(mapping, start, end - 1);
+        }
+}
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+                                       u64 start, u64 len)
+{
+        int ret = 0;
+        u64 tmpend, end = start + len;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        unsigned int csize = osb->s_clustersize;
+        handle_t *handle;
+        /*
+         * The "start" and "end" values are NOT necessarily part of
+         * the range whose allocation is being deleted. Rather, this
+         * is what the user passed in with the request. We must zero
+         * partial clusters here. There's no need to worry about
+         * physical allocation - the zeroing code knows to skip holes.
+         */
+        mlog(0, "byte start: %llu, end: %llu\n",
+             (unsigned long long)start, (unsigned long long)end);
+        /*
+         * If both edges are on a cluster boundary then there's no
+         * zeroing required as the region is part of the allocation to
+         * be truncated.
+         */
+        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (handle == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * We want to get the byte offset of the end of the 1st cluster.
+         */
+        tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+        if (tmpend > end)
+                tmpend = end;
+        mlog(0, "1st range: start: %llu, tmpend: %llu\n",
+             (unsigned long long)start, (unsigned long long)tmpend);
+        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+        if (ret)
+                mlog_errno(ret);
+        if (tmpend < end) {
+                /*
+                 * This may make start and end equal, but the zeroing
+                 * code will skip any work in that case so there's no
+                 * need to catch it up here.
+                 */
+                start = end & ~(osb->s_clustersize - 1);
+                mlog(0, "2nd range: start: %llu, end: %llu\n",
+                     (unsigned long long)start, (unsigned long long)end);
+                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+                if (ret)
+                        mlog_errno(ret);
+        }
+        ocfs2_commit_trans(osb, handle);
+out:
+        return ret;
+}
+static int ocfs2_remove_inode_range(struct inode *inode,
+                                    struct buffer_head *di_bh, u64 byte_start,
+                                    u64 byte_len)
+{
+        int ret = 0;
+        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        if (byte_len == 0)
+                return 0;
+        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+        if (trunc_len >= trunc_start)
+                trunc_len -= trunc_start;
+        else
+                trunc_len = 0;
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             (unsigned long long)byte_start,
+             (unsigned long long)byte_len, trunc_start, trunc_len);
+        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        cpos = trunc_start;
+        while (trunc_len) {
+                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                                         &alloc_size, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (alloc_size > trunc_len)
+                        alloc_size = trunc_len;
+                /* Only do work for non-holes */
+                if (phys_cpos != 0) {
+                        ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+                                                         phys_cpos, alloc_size,
+                                                         &dealloc);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                cpos += alloc_size;
+                trunc_len -= alloc_size;
+        }
+        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &dealloc);
+        return ret;
+}
+/*
+ * Parts of this function taken from xfs_change_file_space()
+ */
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+                            struct ocfs2_space_resv *sr)
+{
+        int ret;
+        s64 llen;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        handle_t *handle;
+        unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
+        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
+            !ocfs2_writes_unwritten_extents(osb))
+                return -ENOTTY;
+        else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
+                 !ocfs2_sparse_alloc(osb))
+                return -ENOTTY;
+        if (!S_ISREG(inode->i_mode))
+                return -EINVAL;
+        if (!(file->f_mode & FMODE_WRITE))
+                return -EBADF;
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        mutex_lock(&inode->i_mutex);
+        /*
+         * This prevents concurrent writes on other nodes
+         */
+        ret = ocfs2_rw_lock(inode, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_rw_unlock;
+        }
+        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+                ret = -EPERM;
+                goto out_meta_unlock;
+        }
+        switch (sr->l_whence) {
+        case 0: /*SEEK_SET*/
+                break;
+        case 1: /*SEEK_CUR*/
+                sr->l_start += file->f_pos;
+                break;
+        case 2: /*SEEK_END*/
+                sr->l_start += i_size_read(inode);
+                break;
+        default:
+                ret = -EINVAL;
+                goto out_meta_unlock;
+        }
+        sr->l_whence = 0;
+        llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
+        if (sr->l_start < 0
+            || sr->l_start > max_off
+            || (sr->l_start + llen) < 0
+            || (sr->l_start + llen) > max_off) {
+                ret = -EINVAL;
+                goto out_meta_unlock;
+        }
+        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+                if (sr->l_len <= 0) {
+                        ret = -EINVAL;
+                        goto out_meta_unlock;
+                }
+        }
+        if (should_remove_suid(file->f_path.dentry)) {
+                ret = __ocfs2_write_remove_suid(inode, di_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_meta_unlock;
+                }
+        }
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        switch (cmd) {
+        case OCFS2_IOC_RESVSP:
+        case OCFS2_IOC_RESVSP64:
+                /*
+                 * This takes unsigned offsets, but the signed ones we
+                 * pass have been checked against overflow above.
+                 */
+                ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
+                                                       sr->l_len);
+                break;
+        case OCFS2_IOC_UNRESVSP:
+        case OCFS2_IOC_UNRESVSP64:
+                ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
+                                               sr->l_len);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_meta_unlock;
+        }
+        /*
+         * We update c/mtime for these changes
+         */
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_meta_unlock;
+        }
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+        ocfs2_commit_trans(osb, handle);
+out_meta_unlock:
+        brelse(di_bh);
+        ocfs2_meta_unlock(inode, 1);
+out_rw_unlock:
+        ocfs2_rw_unlock(inode, 1);
+        mutex_unlock(&inode->i_mutex);
+out:
+        return ret;
+}
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
        *basep = base;
 }
-static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+static struct page * ocfs2_get_write_source(char **ret_src_buf,
                                            const struct iovec *cur_iov,
                                            size_t iov_offset)
 {
        int ret;
-        char *buf;
+        char *buf = cur_iov->iov_base + iov_offset;
        struct page *src_page = NULL;
+        unsigned long off;
-        buf = cur_iov->iov_base + iov_offset;
+        off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
        if (!segment_eq(get_fs(), KERNEL_DS)) {
                /*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
                                     (unsigned long)buf & PAGE_CACHE_MASK, 1,
                                     0, 0, &src_page, NULL);
                if (ret == 1)
-                        bp->b_src_buf = kmap(src_page);
+                        *ret_src_buf = kmap(src_page) + off;
                else
                        src_page = ERR_PTR(-EFAULT);
        } else {
-                bp->b_src_buf = buf;
+                *ret_src_buf = buf;
        }
        return src_page;
 }
-static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+static void ocfs2_put_write_source(struct page *page)
-                                   struct page *page)
 {
        if (page) {
                kunmap(page);
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
 {
        int ret = 0;
        ssize_t copied, total = 0;
-        size_t iov_offset = 0;
+        size_t iov_offset = 0, bytes;
+        loff_t pos;
        const struct iovec *cur_iov = iov;
-        struct ocfs2_buffered_write_priv bp;
+        struct page *user_page, *page;
-        struct page *page;
+        char *buf, *dst;
+        void *fsdata;
        /*
         * handle partial DIO write.  Adjust cur_iov if needed.
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
        ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
        do {
-                bp.b_cur_off = iov_offset;
+                pos = *ppos;
-                bp.b_cur_iov = cur_iov;
-                page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+                user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
-                if (IS_ERR(page)) {
+                if (IS_ERR(user_page)) {
-                        ret = PTR_ERR(page);
+                        ret = PTR_ERR(user_page);
                        goto out;
                }
-                copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+                /* Stay within our page boundaries */
-                                                      ocfs2_map_and_write_user_data,
+                bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
-                                                      &bp);
+                            (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
+                /* Stay within the vector boundary */
+                bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
+                /* Stay within count */
+                bytes = min(bytes, count);
+                page = NULL;
+                ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
+                                        &page, &fsdata);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-                ocfs2_put_write_source(&bp, page);
+                dst = kmap_atomic(page, KM_USER0);
+                memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
+                kunmap_atomic(dst, KM_USER0);
+                flush_dcache_page(page);
+                ocfs2_put_write_source(user_page);
+                copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
+                                         bytes, page, fsdata);
                if (copied < 0) {
                        mlog_errno(copied);
                        ret = copied;
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
                }
                total += copied;
-                *ppos = *ppos + copied;
+                *ppos = pos + copied;
                count -= copied;
                ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf,
                                    struct splice_desc *sd)
 {
-        int ret, count, total = 0;
+        int ret, count;
        ssize_t copied = 0;
-        struct ocfs2_splice_write_priv sp;
+        struct file *file = sd->u.file;
+        unsigned int offset;
+        struct page *page = NULL;
+        void *fsdata;
+        char *src, *dst;
        ret = buf->ops->confirm(pipe, buf);
        if (ret)
                goto out;
-        sp.s_sd = sd;
+        offset = sd->pos & ~PAGE_CACHE_MASK;
-        sp.s_buf = buf;
-        sp.s_pipe = pipe;
-        sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
-        sp.s_buf_offset = buf->offset;
        count = sd->len;
-        if (count + sp.s_offset > PAGE_CACHE_SIZE)
+        if (count + offset > PAGE_CACHE_SIZE)
-                count = PAGE_CACHE_SIZE - sp.s_offset;
+                count = PAGE_CACHE_SIZE - offset;
-        do {
+        ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
-                /*
+                                &page, &fsdata);
-                 * splice wants us to copy up to one page at a
+        if (ret) {
-                 * time. For pagesize > cluster size, this means we
+                mlog_errno(ret);
-                 * might enter ocfs2_buffered_write_cluster() more
+                goto out;
-                 * than once, so keep track of our progress here.
+        }
-                 */
-                copied = ocfs2_buffered_write_cluster(sd->u.file,
-                                                      (loff_t)sd->pos + total,
-                                                      count,
-                                                      ocfs2_map_and_write_splice_data,
-                                                      &sp);
-                if (copied < 0) {
-                        mlog_errno(copied);
-                        ret = copied;
-                        goto out;
-                }
-                count -= copied;
+        src = buf->ops->map(pipe, buf, 1);
-                sp.s_offset += copied;
+        dst = kmap_atomic(page, KM_USER1);
-                sp.s_buf_offset += copied;
+        memcpy(dst + offset, src + buf->offset, count);
-                total += copied;
+        kunmap_atomic(page, KM_USER1);
-        } while (count);
+        buf->ops->unmap(pipe, buf, src);
-        ret = 0;
+        copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
+                                 page, fsdata);
+        if (copied < 0) {
+                mlog_errno(copied);
+                ret = copied;
+                goto out;
+        }
 out:
-        return total ? total : ret;
+        return copied ? copied : ret;
 }
 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa1822b..36fe27f268ee 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
                               struct inode *inode,
-                               u32 *cluster_start,
+                               u32 *logical_offset,
                               u32 clusters_to_add,
+                               int mark_unwritten,
                               struct buffer_head *fe_bh,
                               handle_t *handle,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
-                               enum ocfs2_alloc_restarted *reason);
+                               enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                          u32 clusters_to_add,
+                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
 int ocfs2_update_inode_atime(struct inode *inode,
                             struct buffer_head *bh);
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+                            struct ocfs2_space_resv *sr);
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63781ba..352eb4a13f98 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return 0;
-        status = o2hb_register_callback(&osb->osb_hb_down);
+        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = o2hb_register_callback(&osb->osb_hb_up);
+        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
        if (status < 0) {
                mlog_errno(status);
-                o2hb_unregister_callback(&osb->osb_hb_down);
+                o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
        }
 bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return;
-        o2hb_unregister_callback(&osb->osb_hb_down);
+        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-        o2hb_unregister_callback(&osb->osb_hb_up);
+        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
 }
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21ad9aed..bd68c3f2afbe 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
 #include "dlmglue.h"
+#include "file.h"
 #include "inode.h"
 #include "journal.h"
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 {
        unsigned int flags;
        int status;
+        struct ocfs2_space_resv sr;
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
                return ocfs2_set_inode_attr(inode, flags,
                        OCFS2_FL_MODIFIABLE);
+        case OCFS2_IOC_RESVSP:
+        case OCFS2_IOC_RESVSP64:
+        case OCFS2_IOC_UNRESVSP:
+        case OCFS2_IOC_UNRESVSP64:
+                if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
+                        return -EFAULT;
+                return ocfs2_change_file_space(filp, cmd, &sr);
        default:
                return -ENOTTY;
        }
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC32_SETFLAGS:
                cmd = OCFS2_IOC_SETFLAGS;
                break;
+        case OCFS2_IOC_RESVSP:
+        case OCFS2_IOC_RESVSP64:
+        case OCFS2_IOC_UNRESVSP:
+        case OCFS2_IOC_UNRESVSP64:
+                break;
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc1188081720..dbfb20bb27ea 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                container_of(work, struct ocfs2_journal, j_recovery_work);
        struct ocfs2_super *osb = journal->j_osb;
        struct ocfs2_dinode *la_dinode, *tl_dinode;
-        struct ocfs2_la_recovery_item *item;
+        struct ocfs2_la_recovery_item *item, *n;
-        struct list_head *p, *n;
        LIST_HEAD(tmp_la_list);
        mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
        spin_unlock(&journal->j_lock);
-        list_for_each_safe(p, n, &tmp_la_list) {
+        list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
-                item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
                list_del_init(&item->lri_list);
                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506da..ce60aab013aa 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE                 \
                                         + OCFS2_TRUNCATE_LOG_UPDATE)
+#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39f5..d79aa12137d2 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
 #include "ocfs2.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+        /* The best way to deal with signals in the vm path is
+         * to block them upfront, rather than allowing the
+         * locking paths to return -ERESTARTSYS. */
+        sigfillset(blocked);
+        /* We should technically never get a bad return value
+         * from sigprocmask */
+        return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+        return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
                                 unsigned long address,
                                 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
        mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
                   type);
-        /* The best way to deal with signals in this path is
+        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-         * to block them upfront, rather than allowing the
-         * locking paths to return -ERESTARTSYS. */
-        sigfillset(&blocked);
-        /* We should technically never get a bad ret return
-         * from sigprocmask */
-        ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
        page = filemap_nopage(area, address, type);
-        ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+        ret = ocfs2_vm_op_unblock_sigs(&oldset);
        if (ret < 0)
                mlog_errno(ret);
 out:
@@ -76,28 +87,136 @@ out:
        return page;
 }
-static struct vm_operations_struct ocfs2_file_vm_ops = {
+static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
-        .nopage = ocfs2_nopage,
+                                struct page *page)
-};
+{
+        int ret;
+        struct address_space *mapping = inode->i_mapping;
+        loff_t pos = page->index << PAGE_CACHE_SHIFT;
+        unsigned int len = PAGE_CACHE_SIZE;
+        pgoff_t last_index;
+        struct page *locked_page = NULL;
+        void *fsdata;
+        loff_t size = i_size_read(inode);
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+        /*
+         * Another node might have truncated while we were waiting on
+         * cluster locks.
+         */
+        last_index = size >> PAGE_CACHE_SHIFT;
+        if (page->index > last_index) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * The i_size check above doesn't catch the case where nodes
+         * truncated and then re-extended the file. We'll re-check the
+         * page mapping after taking the page lock inside of
+         * ocfs2_write_begin_nolock().
+         */
+        if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * Call ocfs2_write_begin() and ocfs2_write_end() to take
+         * advantage of the allocation code there. We pass a write
+         * length of the whole page (chopped to i_size) to make sure
+         * the whole thing is allocated.
+         *
+         * Since we know the page is up to date, we don't have to
+         * worry about ocfs2_write_begin() skipping some buffer reads
+         * because the "write" would invalidate their data.
+         */
+        if (page->index == last_index)
+                len = size & ~PAGE_CACHE_MASK;
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+                                       &fsdata, di_bh, page);
+        if (ret) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+                                     fsdata);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(ret != len);
+        ret = 0;
+out:
+        return ret;
+}
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-        int ret = 0, lock_level = 0;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        sigset_t blocked, oldset;
+        int ret, ret2;
+        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*
+         * The cluster locks taken will block a truncate from another
+         * node. Taking the data lock will also ensure that we don't
+         * attempt page truncation as part of a downconvert.
+         */
+        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
-         * Only support shared writeable mmap for local mounts which
+         * The alloc sem should be enough to serialize with
-         * don't know about holes.
+         * ocfs2_truncate_file() changing i_size as well as any thread
+         * modifying the inode btree.
         */
-        if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
+        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-            ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
-            ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+        ret = ocfs2_data_lock(inode, 1);
-                mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
+        if (ret < 0) {
-                /* This is -EINVAL because generic_file_readonly_mmap
+                mlog_errno(ret);
-                 * returns it in a similar situation. */
+                goto out_meta_unlock;
-                return -EINVAL;
        }
+        ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+        ocfs2_data_unlock(inode, 1);
+out_meta_unlock:
+        up_write(&OCFS2_I(inode)->ip_alloc_sem);
+        brelse(di_bh);
+        ocfs2_meta_unlock(inode, 1);
+out:
+        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+        if (ret2 < 0)
+                mlog_errno(ret2);
+        return ret;
+}
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+        .nopage         = ocfs2_nopage,
+        .page_mkwrite   = ocfs2_page_mkwrite,
+};
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int ret = 0, lock_level = 0;
        ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
                                    file->f_vfsmnt, &lock_level);
        if (ret < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
                                                    new_fe_bh,
                                                    handle, data_ac, NULL,
                                                    NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633e833f..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
        u16 max_slots;
        s16 node_num;
        s16 slot_num;
+        s16 preferred_slot;
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+        /*
+         * Support for sparse files is a pre-requisite
+         */
+        if (!ocfs2_sparse_alloc(osb))
+                return 0;
+        if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..82f8a75b207e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP    0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -116,6 +116,11 @@
 */
 #define OCFS2_FEATURE_COMPAT_BACKUP_SB          0x0001
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -170,6 +175,32 @@
 #define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
 /*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start;
+        __s64           l_len;          /* len == 0 means until end of file */
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area                     */
+};
+#define OCFS2_IOC_ALLOCSP               _IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP                _IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP                _IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP      _IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64     _IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64      _IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64      _IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64    _IOW ('X', 43, struct ocfs2_space_resv)
+/*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
 */
 #define OCFS2_JOURNAL_DIRTY_FL  (0x00000001)    /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b79067dc14..af4882b62cfa 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
        return ret;
 }
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
 {
        int i;
        s16 ret = OCFS2_INVALID_SLOT;
+        if (preferred >= 0 && preferred < si->si_num_slots) {
+                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+                        ret = preferred;
+                        goto out;
+                }
+        }
        for(i = 0; i < si->si_num_slots; i++) {
                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
                        ret = (s16) i;
                        break;
                }
        }
+out:
        return ret;
 }
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
        if (slot == OCFS2_INVALID_SLOT) {
                /* if no slot yet, then just take 1st available
                 * one. */
-                slot = __ocfs2_find_empty_slot(si);
+                slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
                if (slot == OCFS2_INVALID_SLOT) {
                        spin_unlock(&si->si_lock);
                        mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e3437626d183..d9c5c9fcb30f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
                                    u16 chain);
 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
                                                     u32 wanted);
-static int ocfs2_free_suballoc_bits(handle_t *handle,
-                                    struct inode *alloc_inode,
-                                    struct buffer_head *alloc_bh,
-                                    unsigned int start_bit,
-                                    u64 bg_blkno,
-                                    unsigned int count);
-static inline u64 ocfs2_which_suballoc_group(u64 block,
-                                             unsigned int bit);
 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
                                                   u64 bg_blkno,
                                                   u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
        (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
        (*ac)->ac_which = OCFS2_AC_USE_META;
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
-        slot = 0;
-#else
        slot = osb->slot_num;
-#endif
        (*ac)->ac_group_search = ocfs2_block_group_search;
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
 /*
 * expects the suballoc inode to already be locked.
 */
-static int ocfs2_free_suballoc_bits(handle_t *handle,
+int ocfs2_free_suballoc_bits(handle_t *handle,
-                                    struct inode *alloc_inode,
+                             struct inode *alloc_inode,
-                                    struct buffer_head *alloc_bh,
+                             struct buffer_head *alloc_bh,
-                                    unsigned int start_bit,
+                             unsigned int start_bit,
-                                    u64 bg_blkno,
+                             u64 bg_blkno,
-                                    unsigned int count)
+                             unsigned int count)
 {
        int status = 0;
        u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
        return status;
 }
-static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
-{
-        u64 group = block - (u64) bit;
-        return group;
-}
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
-int ocfs2_free_extent_block(handle_t *handle,
-                            struct inode *eb_alloc_inode,
-                            struct buffer_head *eb_alloc_bh,
-                            struct ocfs2_extent_block *eb)
-{
-        u64 blk = le64_to_cpu(eb->h_blkno);
-        u16 bit = le16_to_cpu(eb->h_suballoc_bit);
-        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-        return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
-                                        bit, bg_blkno, 1);
-}
 int ocfs2_free_clusters(handle_t *handle,
                       struct inode *bitmap_inode,
                       struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94cb9250..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
                         u32 *cluster_start,
                         u32 *num_clusters);
+int ocfs2_free_suballoc_bits(handle_t *handle,
+                             struct inode *alloc_inode,
+                             struct buffer_head *alloc_bh,
+                             unsigned int start_bit,
+                             u64 bg_blkno,
+                             unsigned int count);
 int ocfs2_free_dinode(handle_t *handle,
                      struct inode *inode_alloc_inode,
                      struct buffer_head *inode_alloc_bh,
                      struct ocfs2_dinode *di);
-int ocfs2_free_extent_block(handle_t *handle,
-                            struct inode *eb_alloc_inode,
-                            struct buffer_head *eb_alloc_bh,
-                            struct ocfs2_extent_block *eb);
 int ocfs2_free_clusters(handle_t *handle,
                        struct inode *bitmap_inode,
                        struct buffer_head *bitmap_bh,
                        u64 start_blk,
                        unsigned int num_clusters);
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+        u64 group = block - (u64) bit;
+        return group;
+}
 static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
                                          u64 bg_blkno)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c7dce9..3a5a1ed09ac9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
 static int ocfs2_parse_options(struct super_block *sb, char *options,
-                               unsigned long *mount_opt, int is_remount);
+                               unsigned long *mount_opt, s16 *slot,
+                               int is_remount);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
 static const struct super_operations ocfs2_sops = {
        .statfs         = ocfs2_statfs,
        .alloc_inode    = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
        Opt_data_ordered,
        Opt_data_writeback,
        Opt_atime_quantum,
+        Opt_slot,
        Opt_err,
 };
@@ -154,6 +154,7 @@ static match_table_t tokens = {
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
        {Opt_atime_quantum, "atime_quantum=%u"},
+        {Opt_slot, "preferred_slot=%u"},
        {Opt_err, NULL}
 };
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
 /* From xfs_super.c:xfs_max_file_offset
 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
 */
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
 {
        unsigned int pagefactor = 1;
        unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        int incompat_features;
        int ret = 0;
        unsigned long parsed_options;
+        s16 slot;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+        if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
                ret = -EINVAL;
                goto out;
        }
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        struct dentry *root;
        int status, sector_size;
        unsigned long parsed_opt;
+        s16 slot;
        struct inode *inode = NULL;
        struct ocfs2_super *osb = NULL;
        struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        mlog_entry("%p, %p, %i", sb, data, silent);
-        if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+        if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
                status = -EINVAL;
                goto read_super_error;
        }
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        brelse(bh);
        bh = NULL;
        osb->s_mount_opt = parsed_opt;
+        osb->preferred_slot = slot;
        sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
 static int ocfs2_parse_options(struct super_block *sb,
                               char *options,
                               unsigned long *mount_opt,
+                               s16 *slot,
                               int is_remount)
 {
        int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
        *mount_opt = 0;
+        *slot = OCFS2_INVALID_SLOT;
        if (!options) {
                status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                        else
                                osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
                        break;
+                case Opt_slot:
+                        option = 0;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option)
+                                *slot = (s16)option;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..3b9cb3d0b008 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
 #endif /* OCFS2_SUPER_H */
author	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-07-16 13:52:55 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-07-16 13:52:55 -0400
commit	add096909da63ef32d6766f6771c07c9f16c6ee5 (patch)
tree	58594bcf68cbb6f777d5270d098ab8ca69cbaee3 /fs
parent	e245befce7af0a1e1347079ed62695b059594bd4 (diff)
parent	54c57dc3b6578356c0a428c767d4bf080254a2ee (diff)